Skip to content

Commit

Permalink
feat(bigquery): do not send preview field and rows if max_preview_row…
Browse files Browse the repository at this point in the history
…s is -1 (#46)

* feat: adds maxPreviewRows -1 logic to not send previewRows and Fields

* Fix: adds sink config in compass sink for emitUnpopulated

* fix: review comments

* Fix: more comments

* fix: last comment

---------

Co-authored-by: Utsav Agarwal <[email protected]>
  • Loading branch information
utsav14nov and utsav14nov authored Nov 28, 2023
1 parent 16f8852 commit a147080
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 27 deletions.
6 changes: 4 additions & 2 deletions plugins/extractors/bigquery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ source:
config:
project_id: google-project-id
table_pattern: gofood.fact_
max_preview_rows: 3
exclude:
datasets:
- dataset_a
Expand Down Expand Up @@ -49,7 +50,7 @@ source:
| `table_pattern` | `string` | `gofood.fact_` | Regex pattern to filter which bigquery table to scan (whitelist) | *optional* |
| `max_page_size` | `int` | `100` | max page size hint used for fetching datasets/tables/rows from bigquery | *optional* |
| `include_column_profile` | `bool` | `true` | true if you want to profile the column value such min, max, med, avg, top, and freq | *optional* |
| `max_preview_rows` | `int` | `30` | max number of preview rows to fetch, `0` will skip preview fetching. Default to `30`. | *optional* |
| `max_preview_rows` | `int` | `30` | max number of preview rows to fetch, `0` will skip preview fetching, `-1` will restrict adding preview_rows key in asset data . Default to `30`. | *optional* |
| `mix_values` | `bool` | `false` | true if you want to mix the column values with the preview rows. Default to `false`. | *optional* |
| `collect_table_usage` | `boolean` | `false` | toggle feature to collect table usage, `true` will enable collecting table usage. Default to `false`. | *optional* |
| `usage_period_in_day` | `int` | `7` | collecting log from `(now - usage_period_in_day)` until `now`. only matter if `collect_table_usage` is true. Default to `7`. | *optional* |
Expand All @@ -60,7 +61,8 @@ source:
- Leaving `service_account_json` and `service_account_base64` blank will default
to [Google's default authentication][google-default-auth]. It is
recommended if Meteor instance runs inside the same Google Cloud environment as the BigQuery project.
- Service account needs to have `bigquery.privateLogsViewer` role to be able to collect bigquery audit logs
- Service account needs to have `bigquery.privateLogsViewer` role to be able to collect bigquery audit logs.
- Setting `max_preview_rows` to `-1` will restrict adding preview_rows key in asset data

## Outputs

Expand Down
55 changes: 31 additions & 24 deletions plugins/extractors/bigquery/bigquery.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,21 @@ var summary string
type Config struct {
ProjectID string `mapstructure:"project_id" validate:"required"`
// ServiceAccountBase64 takes precedence over ServiceAccountJSON field
ServiceAccountBase64 string `mapstructure:"service_account_base64"`
ServiceAccountJSON string `mapstructure:"service_account_json"`
MaxPageSize int `mapstructure:"max_page_size"`
DatasetPageSize int `mapstructure:"dataset_page_size"`
TablePageSize int `mapstructure:"table_page_size"`
TablePattern string `mapstructure:"table_pattern"`
Exclude Exclude `mapstructure:"exclude"`
IncludeColumnProfile bool `mapstructure:"include_column_profile"`
MaxPreviewRows int `mapstructure:"max_preview_rows" default:"30"`
MixValues bool `mapstructure:"mix_values" default:"false"`
IsCollectTableUsage bool `mapstructure:"collect_table_usage" default:"false"`
UsagePeriodInDay int64 `mapstructure:"usage_period_in_day" default:"7"`
UsageProjectIDs []string `mapstructure:"usage_project_ids"`
BuildViewLineage bool `mapstructure:"build_view_lineage" default:"false"`
ServiceAccountBase64 string `mapstructure:"service_account_base64"`
ServiceAccountJSON string `mapstructure:"service_account_json"`
MaxPageSize int `mapstructure:"max_page_size"`
DatasetPageSize int `mapstructure:"dataset_page_size"`
TablePageSize int `mapstructure:"table_page_size"`
TablePattern string `mapstructure:"table_pattern"`
Exclude Exclude `mapstructure:"exclude"`
IncludeColumnProfile bool `mapstructure:"include_column_profile"`
// MaxPreviewRows can also be set to -1 to restrict adding preview_rows key in asset data
MaxPreviewRows int `mapstructure:"max_preview_rows" default:"30"`
MixValues bool `mapstructure:"mix_values" default:"false"`
IsCollectTableUsage bool `mapstructure:"collect_table_usage" default:"false"`
UsagePeriodInDay int64 `mapstructure:"usage_period_in_day" default:"7"`
UsageProjectIDs []string `mapstructure:"usage_project_ids"`
BuildViewLineage bool `mapstructure:"build_view_lineage" default:"false"`
}

type Exclude struct {
Expand Down Expand Up @@ -444,15 +445,21 @@ func (e *Extractor) buildAsset(ctx context.Context, t *bigquery.Table, md *bigqu
}
}

table, err := anypb.New(&v1beta2.Table{
Columns: e.buildColumns(ctx, md.Schema, md),
PreviewFields: previewFields,
PreviewRows: previewRows,
Profile: tableProfile,
Attributes: utils.TryParseMapToProto(attributesData),
CreateTime: timestamppb.New(md.CreationTime),
UpdateTime: timestamppb.New(md.LastModifiedTime),
})
tableData := &v1beta2.Table{
Columns: e.buildColumns(ctx, md.Schema, md),
Profile: tableProfile,
Attributes: utils.TryParseMapToProto(attributesData),
CreateTime: timestamppb.New(md.CreationTime),
UpdateTime: timestamppb.New(md.LastModifiedTime),
}

maxPreviewRows := e.config.MaxPreviewRows
if maxPreviewRows != -1 {
tableData.PreviewFields = previewFields
tableData.PreviewRows = previewRows
}

table, err := anypb.New(tableData)
if err != nil {
e.logger.Warn("error creating Any struct", "error", err)
}
Expand Down Expand Up @@ -515,7 +522,7 @@ func (e *Extractor) buildColumn(ctx context.Context, field *bigquery.FieldSchema

func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table, md *bigquery.TableMetadata) (fields []string, rows *structpb.ListValue, err error) {
maxPreviewRows := e.config.MaxPreviewRows
if maxPreviewRows == 0 {
if maxPreviewRows <= 0 {
return nil, nil, nil
}

Expand Down
3 changes: 3 additions & 0 deletions plugins/sinks/compass/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ sinks:
labels:
myCustom: $properties.attributes.myCustomField
sampleLabel: $properties.labels.sampleLabelField
remove_unset_fields_in_data: false
```
### *Notes*
- Setting `remove_unset_fields_in_data` to `true` will not populate fields in final data which are not set initially in source. Defaults to `false`.
## Contributing

Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-sink) for information on contributing to this module.
4 changes: 3 additions & 1 deletion plugins/sinks/compass/sink.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ type Config struct {
Host string `mapstructure:"host" validate:"required"`
Headers map[string]string `mapstructure:"headers"`
Labels map[string]string `mapstructure:"labels"`
// RemoveUnsetFieldsInData if set to true do not populate fields in final sink data which are unset in initial data.
RemoveUnsetFieldsInData bool `mapstructure:"remove_unset_fields_in_data"`
}

var info = plugins.Info{
Expand Down Expand Up @@ -193,7 +195,7 @@ func (s *Sink) buildCompassData(anyData *anypb.Any) (map[string]interface{}, err

data, err := protojson.MarshalOptions{
UseProtoNames: true,
EmitUnpopulated: true,
EmitUnpopulated: !s.config.RemoveUnsetFieldsInData,
}.Marshal(anyData)
if err != nil {
return nil, fmt.Errorf("marshaling asset data: %w", err)
Expand Down

0 comments on commit a147080

Please sign in to comment.