Skip to content

Commit

Permalink
added examples and fixed column-parsing bug
Browse files Browse the repository at this point in the history
  • Loading branch information
shirleycohen committed Aug 29, 2022
1 parent 16913b9 commit 0f65ab5
Show file tree
Hide file tree
Showing 15 changed files with 313 additions and 56 deletions.
16 changes: 12 additions & 4 deletions DataCatalogUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def apply_dynamic_config(self, fields, uri, config_uuid, template_uuid, tag_hist

creation_status = constants.SUCCESS

print('uri: ' + uri)
#print('uri: ' + uri)

error_exists = False

Expand Down Expand Up @@ -292,7 +292,10 @@ def apply_dynamic_config(self, fields, uri, config_uuid, template_uuid, tag_hist
query_expression = field['query_expression']

# parse and run query in BQ
query_str = self.parse_query_expression(uri, query_expression)
if column != "":
query_str = self.parse_query_expression(uri, query_expression, column)
else:
query_str = self.parse_query_expression(uri, query_expression)
#print('returned query_str: ' + query_str)

field_value, error_exists = self.run_query(bq_client, query_str, batch_mode, store)
Expand Down Expand Up @@ -1406,7 +1409,7 @@ def apply_static_propagated_tag(self, config_status, source_res, view_res, colum
return creation_status


def parse_query_expression(self, uri, query_expression):
def parse_query_expression(self, uri, query_expression, column=None):

#print("*** enter parse_query_expression ***")
#print("uri: " + uri)
Expand All @@ -1424,6 +1427,7 @@ def parse_query_expression(self, uri, query_expression):
column_index = query_expression.rfind("$column", 0)

#print('table_index: ', table_index)
#print('column_index: ', column_index)

if project_index != -1:
project_end = uri.find('/')
Expand Down Expand Up @@ -1481,8 +1485,12 @@ def parse_query_expression(self, uri, query_expression):
query_str = query_expression

if column_index != -1:
query_str = query_str.replace('$column', column)

if query_str == None:
query_str = query_expression.replace('$column', column)
else:
query_str = query_str.replace('$column', column)

return query_str


Expand Down
3 changes: 2 additions & 1 deletion Resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,8 @@ def find_gcs_resources(uris):
#excluded_uris=None
#resources = Resources.get_bq_resources(included_uris, excluded_uris)

included_uris = 'bigquery/project/tag-engine-develop/dataset/finwire/FINWIRE*_CMP/industryID'
#included_uris = 'bigquery/project/tag-engine-develop/dataset/finwire/FINWIRE*_CMP/industryID'
included_uris = 'bigquery/project/data-mesh-343422/dataset/oltp/Account/ca_st_id'
#included_uris = 'gs://discovery-area/austin_311_service_requests.parquet'
#included_uris = 'gs://discovery-area/cities_311/austin_311_service_requests.parquet', 'gs://discovery-area/cities_311/san_francisco_311_service_requests/*'
#excluded_uris = 'gs://discovery-area/cities_311/san_francisco_311_service_requests/000000000003'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"template_id": "data_product",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
{
"field_id": "data_domain",
"field_value": "Finance"
},
{
"field_id": "data_subdomain",
"field_value": "Financial_Insights"
},
{
"field_id": "data_product_name",
"field_value": "Finwire Archive"
},
{
"field_id": "data_product_description",
"field_value": "This dataset contains financial information about companies and securities obtained from a financial newswire service that has been archived over an extended period of time."
},
{
"field_id": "data_confidentiality",
"field_value": "Public"
},
{
"field_id": "business_criticality",
"field_value": "Medium"
},
{
"field_id": "business_owner",
"field_value": "John Williams"
},
{
"field_id": "technical_owner",
"field_value": "Emily Doe"
},
{
"field_id": "number_data_resources",
"field_value": "409"
},
{
"field_id": "storage_location",
"field_value": "us-central1"
},
{
"field_id": "data_retention_period",
"field_value": "7_years"
},
{
"field_id": "data_latency_slo",
"field_value": "quarterly"
},
{
"field_id": "documentation_link",
"field_value": "https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-di_v1.1.0.pdf "
},
{
"field_id": "access_request_link",
"field_value": "go/sphinx/finwire-archive"
},
{
"field_id": "data_product_status",
"field_value": "DRAFT"
},
{
"field_id": "last_modified_date",
"field_value": "2022-08-28"
}
],
"included_uris": "bigquery/project/data-mesh-343422/dataset/finance",
"excluded_uris": "",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
"region": "us-central1",
"fields": [
{
"field_id": "data_confidentiality",
"query_expression": "select case when count > 0 then 'SENSITIVE' else 'SHARED_INTERNALLY' end as data_confidentiality from (select count(*) as count from tag-engine-develop.tag_history.data_attribute where starts_with(asset_name, '$project/dataset/$dataset/table/$table/'))"
"field_id": "data_sensitivity",
"query_expression": "select sensitive_type from tag-engine-develop.tag_history.data_sensitivity ds join $project.reference.SensitiveCategory sc on ds.sensitive_type = sc.category where starts_with(asset_name, '$project/dataset/$dataset/table/$table/') order by rank desc limit 1"
},
{
"field_id": "num_fields",
"query_expression": "select count(*) from $project.$dataset.INFORMATION_SCHEMA.COLUMNS where table_name = '$table'"
},
{
{
"field_id": "num_records",
"query_expression": "select row_count from `$project.$dataset.__TABLES__` where table_id = '$table'"
},
Expand All @@ -23,6 +23,10 @@
"field_id": "recent_data_update",
"query_expression": "select cast(timestamp_millis(last_modified_time) as datetime) from `$project.$dataset.__TABLES__` where table_id = '$table'"
},
{
"field_id": "actual_data_latency",
"query_expression": "select timestamp_diff(max(start_time), min(start_time), second) / (count(distinct(start_time)) - 1) from `$project`.`region-us-central1`.INFORMATION_SCHEMA.JOBS j1, unnest(referenced_tables) as r where statement_type in ('INSERT', 'UPDATE', 'DELETE') and r.project_id = '$project' and r.dataset_id = '$dataset' and r.table_id = '$table'"
},
{
"field_id": "global_id_customer",
"query_expression": "select False"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"template_id": "data_attribute",
"template_id": "data_sensitivity",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
Expand All @@ -10,11 +10,11 @@
"field_id": "sensitive_type"
}
],
"dlp_dataset": "bigquery/project/data-mesh-343422/dataset/finance_dlp",
"mapping_table": "bigquery/project/data-mesh-343422/dataset/reference/SensitiveCategory",
"dlp_dataset": "bigquery/project/data-mesh-343422/dataset/finance_dlp",
"mapping_table": "bigquery/project/data-mesh-343422/dataset/reference/SensitiveCategory",
"included_uris": "bigquery/project/data-mesh-343422/dataset/finance/*",
"create_policy_tags": true,
"taxonomy_id": "projects/data-mesh-344315/locations/us-central1/taxonomies/2563112035556857653",
"create_policy_tags": true,
"taxonomy_id": "projects/data-mesh-344315/locations/us-central1/taxonomies/2563112035556857653",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"template_id": "data_standardization",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
{
"field_id": "degree",
"query_expression": "with finwire_unmatches as (select count(*) umatched_counts from $project.$dataset.$table f1 where $column not in (select in_id from reference.Industry)), finwire_total as (select count(*) as total_counts from $project.$dataset.$table) select round(safe_divide((total_counts - umatched_counts), total_counts) * 100) as degree from finwire_unmatches, finwire_total"
}
],
"included_uris": "bigquery/project/data-mesh-343422/dataset/finance/FINWIRE*_CMP/industryID",
"excluded_uris": "",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"template_id": "data_product",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
{
"field_id": "data_domain",
"field_value": "Finance"
},
{
"field_id": "data_subdomain",
"field_value": "Trade_and_Banking"
},
{
"field_id": "data_product_name",
"field_value": "Trade Transactions"
},
{
"field_id": "data_product_description",
"field_value": "Data on customers, accounts, brokers, securities, trade details, account balances, and market information from the system of record."
},
{
"field_id": "data_confidentiality",
"field_value": "Confidential"
},
{
"field_id": "business_criticality",
"field_value": "High"
},
{
"field_id": "business_owner",
"field_value": "Sunil Kumar"
},
{
"field_id": "technical_owner",
"field_value": "Sudipta Nelson"
},
{
"field_id": "number_data_resources",
"field_value": "13"
},
{
"field_id": "storage_location",
"field_value": "us-central1"
},
{
"field_id": "data_retention_period",
"field_value": "2_years"
},
{
"field_id": "data_latency_slo",
"field_value": "daily"
},
{
"field_id": "documentation_link",
"field_value": "https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-di_v1.1.0.pdf "
},
{
"field_id": "access_request_link",
"field_value": "go/sphinx/trade-tx"
},
{
"field_id": "data_product_status",
"field_value": "DRAFT"
},
{
"field_id": "last_modified_date",
"field_value": "2022-08-30"
}
],
"included_uris": "bigquery/project/data-mesh-343422/dataset/oltp",
"excluded_uris": "",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
"region": "us-central1",
"fields": [
{
"field_id": "data_confidentiality",
"query_expression": "select case when count > 0 then 'SENSITIVE' else 'SHARED_INTERNALLY' end as data_confidentiality from (select count(*) as count from tag-engine-develop.tag_history.data_attribute where starts_with(asset_name, '$project/dataset/$dataset/table/$table/'))"
"field_id": "data_sensitivity",
"query_expression": "select sensitive_type from tag-engine-develop.tag_history.data_sensitivity ds join $project.reference.SensitiveCategory sc on ds.sensitive_type = sc.category where starts_with(asset_name, '$project/dataset/$dataset/table/$table/') order by rank desc limit 1"
},
{
"field_id": "num_fields",
"query_expression": "select count(*) from $project.$dataset.INFORMATION_SCHEMA.COLUMNS where table_name = '$table'"
},
{
{
"field_id": "num_records",
"query_expression": "select row_count from `$project.$dataset.__TABLES__` where table_id = '$table'"
},
Expand All @@ -23,6 +23,10 @@
"field_id": "recent_data_update",
"query_expression": "select cast(timestamp_millis(last_modified_time) as datetime) from `$project.$dataset.__TABLES__` where table_id = '$table'"
},
{
"field_id": "actual_data_latency",
"query_expression": "select timestamp_diff(max(start_time), min(start_time), second) / (count(distinct(start_time)) - 1) from `$project`.`region-us-central1`.INFORMATION_SCHEMA.JOBS j1, unnest(referenced_tables) as r where statement_type in ('INSERT', 'UPDATE', 'DELETE') and r.project_id = '$project' and r.dataset_id = '$dataset' and r.table_id = '$table'"
},
{
"field_id": "global_id_customer",
"query_expression": "select False"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"template_id": "data_attribute",
"template_id": "data_sensitivity",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
Expand All @@ -10,11 +10,11 @@
"field_id": "sensitive_type"
}
],
"dlp_dataset": "bigquery/project/data-mesh-343422/dataset/oltp_dlp",
"mapping_table": "bigquery/project/data-mesh-343422/dataset/reference/SensitiveCategory",
"dlp_dataset": "bigquery/project/data-mesh-343422/dataset/oltp_dlp",
"mapping_table": "bigquery/project/data-mesh-343422/dataset/reference/SensitiveCategory",
"included_uris": "bigquery/project/data-mesh-343422/dataset/oltp/*",
"create_policy_tags": true,
"taxonomy_id": "projects/data-mesh-344315/locations/us-central1/taxonomies/2563112035556857653",
"create_policy_tags": true,
"taxonomy_id": "projects/data-mesh-344315/locations/us-central1/taxonomies/2563112035556857653",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"template_id": "data_standardization",
"project_id": "data-mesh-344315",
"region": "us-central1",
"fields": [
{
"field_id": "degree",
"query_expression": "with status_unmatches as (select count(*) umatched_counts from $project.$dataset.$table where $column not in (select st_id from $project.reference.StatusType)), records_total as (select count(*) as total_counts from $project.$dataset.$table) select round(safe_divide((total_counts - umatched_counts), total_counts) * 100) as degree from status_unmatches, records_total"
}
],
"included_uris": "bigquery/project/data-mesh-343422/dataset/oltp/Account/ca_st_id",
"excluded_uris": "",
"refresh_mode": "ON_DEMAND",
"tag_history": true,
"tag_stream": false
}

Loading

0 comments on commit 0f65ab5

Please sign in to comment.