Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions datacontract/imports/bigquery_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def convert_bigquery_schema(
return data_contract_specification


def import_table_fields(table_fields):
def import_table_fields(table_fields, in_array=False):
imported_fields = {}
for field in table_fields:
field_name = field.get("name")
Expand All @@ -136,44 +136,71 @@ def import_table_fields(table_fields):
imported_fields[field_name].description = field.get("description")

if field.get("type") == "RECORD":
imported_fields[field_name].type = "object"
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
if field.get("mode") == "REPEATED":
imported_fields[field_name].type = "array"
imported_fields[field_name].items = Field(
type="record", fields= import_table_fields(field.get("fields"), in_array=True))
elif field.get("mode") == "NULLABLE":
imported_fields[field_name].type = "struct"
imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=True)
else:
imported_fields[field_name].type = "object"
imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=True)

elif field.get("type") == "STRUCT":
imported_fields[field_name].type = "struct"
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=in_array)
elif field.get("type") == "RANGE":
# This is a range of date/datetime/timestamp but multiple values
# So we map it to an array
imported_fields[field_name].type = "array"
imported_fields[field_name].items = Field(
type=map_type_from_bigquery(field["rangeElementType"].get("type"))
type=map_type_from_bigquery(field["rangeElementType"].get("type"), in_array=True)
)
else: # primitive type
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"))
elif field.get("type") == "TIME":
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
imported_fields[field_name].config = {"bigqueryType": "TIME"}
elif field.get("type") == "GEOGRAPHY":
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
imported_fields[field_name].config = {"bigqueryType": "GEOGRAPHY"}
elif field.get("type") == "JSON":
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
imported_fields[field_name].config = {"bigqueryType": "JSON"}

else:
if field.get("mode") == "REPEATED": # not a type record meaning type ARRAY<STRING> ARRAY<INTEGER>
imported_fields[field_name].type = "array"
imported_fields[field_name].items = Field(
type= map_type_from_bigquery(field.get("type"), in_array=True))
else: # primitive type
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)

if field.get("type") == "STRING":
# in bigquery both string and bytes have maxLength but in the datacontracts
# spec it is only valid for strings
if field.get("maxLength") is not None:
imported_fields[field_name].maxLength = int(field.get("maxLength"))
imported_fields[field_name].config = {"bigqueryType": f"STRING({field.get('maxLength')})"}

if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC":
if field.get("precision") is not None:
imported_fields[field_name].precision = int(field.get("precision"))

if field.get("scale") is not None:
imported_fields[field_name].scale = int(field.get("scale"))
if field.get("precision") is not None and field.get("scale") is not None:
imported_fields[field_name].config = {"bigqueryType": f"{ field.get('type')}({field.get('precision')}, {field.get('scale')})"}

return imported_fields


def map_type_from_bigquery(bigquery_type_str: str):
def map_type_from_bigquery(bigquery_type_str: str, in_array=False):
if bigquery_type_str == "STRING":
return "string"
elif bigquery_type_str == "BYTES":
return "bytes"
elif bigquery_type_str == "INTEGER":
return "int"
return "bigint" if in_array else "int"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, why does the type change to bigint when being inside an array? Not clear to me.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi so regarding that INT64 type this is something that is internal to bigquery where the data_type of bq table within a struct or an array is represented as an INT64 and not an INTEGER.

  1. create or replace table project_id.dataset_id.simple as (
    select * from
    (select [1,2,3] as array_int64)
    left join
    (select struct(1 as elem) as struct_int64)
    on true
    )

  2. when looking at the table you see the fields as INTEGER
    Capture d'écran 2025-05-21 131625

  3. But when looking at the information_schema: SELECT * FROM project_id.dataset_id.INFORMATION_SCHEMA.COLUMNS where table_name ='simple'
    you see that the bigquery internal representation is an INT64
    Capture d'écran 2025-05-21 131420

elif bigquery_type_str == "INT64":
return "bigint"
elif bigquery_type_str == "FLOAT":
Expand All @@ -187,9 +214,9 @@ def map_type_from_bigquery(bigquery_type_str: str):
elif bigquery_type_str == "DATE":
return "date"
elif bigquery_type_str == "TIME":
return "timestamp_ntz"
return "object"
elif bigquery_type_str == "DATETIME":
return "timestamp"
return "timestamp_ntz"
elif bigquery_type_str == "NUMERIC":
return "numeric"
elif bigquery_type_str == "BIGNUMERIC":
Expand Down
34 changes: 29 additions & 5 deletions tests/fixtures/bigquery/export/bq_table_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,35 @@
"description": "a simple timestamp_tz field"
},
{
"name": "timestamp_ntz_field",
"name": "datetime_field",
"type": "DATETIME",
"mode": "NULLABLE",
"description": "a simple timestamp_ntz field"
"description": "a simple timestamp_ntz field (bq datetime)"
},
{
"name": "date_field",
"type": "DATE",
"mode": "NULLABLE",
"description": "a simple date field"
},
{
"name": "time_field",
"type": "TIME",
"mode": "NULLABLE",
"description": "a time field"
},
{
"name": "geography_field",
"type": "GEOGRAPHY",
"mode": "NULLABLE",
"description": "a geography field"
},
{
"name": "json_field",
"type": "JSON",
"mode": "NULLABLE",
"description": "a json field"
},
{
"name": "number_field",
"type": "NUMERIC",
Expand Down Expand Up @@ -194,7 +212,13 @@
"type": "DATE",
"mode": "NULLABLE",
"description": "a non required date field"
}
},
{
"name": "subfield_3",
"type": "INT64",
"mode": "NULLABLE",
"description": "an integer field"
}
]
},
{
Expand All @@ -211,7 +235,7 @@
},
{
"name": "subfield_2",
"type": "INTEGER",
"type": "INT64",
"mode": "NULLABLE",
"description": "a non required int field"
}
Expand All @@ -225,7 +249,7 @@
},
{
"name": "int_array_field",
"type": "INTEGER",
"type": "INT64",
"mode": "REPEATED",
"description": "an int array"
},
Expand Down
30 changes: 26 additions & 4 deletions tests/fixtures/bigquery/export/datacontract.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,32 @@ models:
type: timestamp_tz
required: false
description: a simple timestamp_tz field
timestamp_ntz_field:
datetime_field:
type: timestamp_ntz
required: false
description: a simple timestamp_ntz field
description: a simple timestamp_ntz field (bq datetime)
date_field:
type: date
required: false
description: a simple date field
time_field:
type: object
required: false
description: a time field
config:
bigqueryType: TIME
geography_field:
type: object
required: false
description: a geography field
config:
bigqueryType: GEOGRAPHY
json_field:
type: object
required: false
description: a json field
config:
bigqueryType: JSON
number_field:
type: number
required: false
Expand Down Expand Up @@ -141,6 +159,10 @@ models:
type: date
required: false
description: a non required date field
subfield_3:
type: bigint
required: false
description: an integer field
struct_field:
type: struct
required: false
Expand All @@ -151,7 +173,7 @@ models:
required: true
description: a required bytes field
subfield_2:
type: int
type: bigint
required: false
description: a non required int field
string_array_field:
Expand All @@ -165,7 +187,7 @@ models:
required: false
description: an int array
items:
type: int
type: bigint
complex_array_field:
type: array
required: false
Expand Down
33 changes: 33 additions & 0 deletions tests/fixtures/bigquery/import/complete_table_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,39 @@
"mode": "NULLABLE",
"name": "JSON_Field",
"type": "JSON"
},
{
"description": "an array of string",
"mode": "REPEATED",
"name": "Array_of_string",
"type": "STRING"
},
{
"description": "an array of int",
"mode": "REPEATED",
"name": "Array_of_int",
"type": "INTEGER"
},
{
"description": "an array of objects that has multiple fields that should carry through",
"fields": [
{
"name": "Id",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "an id field"
},
{
"name": "Name",
"type": "STRING",
"mode": "NULLABLE",
"description": "a name field"

}
],
"mode": "REPEATED",
"name": "Array_of_struct_col",
"type": "RECORD"
}
]
},
Expand Down
50 changes: 45 additions & 5 deletions tests/fixtures/bigquery/import/datacontract.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ models:
required: true
description: A required String field with a maximum length
maxLength: 42
config:
bigqueryType: STRING(42)
Bytes field:
type: bytes
required: false
Expand Down Expand Up @@ -51,11 +53,13 @@ models:
required: false
description: A Date field
Time_Field:
type: timestamp_ntz
type: object
required: false
description: A time field
config:
bigqueryType: TIME
Datetime_Field:
type: timestamp
type: timestamp_ntz
required: false
description: A Datetime field
Numeric_Field:
Expand All @@ -64,14 +68,18 @@ models:
description: A Numeric field with precision 5 and scale 3
precision: 5
scale: 3
config:
bigqueryType: NUMERIC(5, 3)
Bignumeric_field:
type: double
required: false
description: A bignumeric field with precision 8 and sclae 4
precision: 8
scale: 4
config:
bigqueryType: BIGNUMERIC(8, 4)
Record_field:
type: object
type: struct
required: false
description: A record field with two subfields
fields:
Expand All @@ -80,20 +88,52 @@ models:
required: false
description: subfield 1 of type string
subfield_2:
type: int
type: bigint
required: false
description: Subfield 2 of type integer
Range_field:
type: array
required: false
description: a datetime range
items:
type: timestamp
type: timestamp_ntz
Geography_Field:
type: object
required: false
description: a geography field
config:
bigqueryType: GEOGRAPHY
JSON_Field:
type: object
required: false
description: a json field
config:
bigqueryType: JSON
Array_of_string:
type: array
required: false
description: an array of string
items:
type: string
Array_of_int:
type: array
required: false
description: an array of int
items:
type: bigint
Array_of_struct_col:
type: array
required: false
description: an array of objects that has multiple fields that should carry through
items:
type: record
fields:
Id:
type: bigint
required: false
description: an id field
Name:
type: string
required: false
description: a name field