-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for importing from iceberg table definition (#452)
Assumes the table definition is in JSON format as defined by https://iceberg.apache.org/spec/#appendix-c-json-serialization. Eventually it might be nice to add support for ingesting the table definition directly from the catalog (via the pyiceberg sdk), but I did not attempt to tackle that in this first iteration.
- Loading branch information
1 parent
179a687
commit bd7d8b4
Showing
11 changed files
with
464 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
from typing import Dict, Any | ||
|
||
from datacontract.imports.importer import Importer | ||
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field | ||
|
||
from pyiceberg.schema import Schema | ||
from pyiceberg import types as iceberg_types | ||
from pydantic import ValidationError | ||
|
||
from datacontract.model.exceptions import DataContractException | ||
|
||
|
||
class IcebergImporter(Importer): | ||
def import_source( | ||
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict | ||
) -> DataContractSpecification: | ||
schema = load_and_validate_iceberg_schema(source) | ||
return import_iceberg( | ||
data_contract_specification, | ||
schema, | ||
import_args.get("iceberg_table"), | ||
) | ||
|
||
|
||
def load_and_validate_iceberg_schema(source: str) -> Schema: | ||
with open(source, "r") as file: | ||
try: | ||
return Schema.model_validate_json(file.read()) | ||
except ValidationError as e: | ||
raise DataContractException( | ||
type="schema", | ||
name="Parse iceberg schema", | ||
reason=f"Failed to validate iceberg schema from {source}: {e}", | ||
engine="datacontract", | ||
) | ||
|
||
|
||
def import_iceberg( | ||
data_contract_specification: DataContractSpecification, schema: Schema, table_name: str | ||
) -> DataContractSpecification: | ||
if data_contract_specification.models is None: | ||
data_contract_specification.models = {} | ||
|
||
model = Model(type="table", title=table_name) | ||
|
||
for field in schema.fields: | ||
model.fields[field.name] = _field_from_nested_field(field) | ||
|
||
data_contract_specification.models[table_name] = model | ||
return data_contract_specification | ||
|
||
|
||
def _field_from_nested_field(nested_field: iceberg_types.NestedField) -> Field: | ||
""" | ||
Converts an Iceberg NestedField into a Field object for the data contract. | ||
Args: | ||
nested_field: The Iceberg NestedField to convert. | ||
Returns: | ||
Field: The generated Field object. | ||
""" | ||
field = Field( | ||
title=nested_field.name, | ||
required=nested_field.required, | ||
config=build_field_config(nested_field), | ||
) | ||
|
||
if nested_field.doc is not None: | ||
field.description = nested_field.doc | ||
|
||
return _type_from_iceberg_type(field, nested_field.field_type) | ||
|
||
|
||
def _type_from_iceberg_type(field: Field, iceberg_type: iceberg_types.IcebergType) -> Field: | ||
""" | ||
Maps Iceberg data types to the Data Contract type system and updates the field. | ||
Args: | ||
field: The Field object to update. | ||
iceberg_type: The Iceberg data type to map. | ||
Returns: | ||
Field: The updated Field object. | ||
""" | ||
field.type = _data_type_from_iceberg(iceberg_type) | ||
|
||
if field.type == "array": | ||
field.items = _type_from_iceberg_type(Field(required=iceberg_type.element_required), iceberg_type.element_type) | ||
|
||
elif field.type == "map": | ||
field.keys = _type_from_iceberg_type(Field(required=True), iceberg_type.key_type) | ||
field.values = _type_from_iceberg_type(Field(required=iceberg_type.value_required), iceberg_type.value_type) | ||
|
||
elif field.type == "object": | ||
field.fields = {nf.name: _field_from_nested_field(nf) for nf in iceberg_type.fields} | ||
|
||
return field | ||
|
||
|
||
def build_field_config(iceberg_field: iceberg_types.NestedField) -> Dict[str, Any]: | ||
config = {} | ||
|
||
if iceberg_field.field_id > 0: | ||
config["icebergFieldId"] = iceberg_field.field_id | ||
|
||
if iceberg_field.initial_default is not None: | ||
config["icebergInitialDefault"] = iceberg_field.initial_default | ||
|
||
if iceberg_field.write_default is not None: | ||
config["icebergWriteDefault"] = iceberg_field.write_default | ||
|
||
return config | ||
|
||
|
||
def _data_type_from_iceberg(type: iceberg_types.IcebergType) -> str: | ||
""" | ||
Convert an Iceberg field type to a datacontract field type | ||
Args: | ||
type: The Iceberg field type | ||
Returns: | ||
str: The datacontract field type | ||
""" | ||
if isinstance(type, iceberg_types.BooleanType): | ||
return "boolean" | ||
if isinstance(type, iceberg_types.IntegerType): | ||
return "integer" | ||
if isinstance(type, iceberg_types.LongType): | ||
return "long" | ||
if isinstance(type, iceberg_types.FloatType): | ||
return "float" | ||
if isinstance(type, iceberg_types.DoubleType): | ||
return "double" | ||
if isinstance(type, iceberg_types.DecimalType): | ||
return "decimal" | ||
if isinstance(type, iceberg_types.DateType): | ||
return "date" | ||
if isinstance(type, iceberg_types.TimeType): | ||
# there isn't a great mapping for the iceberg type "time", just map to string for now | ||
return "string" | ||
if isinstance(type, iceberg_types.TimestampType): | ||
return "timestamp_ntz" | ||
if isinstance(type, iceberg_types.TimestamptzType): | ||
return "timestamp_tz" | ||
if isinstance(type, iceberg_types.StringType): | ||
return "string" | ||
if isinstance(type, iceberg_types.UUIDType): | ||
return "string" | ||
if isinstance(type, iceberg_types.BinaryType): | ||
return "bytes" | ||
if isinstance(type, iceberg_types.FixedType): | ||
return "bytes" | ||
if isinstance(type, iceberg_types.MapType): | ||
return "map" | ||
if isinstance(type, iceberg_types.ListType): | ||
return "array" | ||
if isinstance(type, iceberg_types.StructType): | ||
return "object" | ||
|
||
raise ValueError(f"Unknown Iceberg type: {type}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"fields": "not a list" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
{ | ||
"type": "struct", | ||
"fields": [ | ||
{ | ||
"id": 1, | ||
"name": "foo", | ||
"type": "string", | ||
"required": false | ||
}, | ||
{ | ||
"id": 2, | ||
"name": "bar", | ||
"type": "int", | ||
"required": true | ||
}, | ||
{ | ||
"id": 3, | ||
"name": "baz", | ||
"type": "boolean", | ||
"required": false | ||
}, | ||
{ | ||
"id": 4, | ||
"name": "qux", | ||
"type": { | ||
"type": "list", | ||
"element-id": 5, | ||
"element": "string", | ||
"element-required": true | ||
}, | ||
"required": true | ||
}, | ||
{ | ||
"id": 6, | ||
"name": "quux", | ||
"type": { | ||
"type": "map", | ||
"key-id": 7, | ||
"key": "string", | ||
"value-id": 8, | ||
"value": { | ||
"type": "map", | ||
"key-id": 9, | ||
"key": "string", | ||
"value-id": 10, | ||
"value": "int", | ||
"value-required": true | ||
}, | ||
"value-required": true | ||
}, | ||
"required": true | ||
}, | ||
{ | ||
"id": 11, | ||
"name": "location", | ||
"type": { | ||
"type": "list", | ||
"element-id": 12, | ||
"element": { | ||
"type": "struct", | ||
"fields": [ | ||
{ | ||
"id": 13, | ||
"name": "latitude", | ||
"type": "float", | ||
"required": false | ||
}, | ||
{ | ||
"id": 14, | ||
"name": "longitude", | ||
"type": "float", | ||
"required": false | ||
} | ||
] | ||
}, | ||
"element-required": true | ||
}, | ||
"required": true | ||
}, | ||
{ | ||
"id": 15, | ||
"name": "person", | ||
"type": { | ||
"type": "struct", | ||
"fields": [ | ||
{ | ||
"id": 16, | ||
"name": "name", | ||
"type": "string", | ||
"required": false | ||
}, | ||
{ | ||
"id": 17, | ||
"name": "age", | ||
"type": "int", | ||
"required": true | ||
} | ||
] | ||
}, | ||
"required": false | ||
} | ||
], | ||
"schema-id": 1, | ||
"identifier-field-ids": [ | ||
2 | ||
] | ||
} |
Oops, something went wrong.