From 286f30f9edfc95e65fcdf88fe177f9ee9ec5bc02 Mon Sep 17 00:00:00 2001 From: vovavovavovavova <39351371+vovavovavovavova@users.noreply.github.com> Date: Wed, 16 Jun 2021 14:16:32 +0300 Subject: [PATCH] CDK: tool for auto-generating schemas from the stream output (#3948) * tool for creating schemas from configured_catalog within genson stripping `required` * gen from all output (add_schema for all) * all except extra_strategies * apply extra strategies * merge && small upd * upd docstring --- tools/integrations/schema_generator.py | 77 ++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tools/integrations/schema_generator.py diff --git a/tools/integrations/schema_generator.py b/tools/integrations/schema_generator.py new file mode 100644 index 0000000000000..c741267d249bf --- /dev/null +++ b/tools/integrations/schema_generator.py @@ -0,0 +1,77 @@ +# +# MIT License +# +# Copyright (c) 2020 Airbyte +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +import json +import os +import sys + +from airbyte_cdk.models import AirbyteMessage, Type +from genson import SchemaBuilder +from genson.schema.strategies.object import Object + + +class NoRequiredObj(Object): + """ + This class has Object behaviour, but it does not generate "required[]" fields + every time it parses object. So we dont add unnecessary extra field. + """ + def to_schema(self): + schema = super(Object, self).to_schema() + schema["type"] = "object" + if self._properties: + schema["properties"] = self._properties_to_schema(self._properties) + if self._pattern_properties: + schema["patternProperties"] = self._properties_to_schema(self._pattern_properties) + return schema + + +class NoRequiredSchemaBuilder(SchemaBuilder): + EXTRA_STRATEGIES = (NoRequiredObj,) + + +def main(): + default_folder = os.path.join(os.getcwd(), "schemas") + if not os.path.exists(default_folder): + os.mkdir(default_folder) + + builders = {} + for line in sys.stdin: + message = AirbyteMessage.parse_raw(line) + if message.type == Type.RECORD: + stream_name = message.record.stream + if stream_name not in builders: + builder = NoRequiredSchemaBuilder() + builders[stream_name] = builder + else: + builder = builders[stream_name] + builder.add_object(message.record.data) + for stream_name, builder in builders.items(): + schema = builder.to_schema() + output_file_name = os.path.join(default_folder, stream_name + ".json") + with open(output_file_name, "w") as outfile: + json.dump(schema, outfile, indent=2, sort_keys=True) + + +if __name__ == "__main__": + main()