dlt-hub
diff --git a/‎.github/workflows/build_docs.yml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/build_docs.yml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/main.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/test_common.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test_common.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/tools_deploy_docs.yml‎
Lines changed: 0 additions & 20 deletions b/‎.github/workflows/tools_deploy_docs.yml‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎dlt/_workspace/_templates/_core_source_templates/sql_database_pipeline.py‎
Lines changed: 2 additions & 39 deletions b/‎dlt/_workspace/_templates/_core_source_templates/sql_database_pipeline.py‎
Lines changed: 2 additions & 39 deletions
diff --git a/‎dlt/_workspace/mcp/tools/mcp_tools.py‎
Lines changed: 5 additions & 5 deletions b/‎dlt/_workspace/mcp/tools/mcp_tools.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎dlt/common/configuration/resolve.py‎
Lines changed: 3 additions & 1 deletion b/‎dlt/common/configuration/resolve.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎dlt/common/libs/pydantic.py‎
Lines changed: 1 addition & 1 deletion b/‎dlt/common/libs/pydantic.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dlt/common/schema/schema.py‎
Lines changed: 5 additions & 3 deletions b/‎dlt/common/schema/schema.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎docs/examples/custom_destination_lancedb/custom_destination_lancedb.py‎
Lines changed: 21 additions & 10 deletions b/‎docs/examples/custom_destination_lancedb/custom_destination_lancedb.py‎
Lines changed: 21 additions & 10 deletions
@@ -0,0 +1,36 @@
+name: docs | build docs
+
+on:
+  workflow_call:
+  workflow_dispatch:
+
+jobs:
+  build_docs:
+    name: docs | build docs
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out
+        uses: actions/checkout@master
+
+      - uses: pnpm/action-setup@v2
+        with:
+          version: 9.13.2
+
+      - uses: actions/setup-node@v5
+        with:
+          node-version: '22'
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install node dependencies
+        run: cd docs/website && npm install
+
+      - name: Install python dependencies
+        run: cd docs/website && pip install -r requirements.txt
+
+      - name: Build docs
+        run: cd docs/website && npm run build:cloudflare
@@ -30,6 +30,11 @@ jobs:
   test_docs_snippets:
     name: test snippets in docs
     uses: ./.github/workflows/test_docs_snippets.yml
+
+  # NOTE: we build docs the same way as on cloudflare, so we can catch problems early
+  build_docs:
+    name: build docs
+    uses: ./.github/workflows/build_docs.yml
 
   lint:
     name: lint on all python versions
 
@@ -63,11 +63,11 @@ jobs:
           - os: windows-latest
             python-version: "3.11"
             shell: cmd
-            pytest_args: '-m "not forked"'
+            pytest_args: '-m "not forked and not rfam"'
           - os: windows-latest
             python-version: "3.13"
             shell: cmd
-            pytest_args: '-m "not forked"'
+            pytest_args: '-m "not forked and not rfam"'
 
     defaults:
       run:
 
@@ -9,7 +9,6 @@
 
 from dlt.sources.sql_database import sql_database, sql_table, Table
 
-from sqlalchemy.sql.sqltypes import TypeEngine
 import sqlalchemy as sa
 
 
@@ -105,46 +104,13 @@ def load_standalone_table_resource() -> None:
         defer_table_reflect=True,
     )
 
-    # Run the resources together
-    info = pipeline.extract([family, genome], write_disposition="merge")
+    # Run the resources together (just take one page of results to make it faster)
+    info = pipeline.extract([family.add_limit(1), genome.add_limit(1)], write_disposition="merge")
     print(info)
     # Show inferred columns
     print(pipeline.default_schema.to_pretty_yaml())
 
 
-def select_columns() -> None:
-    """Uses table adapter callback to modify list of columns to be selected"""
-    pipeline = dlt.pipeline(
-        pipeline_name="rfam_database",
-        destination="duckdb",
-        dataset_name="rfam_data_cols",
-        dev_mode=True,
-    )
-
-    def table_adapter(table: Table) -> Table:
-        print(table.name)
-        if table.name == "family":
-            # this is SqlAlchemy table. _columns are writable
-            # let's drop updated column
-            table._columns.remove(table.columns["updated"])  # type: ignore
-        return table
-
-    family = sql_table(
-        credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
-        table="family",
-        chunk_size=10,
-        reflection_level="full_with_precision",
-        table_adapter_callback=table_adapter,
-    )
-
-    # also we do not want the whole table, so we add limit to get just one chunk (10 records)
-    pipeline.run(family.add_limit(1))
-    # only 10 rows
-    print(pipeline.last_trace.last_normalize_info)
-    # no "updated" column in "family" table
-    print(pipeline.default_schema.to_pretty_yaml())
-
-
 def select_with_end_value_and_row_order() -> None:
     """Gets data from a table withing a specified range and sorts rows descending"""
     pipeline = dlt.pipeline(
@@ -347,9 +313,6 @@ def specify_columns_to_load() -> None:
     # Load selected tables with different settings
     # load_select_tables_from_database()
 
-    # load a table and select columns
-    # select_columns()
-
     # load_entire_database()
     # select_with_end_value_and_row_order()
 
 
@@ -193,22 +193,22 @@ def register_with(self, mcp_server: FastMCP) -> None:
         pipeline_name = self.pipeline.pipeline_name
         mcp_server.add_tool(
             self.available_tables,
-            name=f"available_tables_in_pipeline_{pipeline_name}",
+            name="available_tables",
             description=f"All available tables in the pipeline {pipeline_name}",
         )
         mcp_server.add_tool(
             self.table_head,
-            name=f"table_head_in_pipeline_{pipeline_name}",
+            name="table_head",
             description=f"Get the first 10 rows of the table in the pipeline {pipeline_name}",
         )
         mcp_server.add_tool(
             self.table_schema,
-            name=f"table_schema_in_pipeline_{pipeline_name}",
+            name="table_schema",
             description=f"Get the schema of the table in the pipeline {pipeline_name}",
         )
         mcp_server.add_tool(
             self.query_sql,
-            name=f"query_sql_in_pipeline_{pipeline_name}",
+            name="query_sql",
             description=(
                 f"Executes sql statement on a given pipeline {pipeline_name} as returns the result "
                 "as | delimited csv. Use this tool for simple analysis where the number of rows is "
@@ -219,7 +219,7 @@ def register_with(self, mcp_server: FastMCP) -> None:
         )
         mcp_server.add_tool(
             self.bookmark_sql,
-            name=f"bookmark_sql_in_pipeline_{pipeline_name}",
+            name="bookmark_sql",
             description=(
                 f"Executes sql statement on a pipeline {pipeline_name} and bookmarks it under "
                 "given bookmark for further processing. Use this tool when you need to select "
 
@@ -158,8 +158,10 @@ def _maybe_parse_native_value(
                 .as_dict_nondefault()
                 .items()
             }
-        except (ValueError, NotImplementedError) as v_err:
+        except ValueError as v_err:
             raise InvalidNativeValue(type(config), type(native_value), embedded_sections, v_err)
+        except NotImplementedError:
+            pass
 
     return native_value  # type: ignore[no-any-return]
 
 
@@ -385,7 +385,7 @@ def validate_and_filter_items(
                     deleted.add(err_idx)
                 else:
                     raise NotImplementedError(
-                        f"`{column_mode=:}` not implemented for Pydantic validation"
+                        f"`{data_mode=:}` not implemented for Pydantic validation"
                     )
 
         # validate again with error items removed
 
@@ -418,13 +418,14 @@ def filter_row_with_hint(
     def merge_hints(
         self,
         new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
+        replace: bool = False,
         normalize_identifiers: bool = True,
     ) -> None:
-        """Merges existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
+        """Merges or replace existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
 
         NOTE: you can manipulate default hints collection directly via `Schema.settings` as long as you call Schema._compile_settings() at the end.
         """
-        self._merge_hints(new_hints, normalize_identifiers)
+        self._merge_hints(new_hints, replace=replace, normalize_identifiers=normalize_identifiers)
         self._compile_settings()
 
     def update_preferred_types(
@@ -813,6 +814,7 @@ def _infer_hint(self, hint_type: TColumnDefaultHint, col_name: str) -> bool:
     def _merge_hints(
         self,
         new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
+        replace: bool = False,
         normalize_identifiers: bool = True,
     ) -> None:
         """Used by `merge_hints method, does not compile settings at the end"""
@@ -829,7 +831,7 @@ def _merge_hints(
         default_hints = self._settings.setdefault("default_hints", {})
         # add `new_hints` to existing hints
         for h, l in new_hints.items():
-            if h in default_hints:
+            if h in default_hints and not replace:
                 extend_list_deduplicated(default_hints[h], l, utils.canonical_simple_regex)
             else:
                 # set new hint type
 
@@ -16,6 +16,7 @@
 We'll learn how to:
 - Use the [custom destination](../dlt-ecosystem/destinations/destination.md)
 - Delegate the embeddings to LanceDB using OpenAI Embeddings
+- Use Pydantic for unified dlt and lancedb schema validation
 """
 
 __source_name__ = "spotify"
@@ -59,10 +60,11 @@
 
 
 class EpisodeSchema(LanceModel):
+    """Used for dlt and lance schema validation"""
+
     id: str  # noqa: A003
     name: str
     description: str = func.SourceField()
-    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore[valid-type]
     release_date: datetime.date
     audio_preview_url: str
     duration_ms: int
@@ -71,6 +73,12 @@ class EpisodeSchema(LanceModel):
     # there is more data but we are not using it ...
 
 
+class EpisodeSchemaVector(EpisodeSchema):
+    """Adds lance vector field"""
+
+    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore[valid-type]
+
+
 @dataclass(frozen=True)
 class Shows:
     monday_morning_data_chat: str = "3Km3lBNzJpc1nOTJUtbtMh"
@@ -120,11 +128,20 @@ def spotify_shows(
         yield dlt.resource(
             client.paginate(url, params={"limit": 50}),
             name=show_name,
-            write_disposition="merge",
             primary_key="id",
             parallelized=True,
             max_table_nesting=0,
-        )
+            # reuse lance model to filter out all non-matching items and extra columns from spotify api
+            # 1. unknown columns are removed ("columns": "discard_value")
+            # 2. non validating items (ie. without id or url) are removed ("data_type": "discard_row")
+            # 3. for some reason None values are returned as well 🤯, add_filter takes care of that
+            columns=EpisodeSchema,
+            schema_contract={
+                "tables": "evolve",
+                "columns": "discard_value",
+                "data_type": "discard_row",
+            },
+        ).add_filter(lambda i: i is not None)
 
 
 @dlt.destination(batch_size=250, name="lancedb")
@@ -135,13 +152,7 @@ def lancedb_destination(items: TDataItems, table: TTableSchema) -> None:
     try:
         tbl = db.open_table(table["name"])
     except ValueError:
-        tbl = db.create_table(table["name"], schema=EpisodeSchema)
-
-    # remove all fields that are not in the schema
-    for item in items:
-        keys_to_remove = [key for key in item.keys() if key not in EpisodeSchema.model_fields]
-        for key in keys_to_remove:
-            del item[key]
+        tbl = db.create_table(table["name"], schema=EpisodeSchemaVector)
 
     tbl.add(items)
Original file line number	Diff line number	Diff line change
`@@ -385,7 +385,7 @@ def validate_and_filter_items(`
`385`	`385`	`deleted.add(err_idx)`
`386`	`386`	`else:`
`387`	`387`	`raise NotImplementedError(`
`388`		- f"`{column_mode=:}` not implemented for Pydantic validation"
	`388`	+ f"`{data_mode=:}` not implemented for Pydantic validation"
`389`	`389`	`)`
`390`	`390`
`391`	`391`	`# validate again with error items removed`