Skip to content

Commit fabd0e9

Browse files
committed
Merge branch 'devel' into feat/graceful-signal-handler
2 parents 9b7827f + 0dcdcf0 commit fabd0e9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1692
-870
lines changed

.github/workflows/build_docs.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: docs | build docs
2+
3+
on:
4+
workflow_call:
5+
workflow_dispatch:
6+
7+
jobs:
8+
build_docs:
9+
name: docs | build docs
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- name: Check out
14+
uses: actions/checkout@master
15+
16+
- uses: pnpm/action-setup@v2
17+
with:
18+
version: 9.13.2
19+
20+
- uses: actions/setup-node@v5
21+
with:
22+
node-version: '22'
23+
24+
- name: Setup Python
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version: "3.11"
28+
29+
- name: Install node dependencies
30+
run: cd docs/website && npm install
31+
32+
- name: Install python dependencies
33+
run: cd docs/website && pip install -r requirements.txt
34+
35+
- name: Build docs
36+
run: cd docs/website && npm run build:cloudflare

.github/workflows/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ jobs:
3030
test_docs_snippets:
3131
name: test snippets in docs
3232
uses: ./.github/workflows/test_docs_snippets.yml
33+
34+
# NOTE: we build docs the same way as on cloudflare, so we can catch problems early
35+
build_docs:
36+
name: build docs
37+
uses: ./.github/workflows/build_docs.yml
3338

3439
lint:
3540
name: lint on all python versions

.github/workflows/test_common.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ jobs:
6363
- os: windows-latest
6464
python-version: "3.11"
6565
shell: cmd
66-
pytest_args: '-m "not forked"'
66+
pytest_args: '-m "not forked and not rfam"'
6767
- os: windows-latest
6868
python-version: "3.13"
6969
shell: cmd
70-
pytest_args: '-m "not forked"'
70+
pytest_args: '-m "not forked and not rfam"'
7171

7272
defaults:
7373
run:

.github/workflows/tools_deploy_docs.yml

Lines changed: 0 additions & 20 deletions
This file was deleted.

dlt/_workspace/_templates/_core_source_templates/sql_database_pipeline.py

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
from dlt.sources.sql_database import sql_database, sql_table, Table
1111

12-
from sqlalchemy.sql.sqltypes import TypeEngine
1312
import sqlalchemy as sa
1413

1514

@@ -105,46 +104,13 @@ def load_standalone_table_resource() -> None:
105104
defer_table_reflect=True,
106105
)
107106

108-
# Run the resources together
109-
info = pipeline.extract([family, genome], write_disposition="merge")
107+
# Run the resources together (just take one page of results to make it faster)
108+
info = pipeline.extract([family.add_limit(1), genome.add_limit(1)], write_disposition="merge")
110109
print(info)
111110
# Show inferred columns
112111
print(pipeline.default_schema.to_pretty_yaml())
113112

114113

115-
def select_columns() -> None:
116-
"""Uses table adapter callback to modify list of columns to be selected"""
117-
pipeline = dlt.pipeline(
118-
pipeline_name="rfam_database",
119-
destination="duckdb",
120-
dataset_name="rfam_data_cols",
121-
dev_mode=True,
122-
)
123-
124-
def table_adapter(table: Table) -> Table:
125-
print(table.name)
126-
if table.name == "family":
127-
# this is SqlAlchemy table. _columns are writable
128-
# let's drop updated column
129-
table._columns.remove(table.columns["updated"]) # type: ignore
130-
return table
131-
132-
family = sql_table(
133-
credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
134-
table="family",
135-
chunk_size=10,
136-
reflection_level="full_with_precision",
137-
table_adapter_callback=table_adapter,
138-
)
139-
140-
# also we do not want the whole table, so we add limit to get just one chunk (10 records)
141-
pipeline.run(family.add_limit(1))
142-
# only 10 rows
143-
print(pipeline.last_trace.last_normalize_info)
144-
# no "updated" column in "family" table
145-
print(pipeline.default_schema.to_pretty_yaml())
146-
147-
148114
def select_with_end_value_and_row_order() -> None:
149115
"""Gets data from a table withing a specified range and sorts rows descending"""
150116
pipeline = dlt.pipeline(
@@ -347,9 +313,6 @@ def specify_columns_to_load() -> None:
347313
# Load selected tables with different settings
348314
# load_select_tables_from_database()
349315

350-
# load a table and select columns
351-
# select_columns()
352-
353316
# load_entire_database()
354317
# select_with_end_value_and_row_order()
355318

dlt/_workspace/mcp/tools/mcp_tools.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,22 +193,22 @@ def register_with(self, mcp_server: FastMCP) -> None:
193193
pipeline_name = self.pipeline.pipeline_name
194194
mcp_server.add_tool(
195195
self.available_tables,
196-
name=f"available_tables_in_pipeline_{pipeline_name}",
196+
name="available_tables",
197197
description=f"All available tables in the pipeline {pipeline_name}",
198198
)
199199
mcp_server.add_tool(
200200
self.table_head,
201-
name=f"table_head_in_pipeline_{pipeline_name}",
201+
name="table_head",
202202
description=f"Get the first 10 rows of the table in the pipeline {pipeline_name}",
203203
)
204204
mcp_server.add_tool(
205205
self.table_schema,
206-
name=f"table_schema_in_pipeline_{pipeline_name}",
206+
name="table_schema",
207207
description=f"Get the schema of the table in the pipeline {pipeline_name}",
208208
)
209209
mcp_server.add_tool(
210210
self.query_sql,
211-
name=f"query_sql_in_pipeline_{pipeline_name}",
211+
name="query_sql",
212212
description=(
213213
f"Executes sql statement on a given pipeline {pipeline_name} as returns the result "
214214
"as | delimited csv. Use this tool for simple analysis where the number of rows is "
@@ -219,7 +219,7 @@ def register_with(self, mcp_server: FastMCP) -> None:
219219
)
220220
mcp_server.add_tool(
221221
self.bookmark_sql,
222-
name=f"bookmark_sql_in_pipeline_{pipeline_name}",
222+
name="bookmark_sql",
223223
description=(
224224
f"Executes sql statement on a pipeline {pipeline_name} and bookmarks it under "
225225
"given bookmark for further processing. Use this tool when you need to select "

dlt/common/configuration/resolve.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,10 @@ def _maybe_parse_native_value(
158158
.as_dict_nondefault()
159159
.items()
160160
}
161-
except (ValueError, NotImplementedError) as v_err:
161+
except ValueError as v_err:
162162
raise InvalidNativeValue(type(config), type(native_value), embedded_sections, v_err)
163+
except NotImplementedError:
164+
pass
163165

164166
return native_value # type: ignore[no-any-return]
165167

dlt/common/libs/pydantic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def validate_and_filter_items(
385385
deleted.add(err_idx)
386386
else:
387387
raise NotImplementedError(
388-
f"`{column_mode=:}` not implemented for Pydantic validation"
388+
f"`{data_mode=:}` not implemented for Pydantic validation"
389389
)
390390

391391
# validate again with error items removed

dlt/common/schema/schema.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,13 +418,14 @@ def filter_row_with_hint(
418418
def merge_hints(
419419
self,
420420
new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
421+
replace: bool = False,
421422
normalize_identifiers: bool = True,
422423
) -> None:
423-
"""Merges existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
424+
"""Merges or replace existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
424425
425426
NOTE: you can manipulate default hints collection directly via `Schema.settings` as long as you call Schema._compile_settings() at the end.
426427
"""
427-
self._merge_hints(new_hints, normalize_identifiers)
428+
self._merge_hints(new_hints, replace=replace, normalize_identifiers=normalize_identifiers)
428429
self._compile_settings()
429430

430431
def update_preferred_types(
@@ -813,6 +814,7 @@ def _infer_hint(self, hint_type: TColumnDefaultHint, col_name: str) -> bool:
813814
def _merge_hints(
814815
self,
815816
new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
817+
replace: bool = False,
816818
normalize_identifiers: bool = True,
817819
) -> None:
818820
"""Used by `merge_hints method, does not compile settings at the end"""
@@ -829,7 +831,7 @@ def _merge_hints(
829831
default_hints = self._settings.setdefault("default_hints", {})
830832
# add `new_hints` to existing hints
831833
for h, l in new_hints.items():
832-
if h in default_hints:
834+
if h in default_hints and not replace:
833835
extend_list_deduplicated(default_hints[h], l, utils.canonical_simple_regex)
834836
else:
835837
# set new hint type

docs/examples/custom_destination_lancedb/custom_destination_lancedb.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
We'll learn how to:
1717
- Use the [custom destination](../dlt-ecosystem/destinations/destination.md)
1818
- Delegate the embeddings to LanceDB using OpenAI Embeddings
19+
- Use Pydantic for unified dlt and lancedb schema validation
1920
"""
2021

2122
__source_name__ = "spotify"
@@ -59,10 +60,11 @@
5960

6061

6162
class EpisodeSchema(LanceModel):
63+
"""Used for dlt and lance schema validation"""
64+
6265
id: str # noqa: A003
6366
name: str
6467
description: str = func.SourceField()
65-
vector: Vector(func.ndims()) = func.VectorField() # type: ignore[valid-type]
6668
release_date: datetime.date
6769
audio_preview_url: str
6870
duration_ms: int
@@ -71,6 +73,12 @@ class EpisodeSchema(LanceModel):
7173
# there is more data but we are not using it ...
7274

7375

76+
class EpisodeSchemaVector(EpisodeSchema):
77+
"""Adds lance vector field"""
78+
79+
vector: Vector(func.ndims()) = func.VectorField() # type: ignore[valid-type]
80+
81+
7482
@dataclass(frozen=True)
7583
class Shows:
7684
monday_morning_data_chat: str = "3Km3lBNzJpc1nOTJUtbtMh"
@@ -120,11 +128,20 @@ def spotify_shows(
120128
yield dlt.resource(
121129
client.paginate(url, params={"limit": 50}),
122130
name=show_name,
123-
write_disposition="merge",
124131
primary_key="id",
125132
parallelized=True,
126133
max_table_nesting=0,
127-
)
134+
# reuse lance model to filter out all non-matching items and extra columns from spotify api
135+
# 1. unknown columns are removed ("columns": "discard_value")
136+
# 2. non validating items (ie. without id or url) are removed ("data_type": "discard_row")
137+
# 3. for some reason None values are returned as well 🤯, add_filter takes care of that
138+
columns=EpisodeSchema,
139+
schema_contract={
140+
"tables": "evolve",
141+
"columns": "discard_value",
142+
"data_type": "discard_row",
143+
},
144+
).add_filter(lambda i: i is not None)
128145

129146

130147
@dlt.destination(batch_size=250, name="lancedb")
@@ -135,13 +152,7 @@ def lancedb_destination(items: TDataItems, table: TTableSchema) -> None:
135152
try:
136153
tbl = db.open_table(table["name"])
137154
except ValueError:
138-
tbl = db.create_table(table["name"], schema=EpisodeSchema)
139-
140-
# remove all fields that are not in the schema
141-
for item in items:
142-
keys_to_remove = [key for key in item.keys() if key not in EpisodeSchema.model_fields]
143-
for key in keys_to_remove:
144-
del item[key]
155+
tbl = db.create_table(table["name"], schema=EpisodeSchemaVector)
145156

146157
tbl.add(items)
147158

0 commit comments

Comments
 (0)