dlt-hub · sh-rp · May 30, 2025 · Apr 28, 2025 · Apr 28, 2025 · Apr 28, 2025
diff --git a/Makefile b/Makefile
@@ -97,7 +97,8 @@ lint-docstrings:
 		dlt/common/destination/dataset.py \
 		dlt/destinations/impl/**/factory.py \
 		dlt/pipeline/pipeline.py \
-		dlt/pipeline/__init__.py
+		dlt/pipeline/__init__.py \
+		tests/pipeline/utils.py
 
 test:
 	poetry run pytest tests

diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py
@@ -14,6 +14,7 @@
     Dict,
     Any,
     TypeVar,
+    Tuple,
 )
 from typing_extensions import Annotated
 import datetime  # noqa: 251
@@ -655,7 +656,11 @@ def get_open_table_catalog(self, table_format: TTableFormat, catalog_name: str =
 
     @abstractmethod
     def get_open_table_location(self, table_format: TTableFormat, table_name: str) -> str:
-        """Computes location in which table metadata is stored. Does not verify if table exists."""
+        """Computes location in which table is stored which is typically a "folder" with table
+        data and metadata. Does not verify if table exists.
+        Returns:
+            str: fully formed url with table location
+        """
 
     @abstractmethod
     def load_open_table(self, table_format: TTableFormat, table_name: str, **kwargs: Any) -> Any:

diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py
@@ -102,7 +102,7 @@ def evolve_table(
         table = catalog.load_table(table_id)
     except NoSuchTableError:
         # add table to catalog
-        metadata_path = f"{table_location}/metadata"
+        metadata_path = f"{table_location.rstrip('/')}/metadata"
         if client.fs_client.exists(metadata_path):
             # found metadata; register existing table
             table = register_table(
@@ -166,7 +166,7 @@ def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]:
 def get_last_metadata_file(
     metadata_path: str, fs_client: AbstractFileSystem, config: FilesystemConfiguration
 ) -> str:
-    # TODO: implement faster way to obtain `last_metadata_file` (listing is slow)
+    # TODO: read version-hint.txt first and save it in filesystem
     try:
         metadata_files = [f for f in fs_client.ls(metadata_path) if f.endswith(".json")]
     except FileNotFoundError:

diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py
@@ -107,11 +107,17 @@ def _make_az_url(scheme: str, fs_path: str, bucket_url: str) -> str:
         # az://<container_name>@<storage_account_name>.dfs.core.windows.net/<path>
         # fs_path always starts with container
         split_path = fs_path.split("/", maxsplit=1)
+        # preserve slash at the end
+        if len(split_path) == 2 and split_path[1] == "":
+            split_path[1] = "/"
+        # if just a container name, add empty path
         if len(split_path) == 1:
             split_path.append("")
         container, path = split_path
         netloc = f"{container}@{parsed_bucket_url.hostname}"
-        return urlunparse(parsed_bucket_url._replace(path=path, scheme=scheme, netloc=netloc))
+        # this strips trailing slash
+        uri = urlunparse(parsed_bucket_url._replace(path=path, scheme=scheme, netloc=netloc))
+        return uri
     return f"{scheme}://{fs_path}"
 
 
@@ -121,8 +127,12 @@ def _make_file_url(scheme: str, fs_path: str, bucket_url: str) -> str:
     netloc is never set. UNC paths are represented as file://host/path
     """
     p_ = pathlib.Path(fs_path)
+    # will remove trailing separator
     p_ = p_.expanduser().resolve()
-    return p_.as_uri()
+    uri = p_.as_uri()
+    if fs_path.endswith(os.path.sep):
+        uri += "/"
+    return uri
 
 
 MAKE_URI_DISPATCH = {"az": _make_az_url, "file": _make_file_url, "sftp": _make_sftp_url}
@@ -135,7 +145,8 @@ def _make_file_url(scheme: str, fs_path: str, bucket_url: str) -> str:
 
 
 def make_fsspec_url(scheme: str, fs_path: str, bucket_url: str) -> str:
-    """Creates url from `fs_path` and `scheme` using bucket_url as an `url` template
+    """Creates url from `fs_path` and `scheme` using bucket_url as an `url` template, if `fs_path`
+    ends with separator (indicating folder), it is preserved
 
     Args:
         scheme (str): scheme of the resulting url

diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py
@@ -40,7 +40,7 @@ def query(self) -> Any:
         """build the query"""
         from dlt.helpers.ibis import ibis
 
-        target_dialect = self._dataset._destination.capabilities().sqlglot_dialect
+        target_dialect = self._dataset.sql_client.capabilities.sqlglot_dialect
 
         # render sql directly if possible
         if target_dialect not in TRANSPILE_VIA_DEFAULT:

diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py
@@ -14,11 +14,15 @@ class FSClientBase(ABC):
     @property
     @abstractmethod
     def dataset_path(self) -> str:
+        """A path within a bucket to tables in a dataset, ending with separator"""
         pass
 
     @abstractmethod
     def get_table_dir(self, table_name: str) -> str:
-        """returns directory for given table"""
+        """Returns a directory containing table files, ending with separator.
+        Native filesystem paths are used for local filesystems.
+        Note that many tables can share the same table dir.
+        """
         pass
 
     @abstractmethod
@@ -28,7 +32,7 @@ def get_table_dirs(self, table_names: Iterable[str]) -> List[str]:
 
     @abstractmethod
     def list_table_files(self, table_name: str) -> List[str]:
-        """returns all filepaths for a given table"""
+        """Returns all filepaths for a given table. Native filesystem paths are used for local filesystems."""
         pass
 
     @abstractmethod

diff --git a/dlt/destinations/impl/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py
@@ -210,14 +210,18 @@ def _make_database_exception(cls, ex: Exception) -> Exception:
             # duckdb raises TypeError on malformed query parameters
             return DatabaseTransientException(duckdb.ProgrammingError(ex))
         elif isinstance(ex, duckdb.IOException):
+            message = str(ex)
             if (
-                "read from delta table" in str(ex) and "No files in log segment" in str(ex)
-            ) or "Path does not exist" in str(ex):
+                "read from delta table" in message and "No files in log segment" in message
+            ) or "Path does not exist" in message:
                 # delta scanner with no delta data and metadata exist in the location
                 return DatabaseUndefinedRelation(ex)
-            if "Could not guess Iceberg table version" in str(ex):
+            if "Could not guess Iceberg table version" in message:
                 # same but iceberg
                 return DatabaseUndefinedRelation(ex)
+            if "No files found" in message:
+                # glob patterns not found
+                return DatabaseUndefinedRelation(ex)
             return DatabaseTransientException(ex)
         elif isinstance(ex, duckdb.InternalException):
             if "INTERNAL Error: Value::LIST(values)" in str(ex):

diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py
@@ -23,6 +23,8 @@ class FilesystemDestinationClientConfiguration(FilesystemConfigurationWithLocalF
     extra_placeholders: Optional[TExtraPlaceholders] = None
     max_state_files: int = 100
     """Maximum number of pipeline state files to keep; 0 or negative value disables cleanup."""
+    always_refresh_views: bool = False
+    """Always refresh table scanner views by setting the newest table metadata or globbing table files"""
 
     @resolve_type("credentials")
     def resolve_credentials_type(self) -> Type[CredentialsConfiguration]:

diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py
@@ -86,6 +86,7 @@ def __init__(
         layout: str = DEFAULT_FILE_LAYOUT,
         extra_placeholders: Optional[TExtraPlaceholders] = None,
         current_datetime: Optional[TCurrentDateTime] = None,
+        always_refresh_views: bool = None,
         destination_name: str = None,
         environment: str = None,
         **kwargs: Any,
@@ -113,6 +114,7 @@ def __init__(
                 are mapped to string values or to callables evaluated at runtime.
             current_datetime (Optional[TCurrentDateTime]): Current datetime used by date/time related placeholders. If not provided, load package creation timestamp
                 will be used.
+            always_refresh_views (bool, optional): Always refresh sql_client views by setting the newest table metadata or globbing table files
             destination_name (str, optional): Name of the destination, can be used in config section to differentiate between multiple of the same type
             environment (str, optional): Environment of the destination
             **kwargs (Any): Additional arguments passed to the destination config
@@ -123,6 +125,7 @@ def __init__(
             layout=layout,
             extra_placeholders=extra_placeholders,
             current_datetime=current_datetime,
+            always_refresh_views=always_refresh_views,
             destination_name=destination_name,
             environment=environment,
             **kwargs,

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -320,7 +320,7 @@ def dataset_path(self) -> str:
         """A path within a bucket to tables in a dataset
         NOTE: dataset_name changes if with_staging_dataset is active
         """
-        return self.pathlib.join(self.bucket_path, self.dataset_name)  # type: ignore[no-any-return]
+        return self.pathlib.join(self.bucket_path, self.dataset_name, "")  # type: ignore[no-any-return]
 
     @contextmanager
     def with_staging_dataset(self) -> Iterator["FilesystemClient"]:
@@ -478,14 +478,18 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema:
         return table
 
     def get_table_dir(self, table_name: str, remote: bool = False) -> str:
+        """Returns a directory containing table files, ending with separator.
+        Note that many tables can share the same table dir
+        """
         # dlt tables do not respect layout (for now)
         table_prefix = self.get_table_prefix(table_name)
-        table_dir: str = self.pathlib.dirname(table_prefix)
+        table_dir: str = self.pathlib.dirname(table_prefix) + self.pathlib.sep
         if remote:
             table_dir = self.make_remote_url(table_dir)
         return table_dir
 
     def get_table_prefix(self, table_name: str) -> str:
+        """For table prefixes that are folders, trailing separator will be preserved"""
         # dlt tables do not respect layout (for now)
         if table_name.startswith(self.schema._dlt_tables_prefix):
             # dlt tables get layout where each tables is a folder
@@ -896,9 +900,15 @@ def get_open_table_catalog(self, table_format: TTableFormat, catalog_name: str =
         return catalog
 
     def get_open_table_location(self, table_format: TTableFormat, table_name: str) -> str:
-        """All tables have location, also those in "native" table format."""
-        folder = self.get_table_dir(table_name)
-        location = self.make_remote_url(folder)
+        """All tables have location, also those in "native" table format. Native format
+        in case of filesystem is a set of parquet/csv/jsonl files where a table may
+        be placed in a separate folder or share common prefix define in the layout.
+        Locations of native tables will are normalized to include trailing separator
+        if path is a "folder" (includes buckets)
+        Note: location is fully formed url
+        """
+        prefix = self.get_table_prefix(table_name)
+        location = self.make_remote_url(prefix)
         if self.config.is_local_filesystem and os.name == "nt":
             # pyiceberg cannot deal with windows absolute urls
             location = location.replace("file:///", "file://")

diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py
@@ -1,6 +1,5 @@
-from typing import Any, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING, Tuple, List
 import os
-import re
 import duckdb
 
 from dlt.common import logger
@@ -45,17 +44,15 @@ def __init__(
     def can_create_view(self, table_schema: PreparedTableSchema) -> bool:
         if table_schema.get("table_format") in ("delta", "iceberg"):
             return True
-        file_format = self.get_file_format(table_schema)
-        return file_format in ("jsonl", "parquet", "csv")
+        # checking file type is expensive so we optimistically allow to create view and prune later
+        return True
 
-    def get_file_format(self, table_schema: PreparedTableSchema) -> str:
+    def get_file_format_and_files(self, table_schema: PreparedTableSchema) -> Tuple[str, List[str]]:
         table_name = table_schema["name"]
-        if table_name in self.schema.dlt_table_names():
-            return "jsonl"
         files = self.remote_client.list_table_files(table_name)
         if len(files) == 0:
             raise DestinationUndefinedEntity(table_name)
-        return os.path.splitext(files[0])[1][1:]
+        return os.path.splitext(files[0])[1][1:], files
 
     def create_secret(
         self,
@@ -85,13 +82,18 @@ def create_secret(
                 raise ValueError(
                     f"Cannot create secret or register filesystem for protocol {protocol}"
                 )
+
         return True
 
     def open_connection(self) -> duckdb.DuckDBPyConnection:
         first_connection = self.credentials.never_borrowed
         super().open_connection()
 
         if first_connection:
+            # TODO: we need to frontload the httpfs extension for abfss for some reason
+            if self.is_abfss:
+                self._conn.sql("LOAD httpfs;")
+
             # create single authentication for the whole client
             self.create_secret(
                 self.remote_client.config.bucket_url, self.remote_client.config.credentials
@@ -101,9 +103,12 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
         return self._conn
 
     def should_replace_view(self, view_name: str, table_schema: PreparedTableSchema) -> bool:
-        # we use alternative method to get snapshot on abfss and we need to replace
-        # the view each time to control the freshness (abfss cannot glob)
-        return self.is_abfss  # and table_format == "iceberg"
+        if self.remote_client.config.always_refresh_views:
+            table_format = table_schema.get("table_format")
+            if table_format == "delta":
+                # delta will auto refresh
+                return False
+        return self.remote_client.config.always_refresh_views
 
     @raise_database_error
     def create_view(self, view_name: str, table_schema: PreparedTableSchema) -> None:
@@ -139,40 +144,37 @@ def _escape_column_name(col_name: str) -> str:
         # create from statement
         from_statement = ""
         if table_format == "delta":
+            table_location = table_location.rstrip("/")
             from_statement = f"delta_scan('{table_location}')"
         elif table_format == "iceberg":
+            table_location = table_location.rstrip("/")
             if not self.iceberg_initialized:
                 self._setup_iceberg(self._conn)
                 self.iceberg_initialized = True
-            if self.is_abfss:
-                # duckdb can't glob on abfss 🤯
-                from dlt.common.libs.pyiceberg import get_last_metadata_file
 
-                metadata_path = f"{table_location}/metadata"
-                last_metadata_file = get_last_metadata_file(
-                    metadata_path, self.remote_client.fs_client, self.remote_client.config
-                )
-                from_statement = (
-                    f"iceberg_scan('{last_metadata_file}', skip_schema_inference=false)"
-                )
+            from dlt.common.libs.pyiceberg import get_last_metadata_file
+
+            metadata_path = f"{table_location}/metadata"
+            last_metadata_file = get_last_metadata_file(
+                metadata_path, self.remote_client.fs_client, self.remote_client.config
+            )
+            if ".gz." in last_metadata_file:
+                compression = ", metadata_compression_codec = 'gzip'"
             else:
-                # skip schema inference to make nested data types work
-                # https://github.com/duckdb/duckdb_iceberg/issues/47
-                from_statement = (
-                    f"iceberg_scan('{table_location}', version='?', allow_moved_paths = true,"
-                    " skip_schema_inference=false)"
-                )
+                compression = ""
+
+            from_statement = (
+                f"iceberg_scan('{last_metadata_file}', {compression} skip_schema_inference=false)"
+            )
         else:
-            # get file format from schema
+            # get file format and list of table files
             # NOTE: this does not support cases where table contains many different file formats
-            first_file_type = self.get_file_format(table_schema)
-
-            # build files string
-            supports_wildcard_notation = not self.is_abfss
-
-            resolved_files_string = f"'{table_location}/**/*.{first_file_type}'"
-            if not supports_wildcard_notation:
-                files = self.remote_client.list_table_files(table_name)
+            # NOTE: since we must list all the files anyway we just pass them to duckdb without further globbing
+            #   list is in the memory already and query size in duckdb is very large
+            first_file_type, files = self.get_file_format_and_files(table_schema)
+            if protocol == "file":
+                resolved_files_string = ",".join(map(lambda f: f"'{f}'", files))
+            else:
                 resolved_files_string = ",".join(map(lambda f: f"'{protocol}://{f}'", files))
 
             if first_file_type == "parquet":
@@ -219,11 +221,14 @@ def _escape_column_name(col_name: str) -> str:
                     )
 
             else:
-                raise NotImplementedError(
-                    f"Unknown filetype {first_file_type} for table {table_name}. Currently only"
-                    " jsonl and parquet files as well as delta and iceberg tables are"
-                    " supported."
-                )
+                # we skipped checking file type in can_create_view to not repeat globs which are expensive
+                # so we skip here.
+                return
+                # raise NotImplementedError(
+                #     f"Unknown filetype {first_file_type} for table {table_name}. Currently only"
+                #     " jsonl and parquet files as well as delta and iceberg tables are"
+                #     " supported."
+                # )
 
         # create table
         view_name = self.make_qualified_table_name(view_name)

diff --git a/dlt/destinations/path_utils.py b/dlt/destinations/path_utils.py
@@ -1,4 +1,5 @@
 import re
+import os
 from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
 
 from dlt.common import logger