dlt-hub · rudolfix · Nov 18, 2023 · Nov 18, 2023 · Nov 19, 2023 · Nov 20, 2023
diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
@@ -24,6 +24,10 @@ env:
   RUNTIME__SLACK_INCOMING_HOOK: ${{ secrets.RUNTIME__SLACK_INCOMING_HOOK }}
   # Mongodb url for nested data example
   MONGODB_PIPELINE__SOURCES__CONNECTION_URL: ${{ secrets.MONGODB_PIPELINE__SOURCES__CONNECTION_URL }}
+  # Qdrant credentials
+  DESTINATION__QDRANT__CREDENTIALS__LOCATION: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__LOCATION }}
+  DESTINATION__QDRANT__CREDENTIALS__API_KEY: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__API_KEY }}
+
 jobs:
 
   run_lint:
@@ -59,7 +63,7 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow
+        run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant --with docs --without airflow
 
       - name: Run linter and tests
         run: make test-and-lint-snippets

diff --git a/docs/examples/CONTRIBUTING.md b/docs/examples/CONTRIBUTING.md
@@ -24,6 +24,7 @@ Note: All paths in this guide are relative to the `dlt` repository directory.
 - Use `# @@@DLT_REMOVE` to remove test code from final code example.
 - Test your snippets locally first with command:
     - `cd docs/website/docs/examples/<example-name>/code && pytest --ignore=node_modules -s -v`.
+- Add `@skipifgithubfork` decorator to your main snippet function, look [example](https://github.com/dlt-hub/dlt/blob/master/docs/website/docs/examples/chess_production/code/chess-snippets.py#L1-L4).
 
 ## Run npm start
 

diff --git a/docs/examples/connector_x_arrow/load_arrow.py b/docs/examples/connector_x_arrow/load_arrow.py
@@ -0,0 +1,39 @@
+import connectorx as cx
+
+import dlt
+from dlt.sources.credentials import ConnectionStringCredentials
+
+def read_sql_x(
+    conn_str: ConnectionStringCredentials = dlt.secrets.value,
+    query: str = dlt.config.value,
+):
+    yield cx.read_sql(
+        conn_str.to_native_representation(),
+        query,
+        return_type="arrow2",
+        protocol="binary",
+    )
+
+def genome_resource():
+    # create genome resource with merge on `upid` primary key
+    genome = dlt.resource(
+        name="genome",
+        write_disposition="merge",
+        primary_key="upid",
+        standalone=True,
+    )(read_sql_x)(
+        "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",  # type: ignore[arg-type]
+        "SELECT * FROM genome ORDER BY created LIMIT 1000",
+    )
+    # add incremental on created at
+    genome.apply_hints(incremental=dlt.sources.incremental("created"))
+    return genome
+
+
+if __name__ == "__main__":
+    pipeline = dlt.pipeline(destination="duckdb")
+    genome = genome_resource()
+
+    print(pipeline.run(genome))
+    print(pipeline.last_trace.last_normalize_info)
+    # NOTE: run pipeline again to see that no more records got loaded thanks to incremental loading
diff --git a/...les/incremental_loading/.dlt/secrets.toml → ...emental_loading/.dlt/example.secrets.toml b/...les/incremental_loading/.dlt/secrets.toml → ...emental_loading/.dlt/example.secrets.toml
diff --git a/docs/examples/nested_data/.dlt/secrets.toml → ...les/nested_data/.dlt/example.secrets.toml b/docs/examples/nested_data/.dlt/secrets.toml → ...les/nested_data/.dlt/example.secrets.toml
diff --git a/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml b/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml
@@ -0,0 +1,8 @@
+[destination.qdrant.credentials]
+location = ""
+api_key = ""
+
+[sources.zendesk.credentials]
+password = ""
+subdomain = ""
+email = ""
diff --git a/.../transformers_and_parallelism/__init__.py → docs/examples/qdrant_zendesk/__init__.py b/.../transformers_and_parallelism/__init__.py → docs/examples/qdrant_zendesk/__init__.py
diff --git a/docs/examples/qdrant_zendesk/qdrant.py b/docs/examples/qdrant_zendesk/qdrant.py
@@ -0,0 +1,172 @@
+from typing import Optional, Dict, Any, Tuple
+
+import dlt
+from dlt.common import pendulum
+from dlt.common.time import ensure_pendulum_datetime
+from dlt.common.typing import TAnyDateTime
+from dlt.sources.helpers.requests import client
+from dlt.destinations.qdrant import qdrant_adapter
+from qdrant_client import QdrantClient
+
+from dlt.common.configuration.inject import with_config
+
+# function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
+@dlt.source(max_table_nesting=2)
+def zendesk_support(
+    credentials: Dict[str, str] = dlt.secrets.value,
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
+    end_date: Optional[TAnyDateTime] = None,
+):
+    """
+    Retrieves data from Zendesk Support for tickets events.
+
+    Args:
+        credentials: Zendesk credentials (default: dlt.secrets.value)
+        start_date: Start date for data extraction (default: 2000-01-01)
+        end_date: End date for data extraction (default: None).
+            If end time is not provided, the incremental loading will be
+            enabled, and after the initial run, only new data will be retrieved.
+
+    Returns:
+        DltResource.
+    """
+    # Convert start_date and end_date to Pendulum datetime objects
+    start_date_obj = ensure_pendulum_datetime(start_date)
+    end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None
+
+    # Extract credentials from secrets dictionary
+    auth = (credentials["email"], credentials["password"])
+    subdomain = credentials["subdomain"]
+    url = f"https://{subdomain}.zendesk.com"
+
+    # we use `append` write disposition, because objects in tickets_data endpoint are never updated
+    #  so we do not need to merge
+    # we set primary_key so allow deduplication of events by the `incremental` below in the rare case
+    #  when two events have the same timestamp
+    @dlt.resource(primary_key="id", write_disposition="append")
+    def tickets_data(
+        updated_at: dlt.sources.incremental[
+            pendulum.DateTime
+        ] = dlt.sources.incremental(
+            "updated_at",
+            initial_value=start_date_obj,
+            end_value=end_date_obj,
+            allow_external_schedulers=True,
+        )
+    ):
+        # URL For ticket events
+        # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/tickets_data.json?start_time=946684800'
+        event_pages = get_pages(
+            url=url,
+            endpoint="/api/v2/incremental/tickets",
+            auth=auth,
+            data_point_name="tickets",
+            params={"start_time": updated_at.last_value.int_timestamp},
+        )
+        for page in event_pages:
+            yield ([_fix_date(ticket) for ticket in page])
+
+            # stop loading when using end_value and end is reached.
+            # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves
+            if updated_at.end_out_of_range:
+                return
+
+    return tickets_data
+
+
+# helper function to fix the datetime format
+def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]:
+    if not value:
+        return None
+    return ensure_pendulum_datetime(value)
+
+# modify dates to return datetime objects instead
+def _fix_date(ticket):
+    ticket["updated_at"] = _parse_date_or_none(ticket["updated_at"])
+    ticket["created_at"] = _parse_date_or_none(ticket["created_at"])
+    ticket["due_at"] = _parse_date_or_none(ticket["due_at"])
+    return ticket
+
+# function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
+def get_pages(
+    url: str,
+    endpoint: str,
+    auth: Tuple[str, str],
+    data_point_name: str,
+    params: Optional[Dict[str, Any]] = None,
+):
+    """
+    Makes a request to a paginated endpoint and returns a generator of data items per page.
+
+    Args:
+        url: The base URL.
+        endpoint: The url to the endpoint, e.g. /api/v2/calls
+        auth: Credentials for authentication.
+        data_point_name: The key which data items are nested under in the response object (e.g. calls)
+        params: Optional dict of query params to include in the request.
+
+    Returns:
+        Generator of pages, each page is a list of dict data items.
+    """
+    # update the page size to enable cursor pagination
+    params = params or {}
+    params["per_page"] = 1000
+    headers = None
+
+    # make request and keep looping until there is no next page
+    get_url = f"{url}{endpoint}"
+    while get_url:
+        response = client.get(
+            get_url, headers=headers, auth=auth, params=params
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        result = response_json[data_point_name]
+        yield result
+
+        get_url = None
+        # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format
+        if not response_json["end_of_stream"]:
+            get_url = response_json["next_page"]
+
+if __name__ == "__main__":
+    # create a pipeline with an appropriate name
+    pipeline = dlt.pipeline(
+        pipeline_name="qdrant_zendesk_pipeline",
+        destination="qdrant",
+        dataset_name="zendesk_data",
+    )
+
+    # run the dlt pipeline and save info about the load process
+    load_info = pipeline.run(
+        # here we use a special function to tell Qdrant which fields to embed
+        qdrant_adapter(
+            zendesk_support(), # retrieve tickets data
+            embed=["subject", "description"],
+        )
+    )
+
+    print(load_info)
+
+
+    # running the Qdrant client to connect to your Qdrant database
+
+    @with_config(sections=("destination", "credentials"))
+    def get_qdrant_client(location=dlt.secrets.value, api_key=dlt.secrets.value):
+        return QdrantClient(
+            url=location,
+            api_key=api_key,
+        )
+
+    # running the Qdrant client to connect to your Qdrant database
+    qdrant_client = get_qdrant_client()
+
+    # view Qdrant collections you'll find your dataset here:
+    print(qdrant_client.get_collections())
+
+    # query Qdrant with prompt: getting tickets info close to "cancellation"
+    response = qdrant_client.query(
+        "zendesk_data_content",  # collection/dataset name with the 'content' suffix -> tickets content table
+        query_text=["cancel", "cancel subscription"],  # prompt to search
+        limit=3  # limit the number of results to the nearest 3 embeddings
+    )
diff --git a/docs/website/.npmrc b/docs/website/.npmrc
@@ -0,0 +1 @@
+engine-strict=true
diff --git a/docs/website/README.md b/docs/website/README.md
@@ -1,38 +1,65 @@
 # Website
 
-This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. The actual content resides in the `docs` folder.
+The website is a Node.js application.
 
+The documentation is generated using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. 
+Docusaurus consumes content from the `./docs` folder (at `./docs/website/docs` in this repo). The content includes:
 
-### Installation
+- markdown files
+- code snippets
+- API documentation, which pydoc generates into `./docs/api_reference` when the Node package is run.
+
+On the production website the documentation appears at https://dlthub.com/docs and the default documentation page is https://dlthub.com/docs/intro.
+
+Docusauraus also consumes blog posts (from `./blog`) and they appear at https://dlthub.com/docs/blog.
+
+## Installation
+
+With `website` as your working directory:
 
 ```
 $ npm install
 ```
 
-### Site Configuration
+That command installs our Node.js package defined in `package.json`.
+
+### Are you new to Node?
+
+`npm` is a package manager bundled with Node.js. If `npm install` complained that you have an old version, try:
+
+```
+nvm install --lts
+```
+
+That command installs and uses the latest stable version of Node.js (and therefore `npm`).  Then retry the Installation steps above.
+
+`nvm` is the Node Version Manager, and yes, you may need to install that first, using your usual way of installing software on your OS.
 
-The site is configured to run under the `/docs` path. The `build` command is properly configured.
+## Local Development
 
-### Local Development
+In this mode, most of your authoring changes will be reflected live in the browser just by saving files, without having to restart the server. Type:
 
 ```
 $ npm run start
 ```
 
-This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server.
+That command starts a local development web server and opens a browser window. It then takes a few seconds for Docusaurus to generate pages before the website displays.
+You may get a "Page Not Found" error when browsing at `/docs/`. This does not happen on the production website, whose default page is the "Ïntroduction" page at `/docs/intro`.
+
+For most authoring purposes, once you are happy with your changes running locally, you can create a Github PR, without needing to do the following build and deployment steps.
 
-### Local Build
+## Local Build
 
 ```
 $ npm run build
 ```
 
-This command generates static content into the `build` directory and can be served using any static contents hosting service ie. `npm run serve`
+That command generates static content into the `build` directory, which can be served using any static contents hosting service, for example, `npm run serve`
 
+## Deployment
 
-### Deployment
+The site is deployed using `netlify`. The `netlify` build command is:
 
-The site is deployed using `netlify`. The `netlify` build command is as follows:
 ```
 npm run build:netlify
 ```

diff --git a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md
@@ -4,7 +4,7 @@ title: "dlt-dbt-DuckDB-MotherDuck: My super simple and highly customizable appro
 image: /img/dlt-motherduck-logos.png
 authors:
   name: Rahul Joshi
-  title: Data Science Intern at dltHub
+  title: Developer Relations at dltHub
   url: https://github.com/rahuljo
   image_url: https://avatars.githubusercontent.com/u/28861929?v=4
 tags: [BigQuery, dbt, dlt, DuckDB, GitHub, Metabase, MotherDuck]