Add 'where' argument to Python API.

leoebfolsom · leoebfolsom · commit 66200ade3911 · 2022-12-14T16:17:46.000-08:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,13 +43,14 @@ jobs:
         run: "poetry install"
 
       # BigQuery start
-      - id: 'auth'
-        uses: 'google-github-actions/auth@v1'
-        with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
 
-      - name: 'Set up BigQuery Cloud SDK'
-        uses: 'google-github-actions/setup-gcloud@v1'
+    #   - id: 'auth'
+    #     uses: 'google-github-actions/auth@v1'
+    #     with:
+    #       credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
+
+    #   - name: 'Set up BigQuery Cloud SDK'
+    #     uses: 'google-github-actions/setup-gcloud@v1'
 
     #   - name: 'Use gcloud CLI'
     #     run: "gcloud config configurations list"
@@ -64,9 +65,10 @@ jobs:
             DATADIFF_SNOWFLAKE_URI: '${{ secrets.DATADIFF_SNOWFLAKE_URI }}'
             DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
             DATADIFF_TRINO_URI: '${{ secrets.DATADIFF_TRINO_URI }}'
-            DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
+            # DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
             DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
             DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
+            DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
         run: |
           chmod +x tests/waiting_for_stack_up.sh
           ./tests/waiting_for_stack_up.sh && TEST_ACROSS_ALL_DBS=0 poetry run unittest-parallel -j 16
diff --git a/.github/workflows/ci_full.yml b/.github/workflows/ci_full.yml
@@ -8,9 +8,12 @@ on:
 #       - '!dev/**'
   pull_request:
     branches: [ master ]
-
   workflow_dispatch:
 
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read  # This is required for actions/checkout
+
 jobs:
   unit_tests:
     strategy:
@@ -40,19 +43,16 @@ jobs:
         run: "poetry install"
 
       # BigQuery start
-      - id: 'auth'
-        uses: 'google-github-actions/auth@v1'
-        with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
-
-      - name: 'Set up BigQuery Cloud SDK'
-        uses: 'google-github-actions/setup-gcloud@v1'
+    #   - id: 'auth'
+    #     uses: 'google-github-actions/auth@v1'
+    #     with:
+    #       credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
 
-    #   - name: 'Use gcloud CLI'
-    #     run: "gcloud config configurations list"
+    #   - name: 'Set up BigQuery Cloud SDK'
+    #     uses: 'google-github-actions/setup-gcloud@v1'
 
-      - name: "Install BigQuery for Python"
-        run: poetry add google-cloud-bigquery
+    #   - name: "Install BigQuery for Python"
+    #     run: poetry add google-cloud-bigquery
 
       # BigQuery end
 
@@ -62,7 +62,8 @@ jobs:
             DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
             DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
             DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
-            DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
+            # DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
+            DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
         run: |
           chmod +x tests/waiting_for_stack_up.sh
           ./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16
diff --git a/data_diff/__init__.py b/data_diff/__init__.py
@@ -62,6 +62,8 @@ def diff_tables(
     max_threadpool_size: Optional[int] = 1,
     # Algorithm
     algorithm: Algorithm = Algorithm.AUTO,
+    # An additional 'where' expression to restrict the search space.
+    where: str = None,
     # Into how many segments to bisect per iteration (hashdiff only)
     bisection_factor: int = DEFAULT_BISECTION_FACTOR,
     # When should we stop bisecting and compare locally (in row count; hashdiff only)
@@ -92,6 +94,7 @@ def diff_tables(
         max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
                                    Only relevant when `threaded` is ``True``.
                                    There may be many pools, so number of actual threads can be a lot higher.
+        where (str, optional): An additional 'where' expression to restrict the search space.                   
         algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
         bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
         bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
@@ -106,7 +109,7 @@ def diff_tables(
 
     Note:
         The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
-        `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`.
+        `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
         If different values are needed per table, it's possible to omit them here, and instead set
         them directly when creating each :class:`TableSegment`.
 
@@ -135,6 +138,7 @@ def diff_tables(
             max_key=max_key,
             min_update=min_update,
             max_update=max_update,
+            where=where,
         ).items()
         if v is not None
     }
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -20,7 +20,7 @@
 from .databases import connect
 from .parse_time import parse_time_before, UNITS_STR, ParseError
 from .config import apply_config_from_file
-from .tracking import disable_tracking
+from .tracking import disable_tracking, set_entrypoint_name
 from .version import __version__
 
 
@@ -32,6 +32,7 @@
     "-": "red",
 }
 
+set_entrypoint_name("CLI")
 
 def _remove_passwords_in_dict(d: dict):
     for k, v in d.items():
diff --git a/data_diff/sqeleton/databases/bigquery.py b/data_diff/sqeleton/databases/bigquery.py
@@ -145,17 +145,30 @@ def close(self):
         self._client.close()
 
     def select_table_schema(self, path: DbPath) -> str:
-        schema, name = self._normalize_table_path(path)
-
+        project, schema, name = self._normalize_table_path(path)
         return (
             "SELECT column_name, data_type, 6 as datetime_precision, 38 as numeric_precision, 9 as numeric_scale "
-            f"FROM {schema}.INFORMATION_SCHEMA.COLUMNS "
+            f"FROM `{project}`.`{schema}`.INFORMATION_SCHEMA.COLUMNS "
             f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
         )
 
     def query_table_unique_columns(self, path: DbPath) -> List[str]:
         return []
 
+    def _normalize_table_path(self, path: DbPath) -> DbPath:
+        if len(path) == 0:
+            raise ValueError(f"{self.name}: Bad table path for {self}: ()")
+        elif len(path) == 1:
+            return (self.project, self.default_schema, path[0])
+        elif len(path) == 2:
+            return (self.project,) + path
+        elif len(path) == 3:
+            return path
+        else:
+            raise ValueError(
+                f"{self.name}: Bad table path for {self}: '{'.'.join(path)}'. Expected form: [project.]schema.table"
+            )
+
     def parse_table_name(self, name: str) -> DbPath:
         path = parse_table_name(name)
         return tuple(i for i in self._normalize_table_path(path) if i is not None)
diff --git a/data_diff/sqeleton/databases/databricks.py b/data_diff/sqeleton/databases/databricks.py
@@ -88,7 +88,7 @@ def set_timezone_to_utc(self) -> str:
 
 class Databricks(ThreadedDatabase):
     dialect = Dialect()
-    CONNECT_URI_HELP = "databricks://:<access_token>@<server_name>/<http_path>"
+    CONNECT_URI_HELP = "databricks://:<access_token>@<server_hostname>/<http_path>"
     CONNECT_URI_PARAMS = ["catalog", "schema"]
 
     def __init__(self, *, thread_count, **kw):
diff --git a/data_diff/sqeleton/databases/redshift.py b/data_diff/sqeleton/databases/redshift.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Dict
 from ..abcs.database_types import Float, TemporalType, FractionalType, DbPath
 from ..abcs.mixins import AbstractMixin_MD5
 from .postgresql import (
@@ -70,3 +70,31 @@ def select_table_schema(self, path: DbPath) -> str:
             "SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale FROM information_schema.columns "
             f"WHERE table_name = '{table.lower()}' AND table_schema = '{schema.lower()}'"
         )
+
+    def select_external_table_schema(self, path: DbPath) -> str:
+        schema, table = self._normalize_table_path(path)
+
+        return f"""SELECT
+                columnname AS column_name
+                , CASE WHEN external_type = 'string' THEN 'varchar' ELSE external_type END AS data_type
+                , NULL AS datetime_precision
+                , NULL AS numeric_precision
+                , NULL AS numeric_scale
+            FROM svv_external_columns
+                WHERE tablename = '{table.lower()}' AND schemaname = '{schema.lower()}'
+            """
+
+    def query_external_table_schema(self, path: DbPath) -> Dict[str, tuple]:
+        rows = self.query(self.select_external_table_schema(path), list)
+        if not rows:
+            raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
+
+        d = {r[0]: r for r in rows}
+        assert len(d) == len(rows)
+        return d
+
+    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
+        try:
+            return super().query_table_schema(path)
+        except RuntimeError:
+            return self.query_external_table_schema(path)
diff --git a/data_diff/table_segment.py b/data_diff/table_segment.py
@@ -50,8 +50,8 @@ class TableSegment:
     max_key: DbKey = None
     min_update: DbTime = None
     max_update: DbTime = None
-
     where: str = None
+    
     case_sensitive: bool = True
     _schema: Schema = None
 
diff --git a/data_diff/tracking.py b/data_diff/tracking.py
@@ -40,6 +40,8 @@ def _load_profile():
 g_tracking_enabled = True
 g_anonymous_id = None
 
+entrypoint_name = "Python API"
+
 
 def disable_tracking():
     global g_tracking_enabled
@@ -50,6 +52,11 @@ def is_tracking_enabled():
     return g_tracking_enabled
 
 
+def set_entrypoint_name(s):
+    global entrypoint_name
+    entrypoint_name = s
+
+
 def get_anonymous_id():
     global g_anonymous_id
     if g_anonymous_id is None:
@@ -70,6 +77,7 @@ def create_start_event_json(diff_options: Dict[str, Any]):
             "python_version": f"{platform.python_version()}/{platform.python_implementation()}",
             "diff_options": diff_options,
             "data_diff_version:": __version__,
+            "entrypoint_name": entrypoint_name,
         },
     }
 
@@ -99,6 +107,7 @@ def create_end_event_json(
             "diff_rows_cnt": diff_count,
             "error_message": error,
             "data_diff_version:": __version__,
+            "entrypoint_name": entrypoint_name,
         },
     }
 
diff --git a/tests/common.py b/tests/common.py
@@ -25,7 +25,7 @@
 TEST_SNOWFLAKE_CONN_STRING: str = os.environ.get("DATADIFF_SNOWFLAKE_URI") or None
 TEST_PRESTO_CONN_STRING: str = os.environ.get("DATADIFF_PRESTO_URI") or None
 TEST_BIGQUERY_CONN_STRING: str = os.environ.get("DATADIFF_BIGQUERY_URI") or None
-TEST_REDSHIFT_CONN_STRING: str = None
+TEST_REDSHIFT_CONN_STRING: str = os.environ.get("DATADIFF_REDSHIFT_URI") or None
 TEST_ORACLE_CONN_STRING: str = None
 TEST_DATABRICKS_CONN_STRING: str = os.environ.get("DATADIFF_DATABRICKS_URI")
 TEST_TRINO_CONN_STRING: str = os.environ.get("DATADIFF_TRINO_URI") or None