feat: load examples from config instead of code (apache#12026)

* feat: load examples from config instead of code * Remove database * Update data URL
betodealmeida · Dec 15, 2020 · 5e811a1 · 5e811a1
1 parent e0079bb
commit 5e811a1
Show file tree

Hide file tree

Showing 16 changed files with 444 additions and 207 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -30,7 +30,7 @@ combine_as_imports = true
 include_trailing_comma = true
 line_length = 88
 known_first_party = superset
-known_third_party =alembic,apispec,backoff,bleach,cachelib,celery,click,colorama,contextlib2,cron_descriptor,croniter,cryptography,dateutil,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,freezegun,geohash,geopy,humanize,isodate,jinja2,jwt,markdown,markupsafe,marshmallow,msgpack,numpy,pandas,parameterized,parsedatetime,pathlib2,pgsanity,polyline,prison,pyarrow,pyhive,pytest,pytz,redis,retry,selenium,setuptools,simplejson,slack,sqlalchemy,sqlalchemy_utils,sqlparse,werkzeug,wtforms,wtforms_json,yaml
+known_third_party =alembic,apispec,backoff,bleach,cachelib,celery,click,colorama,contextlib2,cron_descriptor,croniter,cryptography,dateutil,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,freezegun,geohash,geopy,humanize,isodate,jinja2,jwt,markdown,markupsafe,marshmallow,msgpack,numpy,pandas,parameterized,parsedatetime,pathlib2,pgsanity,pkg_resources,polyline,prison,pyarrow,pyhive,pytest,pytz,redis,retry,selenium,setuptools,simplejson,slack,sqlalchemy,sqlalchemy_utils,sqlparse,werkzeug,wtforms,wtforms_json,yaml
 multi_line_output = 3
 order_by_type = false
 

diff --git a/superset/cli.py b/superset/cli.py
@@ -124,9 +124,6 @@ def load_examples_run(
     print("Loading [Birth names]")
     examples.load_birth_names(only_metadata, force)
 
-    print("Loading [Unicode test data]")
-    examples.load_unicode_test_data(only_metadata, force)
-
     if not load_test_data:
         print("Loading [Random time series data]")
         examples.load_random_time_series_data(only_metadata, force)
@@ -164,6 +161,9 @@ def load_examples_run(
     print("Loading [Tabbed dashboard]")
     examples.load_tabbed_dashboard(only_metadata)
 
+    # load examples that are stored as YAML config files
+    examples.load_from_configs()
+
 
 @with_appcontext
 @superset.command()

diff --git a/superset/commands/importers/v1/__init__.py b/superset/commands/importers/v1/__init__.py
@@ -14,23 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
 from typing import Any, Dict, List, Optional, Set
 
 from marshmallow import Schema, validate
@@ -106,7 +89,7 @@ def validate(self) -> None:
             metadata = None
 
         # validate that the type declared in METADATA_FILE_NAME is correct
-        if metadata:
+        if metadata and "type" in metadata:
             type_validator = validate.Equal(self.dao.model_cls.__name__)  # type: ignore
             try:
                 type_validator(metadata["type"])

diff --git a/superset/commands/importers/v1/examples.py b/superset/commands/importers/v1/examples.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, Dict, List, Tuple
+
+from marshmallow import Schema
+from sqlalchemy.orm import Session
+from sqlalchemy.sql import select
+
+from superset import db
+from superset.charts.commands.importers.v1.utils import import_chart
+from superset.charts.schemas import ImportV1ChartSchema
+from superset.commands.exceptions import CommandException
+from superset.commands.importers.v1 import ImportModelsCommand
+from superset.dao.base import BaseDAO
+from superset.dashboards.commands.importers.v1.utils import (
+    find_chart_uuids,
+    import_dashboard,
+    update_id_refs,
+)
+from superset.dashboards.schemas import ImportV1DashboardSchema
+from superset.databases.commands.importers.v1.utils import import_database
+from superset.databases.schemas import ImportV1DatabaseSchema
+from superset.datasets.commands.importers.v1.utils import import_dataset
+from superset.datasets.schemas import ImportV1DatasetSchema
+from superset.models.core import Database
+from superset.models.dashboard import dashboard_slices
+
+
+class ImportExamplesCommand(ImportModelsCommand):
+
+    """Import examples"""
+
+    dao = BaseDAO
+    model_name = "model"
+    schemas: Dict[str, Schema] = {
+        "charts/": ImportV1ChartSchema(),
+        "dashboards/": ImportV1DashboardSchema(),
+        "datasets/": ImportV1DatasetSchema(),
+        "databases/": ImportV1DatabaseSchema(),
+    }
+    import_error = CommandException
+
+    # pylint: disable=too-many-locals
+    @staticmethod
+    def _import(
+        session: Session, configs: Dict[str, Any], overwrite: bool = False
+    ) -> None:
+        # import databases
+        database_ids: Dict[str, int] = {}
+        for file_name, config in configs.items():
+            if file_name.startswith("databases/"):
+                database = import_database(session, config, overwrite=overwrite)
+                database_ids[str(database.uuid)] = database.id
+
+        # import datasets
+        # TODO (betodealmeida): once we have all examples being imported we can
+        # have a stable UUID for the database stored in the dataset YAML; for
+        # now we need to fetch the current ID.
+        examples_id = (
+            db.session.query(Database).filter_by(database_name="examples").one().id
+        )
+        dataset_info: Dict[str, Dict[str, Any]] = {}
+        for file_name, config in configs.items():
+            if file_name.startswith("datasets/"):
+                config["database_id"] = examples_id
+                dataset = import_dataset(session, config, overwrite=overwrite)
+                dataset_info[str(dataset.uuid)] = {
+                    "datasource_id": dataset.id,
+                    "datasource_type": "view" if dataset.is_sqllab_view else "table",
+                    "datasource_name": dataset.table_name,
+                }
+
+        # import charts
+        chart_ids: Dict[str, int] = {}
+        for file_name, config in configs.items():
+            if file_name.startswith("charts/"):
+                # update datasource id, type, and name
+                config.update(dataset_info[config["dataset_uuid"]])
+                chart = import_chart(session, config, overwrite=overwrite)
+                chart_ids[str(chart.uuid)] = chart.id
+
+        # store the existing relationship between dashboards and charts
+        existing_relationships = session.execute(
+            select([dashboard_slices.c.dashboard_id, dashboard_slices.c.slice_id])
+        ).fetchall()
+
+        # import dashboards
+        dashboard_chart_ids: List[Tuple[int, int]] = []
+        for file_name, config in configs.items():
+            if file_name.startswith("dashboards/"):
+                config = update_id_refs(config, chart_ids)
+                dashboard = import_dashboard(session, config, overwrite=overwrite)
+                for uuid in find_chart_uuids(config["position"]):
+                    chart_id = chart_ids[uuid]
+                    if (dashboard.id, chart_id) not in existing_relationships:
+                        dashboard_chart_ids.append((dashboard.id, chart_id))
+
+        # set ref in the dashboard_slices table
+        values = [
+            {"dashboard_id": dashboard_id, "slice_id": chart_id}
+            for (dashboard_id, chart_id) in dashboard_chart_ids
+        ]
+        # pylint: disable=no-value-for-parameter (sqlalchemy/issues/4656)
+        session.execute(dashboard_slices.insert(), values)
diff --git a/superset/commands/importers/v1/utils.py b/superset/commands/importers/v1/utils.py
@@ -38,7 +38,7 @@ def remove_root(file_path: str) -> str:
 
 class MetadataSchema(Schema):
     version = fields.String(required=True, validate=validate.Equal(IMPORT_VERSION))
-    type = fields.String(required=True)
+    type = fields.String(required=False)
     timestamp = fields.DateTime()
 
 

diff --git a/superset/dashboards/commands/importers/v1/__init__.py b/superset/dashboards/commands/importers/v1/__init__.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, Iterator, List, Set, Tuple
+from typing import Any, Dict, List, Set, Tuple
 
 from marshmallow import Schema
 from sqlalchemy.orm import Session
@@ -26,6 +26,7 @@
 from superset.commands.importers.v1 import ImportModelsCommand
 from superset.dashboards.commands.exceptions import DashboardImportError
 from superset.dashboards.commands.importers.v1.utils import (
+    find_chart_uuids,
     import_dashboard,
     update_id_refs,
 )
@@ -38,17 +39,6 @@
 from superset.models.dashboard import dashboard_slices
 
 
-def find_chart_uuids(position: Dict[str, Any]) -> Iterator[str]:
-    """Find all chart UUIDs in a dashboard"""
-    for child in position.values():
-        if (
-            isinstance(child, dict)
-            and child["type"] == "CHART"
-            and "uuid" in child["meta"]
-        ):
-            yield child["meta"]["uuid"]
-
-
 class ImportDashboardsCommand(ImportModelsCommand):
 
     """Import dashboards"""

diff --git a/superset/dashboards/commands/importers/v1/utils.py b/superset/dashboards/commands/importers/v1/utils.py
@@ -17,7 +17,7 @@
 
 import json
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, Set
 
 from sqlalchemy.orm import Session
 
@@ -29,6 +29,10 @@
 JSON_KEYS = {"position": "position_json", "metadata": "json_metadata"}
 
 
+def find_chart_uuids(position: Dict[str, Any]) -> Set[str]:
+    return set(build_uuid_to_id_map(position))
+
+
 def build_uuid_to_id_map(position: Dict[str, Any]) -> Dict[str, int]:
     return {
         child["meta"]["uuid"]: child["meta"]["chartId"]
@@ -43,9 +47,6 @@ def build_uuid_to_id_map(position: Dict[str, Any]) -> Dict[str, int]:
 
 def update_id_refs(config: Dict[str, Any], chart_ids: Dict[str, int]) -> Dict[str, Any]:
     """Update dashboard metadata to use new IDs"""
-    if not config.get("metadata"):
-        return config
-
     fixed = config.copy()
 
     # build map old_id => new_id

diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py
@@ -17,17 +17,48 @@
 
 import json
 import logging
+import re
 from typing import Any, Dict
+from urllib import request
 
+import pandas as pd
+from sqlalchemy import Date, Float, String
 from sqlalchemy.orm import Session
+from sqlalchemy.sql.visitors import VisitableType
 
 from superset.connectors.sqla.models import SqlaTable
 
 logger = logging.getLogger(__name__)
 
+CHUNKSIZE = 512
+VARCHAR = re.compile(r"VARCHAR\((\d+)\)", re.IGNORECASE)
+
 JSON_KEYS = {"params", "template_params", "extra"}
 
 
+def get_sqla_type(native_type: str) -> VisitableType:
+    if native_type.upper() == "DATE":
+        return Date()
+
+    if native_type.upper() == "FLOAT":
+        return Float()
+
+    match = VARCHAR.match(native_type)
+    if match:
+        size = int(match.group(1))
+        return String(size)
+
+    raise Exception(f"Unknown type: {native_type}")
+
+
+def get_dtype(df: pd.DataFrame, dataset: SqlaTable) -> Dict[str, VisitableType]:
+    return {
+        column.column_name: get_sqla_type(column.type)
+        for column in dataset.columns
+        if column.column_name in df.keys()
+    }
+
+
 def import_dataset(
     session: Session, config: Dict[str, Any], overwrite: bool = False
 ) -> SqlaTable:
@@ -55,9 +86,34 @@ def import_dataset(
     # should we delete columns and metrics not present in the current import?
     sync = ["columns", "metrics"] if overwrite else []
 
+    # should we also load data into the dataset?
+    data_uri = config.get("data")
+
     # import recursively to include columns and metrics
     dataset = SqlaTable.import_from_dict(session, config, recursive=True, sync=sync)
     if dataset.id is None:
         session.flush()
 
+    # load data
+    if data_uri:
+        data = request.urlopen(data_uri)
+        df = pd.read_csv(data, encoding="utf-8")
+        dtype = get_dtype(df, dataset)
+
+        # convert temporal columns
+        for column_name, sqla_type in dtype.items():
+            if isinstance(sqla_type, Date):
+                df[column_name] = pd.to_datetime(df[column_name])
+
+        df.to_sql(
+            dataset.table_name,
+            con=session.connection(),
+            schema=dataset.schema,
+            if_exists="replace",
+            chunksize=CHUNKSIZE,
+            dtype=dtype,
+            index=False,
+            method="multi",
+        )
+
     return dataset
diff --git a/superset/datasets/schemas.py b/superset/datasets/schemas.py
@@ -168,3 +168,4 @@ class ImportV1DatasetSchema(Schema):
     metrics = fields.List(fields.Nested(ImportV1MetricSchema))
     version = fields.String(required=True)
     database_uuid = fields.UUID(required=True)
+    data = fields.URL()
diff --git a/superset/examples/__init__.py b/superset/examples/__init__.py
@@ -29,5 +29,5 @@
 from .random_time_series import load_random_time_series_data
 from .sf_population_polygons import load_sf_population_polygons
 from .tabbed_dashboard import load_tabbed_dashboard
-from .unicode_test_data import load_unicode_test_data
+from .utils import load_from_configs
 from .world_bank import load_world_bank_health_n_pop
diff --git a/superset/examples/configs/charts/Unicode_Cloud.yaml b/superset/examples/configs/charts/Unicode_Cloud.yaml
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+slice_name: Unicode Cloud
+viz_type: word_cloud
+params:
+  granularity_sqla: dttm
+  groupby: []
+  limit: '100'
+  metric:
+    aggregate: SUM
+    column:
+      column_name: value
+    expressionType: SIMPLE
+    label: Value
+  rotation: square
+  row_limit: 50000
+  series: short_phrase
+  since: 100 years ago
+  size_from: '10'
+  size_to: '70'
+  until: now
+  viz_type: word_cloud
+cache_timeout: null
+uuid: 609e26d8-8e1e-4097-9751-931708e24ee4
+version: 1.0.0
+dataset_uuid: a6771c73-96fc-44c6-8b6e-9d303955ea48