-
Notifications
You must be signed in to change notification settings - Fork 903
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[KED-2891] Implement
spark.DeltaTable
dataset (#964)
Signed-off-by: Yetunde Dada <yetudada@users.noreply.github.com>
- Loading branch information
Showing
16 changed files
with
423 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
"""``AbstractVersionedDataSet`` implementation to access DeltaTables using | ||
``delta-spark`` | ||
""" | ||
from copy import deepcopy | ||
from pathlib import PurePosixPath | ||
from typing import Any, Dict | ||
|
||
from delta.tables import DeltaTable | ||
from pyspark.sql import SparkSession | ||
from pyspark.sql.utils import AnalysisException | ||
|
||
from kedro.extras.datasets.spark.spark_dataset import ( | ||
_split_filepath, | ||
_strip_dbfs_prefix, | ||
) | ||
from kedro.io.core import AbstractDataSet, DataSetError | ||
|
||
|
||
class DeltaTableDataSet(AbstractDataSet): | ||
"""``DeltaTableDataSet`` loads data into DeltaTable objects. | ||
Example adding a catalog entry with | ||
`YAML API <https://kedro.readthedocs.io/en/stable/05_data/\ | ||
01_data_catalog.html#using-the-data-catalog-with-the-yaml-api>`_: | ||
.. code-block:: yaml | ||
>>> weather@spark: | ||
>>> type: spark.SparkDataSet | ||
>>> filepath: data/02_intermediate/data.parquet | ||
>>> file_format: "delta" | ||
>>> | ||
>>> weather@delta: | ||
>>> type: spark.DeltaTableDataSet | ||
>>> filepath: data/02_intermediate/data.parquet | ||
Example using Python API: | ||
:: | ||
>>> from pyspark.sql import SparkSession | ||
>>> from pyspark.sql.types import (StructField, StringType, | ||
>>> IntegerType, StructType) | ||
>>> | ||
>>> from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet | ||
>>> | ||
>>> schema = StructType([StructField("name", StringType(), True), | ||
>>> StructField("age", IntegerType(), True)]) | ||
>>> | ||
>>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] | ||
>>> | ||
>>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) | ||
>>> | ||
>>> data_set = SparkDataSet(filepath="test_data", file_format="delta") | ||
>>> data_set.save(spark_df) | ||
>>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") | ||
>>> delta_table = deltatable_dataset.load() | ||
>>> | ||
>>> delta_table.update() | ||
""" | ||
|
||
# this dataset cannot be used with ``ParallelRunner``, | ||
# therefore it has the attribute ``_SINGLE_PROCESS = True`` | ||
# for parallelism within a Spark pipeline please consider | ||
# using ``ThreadRunner`` instead | ||
_SINGLE_PROCESS = True | ||
|
||
def __init__(self, filepath: str, credentials: Dict[str, Any] = None) -> None: | ||
"""Creates a new instance of ``DeltaTableDataSet``. | ||
Args: | ||
filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks | ||
and working with data written to mount path points, | ||
specify ``filepath``s for (versioned) ``SparkDataSet``s | ||
starting with ``/dbfs/mnt``. | ||
credentials: Credentials to access the S3 bucket, such as | ||
``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``. | ||
Optional keyword arguments passed to ``hdfs.client.InsecureClient`` | ||
if ``filepath`` prefix is ``hdfs://``. Ignored otherwise. | ||
""" | ||
credentials = deepcopy(credentials) or {} # do we need these anywhere?? | ||
fs_prefix, filepath = _split_filepath(filepath) | ||
|
||
self._fs_prefix = fs_prefix | ||
self._filepath = PurePosixPath(filepath) | ||
|
||
@staticmethod | ||
def _get_spark(): | ||
return SparkSession.builder.getOrCreate() | ||
|
||
def _load(self) -> DeltaTable: | ||
load_path = self._fs_prefix + str(self._filepath) | ||
return DeltaTable.forPath(self._get_spark(), load_path) | ||
|
||
def _save(self, data: Any) -> None: | ||
raise DataSetError(f"{self.__class__.__name__} is a read only dataset type") | ||
|
||
def _exists(self) -> bool: | ||
load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) | ||
|
||
try: | ||
self._get_spark().read.load(path=load_path, format="delta") | ||
except AnalysisException as exception: | ||
if "is not a Delta table" in exception.desc: | ||
return False | ||
raise | ||
|
||
return True | ||
|
||
def _describe(self): | ||
return dict(filepath=str(self._filepath), fs_prefix=self._fs_prefix) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.