diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ce00bab --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/ambv/black + rev: stable + hooks: + - id: black + args: ['--line-length', '120'] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v0.780' # Use the sha / tag you want to point at + hooks: + - id: mypy + args: ['--config-file', 'mypy.ini'] +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.2 + hooks: + - id: flake8 + args: ['--config', 'flake8.cfg'] + additional_dependencies: + - isort<5 # https://github.com/gforcada/flake8-isort/issues/88 \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..e2f6153 --- /dev/null +++ b/Pipfile @@ -0,0 +1,15 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +black = "==19.10b0" +flake8 = "*" +mypy = "*" +pre-commit = "*" + +[packages] + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..b2b37b6 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,276 @@ +{ + "_meta": { + "hash": { + "sha256": "5266a49950c62dadb10a8abefd02e0e37a60a7f2c3e9c1aa3129fe3f026c61bd" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": {}, + "develop": { + "appdirs": { + "hashes": [ + "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", + "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128" + ], + "version": "==1.4.4" + }, + "attrs": { + "hashes": [ + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==19.3.0" + }, + "black": { + "hashes": [ + "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b", + "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539" + ], + "index": "pypi", + "version": "==19.10b0" + }, + "cfgv": { + "hashes": [ + "sha256:1ccf53320421aeeb915275a196e23b3b8ae87dea8ac6698b1638001d4a486d53", + "sha256:c8e8f552ffcc6194f4e18dd4f68d9aef0c0d58ae7e7be8c82bee3c5e9edfa513" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==3.1.0" + }, + "click": { + "hashes": [ + "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a", + "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==7.1.2" + }, + "distlib": { + "hashes": [ + "sha256:8c09de2c67b3e7deef7184574fc060ab8a793e7adbb183d942c389c8b13c52fb", + "sha256:edf6116872c863e1aa9d5bb7cb5e05a022c519a4594dc703843343a9ddd9bff1" + ], + "version": "==0.3.1" + }, + "filelock": { + "hashes": [ + "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59", + "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836" + ], + "version": "==3.0.12" + }, + "flake8": { + "hashes": [ + "sha256:15e351d19611c887e482fb960eae4d44845013cc142d42896e9862f775d8cf5c", + "sha256:f04b9fcbac03b0a3e58c0ab3a0ecc462e023a9faf046d57794184028123aa208" + ], + "index": "pypi", + "version": "==3.8.3" + }, + "identify": { + "hashes": [ + "sha256:c4d07f2b979e3931894170a9e0d4b8281e6905ea6d018c326f7ffefaf20db680", + "sha256:dac33eff90d57164e289fb20bf4e131baef080947ee9bf45efcd0da8d19064bf" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.4.21" + }, + "importlib-metadata": { + "hashes": [ + "sha256:90bb658cdbbf6d1735b6341ce708fc7024a3e14e99ffdc5783edea9f9b077f83", + "sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070" + ], + "markers": "python_version < '3.8'", + "version": "==1.7.0" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "mypy": { + "hashes": [ + "sha256:2c6cde8aa3426c1682d35190b59b71f661237d74b053822ea3d748e2c9578a7c", + "sha256:3fdda71c067d3ddfb21da4b80e2686b71e9e5c72cca65fa216d207a358827f86", + "sha256:5dd13ff1f2a97f94540fd37a49e5d255950ebcdf446fb597463a40d0df3fac8b", + "sha256:6731603dfe0ce4352c555c6284c6db0dc935b685e9ce2e4cf220abe1e14386fd", + "sha256:6bb93479caa6619d21d6e7160c552c1193f6952f0668cdda2f851156e85186fc", + "sha256:81c7908b94239c4010e16642c9102bfc958ab14e36048fa77d0be3289dda76ea", + "sha256:9c7a9a7ceb2871ba4bac1cf7217a7dd9ccd44c27c2950edbc6dc08530f32ad4e", + "sha256:a4a2cbcfc4cbf45cd126f531dedda8485671545b43107ded25ce952aac6fb308", + "sha256:b7fbfabdbcc78c4f6fc4712544b9b0d6bf171069c6e0e3cb82440dd10ced3406", + "sha256:c05b9e4fb1d8a41d41dec8786c94f3b95d3c5f528298d769eb8e73d293abc48d", + "sha256:d7df6eddb6054d21ca4d3c6249cae5578cb4602951fd2b6ee2f5510ffb098707", + "sha256:e0b61738ab504e656d1fe4ff0c0601387a5489ca122d55390ade31f9ca0e252d", + "sha256:eff7d4a85e9eea55afa34888dfeaccde99e7520b51f867ac28a48492c0b1130c", + "sha256:f05644db6779387ccdb468cc47a44b4356fc2ffa9287135d05b70a98dc83b89a" + ], + "index": "pypi", + "version": "==0.782" + }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, + "nodeenv": { + "hashes": [ + "sha256:4b0b77afa3ba9b54f4b6396e60b0c83f59eaeb2d63dc3cc7a70f7f4af96c82bc" + ], + "version": "==1.4.0" + }, + "pathspec": { + "hashes": [ + "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0", + "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061" + ], + "version": "==0.8.0" + }, + "pre-commit": { + "hashes": [ + "sha256:1657663fdd63a321a4a739915d7d03baedd555b25054449090f97bb0cb30a915", + "sha256:e8b1315c585052e729ab7e99dcca5698266bedce9067d21dc909c23e3ceed626" + ], + "index": "pypi", + "version": "==2.6.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367", + "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.6.0" + }, + "pyflakes": { + "hashes": [ + "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92", + "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.2.0" + }, + "pyyaml": { + "hashes": [ + "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97", + "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76", + "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2", + "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648", + "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf", + "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f", + "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2", + "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee", + "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d", + "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c", + "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a" + ], + "version": "==5.3.1" + }, + "regex": { + "hashes": [ + "sha256:08997a37b221a3e27d68ffb601e45abfb0093d39ee770e4257bd2f5115e8cb0a", + "sha256:112e34adf95e45158c597feea65d06a8124898bdeac975c9087fe71b572bd938", + "sha256:1700419d8a18c26ff396b3b06ace315b5f2a6e780dad387e4c48717a12a22c29", + "sha256:2f6f211633ee8d3f7706953e9d3edc7ce63a1d6aad0be5dcee1ece127eea13ae", + "sha256:52e1b4bef02f4040b2fd547357a170fc1146e60ab310cdbdd098db86e929b387", + "sha256:55b4c25cbb3b29f8d5e63aeed27b49fa0f8476b0d4e1b3171d85db891938cc3a", + "sha256:5aaa5928b039ae440d775acea11d01e42ff26e1561c0ffcd3d805750973c6baf", + "sha256:654cb773b2792e50151f0e22be0f2b6e1c3a04c5328ff1d9d59c0398d37ef610", + "sha256:690f858d9a94d903cf5cada62ce069b5d93b313d7d05456dbcd99420856562d9", + "sha256:6ad8663c17db4c5ef438141f99e291c4d4edfeaacc0ce28b5bba2b0bf273d9b5", + "sha256:89cda1a5d3e33ec9e231ece7307afc101b5217523d55ef4dc7fb2abd6de71ba3", + "sha256:92d8a043a4241a710c1cf7593f5577fbb832cf6c3a00ff3fc1ff2052aff5dd89", + "sha256:95fa7726d073c87141f7bbfb04c284901f8328e2d430eeb71b8ffdd5742a5ded", + "sha256:97712e0d0af05febd8ab63d2ef0ab2d0cd9deddf4476f7aa153f76feef4b2754", + "sha256:b2ba0f78b3ef375114856cbdaa30559914d081c416b431f2437f83ce4f8b7f2f", + "sha256:bae83f2a56ab30d5353b47f9b2a33e4aac4de9401fb582b55c42b132a8ac3868", + "sha256:c78e66a922de1c95a208e4ec02e2e5cf0bb83a36ceececc10a72841e53fbf2bd", + "sha256:cf59bbf282b627130f5ba68b7fa3abdb96372b24b66bdf72a4920e8153fc7910", + "sha256:e3cdc9423808f7e1bb9c2e0bdb1c9dc37b0607b30d646ff6faf0d4e41ee8fee3", + "sha256:e9b64e609d37438f7d6e68c2546d2cb8062f3adb27e6336bc129b51be20773ac", + "sha256:fbff901c54c22425a5b809b914a3bfaf4b9570eee0e5ce8186ac71eb2025191c" + ], + "version": "==2020.6.8" + }, + "six": { + "hashes": [ + "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", + "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.15.0" + }, + "toml": { + "hashes": [ + "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f", + "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88" + ], + "version": "==0.10.1" + }, + "typed-ast": { + "hashes": [ + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" + ], + "version": "==1.4.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5", + "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae", + "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392" + ], + "version": "==3.7.4.2" + }, + "virtualenv": { + "hashes": [ + "sha256:c11a475400e98450403c0364eb3a2d25d42f71cf1493da64390487b666de4324", + "sha256:e10cc66f40cbda459720dfe1d334c4dc15add0d80f09108224f171006a97a172" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==20.0.26" + }, + "zipp": { + "hashes": [ + "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b", + "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96" + ], + "markers": "python_version >= '3.6'", + "version": "==3.1.0" + } + } +} diff --git a/df_and_order/common.py b/df_and_order/common.py new file mode 100644 index 0000000..60e8a47 --- /dev/null +++ b/df_and_order/common.py @@ -0,0 +1,3 @@ +from typing import Any + +DF_TYPE = Any diff --git a/df_and_order/df_cache.py b/df_and_order/df_cache.py index 41459b8..95b3a4f 100644 --- a/df_and_order/df_cache.py +++ b/df_and_order/df_cache.py @@ -1,7 +1,8 @@ -import pandas as pd from abc import ABC, abstractmethod from typing import Optional +from df_and_order.common import DF_TYPE + class DfCache(ABC): """ @@ -18,26 +19,29 @@ class DfCache(ABC): load_kwargs: dict Custom kwargs for load method you may want to pass to your implementation. """ - def __init__(self, - save_args: Optional[list] = None, - save_kwargs: Optional[dict] = None, - load_args: Optional[list] = None, - load_kwargs: Optional[dict] = None): + + def __init__( + self, + save_args: Optional[list] = None, + save_kwargs: Optional[dict] = None, + load_args: Optional[list] = None, + load_kwargs: Optional[dict] = None, + ): self._save_args = save_args or [] self._save_kwargs = save_kwargs or {} self._load_args = load_args or [] self._load_kwargs = load_kwargs or {} - def save(self, df: pd.DataFrame, path: str): - self._save(df=df, path=path, *self._save_args, **self._save_kwargs) + def save(self, df: DF_TYPE, path: str): + self._save(df=df, path=path, *self._save_args, **self._save_kwargs) # type: ignore - def load(self, path: str) -> pd.DataFrame: - return self._load(path=path, *self._load_args, **self._load_kwargs) + def load(self, path: str) -> DF_TYPE: + return self._load(path=path, *self._load_args, **self._load_kwargs) # type: ignore @abstractmethod - def _save(self, df: pd.DataFrame, path: str, *args, **kwargs): + def _save(self, df: DF_TYPE, path: str, *args, **kwargs): pass @abstractmethod - def _load(self, path: str, *args, **kwargs) -> pd.DataFrame: + def _load(self, path: str, *args, **kwargs) -> DF_TYPE: pass diff --git a/df_and_order/df_config.py b/df_and_order/df_config.py index 12a5e69..4b6fe05 100644 --- a/df_and_order/df_config.py +++ b/df_and_order/df_config.py @@ -1,15 +1,15 @@ -from typing import Optional +from typing import Optional, Dict, Any import os import yaml from df_and_order.df_transform import DfTransformConfig -CONFIG_FILENAME = 'df_config.yaml' +CONFIG_FILENAME = "df_config.yaml" -DF_ID_KEY = 'df_id' -DF_INITIAL_FORMAT_KEY = 'initial_df_format' -METADATA_KEY = 'metadata' -TRANSFORMS_KEY = 'transforms' +DF_ID_KEY = "df_id" +DF_INITIAL_FORMAT_KEY = "initial_df_format" +METADATA_KEY = "metadata" +TRANSFORMS_KEY = "transforms" class DfConfig: @@ -23,12 +23,14 @@ class DfConfig: dir_path: str Location of a folder where config file should be read from. """ + def __init__(self, df_id: str, dir_path: str): self._dir_path = dir_path config_dict = self._read_config(path=self._config_path()) df_check = config_dict[DF_ID_KEY] == df_id - assert df_check, "config_dict doesn't belong to requested dataframe" + if not df_check: + raise Exception("Critical: config_dict doesn't belong to requested dataframe") self._config_dict = config_dict @@ -71,19 +73,20 @@ def _config_path_for(dir_path: str) -> str: """ Gets a path to the config using the given dir_path """ - return DfConfig._path_for_file(dir_path=dir_path, - filename=CONFIG_FILENAME) + return DfConfig._path_for_file(dir_path=dir_path, filename=CONFIG_FILENAME) @staticmethod def _path_for_file(dir_path: str, filename: str) -> str: return os.path.join(dir_path, filename) @staticmethod - def create_config(dir_path: str, - df_id: str, - initial_df_format: str, - metadata: Optional[dict] = None, - transform: Optional[DfTransformConfig] = None): + def create_config( + dir_path: str, + df_id: str, + initial_df_format: str, + metadata: Optional[dict] = None, + transform: Optional[DfTransformConfig] = None, + ): """ Creates config file at the given location. @@ -108,7 +111,7 @@ def create_config(dir_path: str, raise Exception(f"Config for df {df_id} already exists") config_path = DfConfig._config_path_for(dir_path=dir_path) - config_dict = { + config_dict: Dict[str, Any] = { DF_ID_KEY: df_id, DF_INITIAL_FORMAT_KEY: initial_df_format, } @@ -118,12 +121,9 @@ def create_config(dir_path: str, if transform: transform_id, transform_dict = transform.to_dict() - config_dict[TRANSFORMS_KEY] = { - transform_id: transform_dict - } + config_dict[TRANSFORMS_KEY] = {transform_id: transform_dict} - DfConfig._save_at_path(config_dict=config_dict, - config_path=config_path) + DfConfig._save_at_path(config_dict=config_dict, config_path=config_path) @property def df_id(self) -> str: @@ -147,17 +147,16 @@ def transforms_by(self, transform_id: str) -> DfTransformConfig: DfTransformConfig that describes all the transformation steps. """ maybe_transforms_dict = self._config_dict.get(TRANSFORMS_KEY) - assert maybe_transforms_dict, "Config contains no transforms" + if not maybe_transforms_dict: + raise Exception("Config contains no transforms") maybe_transform_dict = maybe_transforms_dict.get(transform_id) - assert maybe_transform_dict, f"Requested transform {transform_id} was not found" - result = DfTransformConfig.from_dict(transform_id=transform_id, - transform_dict=maybe_transform_dict) + if not maybe_transform_dict: + raise Exception(f"Requested transform {transform_id} was not found") + result = DfTransformConfig.from_dict(transform_id=transform_id, transform_dict=maybe_transform_dict) return result - def register_transform(self, - transform: DfTransformConfig, - filename: str): + def register_transform(self, transform: DfTransformConfig, filename: str): """ Adds a new transform to the config file if possible. @@ -176,11 +175,11 @@ def register_transform(self, maybe_transforms = self._config_dict.get(TRANSFORMS_KEY, {}) if maybe_transforms and maybe_transforms.get(transform_id): file_format = transform.df_format - full_filename = f'{filename}.{file_format}' - path_to_cached_file = self._path_for_file(dir_path=self._dir_path, - filename=full_filename) + full_filename = f"{filename}.{file_format}" + path_to_cached_file = self._path_for_file(dir_path=self._dir_path, filename=full_filename) already_cached = DfConfig._is_file_exists(path=path_to_cached_file) - assert not already_cached, f"Result of the transform {transform_id} already exists." + if already_cached: + raise Exception(f"Result of the transform {transform_id} already exists.") maybe_transforms[transform_id] = transform_dict self._config_dict[TRANSFORMS_KEY] = maybe_transforms @@ -190,10 +189,9 @@ def _save(self): """ Serializes config to the disk """ - self._save_at_path(config_dict=self._config_dict, - config_path=self._config_path()) + self._save_at_path(config_dict=self._config_dict, config_path=self._config_path()) @staticmethod def _save_at_path(config_dict: dict, config_path: str): - with open(config_path, 'w') as config_file: + with open(config_path, "w") as config_file: yaml.dump(config_dict, config_file) diff --git a/df_and_order/df_reader.py b/df_and_order/df_reader.py index 6bd9053..9a28921 100644 --- a/df_and_order/df_reader.py +++ b/df_and_order/df_reader.py @@ -1,8 +1,8 @@ import os import yaml -import pandas as pd from typing import Optional, Dict, List, Any +from df_and_order.common import DF_TYPE from df_and_order.df_transform import DfTransformConfig from df_and_order.df_config import DfConfig from df_and_order.df_cache import DfCache @@ -26,15 +26,12 @@ class DfReader: how to read&save them. Provide a map where a format extension is used as a key and DfCache instance as a value. See DfCache class documentation for the details. """ - def __init__(self, - dir_path: str, - format_to_cache_map: Dict[str, DfCache]): + + def __init__(self, dir_path: str, format_to_cache_map: Dict[str, DfCache]): self._dir_path = dir_path self._format_to_cache_map = format_to_cache_map - def df_exists(self, - df_id: str, - transform_id: Optional[str] = None) -> bool: + def df_exists(self, df_id: str, transform_id: Optional[str] = None) -> bool: """ Checks whether a dataframe file exists at the provided path. @@ -56,18 +53,12 @@ def df_exists(self, return False df_config = self._get_config(df_id=df_id) - df_path = self._df_path(df_config=df_config, - df_id=df_id, - transform_id=transform_id) + df_path = self._df_path(df_config=df_config, df_id=df_id, transform_id=transform_id) return DfReader._is_file_exists(path=df_path) - def _df_last_modified_ts(self, - df_id: str, - transform_id: Optional[str] = None) -> float: + def _df_last_modified_ts(self, df_id: str, transform_id: Optional[str] = None) -> float: df_config = self._get_config(df_id=df_id) - df_path = self._df_path(df_config=df_config, - df_id=df_id, - transform_id=transform_id) + df_path = self._df_path(df_config=df_config, df_id=df_id, transform_id=transform_id) result = FileInspector.last_modified_date(file_path=df_path) return result @@ -75,39 +66,39 @@ def _df_last_modified_ts(self, def _is_file_exists(path: str): return os.path.exists(path=path) - def create_df_config(self, - df_id: str, - initial_df_format: str, - metadata: Optional[dict] = None, - transform: Optional[DfTransformConfig] = None): + def create_df_config( + self, + df_id: str, + initial_df_format: str, + metadata: Optional[dict] = None, + transform: Optional[DfTransformConfig] = None, + ): """ Just a forwarding to DfConfig method, see docs in DfConfig. """ - DfConfig.create_config(dir_path=self._df_dir_path(df_id=df_id), - df_id=df_id, - initial_df_format=initial_df_format, - metadata=metadata, - transform=transform) - - def register_transform(self, - df_id: str, - df_config: DfConfig, - transform: DfTransformConfig): + DfConfig.create_config( + dir_path=self._df_dir_path(df_id=df_id), + df_id=df_id, + initial_df_format=initial_df_format, + metadata=metadata, + transform=transform, + ) + + def register_transform(self, df_id: str, df_config: DfConfig, transform: DfTransformConfig): """ Forms a filename for the given dataframe and adds a new transform to the config file if possible. In general it's just a forwarding to DfConfig method, see docs in DfConfig. """ - filename = self.df_filename(df_config=df_config, - df_id=df_id, - transform=transform) - df_config.register_transform(transform=transform, - filename=filename) - - def read(self, - df_id: str, - transform_id: Optional[str] = None, - transform: Optional[DfTransformConfig] = None, - forced: bool = False) -> pd.DataFrame: + filename = self.df_filename(df_config=df_config, df_id=df_id, transform=transform) + df_config.register_transform(transform=transform, filename=filename) + + def read( + self, + df_id: str, + transform_id: Optional[str] = None, + transform: Optional[DfTransformConfig] = None, + forced: bool = False, + ) -> DF_TYPE: """ Reads a dataframe from the disk. If you want a transformed version of your dataframe, but it's still not persisted, it first creates it and then reads it into memory. @@ -129,137 +120,97 @@ def read(self, """ if transform_id and transform: - raise AttributeError('Provide either transform_id or transform_config') + raise AttributeError("Provide either transform_id or transform_config") df_config = self._get_config(df_id=df_id) self._update_transforms_state(df_id=df_id) - if transform_id or transform: + is_transform = transform_id or transform + if is_transform: if not transform: + assert isinstance(transform_id, str) transform = df_config.transforms_by(transform_id=transform_id) - return self._read_transformed(df_id=df_id, - transform=transform, - df_config=df_config, - forced=forced) - else: - return self._read_initial(df_id=df_id, - df_config=df_config) + return self._read_transformed(df_id=df_id, transform=transform, df_config=df_config, forced=forced) - def _read_initial(self, - df_id: str, - df_config: DfConfig) -> pd.DataFrame: + return self._read_initial(df_id=df_id, df_config=df_config) + + def _read_initial(self, df_id: str, df_config: DfConfig) -> DF_TYPE: """ Reads the original dataframe from the disk """ df_format = df_config.initial_df_format - return self._read_df(df_id=df_id, - df_format=df_format, - df_config=df_config) + return self._read_df(df_id=df_id, df_format=df_format, df_config=df_config) - def _read_transformed(self, - df_id: str, - transform: DfTransformConfig, - df_config: DfConfig, - forced: bool = False) -> pd.DataFrame: + def _read_transformed( + self, df_id: str, transform: DfTransformConfig, df_config: DfConfig, forced: bool = False, + ) -> DF_TYPE: """ Reads the transformed dataframe from the disk or creates it if needed. """ transform_id = transform.transform_id - self.register_transform(df_id=df_id, - df_config=df_config, - transform=transform) + self.register_transform(df_id=df_id, df_config=df_config, transform=transform) - transformed_df_exists = self.df_exists(df_id=df_id, - transform_id=transform_id) + transformed_df_exists = self.df_exists(df_id=df_id, transform_id=transform_id) if transformed_df_exists: - return self._try_to_read_cached_transform(df_id=df_id, - df_config=df_config, - forced=forced, - transform=transform) - - df = self._read_source_df_for_transform(df_id=df_id, - df_config=df_config, - forced=forced, - transform=transform) - df = self._apply_df_transforms(df_id=df_id, - df=df, - df_config=df_config, - transform=transform) + return self._try_to_read_cached_transform( + df_id=df_id, df_config=df_config, forced=forced, transform=transform + ) + + df = self._read_source_df_for_transform(df_id=df_id, df_config=df_config, forced=forced, transform=transform) + df = self._apply_df_transforms(df_id=df_id, df=df, df_config=df_config, transform=transform) return df - def _read_source_df_for_transform(self, - df_id: str, - df_config: DfConfig, - forced: bool, - transform: DfTransformConfig): + def _read_source_df_for_transform( + self, df_id: str, df_config: DfConfig, forced: bool, transform: DfTransformConfig, + ): if transform.source_id: source_transform = df_config.transforms_by(transform_id=transform.source_id) - df = self._read_transformed(df_id=df_id, - transform=source_transform, - df_config=df_config, - forced=forced) + df = self._read_transformed(df_id=df_id, transform=source_transform, df_config=df_config, forced=forced,) else: - df = self._read_initial(df_id=df_id, - df_config=df_config) + df = self._read_initial(df_id=df_id, df_config=df_config) return df - def _try_to_read_cached_transform(self, - df_id: str, - df_config: DfConfig, - forced: bool, - transform: DfTransformConfig): + def _try_to_read_cached_transform( + self, df_id: str, df_config: DfConfig, forced: bool, transform: DfTransformConfig, + ): if transform.source_id: source_transform = df_config.transforms_by(transform_id=transform.source_id) - self._try_validate_transform(df_id=df_id, - transform=source_transform, - forced=forced, - child_transform=transform) - - self._try_validate_transform(df_id=df_id, - transform=transform, - forced=forced) - return self._read_cached_transformed(df_id=df_id, - transform=transform, - df_config=df_config, - forced=forced) - - def _apply_df_transforms(self, - df_id: str, - df: pd.DataFrame, - df_config: DfConfig, - transform: DfTransformConfig): + self._try_check_transform( + df_id=df_id, transform=source_transform, forced=forced, child_transform=transform, + ) + + self._try_check_transform(df_id=df_id, transform=transform, forced=forced) + return self._read_cached_transformed(df_id=df_id, transform=transform, df_config=df_config, forced=forced) + + def _apply_df_transforms(self, df_id: str, df: DF_TYPE, df_config: DfConfig, transform: DfTransformConfig): transform_id = transform.transform_id df_format = transform.df_format both_transform_types_are_present = transform.in_memory_steps and transform.permanent_steps df_shape_before = df.shape if transform.source_in_memory_steps: - df = DfReader._apply_transform_steps(df=df, - steps=transform.source_in_memory_steps) + df = DfReader._apply_transform_steps(df=df, steps=transform.source_in_memory_steps) if transform.in_memory_steps: - df = DfReader._apply_transform_steps(df=df, - steps=transform.in_memory_steps) + df = DfReader._apply_transform_steps(df=df, steps=transform.in_memory_steps) df_shape_after_in_memory = df.shape df_shape_has_changed = df_shape_before != df_shape_after_in_memory if both_transform_types_are_present and df_shape_has_changed: - raise Exception(f"Error: A permanent transform is also present, hence your " - f"in-memory transform can't modify the shape of the initial df.") + raise Exception( + "Error: A permanent transform is also present, hence your " + "in-memory transform can't modify the shape of the initial df." + ) if transform.permanent_steps: - df = DfReader._apply_transform_steps(df=df, - steps=transform.permanent_steps) + df = DfReader._apply_transform_steps(df=df, steps=transform.permanent_steps) - df_path = self._df_path(df_config=df_config, - df_id=df_id, - transform_id=transform_id) + df_path = self._df_path(df_config=df_config, df_id=df_id, transform_id=transform_id) df_cache = self._get_df_cache(df_format=df_format) df_cache.save(df=df, path=df_path) source_transform = None if transform.source_id: source_transform = df_config.transforms_by(transform_id=transform.source_id) - state = DfTransformState(transform=transform, - source_transform=source_transform) + state = DfTransformState(transform=transform, source_transform=source_transform) self._save_transform_state(df_id=df_id, state=state) return df @@ -276,8 +227,7 @@ def _transforms_states(self, df_id: str) -> Dict[str, DfTransformState]: result = {} for transform_id, state_dict in state_dicts.items(): - state = DfTransformState.from_dict(transform_id=transform_id, - state_dict=state_dict) + state = DfTransformState.from_dict(transform_id=transform_id, state_dict=state_dict) result[transform_id] = state return result @@ -292,79 +242,172 @@ def _transforms_state_dicts(self, df_id: str) -> Dict[str, Any]: return result_dict def _save_transforms_state_file(self, df_id: str, transforms_state: Dict[str, Any]): - with open(self._transforms_state_file_path(df_id=df_id), 'w') as config_file: + with open(self._transforms_state_file_path(df_id=df_id), "w") as config_file: yaml.dump(transforms_state, config_file) def _transforms_state_file_path(self, df_id: str) -> str: path = self._df_dir_path(df_id=df_id) - file_path = os.path.join(path, '.transforms_state.yaml') + file_path = os.path.join(path, ".transforms_state.yaml") return file_path - def _try_validate_transform(self, - df_id: str, - transform: DfTransformConfig, - forced: bool, - child_transform: Optional[DfTransformConfig] = None): - if len(transform.permanent_steps) == 0: + # TODO: all the checks should be extracted from the reader + def _try_check_transform( + self, + df_id: str, + transform: DfTransformConfig, + forced: bool, + child_transform: Optional[DfTransformConfig] = None, + ): + if child_transform and not self._can_start_transform_state_check(df_id=df_id, transform=transform): return + if child_transform: + # in case of child transform we better check the source separately + # e.g. a source config can match source_transform in a child's state + # but the source file was generated using different config before + self._try_check_transform(df_id=df_id, transform=transform, forced=forced) + + self._check_transform(df_id=df_id, transform=transform, forced=forced, child_transform=child_transform) + + def _check_transform( + self, + df_id: str, + transform: DfTransformConfig, + forced: bool, + child_transform: Optional[DfTransformConfig] = None, + ): + """ + Performs various checks for the given transform + """ + self._check_ts(df_id=df_id, forced=forced, transform=transform) + + # which transform are we checking + transform_id = self._transform_state_id(transform=transform, child_transform=child_transform) + + # saved state for the transform + transform_state = self._transform_state(df_id=df_id, transform_id=transform_id) + + # is it possible to check such a state + can_proceed = self._can_proceed_with_transform_state( + transform_id=transform.transform_id, transform_state=transform_state, forced=forced + ) + + if not can_proceed: + # user forced the check, so exiting early + return + + # now we are sure that transform_state is present, for mypy + assert transform_state + + # ready to truly check the state + self._check_transform_state( + transform_state=transform_state, transform=transform, forced=forced, child_transform=child_transform + ) + + def _can_start_transform_state_check(self, df_id: str, transform: DfTransformConfig) -> bool: + if len(transform.permanent_steps) == 0: + return False + if not self.df_exists(df_id=df_id, transform_id=transform.transform_id): # at first I wanted to throw an exception here, but if the parent transform # is no longer present on the disk, such fact shouldn't prevent you from using a # serialized child transform. It would be a problem if persisted parent transform # is incompatible with the one used to generate a child transform's file # P.S. at least I understand it... - return + return False + + return True - if len(transform.permanent_steps) > 0: - self._validate_ts(df_id=df_id, forced=forced, transform=transform) + @staticmethod + def _transform_state_id(transform: DfTransformConfig, child_transform: Optional[DfTransformConfig]): + if child_transform: + return child_transform.transform_id + return transform.transform_id + def _transform_state(self, df_id: str, transform_id: str) -> Optional[DfTransformState]: transforms_state = self._transforms_states(df_id=df_id) + transform_state = transforms_state.get(transform_id) + + return transform_state + + @staticmethod + def _can_proceed_with_transform_state( + transform_id: str, transform_state: Optional[DfTransformState], forced: bool + ) -> bool: + """ + Checks whether transform_state exists on disk. If not, we can't perform checks. + If forced, only warning will be shown, otherwise it raises Exception. + """ + if not transform_state: + base_warning = f"{transform_id} transform was " f"persisted with an unknown configuration" + if forced: + print( + f"VERY IMPORTANT WARNING: {base_warning}, but " + f"reading it anyway because the operation was forced." + ) + return False + + raise Exception(f"{base_warning}, can't safely load it") + + return True + + def _check_transform_state( + self, + transform_state: DfTransformState, + transform: DfTransformConfig, + forced: bool, + child_transform: Optional[DfTransformConfig] = None, + ): + # if child_transform is present - we are interested in its source_transform + # saved state since then the transform parameter represents source's current state if child_transform: - maybe_transform_state = transforms_state.get(child_transform.transform_id) - saved_transform = maybe_transform_state.source_transform - # in case of child transform we better check the source separately - # e.g. a source config can match source_transform in a child's state - # but the source file was generated using different config before - self._try_validate_transform(df_id=df_id, - transform=transform, - forced=forced) + transform_from_state = transform_state.source_transform else: - maybe_transform_state = transforms_state.get(transform.transform_id) - saved_transform = maybe_transform_state.transform - if not maybe_transform_state: - base_warning = f"{transform.transform_id} transform was " \ - f"persisted with an unknown configuration" - if forced: - print(f"VERY IMPORTANT WARNING: {base_warning}, but " - f"reading it anyway because the operation was forced.") - return - else: - raise Exception(f"{base_warning}, can't safely load it") + transform_from_state = transform_state.transform + + if transform_from_state: + # the only thing left is to compare the saved state with the current one + # P.S. It would be nicer if I checked NOT case and raised an exception there, + # but I need to convince mypy that transform_from_state is present + self._check_transform_with_state( + transform=transform, transform_from_state=transform_from_state, forced=forced + ) + return - valid = saved_transform.to_dict()[1] == transform.to_dict()[1] + raise Exception(f"Critical: couldn't get the transform from transform_state: {transform.transform_id}") + + @staticmethod + def _check_transform_with_state( + transform: DfTransformConfig, transform_from_state: DfTransformConfig, forced: bool + ): + valid = transform_from_state.to_dict()[1] == transform.to_dict()[1] if not valid: - base_warning = f"You've changed the df_config.yaml file of {transform.transform_id}" \ - f" transform, so it's incompatible with the persisted version" + base_warning = ( + f"You've changed the df_config.yaml file of {transform.transform_id}" + f" transform, so it's incompatible with the persisted version" + ) if forced: - print(f"VERY IMPORTANT WARNING: {base_warning}, but " - f"reading it anyway because the operation was forced.") + print( + f"VERY IMPORTANT WARNING: {base_warning}, but " + f"reading it anyway because the operation was forced." + ) return - else: - raise Exception(base_warning) + + raise Exception(base_warning) def _update_transforms_state(self, df_id: str): + """ + When a file of a persisted transform gets deleted - + the method removes the transform's config from saved state config + """ transforms_state = self._transforms_state_dicts(df_id=df_id) transform_ids_to_delete = [] for transform_id, transform_dict in transforms_state.items(): - transform = DfTransformConfig.from_dict(transform_id=transform_id, - transform_dict=transform_dict) + transform = DfTransformConfig.from_dict(transform_id=transform_id, transform_dict=transform_dict) df_config = self._get_config(df_id=df_id) - filename = DfReader.df_filename(df_config=df_config, - df_id=df_id, - transform=transform) + filename = DfReader.df_filename(df_config=df_config, df_id=df_id, transform=transform) file_path = self._df_dir_path(df_id=df_id, filename=filename) if not DfReader._is_file_exists(path=file_path): transform_ids_to_delete.append(transform_id) @@ -372,36 +415,37 @@ def _update_transforms_state(self, df_id: str): for transform_id in transform_ids_to_delete: del transforms_state[transform_id] - self._save_transforms_state_file(df_id=df_id, - transforms_state=transforms_state) + self._save_transforms_state_file(df_id=df_id, transforms_state=transforms_state) - def _read_cached_transformed(self, - df_id: str, - transform: DfTransformConfig, - df_config: DfConfig, - forced: bool = False) -> pd.DataFrame: + def _read_cached_transformed( + self, df_id: str, transform: DfTransformConfig, df_config: DfConfig, forced: bool = False, + ) -> DF_TYPE: df_format = transform.df_format transform_id = transform.transform_id if transform.permanent_steps: - self._validate_ts(df_id=df_id, forced=forced, transform=transform) + self._check_ts(df_id=df_id, forced=forced, transform=transform) - df = self._read_df(df_config=df_config, - df_id=df_id, - df_format=df_format, - transform_id=transform_id) + df = self._read_df(df_config=df_config, df_id=df_id, df_format=df_format, transform_id=transform_id,) if transform.in_memory_steps: - df = DfReader._apply_transform_steps(df=df, - steps=transform.in_memory_steps) + df = DfReader._apply_transform_steps(df=df, steps=transform.in_memory_steps) return df - def _validate_ts(self, df_id: str, forced: bool, transform: DfTransformConfig): + def _check_ts(self, df_id: str, forced: bool, transform: DfTransformConfig): + """ + If a python file with a transform was changed after a transform + was persisted on disk - we can no longer guarantee that the persisted file is still relevant. + This method checks timestamp and raises Exception if needed. + ( modifications in the file could bring some drastic changes in the transform itself ). + """ + if len(transform.permanent_steps) == 0: + return + # if the code of one of the steps was modified since the transformed dataframe # was cached - we need to stop and warn a user about the need of regenerating it - df_last_modified_date = self._df_last_modified_ts(df_id=df_id, - transform_id=transform.transform_id) + df_last_modified_date = self._df_last_modified_ts(df_id=df_id, transform_id=transform.transform_id) def _filter_func(step_config: DfTransformStepConfig): # built-in types override the method to provide true last modification date @@ -413,29 +457,30 @@ def _filter_func(step_config: DfTransformStepConfig): outdated_steps = list(filter(_filter_func, transform.permanent_steps)) if len(outdated_steps) > 0: steps_module_paths = [step.module_path for step in outdated_steps] - base_warning = f'{steps_module_paths} steps of {transform.transform_id} transform were changed since the df was generated' + base_warning = ( + f"{steps_module_paths} steps of {transform.transform_id} " + f"transform were changed since the df was generated" + ) if forced: print(f"Warning: {base_warning}, reading it anyway because the operation was forced") else: - raise Exception(f'{base_warning}, ' - 'delete the file and try again to regenerate the df.') + raise Exception(f"{base_warning}, " "delete the file and try again to regenerate the df.") @staticmethod - def _apply_transform_steps(df: pd.DataFrame, - steps: List[DfTransformStepConfig]) -> pd.DataFrame: + def _apply_transform_steps(df: DF_TYPE, steps: List[DfTransformStepConfig]) -> DF_TYPE: """ Applies all the steps for a transformation on the given dataframe. Parameters ---------- - df: pd.DataFrame + df: DF_TYPE Initial dataframe to perform transformations on. steps: list of DfTransformStepConfig List of objects that represent a step of the whole transformation. Returns ------- - pd.DataFrame, fully transformed initial dataframe + DF_TYPE, fully transformed initial dataframe """ transform_steps = [DfTransformStep.build_transform(config=step_config) for step_config in steps] for transform in transform_steps: @@ -443,11 +488,7 @@ def _apply_transform_steps(df: pd.DataFrame, return df - def _read_df(self, - df_id: str, - df_format: str, - df_config: DfConfig, - transform_id: Optional[str] = None) -> pd.DataFrame: + def _read_df(self, df_id: str, df_format: str, df_config: DfConfig, transform_id: Optional[str] = None,) -> DF_TYPE: """ General method for reading a dataframe from the disk. @@ -464,21 +505,16 @@ def _read_df(self, Returns ------- - pd.DataFrame, the requested dataframe + DF_TYPE, the requested dataframe """ - df_path = self._df_path(df_config=df_config, - df_id=df_id, - transform_id=transform_id) + df_path = self._df_path(df_config=df_config, df_id=df_id, transform_id=transform_id) df_cache = self._get_df_cache(df_format=df_format) df = df_cache.load(path=df_path) return df - def _df_path(self, - df_config: DfConfig, - df_id: str, - transform_id: Optional[str] = None) -> str: + def _df_path(self, df_config: DfConfig, df_id: str, transform_id: Optional[str] = None) -> str: """ Forms a path to the dataframe. @@ -498,17 +534,13 @@ def _df_path(self, transform = None if transform_id: transform = df_config.transforms_by(transform_id=transform_id) - filename = DfReader.df_filename(df_config=df_config, - df_id=df_id, - transform=transform) + filename = DfReader.df_filename(df_config=df_config, df_id=df_id, transform=transform) result = self._df_dir_path(df_id=df_id, filename=filename) return result @staticmethod - def df_filename(df_config: DfConfig, - df_id: str, - transform: Optional[DfTransformConfig] = None): + def df_filename(df_config: DfConfig, df_id: str, transform: Optional[DfTransformConfig] = None): """ Forms a filename for the dataframe @@ -526,9 +558,9 @@ def df_filename(df_config: DfConfig, str, a filename for the dataframe. """ if transform: - return f'{transform.transform_id}_{df_id}.{transform.df_format}' + return f"{transform.transform_id}_{df_id}.{transform.df_format}" - return f'{df_id}.{df_config.initial_df_format}' + return f"{df_id}.{df_config.initial_df_format}" def _df_dir_path(self, df_id: str, filename: Optional[str] = None) -> str: """ @@ -545,8 +577,7 @@ def _df_dir_path(self, df_id: str, filename: Optional[str] = None) -> str: ------- str, absolute path to the desired item """ - path = self._df_dir_path_for(dir_path=self._dir_path, - df_id=df_id) + path = self._df_dir_path_for(dir_path=self._dir_path, df_id=df_id) if filename: path = os.path.join(path, filename) @@ -554,8 +585,7 @@ def _df_dir_path(self, df_id: str, filename: Optional[str] = None) -> str: return path @staticmethod - def _df_dir_path_for(dir_path: str, - df_id: str) -> str: + def _df_dir_path_for(dir_path: str, df_id: str) -> str: return os.path.join(dir_path, df_id) def _get_config(self, df_id: str) -> DfConfig: @@ -571,8 +601,7 @@ def _get_config(self, df_id: str) -> DfConfig: ------- DfConfig instance. """ - result = self._get_config_for(dir_path=self._df_dir_path(df_id=df_id), - df_id=df_id) + result = self._get_config_for(dir_path=self._df_dir_path(df_id=df_id), df_id=df_id) return result @staticmethod @@ -591,8 +620,7 @@ def _get_config_for(dir_path: str, df_id: str) -> DfConfig: ------- """ - result = DfConfig(df_id=df_id, - dir_path=dir_path) + result = DfConfig(df_id=df_id, dir_path=dir_path) return result def _get_df_cache(self, df_format: str) -> DfCache: @@ -610,5 +638,5 @@ def _get_df_cache(self, df_format: str) -> DfCache: """ df_cache = self._format_to_cache_map.get(df_format) if not df_cache: - raise ValueError(f'Unknown df_format, df_cache was not provided: {df_format}') + raise ValueError(f"Unknown df_format, df_cache was not provided: {df_format}") return df_cache diff --git a/df_and_order/df_transform.py b/df_and_order/df_transform.py index 336c8c7..3940ae0 100644 --- a/df_and_order/df_transform.py +++ b/df_and_order/df_transform.py @@ -1,18 +1,19 @@ -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Any, Dict from df_and_order.df_transform_step import DfTransformStepConfig -TRANSFORM_ID_KEY = 'transform_id' -TRANSFORM_DF_FORMAT_KEY = 'df_format' -TRANSFORM_SOURCE_ID_KEY = 'source_id' -TRANSFORM_SOURCE_IN_MEMORY_KEY = 'source_in_memory' -TRANSFORM_IN_MEMORY_KEY = 'in_memory' -TRANSFORM_PERMANENT_KEY = 'permanent' +TRANSFORM_ID_KEY = "transform_id" +TRANSFORM_DF_FORMAT_KEY = "df_format" +TRANSFORM_SOURCE_ID_KEY = "source_id" +TRANSFORM_SOURCE_IN_MEMORY_KEY = "source_in_memory" +TRANSFORM_IN_MEMORY_KEY = "in_memory" +TRANSFORM_PERMANENT_KEY = "permanent" class DfTransformConfig: """ + Describes how a transformation should be performed. For any transformation one or many steps can be used. Those steps can be performed in memory only or their result @@ -37,14 +38,18 @@ class DfTransformConfig: permanent_steps: list of DfTransformStepConfig Those steps result of which is persisted on the disk for future access. """ - def __init__(self, - transform_id: str, - df_format: Optional[str] = None, - source_id: Optional[str] = None, - source_in_memory_steps: Optional[List[DfTransformStepConfig]] = None, - in_memory_steps: Optional[List[DfTransformStepConfig]] = None, - permanent_steps: Optional[List[DfTransformStepConfig]] = None): - assert in_memory_steps or permanent_steps, "Provide at least one type of transformations" + + def __init__( + self, + transform_id: str, + df_format: str, + source_id: Optional[str] = None, + source_in_memory_steps: Optional[List[DfTransformStepConfig]] = None, + in_memory_steps: Optional[List[DfTransformStepConfig]] = None, + permanent_steps: Optional[List[DfTransformStepConfig]] = None, + ): + if in_memory_steps is None and permanent_steps is None: + raise Exception("Provide at least one type of transformations") self._transform_id = transform_id self._df_format = df_format @@ -57,12 +62,14 @@ def __eq__(self, other: object) -> bool: if not isinstance(other, DfTransformConfig): return False - result = self.transform_id == other.transform_id \ - and self.df_format == other.df_format \ - and self.source_id == other.source_id \ - and self.source_in_memory_steps == other.source_in_memory_steps \ - and self.in_memory_steps == other.in_memory_steps \ - and self.permanent_steps == other.permanent_steps + result = ( + self.transform_id == other.transform_id + and self.df_format == other.df_format + and self.source_id == other.source_id + and self.source_in_memory_steps == other.source_in_memory_steps + and self.in_memory_steps == other.in_memory_steps + and self.permanent_steps == other.permanent_steps + ) return result @@ -71,7 +78,7 @@ def transform_id(self) -> str: return self._transform_id @property - def df_format(self) -> Optional[str]: + def df_format(self) -> str: return self._df_format @property @@ -79,15 +86,15 @@ def source_id(self) -> Optional[str]: return self._source_id @property - def source_in_memory_steps(self) -> Optional[List[DfTransformStepConfig]]: + def source_in_memory_steps(self) -> List[DfTransformStepConfig]: return self._source_in_memory_steps @property - def in_memory_steps(self) -> Optional[List[DfTransformStepConfig]]: + def in_memory_steps(self) -> List[DfTransformStepConfig]: return self._in_memory_steps @property - def permanent_steps(self) -> Optional[List[DfTransformStepConfig]]: + def permanent_steps(self) -> List[DfTransformStepConfig]: return self._permanent_steps def to_dict(self) -> Tuple[str, dict]: @@ -98,10 +105,7 @@ def to_dict(self) -> Tuple[str, dict]: ------- tuple of str and dict, transform_id and transform's dictionary representation. """ - result = {} - - if self._df_format: - result[TRANSFORM_DF_FORMAT_KEY] = self._df_format + result: Dict[str, Any] = {TRANSFORM_DF_FORMAT_KEY: self._df_format} if self._source_id: result[TRANSFORM_SOURCE_ID_KEY] = self._source_id @@ -121,8 +125,7 @@ def to_dict(self) -> Tuple[str, dict]: return self.transform_id, result @staticmethod - def from_dict(transform_id: str, - transform_dict: dict): + def from_dict(transform_id: str, transform_dict: dict): """ Builds DfTransformConfig instance out of serialized dictionary. @@ -139,25 +142,30 @@ def from_dict(transform_id: str, """ step_configs_getter = DfTransformConfig._transform_step_configs_from df_format = transform_dict.get(TRANSFORM_DF_FORMAT_KEY) + if df_format is None: + raise Exception( + "Provide a file format you want your transform " "to be saved in using df_format key in the config" + ) + source_id = transform_dict.get(TRANSFORM_SOURCE_ID_KEY) - source_in_memory_transforms = step_configs_getter(transform_dict=transform_dict, - key=TRANSFORM_SOURCE_IN_MEMORY_KEY) - in_memory_transforms = step_configs_getter(transform_dict=transform_dict, - key=TRANSFORM_IN_MEMORY_KEY) - permanent_transforms = step_configs_getter(transform_dict=transform_dict, - key=TRANSFORM_PERMANENT_KEY) - - result = DfTransformConfig(transform_id=transform_id, - df_format=df_format, - source_id=source_id, - source_in_memory_steps=source_in_memory_transforms, - in_memory_steps=in_memory_transforms, - permanent_steps=permanent_transforms) + source_in_memory_transforms = step_configs_getter( + transform_dict=transform_dict, key=TRANSFORM_SOURCE_IN_MEMORY_KEY + ) + in_memory_transforms = step_configs_getter(transform_dict=transform_dict, key=TRANSFORM_IN_MEMORY_KEY) + permanent_transforms = step_configs_getter(transform_dict=transform_dict, key=TRANSFORM_PERMANENT_KEY) + + result = DfTransformConfig( + transform_id=transform_id, + df_format=df_format, + source_id=source_id, + source_in_memory_steps=source_in_memory_transforms, + in_memory_steps=in_memory_transforms, + permanent_steps=permanent_transforms, + ) return result @staticmethod - def _transform_step_configs_from(transform_dict: dict, - key: str) -> Optional[List[DfTransformStepConfig]]: + def _transform_step_configs_from(transform_dict: dict, key: str) -> Optional[List[DfTransformStepConfig]]: """ When deserializing we need to transform steps into objects as well. @@ -175,9 +183,6 @@ def _transform_step_configs_from(transform_dict: dict, step_dicts = transform_dict.get(key) steps = None if step_dicts: - steps = [ - DfTransformStepConfig.from_dict(step_dict=step_dict) - for step_dict in step_dicts - ] + steps = [DfTransformStepConfig.from_dict(step_dict=step_dict) for step_dict in step_dicts] return steps diff --git a/df_and_order/df_transform_state.py b/df_and_order/df_transform_state.py index b961888..0cddb18 100644 --- a/df_and_order/df_transform_state.py +++ b/df_and_order/df_transform_state.py @@ -30,8 +30,7 @@ def to_dict(self) -> Tuple[str, dict]: return transform_id, result @staticmethod - def from_dict(transform_id: str, - state_dict: dict): + def from_dict(transform_id: str, state_dict: dict): """ Builds DfTransformState instance out of serialized dictionary. @@ -53,12 +52,9 @@ def from_dict(transform_id: str, del state_dict[TRANSFORM_STATE_SOURCE_KEY] transform_dict = state_dict - transform = DfTransformConfig.from_dict(transform_id=transform_id, - transform_dict=transform_dict) + transform = DfTransformConfig.from_dict(transform_id=transform_id, transform_dict=transform_dict) if source_dict: - source_transform = DfTransformConfig.from_dict(transform_id=transform.source_id, - transform_dict=source_dict) - result = DfTransformState(transform=transform, - source_transform=source_transform) + source_transform = DfTransformConfig.from_dict(transform_id=transform.source_id, transform_dict=source_dict) + result = DfTransformState(transform=transform, source_transform=source_transform) return result diff --git a/df_and_order/df_transform_step.py b/df_and_order/df_transform_step.py index 1388271..1fad919 100644 --- a/df_and_order/df_transform_step.py +++ b/df_and_order/df_transform_step.py @@ -1,12 +1,18 @@ -import pandas as pd from dataclasses import dataclass from abc import ABC, abstractmethod -from df_and_order.helpers import build_class_instance, get_file_path_from_module_path, FileInspector, \ - get_module_path_from_type +from typing import Dict, Any -TRANSFORM_STEP_MODULE_PATH_KEY = 'module_path' -TRANSFORM_STEP_PARAMS_KEY = 'params' +from df_and_order.common import DF_TYPE +from df_and_order.helpers import ( + build_class_instance, + get_file_path_from_module_path, + FileInspector, + get_module_path_from_type, +) + +TRANSFORM_STEP_MODULE_PATH_KEY = "module_path" +TRANSFORM_STEP_PARAMS_KEY = "params" @dataclass @@ -15,6 +21,7 @@ class DfTransformStepConfig: Stores module path of some DfTransformStep as well as its init parameters. """ + module_path: str params: dict @@ -22,16 +29,14 @@ def __eq__(self, other: object) -> bool: if not isinstance(other, DfTransformStepConfig): return False - result = self.module_path == other.module_path \ - and self.params == other.params + result = self.module_path == other.module_path and self.params == other.params return result @staticmethod def from_step_type(step_type, params: dict): module_path = get_module_path_from_type(py_type=step_type) - step_config = DfTransformStepConfig(module_path=module_path, - params=params) + step_config = DfTransformStepConfig(module_path=module_path, params=params) return step_config @staticmethod @@ -39,16 +44,15 @@ def from_dict(step_dict: dict): module_path = step_dict[TRANSFORM_STEP_MODULE_PATH_KEY] params = step_dict.get(TRANSFORM_STEP_PARAMS_KEY) or {} - step_config = DfTransformStepConfig(module_path=module_path, - params=params) + step_config = DfTransformStepConfig(module_path=module_path, params=params) return step_config def to_dict(self) -> dict: - step_dict = { + step_dict: Dict[str, Any] = { TRANSFORM_STEP_MODULE_PATH_KEY: self.module_path, } - if len(self.params): + if len(self.params) > 0: step_dict[TRANSFORM_STEP_PARAMS_KEY] = self.params return step_dict @@ -60,6 +64,7 @@ class DfTransformStep(ABC): Every subclass must implement 'transform' method with custom logic. """ + @staticmethod def step_last_modified_ts(step_config: DfTransformStepConfig) -> float: file_path = get_file_path_from_module_path(module_path=step_config.module_path) @@ -83,10 +88,9 @@ def build_transform(config: DfTransformStepConfig): """ params = config.params - transform = build_class_instance(module_path=config.module_path, - init_params=params) + transform = build_class_instance(module_path=config.module_path, init_params=params) return transform @abstractmethod - def transform(self, df: pd.DataFrame) -> pd.DataFrame: + def transform(self, df: DF_TYPE) -> DF_TYPE: pass diff --git a/df_and_order/helpers.py b/df_and_order/helpers.py index 3171e3f..72fdea7 100644 --- a/df_and_order/helpers.py +++ b/df_and_order/helpers.py @@ -58,9 +58,9 @@ def split_module_path(module_path: str) -> Tuple[str, str]: ------- Tuple of strings, one for the module name, second for the class name """ - components = module_path.split('.') + components = module_path.split(".") class_name = components[-1] - module_name = '.'.join(components[:-1]) + module_name = ".".join(components[:-1]) return module_name, class_name @@ -97,7 +97,7 @@ def get_module_path_from_type(py_type) -> str: ------- String representation of full module path like module1.submodule2.ClassName """ - result = inspect.getmodule(py_type).__name__ + '.' + py_type.__name__ + result = inspect.getmodule(py_type).__name__ + "." + py_type.__name__ # type: ignore return result diff --git a/df_and_order/steps.py b/df_and_order/steps/pd.py similarity index 98% rename from df_and_order/steps.py rename to df_and_order/steps/pd.py index 7da082d..c134f2e 100644 --- a/df_and_order/steps.py +++ b/df_and_order/steps/pd.py @@ -1,5 +1,5 @@ -from typing import List import pandas as pd +from typing import List from df_and_order.df_transform_step import DfTransformStep, DfTransformStepConfig @@ -8,6 +8,7 @@ class DropColsTransformStep(DfTransformStep): """ Simply drops some undesired columns from a dataframe. """ + def __init__(self, cols: List[str]): super().__init__() @@ -25,6 +26,7 @@ class DatesTransformStep(DfTransformStep): """ Converts cols to datetime type """ + def __init__(self, cols: List[str]): super().__init__() @@ -38,4 +40,4 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: for col in self._dates_cols: df[col] = pd.to_datetime(df[col]) - return df \ No newline at end of file + return df diff --git a/examples/How-To-Cleaned.ipynb b/examples/How-To-Cleaned.ipynb index 76a6715..412c0cb 100644 --- a/examples/How-To-Cleaned.ipynb +++ b/examples/How-To-Cleaned.ipynb @@ -476,7 +476,7 @@ "source": [ "from df_and_order.df_transform import DfTransformConfig\n", "from df_and_order.df_transform_step import DfTransformStepConfig\n", - "from df_and_order.steps import DropColsTransformStep, DatesTransformStep\n", + "from df_and_order.steps.pd import DropColsTransformStep, DatesTransformStep\n", "\n", "# we describe all the steps required\n", "in_memory_steps = [\n", @@ -903,6 +903,7 @@ "bad_in_memory_transform_id = 'bad_in_memory'\n", "# here's the instance of our entire transform\n", "bad_in_memory_transform = DfTransformConfig(transform_id=bad_in_memory_transform_id, \n", + " df_format='csv',\n", " in_memory_steps=in_memory_steps,\n", " permanent_steps=permanent_steps)" ] diff --git a/examples/How-To.ipynb b/examples/How-To.ipynb index 91c8da5..b4f6bdb 100644 --- a/examples/How-To.ipynb +++ b/examples/How-To.ipynb @@ -267,7 +267,7 @@ "output_type": "stream", "text": [ "total 8\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 2 11:36 super_demo_df_2020.csv\r\n" + "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 9 20:40 super_demo_df_2020.csv\r\n" ] } ], @@ -733,13 +733,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from df_and_order.df_transform import DfTransformConfig\n", "from df_and_order.df_transform_step import DfTransformStepConfig\n", - "from df_and_order.steps import DropColsTransformStep, DatesTransformStep\n", + "from df_and_order.steps.pd import DropColsTransformStep, DatesTransformStep\n", "\n", "# we describe all the steps required\n", "in_memory_steps = [\n", @@ -759,7 +759,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -832,7 +832,7 @@ "4 5 five 2020-05-21" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -845,7 +845,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -885,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -900,11 +900,11 @@ " model_input:\r\n", " df_format: csv\r\n", " in_memory:\r\n", - " - module_path: df_and_order.steps.DropColsTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DropColsTransformStep\r\n", " params:\r\n", " cols:\r\n", " - redundant_col\r\n", - " - module_path: df_and_order.steps.DatesTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DatesTransformStep\r\n", " params:\r\n", " cols:\r\n", " - date_col\r\n" @@ -931,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1004,7 +1004,7 @@ "4 5 five 2020-05-21" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1022,7 +1022,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1101,7 +1101,7 @@ "4 5 five 2020-05-21 0" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1113,7 +1113,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1147,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1174,7 +1174,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1247,7 +1247,7 @@ "4 5 five 2020-05-21" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1260,7 +1260,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1275,23 +1275,23 @@ " model_input:\r\n", " df_format: csv\r\n", " in_memory:\r\n", - " - module_path: df_and_order.steps.DropColsTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DropColsTransformStep\r\n", " params:\r\n", " cols:\r\n", " - redundant_col\r\n", - " - module_path: df_and_order.steps.DatesTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DatesTransformStep\r\n", " params:\r\n", " cols:\r\n", " - date_col\r\n", " model_input_permanent:\r\n", " df_format: csv\r\n", " in_memory:\r\n", - " - module_path: df_and_order.steps.DatesTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DatesTransformStep\r\n", " params:\r\n", " cols:\r\n", " - date_col\r\n", " permanent:\r\n", - " - module_path: df_and_order.steps.DropColsTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DropColsTransformStep\r\n", " params:\r\n", " cols:\r\n", " - redundant_col\r\n" @@ -1304,7 +1304,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1312,9 +1312,9 @@ "output_type": "stream", "text": [ "total 24\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 645 Jul 2 11:37 df_config.yaml\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 114 Jul 2 11:37 model_input_permanent_super_demo_df_2020.csv\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 2 11:36 super_demo_df_2020.csv\r\n" + "-rw-r--r-- 1 ilya.tyutin staff 657 Jul 9 20:41 df_config.yaml\r\n", + "-rw-r--r-- 1 ilya.tyutin staff 114 Jul 9 20:41 model_input_permanent_super_demo_df_2020.csv\r\n", + "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 9 20:40 super_demo_df_2020.csv\r\n" ] } ], @@ -1333,7 +1333,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1406,7 +1406,7 @@ "4 5 five 2020-05-21" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1469,7 +1469,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1478,7 +1478,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1489,10 +1489,7 @@ "\r\n", "class DummyTransformStep(DfTransformStep):\r\n", " def transform(self, df):\r\n", - " return df\r\n", - "\r\n", - "\r\n", - "\r\n" + " return df\r\n" ] } ], @@ -1509,7 +1506,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1524,7 +1521,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1603,7 +1600,7 @@ "4 5 five 2020-05-21 0" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1615,7 +1612,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1634,23 +1631,23 @@ " model_input:\r\n", " df_format: csv\r\n", " in_memory:\r\n", - " - module_path: df_and_order.steps.DropColsTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DropColsTransformStep\r\n", " params:\r\n", " cols:\r\n", " - redundant_col\r\n", - " - module_path: df_and_order.steps.DatesTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DatesTransformStep\r\n", " params:\r\n", " cols:\r\n", " - date_col\r\n", " model_input_permanent:\r\n", " df_format: csv\r\n", " in_memory:\r\n", - " - module_path: df_and_order.steps.DatesTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DatesTransformStep\r\n", " params:\r\n", " cols:\r\n", " - date_col\r\n", " permanent:\r\n", - " - module_path: df_and_order.steps.DropColsTransformStep\r\n", + " - module_path: df_and_order.steps.pd.DropColsTransformStep\r\n", " params:\r\n", " cols:\r\n", " - redundant_col\r\n" @@ -1663,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1671,10 +1668,10 @@ "output_type": "stream", "text": [ "total 32\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 746 Jul 2 11:37 df_config.yaml\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 2 11:37 dummy_super_demo_df_2020.csv\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 114 Jul 2 11:37 model_input_permanent_super_demo_df_2020.csv\r\n", - "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 2 11:36 super_demo_df_2020.csv\r\n" + "-rw-r--r-- 1 ilya.tyutin staff 758 Jul 9 20:41 df_config.yaml\r\n", + "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 9 20:41 dummy_super_demo_df_2020.csv\r\n", + "-rw-r--r-- 1 ilya.tyutin staff 114 Jul 9 20:41 model_input_permanent_super_demo_df_2020.csv\r\n", + "-rw-r--r-- 1 ilya.tyutin staff 138 Jul 9 20:40 super_demo_df_2020.csv\r\n" ] } ], @@ -1691,7 +1688,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -1708,7 +1705,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1718,12 +1715,13 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mamazing_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdummy_transform_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, df_id, transform_id, transform, forced)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m forced=forced)\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m return self._read_initial(df_id=df_id,\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_read_transformed\u001b[0;34m(self, df_id, transform, df_config, forced)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m transform=transform)\n\u001b[0m\u001b[1;32m 181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m df = self._read_source_df_for_transform(df_id=df_id,\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_try_to_read_cached_transform\u001b[0;34m(self, df_id, df_config, forced, transform)\u001b[0m\n\u001b[1;32m 221\u001b[0m self._try_validate_transform(df_id=df_id,\n\u001b[1;32m 222\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 223\u001b[0;31m forced=forced)\n\u001b[0m\u001b[1;32m 224\u001b[0m return self._read_cached_transformed(df_id=df_id,\n\u001b[1;32m 225\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_try_validate_transform\u001b[0;34m(self, df_id, transform, forced, child_transform)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpermanent_steps\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 320\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_ts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0mtransforms_state\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_transforms_states\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_validate_ts\u001b[0;34m(self, df_id, forced, transform)\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Warning: {base_warning}, reading it anyway because the operation was forced\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 420\u001b[0;31m raise Exception(f'{base_warning}, '\n\u001b[0m\u001b[1;32m 421\u001b[0m 'delete the file and try again to regenerate the df.')\n\u001b[1;32m 422\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mamazing_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdummy_transform_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, df_id, transform_id, transform, forced)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0mtransform\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransforms_by\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransform_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_transformed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 135\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_initial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_read_transformed\u001b[0;34m(self, df_id, transform, df_config, forced)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtransformed_df_exists\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m return self._try_to_read_cached_transform(\n\u001b[0;32m--> 158\u001b[0;31m \u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 159\u001b[0m )\n\u001b[1;32m 160\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_try_to_read_cached_transform\u001b[0;34m(self, df_id, df_config, forced, transform)\u001b[0m\n\u001b[1;32m 183\u001b[0m )\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_check_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_cached_transformed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_try_check_transform\u001b[0;34m(self, df_id, transform, forced, child_transform)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_check_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 270\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchild_transform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchild_transform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 271\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 272\u001b[0m def _check_transform(\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_check_transform\u001b[0;34m(self, df_id, transform, forced, child_transform)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0mPerforms\u001b[0m \u001b[0mvarious\u001b[0m \u001b[0mchecks\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 281\u001b[0m \"\"\"\n\u001b[0;32m--> 282\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_ts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;31m# which transform are we checking\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_check_ts\u001b[0;34m(self, df_id, forced, transform)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Warning: {base_warning}, reading it anyway because the operation was forced\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{base_warning}, \"\u001b[0m \u001b[0;34m\"delete the file and try again to regenerate the df.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mException\u001b[0m: ['example_steps.steps.DummyTransformStep'] steps of dummy transform were changed since the df was generated, delete the file and try again to regenerate the df." ] } @@ -1743,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1830,7 +1828,7 @@ "4 5 five 2020-05-21 0" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1853,7 +1851,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1862,7 +1860,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1941,7 +1939,7 @@ "4 5 five 2020-05-21 0" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1966,7 +1964,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1985,13 +1983,14 @@ "bad_in_memory_transform_id = 'bad_in_memory'\n", "# here's the instance of our entire transform\n", "bad_in_memory_transform = DfTransformConfig(transform_id=bad_in_memory_transform_id, \n", + " df_format='csv',\n", " in_memory_steps=in_memory_steps,\n", " permanent_steps=permanent_steps)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -2001,10 +2000,10 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mamazing_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbad_in_memory_transform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, df_id, transform_id, transform, forced)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m forced=forced)\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m return self._read_initial(df_id=df_id,\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_read_transformed\u001b[0;34m(self, df_id, transform, df_config, forced)\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 189\u001b[0;31m transform=transform)\n\u001b[0m\u001b[1;32m 190\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_apply_df_transforms\u001b[0;34m(self, df_id, df, df_config, transform)\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0mdf_shape_has_changed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_shape_before\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdf_shape_after_in_memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mboth_transform_types_are_present\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mdf_shape_has_changed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 247\u001b[0;31m raise Exception(f\"Error: A permanent transform is also present, hence your \"\n\u001b[0m\u001b[1;32m 248\u001b[0m f\"in-memory transform can't modify the shape of the initial df.\")\n\u001b[1;32m 249\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpermanent_steps\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mamazing_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbad_in_memory_transform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, df_id, transform_id, transform, forced)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0mtransform\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransforms_by\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransform_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_transformed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 135\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_initial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_read_transformed\u001b[0;34m(self, df_id, transform, df_config, forced)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_source_df_for_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforced\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforced\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_df_transforms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/df-and-order/df_and_order/df_reader.py\u001b[0m in \u001b[0;36m_apply_df_transforms\u001b[0;34m(self, df_id, df, df_config, transform)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mboth_transform_types_are_present\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mdf_shape_has_changed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 200\u001b[0m raise Exception(\n\u001b[0;32m--> 201\u001b[0;31m \u001b[0;34m\"Error: A permanent transform is also present, hence your \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 202\u001b[0m \u001b[0;34m\"in-memory transform can't modify the shape of the initial df.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 203\u001b[0m )\n", "\u001b[0;31mException\u001b[0m: Error: A permanent transform is also present, hence your in-memory transform can't modify the shape of the initial df." ] } diff --git a/examples/example_steps/steps.py b/examples/example_steps/steps.py index ea8147f..104cb95 100644 --- a/examples/example_steps/steps.py +++ b/examples/example_steps/steps.py @@ -1,5 +1,6 @@ from df_and_order.df_transform_step import DfTransformStep + class DummyTransformStep(DfTransformStep): def transform(self, df): return df diff --git a/flake8.cfg b/flake8.cfg new file mode 100644 index 0000000..8455a91 --- /dev/null +++ b/flake8.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 120 +ignore = E731,W503 +exclude = + .git, + tests/* \ No newline at end of file diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..1215375 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy] +ignore_missing_imports = True \ No newline at end of file diff --git a/setup.py b/setup.py index 48592eb..84ec6de 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ # This call to setup() does all the work setup( name="df-and-order", - version="0.2.1", + version="0.2.2", description="Using df-and-order your interactions with dataframes become very clean and predictable.", long_description=README, long_description_content_type="text/markdown", @@ -23,7 +23,8 @@ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", ], - packages=find_packages(exclude=("*.tests", "*.tests.*", "tests.*", "tests", "*.examples", "*.examples.*", "examples.*", "examples")), + packages=find_packages( + exclude=("*.tests", "*.tests.*", "tests.*", "tests", "*.examples", "*.examples.*", "examples.*", "examples") + ), include_package_data=True, - install_requires=["pandas"], ) diff --git a/tests/dates_transform.py b/tests/dates_transform.py index 8db73e6..24b9a6a 100644 --- a/tests/dates_transform.py +++ b/tests/dates_transform.py @@ -1,6 +1,6 @@ +import pandas as pd from typing import List -import pandas as pd from df_and_order.df_transform_step import DfTransformStep @@ -15,4 +15,4 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: for col in self._dates_cols: df[col] = pd.to_datetime(df[col]) - return df \ No newline at end of file + return df diff --git a/tests/drop_cols_transform.py b/tests/drop_cols_transform.py index b584741..208dee4 100644 --- a/tests/drop_cols_transform.py +++ b/tests/drop_cols_transform.py @@ -9,10 +9,11 @@ class TestDropColsTransformStep(DfTransformStep): """ Simply drops some undesired columns from a dataframe. """ + def __init__(self, cols_to_drop: List[str]): super().__init__() self._cols_to_drop = cols_to_drop def transform(self, df: pd.DataFrame) -> pd.DataFrame: - return df.drop(self._cols_to_drop, axis=1) \ No newline at end of file + return df.drop(self._cols_to_drop, axis=1) diff --git a/tests/test_df_cache.py b/tests/test_df_cache.py index 93c16e4..29c6677 100644 --- a/tests/test_df_cache.py +++ b/tests/test_df_cache.py @@ -11,10 +11,7 @@ def save_args(): @pytest.fixture() def save_kwargs(): - return { - 'a': 1, - 'b': 2 - } + return {"a": 1, "b": 2} @pytest.fixture() @@ -24,10 +21,7 @@ def load_args(): @pytest.fixture() def load_kwargs(): - return { - 'c': 3, - 'd': 4 - } + return {"c": 3, "d": 4} class TestDfCache(DfCache): @@ -40,30 +34,22 @@ def _load(self, path: str, *args, **kwargs) -> pd.DataFrame: @pytest.fixture() def instance(save_args, save_kwargs, load_args, load_kwargs): - result = TestDfCache(save_args=save_args, - save_kwargs=save_kwargs, - load_args=load_args, - load_kwargs=load_kwargs) + result = TestDfCache(save_args=save_args, save_kwargs=save_kwargs, load_args=load_args, load_kwargs=load_kwargs,) return result def test_save(mocker, instance, save_args, save_kwargs): - save_mock = mocker.patch.object(instance, '_save') + save_mock = mocker.patch.object(instance, "_save") df = mocker.Mock() - path = '/some/path' + path = "/some/path" instance.save(df=df, path=path) - save_mock.assert_called_with(df=df, - path=path, - *save_args, - **save_kwargs) + save_mock.assert_called_with(df=df, path=path, *save_args, **save_kwargs) def test_load(mocker, instance, load_args, load_kwargs): - load_mock = mocker.patch.object(instance, '_load') - path = '/some/path' + load_mock = mocker.patch.object(instance, "_load") + path = "/some/path" instance.load(path=path) - load_mock.assert_called_with(path=path, - *load_args, - **load_kwargs) + load_mock.assert_called_with(path=path, *load_args, **load_kwargs) diff --git a/tests/test_df_config.py b/tests/test_df_config.py index e4d9b71..46a449e 100644 --- a/tests/test_df_config.py +++ b/tests/test_df_config.py @@ -1,47 +1,56 @@ import pytest import copy -from df_and_order.df_config import DfConfig, DF_ID_KEY, CONFIG_FILENAME, DF_INITIAL_FORMAT_KEY, METADATA_KEY, TRANSFORMS_KEY +from df_and_order.df_config import ( + DfConfig, + DF_ID_KEY, + CONFIG_FILENAME, + DF_INITIAL_FORMAT_KEY, + METADATA_KEY, + TRANSFORMS_KEY, +) from df_and_order.df_transform import DfTransformConfig from df_and_order.df_transform_step import DfTransformStepConfig @pytest.fixture() def df_id(): - return 'test_id' + return "test_id" @pytest.fixture() def test_path(): - return 'path/to/config/' + return "path/to/config/" @pytest.fixture def transformed_format(): - return 'trans_format' + return "trans_format" @pytest.fixture() def transform(transformed_format): - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='123', params={'a': 1, 'b': 'b'}) - ], - permanent_steps=[ - DfTransformStepConfig(module_path='456', params={'c': 2, 'd': 1.2}) - ]) + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[DfTransformStepConfig(module_path="123", params={"a": 1, "b": "b"})], + permanent_steps=[DfTransformStepConfig(module_path="456", params={"c": 2, "d": 1.2})], + ) return transform def test_df_id_match(mocker, df_id, test_path): valid_df_id = df_id - mocker.patch.object(DfConfig, DfConfig._read_config.__name__, lambda _, path: {DF_ID_KEY: valid_df_id}) + mocker.patch.object( + DfConfig, DfConfig._read_config.__name__, lambda _, path: {DF_ID_KEY: valid_df_id}, + ) DfConfig(df_id=df_id, dir_path=test_path) def test_df_id_mismatch(mocker, df_id, test_path): - wrong_df_id = 'wrong_id' - mocker.patch.object(DfConfig, DfConfig._read_config.__name__, lambda _, path: {DF_ID_KEY: wrong_df_id}) + wrong_df_id = "wrong_id" + mocker.patch.object( + DfConfig, DfConfig._read_config.__name__, lambda _, path: {DF_ID_KEY: wrong_df_id}, + ) with pytest.raises(Exception): DfConfig(df_id=df_id, dir_path=test_path) @@ -57,25 +66,25 @@ def test_create_config_when_existing(mocker, df_id, test_path): save_mock = mocker.patch.object(DfConfig, DfConfig._save_at_path.__name__) with pytest.raises(Exception): - DfConfig.create_config(dir_path=test_path, - df_id=df_id, - initial_df_format='') + DfConfig.create_config(dir_path=test_path, df_id=df_id, initial_df_format="") save_mock.assert_not_called() -@pytest.mark.parametrize("metadata", [{'meta': 'shmeta'}, None], ids=['meta', 'no_meta']) -@pytest.mark.parametrize("use_transform", [True, False], ids=['transform', 'no_transform']) +@pytest.mark.parametrize("metadata", [{"meta": "shmeta"}, None], ids=["meta", "no_meta"]) +@pytest.mark.parametrize("use_transform", [True, False], ids=["transform", "no_transform"]) def test_create_config(mocker, df_id, test_path, transform, metadata, use_transform): - initial_df_format = 'init_format' + initial_df_format = "init_format" save_mock = mocker.patch.object(DfConfig, DfConfig._save_at_path.__name__) - DfConfig.create_config(dir_path=test_path, - df_id=df_id, - initial_df_format=initial_df_format, - metadata=metadata, - transform=transform if use_transform else None) + DfConfig.create_config( + dir_path=test_path, + df_id=df_id, + initial_df_format=initial_df_format, + metadata=metadata, + transform=transform if use_transform else None, + ) config_dict = { DF_ID_KEY: df_id, @@ -87,24 +96,18 @@ def test_create_config(mocker, df_id, test_path, transform, metadata, use_transf if use_transform: transform_id, transform_dict = transform.to_dict() - config_dict[TRANSFORMS_KEY] = { - transform_id: transform_dict - } + config_dict[TRANSFORMS_KEY] = {transform_id: transform_dict} - save_mock.assert_called_with(config_dict=config_dict, - config_path=test_path + CONFIG_FILENAME) + save_mock.assert_called_with(config_dict=config_dict, config_path=test_path + CONFIG_FILENAME) def test_properties(mocker, df_id, test_path): - initial_df_format = 'init_format' + initial_df_format = "init_format" config_dict = { DF_ID_KEY: df_id, DF_INITIAL_FORMAT_KEY: initial_df_format, } - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=config_dict) + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=config_dict) assert config.initial_df_format == initial_df_format assert config.df_id == df_id @@ -114,44 +117,25 @@ def test_transform_by_no_transforms(mocker, df_id, test_path): config_dict = { DF_ID_KEY: df_id, } - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=config_dict) + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=config_dict) with pytest.raises(Exception): - config.transforms_by(transform_id='whatever') + config.transforms_by(transform_id="whatever") def test_transform_by_no_transform(mocker, df_id, test_path): - config_dict = { - DF_ID_KEY: df_id, - TRANSFORMS_KEY: { - 'some_transform_id': {} - } - } - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=config_dict) + config_dict = {DF_ID_KEY: df_id, TRANSFORMS_KEY: {"some_transform_id": {}}} + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=config_dict) with pytest.raises(Exception): - config.transforms_by(transform_id='whatever') + config.transforms_by(transform_id="whatever") def test_transform_by(mocker, df_id, test_path, transform): transform_id, transform_dict = transform.to_dict() - config_dict = { - DF_ID_KEY: df_id, - TRANSFORMS_KEY: { - transform_id: transform_dict - } - } - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=config_dict) + config_dict = {DF_ID_KEY: df_id, TRANSFORMS_KEY: {transform_id: transform_dict}} + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=config_dict) transform_result = config.transforms_by(transform_id=transform_id) _, transform_result_dict = transform_result.to_dict() @@ -160,63 +144,49 @@ def test_transform_by(mocker, df_id, test_path, transform): def test_register_transform_already_cached(mocker, df_id, test_path, transform, transformed_format): - initial_df_format = 'init_format' + initial_df_format = "init_format" transform_id, transform_dict = transform.to_dict() config_dict = { DF_ID_KEY: df_id, DF_INITIAL_FORMAT_KEY: initial_df_format, - TRANSFORMS_KEY: { - transform_id: transform_dict - } + TRANSFORMS_KEY: {transform_id: transform_dict}, } - transfom_filename = f'{transform_id}_{df_id}' - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=config_dict) + transfom_filename = f"{transform_id}_{df_id}" + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=config_dict) - test_path_to_cached_file = test_path + transfom_filename + f'.{transformed_format}' + test_path_to_cached_file = test_path + transfom_filename + f".{transformed_format}" is_file_exists_mock = mocker.patch.object(DfConfig, DfConfig._is_file_exists.__name__) is_file_exists_mock.return_value = True with pytest.raises(Exception): - config.register_transform(transform=transform, - filename=transfom_filename) + config.register_transform(transform=transform, filename=transfom_filename) is_file_exists_mock.assert_called_with(path=test_path_to_cached_file) def test_register_transform(mocker, df_id, test_path, transform, transformed_format): save_mock = mocker.patch.object(DfConfig, DfConfig._save_at_path.__name__) - initial_df_format = 'init_format' + initial_df_format = "init_format" transform_id, transform_dict = transform.to_dict() config_dict = { DF_ID_KEY: df_id, DF_INITIAL_FORMAT_KEY: initial_df_format, - TRANSFORMS_KEY: { - transform_id: transform_dict - } + TRANSFORMS_KEY: {transform_id: transform_dict}, } - config = _get_config(mocker=mocker, - df_id=df_id, - dir_path=test_path, - config_dict=copy.deepcopy(config_dict)) + config = _get_config(mocker=mocker, df_id=df_id, dir_path=test_path, config_dict=copy.deepcopy(config_dict),) - updated_transform = DfTransformConfig(transform_id=transform_id, - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='razdva', params={'asd': 123}) - ]) + updated_transform = DfTransformConfig( + transform_id=transform_id, + df_format=transformed_format, + in_memory_steps=[DfTransformStepConfig(module_path="razdva", params={"asd": 123})], + ) - config.register_transform(transform=updated_transform, filename='whatever') + config.register_transform(transform=updated_transform, filename="whatever") _, updated_transform_config = updated_transform.to_dict() - config_dict[TRANSFORMS_KEY] = { - transform_id: updated_transform_config - } - save_mock.assert_called_with(config_dict=config_dict, - config_path=test_path + CONFIG_FILENAME) + config_dict[TRANSFORMS_KEY] = {transform_id: updated_transform_config} + save_mock.assert_called_with(config_dict=config_dict, config_path=test_path + CONFIG_FILENAME) def _get_config(mocker, df_id, dir_path, config_dict: dict): diff --git a/tests/test_df_reader.py b/tests/test_df_reader.py index 0431d3d..398c711 100644 --- a/tests/test_df_reader.py +++ b/tests/test_df_reader.py @@ -13,26 +13,28 @@ @pytest.fixture() def df_id(): - return 'test_id' + return "test_id" @pytest.fixture() def test_path(): - return 'path/to/config/' + return "path/to/config/" @pytest.fixture def transformed_format(): - return 'trans_format' + return "trans_format" @pytest.fixture() def initial_df(): - result = pd.DataFrame({ - 'a': ['2020-02-01', '2020-02-02', '2020-02-03'], - 'b': [1, 2, 3], - 'c': ['2020-01-01', '2020-01-02', '2020-01-03'] - }) + result = pd.DataFrame( + { + "a": ["2020-02-01", "2020-02-02", "2020-02-03"], + "b": [1, 2, 3], + "c": ["2020-01-01", "2020-01-02", "2020-01-03"], + } + ) return result @@ -46,41 +48,38 @@ def build_config(mocker, df_id, test_path, initial_format): return result -@pytest.mark.parametrize("transform_id", ['trans_1', None], ids=['trans', 'no_trans']) -@pytest.mark.parametrize("valid_case", [True, False], ids=['valid', 'not_valid']) +@pytest.mark.parametrize("transform_id", ["trans_1", None], ids=["trans", "no_trans"]) +@pytest.mark.parametrize("valid_case", [True, False], ids=["valid", "not_valid"]) def test_df_exists(mocker, df_id, test_path, transform_id, valid_case, transformed_format): mocker.patch.object(DfConfig, DfConfig.config_exists.__name__, lambda dir_path: True) - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format='init_format') + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format="init_format") mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) if transform_id: - filename = f'{transform_id}_{df_id}.{transformed_format}' + filename = f"{transform_id}_{df_id}.{transformed_format}" else: - filename = f'{df_id}.{config.initial_df_format}' + filename = f"{df_id}.{config.initial_df_format}" if not valid_case: - filename += 'garbage' + filename += "garbage" - test_file_path = f'{test_path}{df_id}/{filename}' + test_file_path = f"{test_path}{df_id}/{filename}" mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: path == test_file_path) reader = DfReader(dir_path=test_path, format_to_cache_map={}) - transform = DfTransformConfig(transform_id=transform_id, - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig( - module_path='tests.drop_cols_transform.TestDropColsTransformStep', - params={'cols_to_drop': ['b']}), - ]) + transform = DfTransformConfig( + transform_id=transform_id, + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.drop_cols_transform.TestDropColsTransformStep", params={"cols_to_drop": ["b"]}, + ), + ], + ) if transform_id: # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) - reader.register_transform(df_id=df_id, - df_config=config, - transform=transform) + reader.register_transform(df_id=df_id, df_config=config, transform=transform) if valid_case: assert reader.df_exists(df_id=df_id, transform_id=transform_id) else: @@ -96,65 +95,85 @@ def test_create_df_config(mocker, df_id, test_path): config_create_mock = mocker.patch.object(DfConfig, DfConfig.create_config.__name__) - reader.create_df_config(df_id=df_id, - initial_df_format=initial_format, - metadata=metadata, - transform=transform) + reader.create_df_config( + df_id=df_id, initial_df_format=initial_format, metadata=metadata, transform=transform, + ) dir_path = _df_dir_path(dir_path=test_path, df_id=df_id) - config_create_mock.assert_called_with(dir_path=dir_path, - df_id=df_id, - initial_df_format=initial_format, - metadata=metadata, - transform=transform) + config_create_mock.assert_called_with( + dir_path=dir_path, df_id=df_id, initial_df_format=initial_format, metadata=metadata, transform=transform, + ) def test_register_transform(mocker, df_id, test_path, transformed_format): - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format='init_format') + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format="init_format") mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) register_transform_mock = mocker.patch.object(DfConfig, DfConfig.register_transform.__name__) transform = mocker.Mock() - transform.transform_id = 'trans_id' + transform.transform_id = "trans_id" transform.df_format = transformed_format - filename = f'{transform.transform_id}_{df_id}.{transformed_format}' + filename = f"{transform.transform_id}_{df_id}.{transformed_format}" reader = DfReader(dir_path=test_path, format_to_cache_map={}) - reader.register_transform(df_id=df_id, - df_config=config, - transform=transform) + reader.register_transform(df_id=df_id, df_config=config, transform=transform) + + register_transform_mock.assert_called_with(transform=transform, filename=filename) + + +@pytest.mark.parametrize("existing_transform", [True, False], ids=["existing_transform", "not_existing_transform"]) +def test_register_transform_after_read(mocker, df_id, test_path, initial_df, transformed_format, existing_transform): + initial_df_format = "init_format" + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=initial_df_format) + mocker.patch.object(DfConfig, DfConfig.config_exists.__name__, lambda dir_path: True) + mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) + mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) + + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + permanent_steps=[ + DfTransformStepConfig( + module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": ["a"]}, + ), + ], + ) - register_transform_mock.assert_called_with(transform=transform, - filename=filename) + # do nothing when it comes to save the config + mocker.patch.object(DfConfig, DfConfig._save.__name__) + + if existing_transform: + config.register_transform(transform=transform, filename="test") + + init_cache = TestDfCache() + transform_cache = TestDfCache() + reader = DfReader( + dir_path=test_path, format_to_cache_map={initial_df_format: init_cache, transformed_format: transform_cache}, + ) + # ignore states for this test + mocker.patch.object(reader, reader._save_transforms_state_file.__name__) + mocker.patch.object(reader, reader._try_to_read_cached_transform.__name__) + + reader.read(df_id=df_id, transform=transform) def test_read_both_transform_id_and_transform(mocker, df_id, test_path): reader = DfReader(dir_path=test_path, format_to_cache_map={}) with pytest.raises(Exception): - reader.read(df_id=df_id, - transform_id='', - transform=mocker.Mock()) + reader.read(df_id=df_id, transform_id="", transform=mocker.Mock()) def test_read_initial(mocker, df_id, test_path, initial_df): - init_format = 'init_format' + init_format = "init_format" init_cache = TestDfCache() load_mock = mocker.patch.object(init_cache, init_cache.load.__name__) load_mock.return_value = initial_df - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache - }) - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + reader = DfReader(dir_path=test_path, format_to_cache_map={init_format: init_cache}) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) mocker.patch.object(reader, reader._save_transforms_state_file.__name__) mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) df_dir_path = _df_dir_path(dir_path=test_path, df_id=df_id) - df_path = f'{df_dir_path}/{df_id}.{config.initial_df_format}' + df_path = f"{df_dir_path}/{df_id}.{config.initial_df_format}" test_initial_df = reader.read(df_id=df_id) @@ -163,8 +182,8 @@ def test_read_initial(mocker, df_id, test_path, initial_df): assert initial_df.equals(test_initial_df) -@pytest.mark.parametrize("permanent", [True, False], ids=['permanent', 'not_permanent']) -@pytest.mark.parametrize("from_scratch", [True, False], ids=['from_scratch', 'not_from_scratch']) +@pytest.mark.parametrize("permanent", [True, False], ids=["permanent", "not_permanent"]) +@pytest.mark.parametrize("from_scratch", [True, False], ids=["from_scratch", "not_from_scratch"]) def test_read_transformed(mocker, df_id, test_path, permanent, from_scratch, initial_df): # setup df cache init_cache = TestDfCache() @@ -174,43 +193,44 @@ def test_read_transformed(mocker, df_id, test_path, permanent, from_scratch, ini init_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - transformed_format = 'trans_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) + init_format = "init_format" + transformed_format = "trans_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) # setup transform permanent_steps = [] if permanent: permanent_steps = [ - DfTransformStepConfig(module_path='tests.drop_cols_transform.TestDropColsTransformStep', - params={'cols_to_drop': ['b']}), + DfTransformStepConfig( + module_path="tests.drop_cols_transform.TestDropColsTransformStep", params={"cols_to_drop": ["b"]}, + ), ] - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='tests.zero_transform.TestZeroTransformStep', - params={'zero_cols': ['a']}), - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ], - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": ["a"]}, + ), + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ), + ], + permanent_steps=permanent_steps, + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) # ignore states for this test mocker.patch.object(reader, reader._save_transforms_state_file.__name__) - mocker.patch.object(reader, reader._try_validate_transform.__name__) + mocker.patch.object(reader, reader._try_check_transform.__name__) # reader.register_transform(df_id=df_id, df_config=config, transform=transform) # override the config getter @@ -237,86 +257,75 @@ def test_read_transformed(mocker, df_id, test_path, permanent, from_scratch, ini # checks if from_scratch: - init_df_path = f'{df_dir_path}/{df_id}.{config.initial_df_format}' + init_df_path = f"{df_dir_path}/{df_id}.{config.initial_df_format}" init_load_mock.assert_called_with(path=init_df_path) if permanent: - test_columns = {'a', 'c'} + test_columns = {"a", "c"} else: - test_columns = {'a', 'b', 'c'} + test_columns = {"a", "b", "c"} assert set(transformed_df.columns) == test_columns - assert transformed_df['a'].unique()[0] == 0 - assert transformed_df.select_dtypes(include=[np.datetime64]).columns[0] == 'c' + assert transformed_df["a"].unique()[0] == 0 + assert transformed_df.select_dtypes(include=[np.datetime64]).columns[0] == "c" if from_scratch and permanent: - df_path = f'{df_dir_path}/{transform.transform_id}_{df_id}.{transformed_format}' - trans_save_mock.assert_called_with(df=transformed_df, - path=df_path) + df_path = f"{df_dir_path}/{transform.transform_id}_{df_id}.{transformed_format}" + trans_save_mock.assert_called_with(df=transformed_df, path=df_path) -@pytest.mark.parametrize("is_df_outdated", [True, False], ids=['outdated', 'not_outdated']) -@pytest.mark.parametrize("forced", [True, False], ids=['forced', 'not_forced']) +@pytest.mark.parametrize("is_df_outdated", [True, False], ids=["outdated", "not_outdated"]) +@pytest.mark.parametrize("forced", [True, False], ids=["forced", "not_forced"]) def test_read_transformed_check_ts(mocker, df_id, test_path, initial_df, is_df_outdated, forced): # if not from scratch we need to stub cached df - zero_module_path = 'tests.zero_transform.TestZeroTransformStep' + zero_module_path = "tests.zero_transform.TestZeroTransformStep" zero_last_modified_ts = 1.0 - drop_module_path = 'tests.drop_cols_transform.TestDropColsTransformStep' + drop_module_path = "tests.drop_cols_transform.TestDropColsTransformStep" drop_last_modified_ts = 2.0 if is_df_outdated: df_last_modified_ts = min(zero_last_modified_ts, drop_last_modified_ts) - 1.0 else: df_last_modified_ts = max(zero_last_modified_ts, drop_last_modified_ts) + 1.0 - mocker.patch.object(DfReader, - DfReader.df_exists.__name__, - lambda _, df_id, transform_id: True) - mocker.patch.object(DfReader, - DfReader._df_last_modified_ts.__name__, - lambda _, df_id, transform_id: df_last_modified_ts) - - transformed_format = 'transformed_format' - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format='init_format') + mocker.patch.object(DfReader, DfReader.df_exists.__name__, lambda _, df_id, transform_id: True) + mocker.patch.object( + DfReader, DfReader._df_last_modified_ts.__name__, lambda _, df_id, transform_id: df_last_modified_ts, + ) + + transformed_format = "transformed_format" + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format="init_format") # do nothing when it comes to save config mocker.patch.object(DfConfig, DfConfig._save.__name__) # override config getter mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) permanent_steps = [ - DfTransformStepConfig(module_path=zero_module_path, - params={'zero_cols': ['a']}), - DfTransformStepConfig(module_path=drop_module_path, - params={'cols_to_drop': ['b']}), + DfTransformStepConfig(module_path=zero_module_path, params={"zero_cols": ["a"]}), + DfTransformStepConfig(module_path=drop_module_path, params={"cols_to_drop": ["b"]}), ] - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[], - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id="transform_id", df_format=transformed_format, in_memory_steps=[], permanent_steps=permanent_steps, + ) trans_cache = TestDfCache() trans_load_mock = mocker.patch.object(trans_cache, trans_cache.load.__name__) trans_df = mocker.Mock() trans_load_mock.return_value = trans_df - reader = DfReader(dir_path=test_path, format_to_cache_map={ - transformed_format: trans_cache - }) + reader = DfReader(dir_path=test_path, format_to_cache_map={transformed_format: trans_cache}) # ignore states for this test mocker.patch.object(reader, reader._save_transforms_state_file.__name__) - mocker.patch.object(reader, reader._try_validate_transform.__name__) + mocker.patch.object(reader, reader._try_check_transform.__name__) # reader.register_transform(df_id=df_id, df_config=config, transform=transform) def last_modified_date(file_path: str): - if 'zero_transform' in file_path: + if "zero_transform" in file_path: return zero_last_modified_ts - elif 'drop_cols_transform' in file_path: + elif "drop_cols_transform" in file_path: return drop_last_modified_ts - else: - raise ValueError('???') + + raise ValueError("???") mocker.patch.object(FileInspector, FileInspector.last_modified_date.__name__, last_modified_date) @@ -336,31 +345,23 @@ def test_in_memory_shape_when_permanent_is_present(mocker, df_id, test_path, ini init_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) # setup transforms - drop_cols_module_path = 'tests.drop_cols_transform.TestDropColsTransformStep' - dates_module_path = 'tests.dates_transform.TestDatesTransformStep' - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path=drop_cols_module_path, - params={'cols_to_drop': ['a']}), - ], - permanent_steps=[ - DfTransformStepConfig(module_path=dates_module_path, - params={'dates_cols': ['c']}) - ]) + drop_cols_module_path = "tests.drop_cols_transform.TestDropColsTransformStep" + dates_module_path = "tests.dates_transform.TestDatesTransformStep" + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[DfTransformStepConfig(module_path=drop_cols_module_path, params={"cols_to_drop": ["a"]}),], + permanent_steps=[DfTransformStepConfig(module_path=dates_module_path, params={"dates_cols": ["c"]})], + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) # override the config getter @@ -378,33 +379,30 @@ def test_source_id(mocker, df_id, test_path, initial_df, transformed_format): init_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) # setup transforms - drop_cols_module_path = 'tests.drop_cols_transform.TestDropColsTransformStep' - dates_module_path = 'tests.dates_transform.TestDatesTransformStep' - zero_module_path = 'tests.zero_transform.TestZeroTransformStep' - source_transform_id = 'source_transform_id' - source_transform = DfTransformConfig(transform_id=source_transform_id, - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path=dates_module_path, - params={'dates_cols': ['c']}) - ]) - transform = DfTransformConfig(transform_id='test_transform_id', - df_format=transformed_format, - source_id=source_transform_id, - source_in_memory_steps=[ - DfTransformStepConfig(module_path=drop_cols_module_path, - params={'cols_to_drop': ['a']}) - ], in_memory_steps=[ - DfTransformStepConfig(module_path=zero_module_path, - params={'zero_cols': []}) - ]) + drop_cols_module_path = "tests.drop_cols_transform.TestDropColsTransformStep" + dates_module_path = "tests.dates_transform.TestDatesTransformStep" + zero_module_path = "tests.zero_transform.TestZeroTransformStep" + source_transform_id = "source_transform_id" + source_transform = DfTransformConfig( + transform_id=source_transform_id, + df_format=transformed_format, + in_memory_steps=[DfTransformStepConfig(module_path=dates_module_path, params={"dates_cols": ["c"]})], + ) + transform = DfTransformConfig( + transform_id="test_transform_id", + df_format=transformed_format, + source_id=source_transform_id, + source_in_memory_steps=[ + DfTransformStepConfig(module_path=drop_cols_module_path, params={"cols_to_drop": ["a"]}) + ], + in_memory_steps=[DfTransformStepConfig(module_path=zero_module_path, params={"zero_cols": []})], + ) transformed_df = initial_df.copy() for step_config in source_transform.in_memory_steps: @@ -415,26 +413,23 @@ def test_source_id(mocker, df_id, test_path, initial_df, transformed_format): trans_load_mock.return_value = transformed_df # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) # override the config getter mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) # ignore states for this test mocker.patch.object(reader, reader._save_transforms_state_file.__name__) - mocker.patch.object(reader, reader._try_validate_transform.__name__) + mocker.patch.object(reader, reader._try_check_transform.__name__) # reader.register_transform(df_id=df_id, df_config=config, transform=source_transform) transformed_df = reader.read(df_id=df_id, transform=transform) - assert transformed_df.select_dtypes(include=[np.datetime64]).columns[0] == 'c' - assert 'a' not in set(transformed_df.columns) + assert transformed_df.select_dtypes(include=[np.datetime64]).columns[0] == "c" + assert "a" not in set(transformed_df.columns) -@pytest.mark.parametrize("valid_state", [True, False], ids=['valid_state', 'not_valid_state']) +@pytest.mark.parametrize("valid_state", [True, False], ids=["valid_state", "not_valid_state"]) def test_permanent_transforms_state(mocker, df_id, test_path, valid_state, initial_df, transformed_format): # setup df cache init_cache = TestDfCache() @@ -443,35 +438,36 @@ def test_permanent_transforms_state(mocker, df_id, test_path, valid_state, initi init_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) # setup transform permanent_steps = [ - DfTransformStepConfig(module_path='tests.drop_cols_transform.TestDropColsTransformStep', - params={'cols_to_drop': ['b']}), + DfTransformStepConfig( + module_path="tests.drop_cols_transform.TestDropColsTransformStep", params={"cols_to_drop": ["b"]}, + ), ] - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='tests.zero_transform.TestZeroTransformStep', - params={'zero_cols': ['a']}), - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ], - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": ["a"]}, + ), + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ), + ], + permanent_steps=permanent_steps, + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) mocker.patch.object(reader, DfReader._save_transforms_state_file.__name__) @@ -479,9 +475,7 @@ def test_permanent_transforms_state(mocker, df_id, test_path, valid_state, initi if not valid_state: del transform_dict[TRANSFORM_IN_MEMORY_KEY] - transform_state = { - transform.transform_id: transform_dict - } + transform_state = {transform.transform_id: transform_dict} mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) mocker.patch.object(reader, DfReader._transforms_state_dicts.__name__, lambda df_id: transform_state) reader.register_transform(df_id=df_id, df_config=config, transform=transform) @@ -504,8 +498,11 @@ def test_permanent_transforms_state(mocker, df_id, test_path, valid_state, initi assert transformed_df.equals(test_transformed_df) -@pytest.mark.parametrize("valid_state", [True, False], ids=['valid_state', 'not_valid_state']) -def test_permanent_source_transforms_state(mocker, df_id, test_path, valid_state, initial_df, transformed_format): +@pytest.mark.parametrize("valid_state", [True, False], ids=["valid_state", "not_valid_state"]) +@pytest.mark.parametrize("forced", [True, False], ids=["forced", "not_forced"]) +def test_permanent_source_transforms_state( + mocker, df_id, test_path, valid_state, initial_df, transformed_format, forced +): # setup df cache init_cache = TestDfCache() transform_cache = TestDfCache() @@ -513,48 +510,49 @@ def test_permanent_source_transforms_state(mocker, df_id, test_path, valid_state init_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) # setup transform source_permanent_steps = [ - DfTransformStepConfig(module_path='tests.zero_transform.TestZeroTransformStep', - params={'zero_cols': ['a']}) + DfTransformStepConfig(module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": ["a"]},) ] permanent_steps = [ - DfTransformStepConfig(module_path='tests.drop_cols_transform.TestDropColsTransformStep', - params={'cols_to_drop': ['b']}), + DfTransformStepConfig( + module_path="tests.drop_cols_transform.TestDropColsTransformStep", params={"cols_to_drop": ["b"]}, + ), ] - source_transform_id = 'source_transform_id' - source_transform = DfTransformConfig(transform_id=source_transform_id, - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig( - module_path='tests.zero_transform.TestZeroTransformStep', - params={'cols_to_drop': ['b']}), - ], - permanent_steps=source_permanent_steps) - - transform = DfTransformConfig(transform_id='transform_id', - source_id=source_transform_id, - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ], - permanent_steps=permanent_steps) + source_transform_id = "source_transform_id" + source_transform = DfTransformConfig( + transform_id=source_transform_id, + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.zero_transform.TestZeroTransformStep", params={"cols_to_drop": ["b"]}, + ), + ], + permanent_steps=source_permanent_steps, + ) + + transform = DfTransformConfig( + transform_id="transform_id", + source_id=source_transform_id, + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + permanent_steps=permanent_steps, + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) mocker.patch.object(reader, DfReader._save_transforms_state_file.__name__) @@ -565,7 +563,7 @@ def test_permanent_source_transforms_state(mocker, df_id, test_path, valid_state transform_state = { transform.transform_id: transform_dict, - source_transform.transform_id: source_transform_dict + source_transform.transform_id: source_transform_dict, } transform_dict[TRANSFORM_STATE_SOURCE_KEY] = source_transform_dict mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) @@ -583,15 +581,19 @@ def test_permanent_source_transforms_state(mocker, df_id, test_path, valid_state trans_load_mock = mocker.patch.object(transform_cache, transform_cache.load.__name__) trans_load_mock.return_value = test_transformed_df - if not valid_state: - with pytest.raises(Exception): - reader.read(df_id=df_id, transform=transform) - else: - transformed_df = reader.read(df_id=df_id, transform=transform) + if valid_state: + transformed_df = reader.read(df_id=df_id, transform=transform, forced=forced) assert transformed_df.equals(test_transformed_df) + else: + if forced: + transformed_df = reader.read(df_id=df_id, transform=transform, forced=forced) + assert transformed_df.equals(test_transformed_df) + else: + with pytest.raises(Exception): + reader.read(df_id=df_id, transform=transform, forced=forced) -@pytest.mark.parametrize("df_exists", [True, False], ids=['df_exists', 'not_df_exists']) +@pytest.mark.parametrize("df_exists", [True, False], ids=["df_exists", "not_df_exists"]) def test_transforms_state_cleanup(mocker, df_id, test_path, df_exists, initial_df, transformed_format): # setup df cache init_cache = TestDfCache() @@ -602,31 +604,42 @@ def test_transforms_state_cleanup(mocker, df_id, test_path, df_exists, initial_d transform_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) - - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ]) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) + + before_transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + permanent_steps=[ + DfTransformStepConfig(module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": "a"}) + ], + ) + + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) mocker.patch.object(reader, DfReader._save_transforms_state_file.__name__) - transform_dict = transform.to_dict()[1] transform_state = { - transform.transform_id: transform_dict, + transform.transform_id: before_transform.to_dict()[1], } mocker.patch.object(reader, DfReader._transforms_state_dicts.__name__, lambda df_id: transform_state) reader.register_transform(df_id=df_id, df_config=config, transform=transform) @@ -635,7 +648,9 @@ def test_transforms_state_cleanup(mocker, df_id, test_path, df_exists, initial_d if df_exists: mocker.patch.object(DfReader, DfReader.df_exists.__name__, lambda _, df_id, transform_id: True) - reader.read(df_id=df_id, transform=transform) + mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) + with pytest.raises(Exception): + reader.read(df_id=df_id, transform=transform) else: reader.read(df_id=df_id, transform=transform) assert len(transform_state) == 0 @@ -651,34 +666,36 @@ def test_transforms_state_save(mocker, df_id, test_path, initial_df, transformed transform_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) - - source_transform_id = 'source_transform_id' - source_transform = DfTransformConfig(transform_id=source_transform_id, - df_format=transformed_format, - permanent_steps=[ - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ]) - - transform = DfTransformConfig(transform_id='transform_id', - source_id=source_transform_id, - df_format=transformed_format, - permanent_steps=[ - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ]) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) + + source_transform_id = "source_transform_id" + source_transform = DfTransformConfig( + transform_id=source_transform_id, + df_format=transformed_format, + permanent_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + ) + + transform = DfTransformConfig( + transform_id="transform_id", + source_id=source_transform_id, + df_format=transformed_format, + permanent_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) save_mock = mocker.patch.object(reader, DfReader._save_transforms_state_file.__name__) @@ -698,7 +715,7 @@ def test_transforms_state_save(mocker, df_id, test_path, initial_df, transformed save_mock.assert_called_with(df_id=df_id, transforms_state=test_transform_state) -@pytest.mark.parametrize("df_exists", [True, False], ids=['df_exists', 'not_df_exists']) +@pytest.mark.parametrize("df_exists", [True, False], ids=["df_exists", "not_df_exists"]) def test_transforms_state_no_file(mocker, df_id, test_path, df_exists, initial_df, transformed_format): # setup df cache init_cache = TestDfCache() @@ -709,31 +726,42 @@ def test_transforms_state_no_file(mocker, df_id, test_path, df_exists, initial_d transform_load_mock.return_value = initial_df # setup reader - init_format = 'init_format' - reader = DfReader(dir_path=test_path, format_to_cache_map={ - init_format: init_cache, - transformed_format: transform_cache - }) - - transform = DfTransformConfig(transform_id='transform_id', - df_format=transformed_format, - in_memory_steps=[ - DfTransformStepConfig(module_path='tests.dates_transform.TestDatesTransformStep', - params={'dates_cols': ['c']}) - ]) + init_format = "init_format" + reader = DfReader( + dir_path=test_path, format_to_cache_map={init_format: init_cache, transformed_format: transform_cache,}, + ) + + before_transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + permanent_steps=[ + DfTransformStepConfig(module_path="tests.zero_transform.TestZeroTransformStep", params={"zero_cols": "a"}) + ], + ) + + transform = DfTransformConfig( + transform_id="transform_id", + df_format=transformed_format, + in_memory_steps=[ + DfTransformStepConfig( + module_path="tests.dates_transform.TestDatesTransformStep", params={"dates_cols": ["c"]}, + ) + ], + ) # setup config - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format=init_format) + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format=init_format) # do nothing when it comes to save the config mocker.patch.object(DfConfig, DfConfig._save.__name__) mocker.patch.object(reader, DfReader._save_transforms_state_file.__name__) - transform_dict = transform.to_dict()[1] transform_state = { - transform.transform_id: transform_dict, + transform.transform_id: before_transform.to_dict()[1], } mocker.patch.object(reader, DfReader._transforms_state_dicts.__name__, lambda df_id: transform_state) reader.register_transform(df_id=df_id, df_config=config, transform=transform) @@ -742,17 +770,20 @@ def test_transforms_state_no_file(mocker, df_id, test_path, df_exists, initial_d if df_exists: mocker.patch.object(DfReader, DfReader.df_exists.__name__, lambda _, df_id, transform_id: True) - reader.read(df_id=df_id, transform=transform) + mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) + with pytest.raises(Exception): + reader.read(df_id=df_id, transform=transform) else: reader.read(df_id=df_id, transform=transform) assert len(transform_state) == 0 -@pytest.mark.parametrize("forced", [True, False], ids=['forced', 'not_forced']) -@pytest.mark.parametrize("is_df_outdated", [True, False], ids=['outdated', 'not_outdated']) + +@pytest.mark.parametrize("forced", [True, False], ids=["forced", "not_forced"]) +@pytest.mark.parametrize("is_df_outdated", [True, False], ids=["outdated", "not_outdated"]) def test_transform_state_source_check_ts(mocker, df_id, test_path, initial_df, is_df_outdated, forced): - zero_module_path = 'tests.zero_transform.TestZeroTransformStep' + zero_module_path = "tests.zero_transform.TestZeroTransformStep" zero_last_modified_ts = 1.0 - drop_module_path = 'tests.drop_cols_transform.TestDropColsTransformStep' + drop_module_path = "tests.drop_cols_transform.TestDropColsTransformStep" drop_last_modified_ts = 2.0 valid_ts = max(zero_last_modified_ts, drop_last_modified_ts) + 1.0 if is_df_outdated: @@ -760,56 +791,53 @@ def test_transform_state_source_check_ts(mocker, df_id, test_path, initial_df, i else: df_last_modified_ts = valid_ts - transformed_format = 'transformed_format' - config = build_config(mocker=mocker, - df_id=df_id, - test_path=test_path, - initial_format='init_format') + transformed_format = "transformed_format" + config = build_config(mocker=mocker, df_id=df_id, test_path=test_path, initial_format="init_format") # do nothing when it comes to save config mocker.patch.object(DfConfig, DfConfig._save.__name__) # override config getter mocker.patch.object(DfReader, DfReader._get_config.__name__, lambda _, df_id: config) source_permanent_steps = [ - DfTransformStepConfig(module_path=zero_module_path, - params={'zero_cols': ['a']}), + DfTransformStepConfig(module_path=zero_module_path, params={"zero_cols": ["a"]}), ] permanent_steps = [ - DfTransformStepConfig(module_path=drop_module_path, - params={'cols_to_drop': ['b']}), + DfTransformStepConfig(module_path=drop_module_path, params={"cols_to_drop": ["b"]}), ] - source_id = 'source_transform_id' - source_transform = DfTransformConfig(transform_id=source_id, - df_format=transformed_format, - in_memory_steps=[], - permanent_steps=source_permanent_steps) - - transform = DfTransformConfig(transform_id='transform_id', - source_id=source_id, - df_format=transformed_format, - in_memory_steps=[], - permanent_steps=permanent_steps) - - mocker.patch.object(DfReader, - DfReader.df_exists.__name__, - lambda _, df_id, transform_id: True) - mocker.patch.object(DfReader, - DfReader._df_last_modified_ts.__name__, - lambda _, df_id, transform_id: df_last_modified_ts if transform_id == source_id else valid_ts) + source_id = "source_transform_id" + source_transform = DfTransformConfig( + transform_id=source_id, + df_format=transformed_format, + in_memory_steps=[], + permanent_steps=source_permanent_steps, + ) + + transform = DfTransformConfig( + transform_id="transform_id", + source_id=source_id, + df_format=transformed_format, + in_memory_steps=[], + permanent_steps=permanent_steps, + ) + + mocker.patch.object(DfReader, DfReader.df_exists.__name__, lambda _, df_id, transform_id: True) + mocker.patch.object( + DfReader, + DfReader._df_last_modified_ts.__name__, + lambda _, df_id, transform_id: df_last_modified_ts if transform_id == source_id else valid_ts, + ) trans_cache = TestDfCache() trans_load_mock = mocker.patch.object(trans_cache, trans_cache.load.__name__) trans_df = mocker.Mock() trans_load_mock.return_value = trans_df - reader = DfReader(dir_path=test_path, format_to_cache_map={ - transformed_format: trans_cache - }) + reader = DfReader(dir_path=test_path, format_to_cache_map={transformed_format: trans_cache}) transform_dict = transform.to_dict()[1] source_transform_dict = source_transform.to_dict()[1] transform_state = { transform.transform_id: transform_dict, - source_transform.transform_id: source_transform_dict + source_transform.transform_id: source_transform_dict, } transform_dict[TRANSFORM_STATE_SOURCE_KEY] = source_transform_dict mocker.patch.object(DfReader, DfReader._is_file_exists.__name__, lambda path: True) @@ -821,12 +849,12 @@ def test_transform_state_source_check_ts(mocker, df_id, test_path, initial_df, i reader.register_transform(df_id=df_id, df_config=config, transform=transform) def last_modified_date(file_path: str): - if 'zero_transform' in file_path: + if "zero_transform" in file_path: return zero_last_modified_ts - elif 'drop_cols_transform' in file_path: + elif "drop_cols_transform" in file_path: return drop_last_modified_ts - else: - raise ValueError('???') + + raise ValueError("???") mocker.patch.object(FileInspector, FileInspector.last_modified_date.__name__, last_modified_date) @@ -839,4 +867,4 @@ def last_modified_date(file_path: str): def _df_dir_path(dir_path: str, df_id: str) -> str: - return f'{dir_path}{df_id}' + return f"{dir_path}{df_id}" diff --git a/tests/test_df_transform.py b/tests/test_df_transform.py index e5a69d2..87a6ca4 100644 --- a/tests/test_df_transform.py +++ b/tests/test_df_transform.py @@ -1,25 +1,36 @@ import pytest -from df_and_order.df_transform import DfTransformConfig, TRANSFORM_IN_MEMORY_KEY, TRANSFORM_PERMANENT_KEY, \ - TRANSFORM_SOURCE_IN_MEMORY_KEY, TRANSFORM_SOURCE_ID_KEY, TRANSFORM_DF_FORMAT_KEY -from df_and_order.df_transform_step import DfTransformStepConfig, TRANSFORM_STEP_MODULE_PATH_KEY, \ - TRANSFORM_STEP_PARAMS_KEY +from df_and_order.df_transform import ( + DfTransformConfig, + TRANSFORM_IN_MEMORY_KEY, + TRANSFORM_PERMANENT_KEY, + TRANSFORM_SOURCE_IN_MEMORY_KEY, + TRANSFORM_SOURCE_ID_KEY, + TRANSFORM_DF_FORMAT_KEY, +) +from df_and_order.df_transform_step import ( + DfTransformStepConfig, + TRANSFORM_STEP_MODULE_PATH_KEY, + TRANSFORM_STEP_PARAMS_KEY, +) def test_properties(mocker): - transform_id = 'trans_id' - df_format = 'format' - source_id = 'source_id' + transform_id = "trans_id" + df_format = "format" + source_id = "source_id" source_in_memory_steps = mocker.Mock() in_memory_steps = mocker.Mock() permanent_steps = mocker.Mock() - transform = DfTransformConfig(transform_id=transform_id, - source_id=source_id, - df_format=df_format, - source_in_memory_steps=source_in_memory_steps, - in_memory_steps=in_memory_steps, - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id=transform_id, + source_id=source_id, + df_format=df_format, + source_in_memory_steps=source_in_memory_steps, + in_memory_steps=in_memory_steps, + permanent_steps=permanent_steps, + ) assert transform.transform_id == transform_id assert transform.df_format == df_format @@ -30,94 +41,88 @@ def test_properties(mocker): def test_eq(mocker): - transform_id = 'trans_id' - df_format = 'format' - source_id = 'source_id' + transform_id = "trans_id" + df_format = "format" + source_id = "source_id" source_in_memory_steps = mocker.Mock() in_memory_steps = mocker.Mock() permanent_steps = mocker.Mock() - transform = DfTransformConfig(transform_id=transform_id, - source_id=source_id, - df_format=df_format, - source_in_memory_steps=source_in_memory_steps, - in_memory_steps=in_memory_steps, - permanent_steps=permanent_steps) - - another_transform = DfTransformConfig(transform_id=transform_id, - source_id=source_id, - df_format=df_format, - source_in_memory_steps=source_in_memory_steps, - in_memory_steps=in_memory_steps, - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id=transform_id, + source_id=source_id, + df_format=df_format, + source_in_memory_steps=source_in_memory_steps, + in_memory_steps=in_memory_steps, + permanent_steps=permanent_steps, + ) + + another_transform = DfTransformConfig( + transform_id=transform_id, + source_id=source_id, + df_format=df_format, + source_in_memory_steps=source_in_memory_steps, + in_memory_steps=in_memory_steps, + permanent_steps=permanent_steps, + ) assert transform == another_transform - another_transform._transform_id = 'bad' + another_transform._transform_id = "bad" assert transform != another_transform -@pytest.mark.parametrize("use_source_id", [True, False], ids=['source_id', 'no_source_id']) + +@pytest.mark.parametrize("use_source_id", [True, False], ids=["source_id", "no_source_id"]) def test_to_dict(use_source_id): - transform_id = 'trans_id' - df_format = 'df_format' - source_id = 'test_source_id' if use_source_id else None - source_in_memory_steps = [ - DfTransformStepConfig(module_path='source_inmem1', params={'_a': 1}), - DfTransformStepConfig(module_path='source_inmem2', params={'_b': 2}), - ] if use_source_id else None + transform_id = "trans_id" + df_format = "df_format" + source_id = "test_source_id" if use_source_id else None + source_in_memory_steps = ( + [ + DfTransformStepConfig(module_path="source_inmem1", params={"_a": 1}), + DfTransformStepConfig(module_path="source_inmem2", params={"_b": 2}), + ] + if use_source_id + else None + ) in_memory_steps = [ - DfTransformStepConfig(module_path='inmem1', params={'a': 1}), - DfTransformStepConfig(module_path='inmem2', params={'b': 2}), + DfTransformStepConfig(module_path="inmem1", params={"a": 1}), + DfTransformStepConfig(module_path="inmem2", params={"b": 2}), ] permanent_steps = [ - DfTransformStepConfig(module_path='perm3', params={}), - DfTransformStepConfig(module_path='perm4', params={'c': 3}), + DfTransformStepConfig(module_path="perm3", params={}), + DfTransformStepConfig(module_path="perm4", params={"c": 3}), ] - transform = DfTransformConfig(transform_id=transform_id, - df_format=df_format, - source_id=source_id, - source_in_memory_steps=source_in_memory_steps, - in_memory_steps=in_memory_steps, - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id=transform_id, + df_format=df_format, + source_id=source_id, + source_in_memory_steps=source_in_memory_steps, + in_memory_steps=in_memory_steps, + permanent_steps=permanent_steps, + ) res_transform_id, res_transform_dict = transform.to_dict() res_source_id = res_transform_dict.get(TRANSFORM_SOURCE_ID_KEY) test_transform_dict = { TRANSFORM_DF_FORMAT_KEY: df_format, TRANSFORM_IN_MEMORY_KEY: [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem1', - TRANSFORM_STEP_PARAMS_KEY: {'a': 1} - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem2', - TRANSFORM_STEP_PARAMS_KEY: {'b': 2} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem1", TRANSFORM_STEP_PARAMS_KEY: {"a": 1},}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem2", TRANSFORM_STEP_PARAMS_KEY: {"b": 2},}, ], TRANSFORM_PERMANENT_KEY: [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm3', - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm4', - TRANSFORM_STEP_PARAMS_KEY: {'c': 3} - } - ] + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm3",}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm4", TRANSFORM_STEP_PARAMS_KEY: {"c": 3},}, + ], } if use_source_id: test_transform_dict[TRANSFORM_SOURCE_ID_KEY] = source_id test_transform_dict[TRANSFORM_SOURCE_IN_MEMORY_KEY] = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'source_inmem1', - TRANSFORM_STEP_PARAMS_KEY: {'_a': 1} - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'source_inmem2', - TRANSFORM_STEP_PARAMS_KEY: {'_b': 2} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "source_inmem1", TRANSFORM_STEP_PARAMS_KEY: {"_a": 1},}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "source_inmem2", TRANSFORM_STEP_PARAMS_KEY: {"_b": 2},}, ] assert res_transform_id == transform_id @@ -125,52 +130,38 @@ def test_to_dict(use_source_id): assert res_transform_dict == test_transform_dict -@pytest.mark.parametrize("use_source_id", [True, False], ids=['source_id', 'no_source_id']) +@pytest.mark.parametrize("use_source_id", [True, False], ids=["source_id", "no_source_id"]) def test_from_dict(use_source_id): - source_id = 'test_source_id' if use_source_id else None - df_format = 'format' - source_in_memory_steps = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'source_inmem1', - TRANSFORM_STEP_PARAMS_KEY: {'_a': 1} - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'source_inmem2', - TRANSFORM_STEP_PARAMS_KEY: {'_b': 2} - } - ] if use_source_id else None + source_id = "test_source_id" if use_source_id else None + df_format = "format" + source_in_memory_steps = ( + [ + {TRANSFORM_STEP_MODULE_PATH_KEY: "source_inmem1", TRANSFORM_STEP_PARAMS_KEY: {"_a": 1},}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "source_inmem2", TRANSFORM_STEP_PARAMS_KEY: {"_b": 2},}, + ] + if use_source_id + else None + ) in_memory_steps = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem1', - TRANSFORM_STEP_PARAMS_KEY: {'a': 1} - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem2', - TRANSFORM_STEP_PARAMS_KEY: {'b': 2} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem1", TRANSFORM_STEP_PARAMS_KEY: {"a": 1}}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem2", TRANSFORM_STEP_PARAMS_KEY: {"b": 2}}, ] permanent_steps = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm3', - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm4', - TRANSFORM_STEP_PARAMS_KEY: {'c': 3} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm3",}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm4", TRANSFORM_STEP_PARAMS_KEY: {"c": 3}}, ] transform_dict = { TRANSFORM_DF_FORMAT_KEY: df_format, TRANSFORM_IN_MEMORY_KEY: in_memory_steps, - TRANSFORM_PERMANENT_KEY: permanent_steps + TRANSFORM_PERMANENT_KEY: permanent_steps, } if use_source_id: transform_dict[TRANSFORM_SOURCE_ID_KEY] = source_id transform_dict[TRANSFORM_SOURCE_IN_MEMORY_KEY] = source_in_memory_steps - transform_id = 'trans_id' - transform = DfTransformConfig.from_dict(transform_id=transform_id, - transform_dict=transform_dict) + transform_id = "trans_id" + transform = DfTransformConfig.from_dict(transform_id=transform_id, transform_dict=transform_dict) assert transform.df_format == df_format assert transform.transform_id == transform_id diff --git a/tests/test_df_transform_step.py b/tests/test_df_transform_step.py index 41217ab..059bb70 100644 --- a/tests/test_df_transform_step.py +++ b/tests/test_df_transform_step.py @@ -1,26 +1,29 @@ import pytest -from df_and_order.df_transform_step import TRANSFORM_STEP_MODULE_PATH_KEY, TRANSFORM_STEP_PARAMS_KEY, \ - DfTransformStepConfig +from df_and_order.df_transform_step import ( + TRANSFORM_STEP_MODULE_PATH_KEY, + TRANSFORM_STEP_PARAMS_KEY, + DfTransformStepConfig, +) from tests.dates_transform import TestDatesTransformStep def test_from_step_type(): step_type = TestDatesTransformStep - params = {'cols': ['a']} + params = {"cols": ["a"]} step_config = DfTransformStepConfig.from_step_type(step_type=step_type, params=params) - assert step_config.module_path == 'tests.dates_transform.TestDatesTransformStep' + assert step_config.module_path == "tests.dates_transform.TestDatesTransformStep" assert step_config.params == params def test_eq(): - module_path = 'some/path' - params = {'param1': 1, 'param2': 'value'} + module_path = "some/path" + params = {"param1": 1, "param2": "value"} step_dict = { TRANSFORM_STEP_MODULE_PATH_KEY: module_path, - TRANSFORM_STEP_PARAMS_KEY: params + TRANSFORM_STEP_PARAMS_KEY: params, } step_config = DfTransformStepConfig.from_dict(step_dict=step_dict) @@ -28,17 +31,17 @@ def test_eq(): assert step_config == another_step_config - another_step_config.module_path = 'bad' + another_step_config.module_path = "bad" assert step_config != another_step_config def test_from_dict(): - module_path = 'some/path' - params = {'param1': 1, 'param2': 'value'} + module_path = "some/path" + params = {"param1": 1, "param2": "value"} step_dict = { TRANSFORM_STEP_MODULE_PATH_KEY: module_path, - TRANSFORM_STEP_PARAMS_KEY: params + TRANSFORM_STEP_PARAMS_KEY: params, } step_config = DfTransformStepConfig.from_dict(step_dict=step_dict) @@ -48,8 +51,8 @@ def test_from_dict(): def test_to_dict(): - module_path = 'some/path' - params = {'param1': 1, 'param2': 'value'} + module_path = "some/path" + params = {"param1": 1, "param2": "value"} step_config = DfTransformStepConfig(module_path=module_path, params=params) step_dict = step_config.to_dict() diff --git a/tests/test_transform_state.py b/tests/test_transform_state.py index 4922277..9a31577 100644 --- a/tests/test_transform_state.py +++ b/tests/test_transform_state.py @@ -1,49 +1,58 @@ import pytest -from df_and_order.df_transform import DfTransformConfig, TRANSFORM_DF_FORMAT_KEY, TRANSFORM_IN_MEMORY_KEY, \ - TRANSFORM_PERMANENT_KEY, TRANSFORM_SOURCE_ID_KEY, TRANSFORM_SOURCE_IN_MEMORY_KEY +from df_and_order.df_transform import ( + DfTransformConfig, + TRANSFORM_DF_FORMAT_KEY, + TRANSFORM_IN_MEMORY_KEY, + TRANSFORM_PERMANENT_KEY, + TRANSFORM_SOURCE_ID_KEY, + TRANSFORM_SOURCE_IN_MEMORY_KEY, +) from df_and_order.df_transform_state import DfTransformState, TRANSFORM_STATE_SOURCE_KEY -from df_and_order.df_transform_step import DfTransformStepConfig, TRANSFORM_STEP_MODULE_PATH_KEY, \ - TRANSFORM_STEP_PARAMS_KEY +from df_and_order.df_transform_step import ( + DfTransformStepConfig, + TRANSFORM_STEP_MODULE_PATH_KEY, + TRANSFORM_STEP_PARAMS_KEY, +) def test_properties(mocker): transform = mocker.Mock() source_transform = mocker.Mock() - state = DfTransformState(transform=transform, - source_transform=source_transform) + state = DfTransformState(transform=transform, source_transform=source_transform) assert state.transform == transform assert state.source_transform == source_transform -@pytest.mark.parametrize("use_source", [True, False], ids=['source', 'no_source']) +@pytest.mark.parametrize("use_source", [True, False], ids=["source", "no_source"]) def test_to_dict(use_source): - source_transform_id = 'source_trans_id' - transform_id = 'trans_id' - df_format = 'df_format' + source_transform_id = "source_trans_id" + transform_id = "trans_id" + df_format = "df_format" in_memory_steps = [ - DfTransformStepConfig(module_path='inmem1', params={'a': 1}), - DfTransformStepConfig(module_path='inmem2', params={'b': 2}), + DfTransformStepConfig(module_path="inmem1", params={"a": 1}), + DfTransformStepConfig(module_path="inmem2", params={"b": 2}), ] permanent_steps = [ - DfTransformStepConfig(module_path='perm3', params={}), - DfTransformStepConfig(module_path='perm4', params={'c': 3}), + DfTransformStepConfig(module_path="perm3", params={}), + DfTransformStepConfig(module_path="perm4", params={"c": 3}), ] source_transform = None if use_source: - source_transform = DfTransformConfig(transform_id=source_transform_id, - df_format=df_format, - in_memory_steps=in_memory_steps) + source_transform = DfTransformConfig( + transform_id=source_transform_id, df_format=df_format, in_memory_steps=in_memory_steps, + ) - transform = DfTransformConfig(transform_id=transform_id, - source_id=source_transform_id if use_source else None, - df_format=df_format, - permanent_steps=permanent_steps) + transform = DfTransformConfig( + transform_id=transform_id, + source_id=source_transform_id if use_source else None, + df_format=df_format, + permanent_steps=permanent_steps, + ) - state = DfTransformState(transform=transform, - source_transform=source_transform) + state = DfTransformState(transform=transform, source_transform=source_transform) res_transform_id, res_state_dict = state.to_dict() @@ -57,28 +66,17 @@ def test_to_dict(use_source): assert res_state_dict == test_state_dict -@pytest.mark.parametrize("use_source", [True, False], ids=['source', 'no_source']) +@pytest.mark.parametrize("use_source", [True, False], ids=["source", "no_source"]) def test_from_dict(use_source): - source_id = 'test_source_id' if use_source else None - df_format = 'format' + source_id = "test_source_id" if use_source else None + df_format = "format" in_memory_steps = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem1', - TRANSFORM_STEP_PARAMS_KEY: {'a': 1} - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'inmem2', - TRANSFORM_STEP_PARAMS_KEY: {'b': 2} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem1", TRANSFORM_STEP_PARAMS_KEY: {"a": 1}}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "inmem2", TRANSFORM_STEP_PARAMS_KEY: {"b": 2}}, ] permanent_steps = [ - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm3', - }, - { - TRANSFORM_STEP_MODULE_PATH_KEY: 'perm4', - TRANSFORM_STEP_PARAMS_KEY: {'c': 3} - } + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm3",}, + {TRANSFORM_STEP_MODULE_PATH_KEY: "perm4", TRANSFORM_STEP_PARAMS_KEY: {"c": 3}}, ] transform_dict = { TRANSFORM_DF_FORMAT_KEY: df_format, @@ -89,19 +87,19 @@ def test_from_dict(use_source): if use_source: source_transform_dict = { TRANSFORM_DF_FORMAT_KEY: df_format, - TRANSFORM_PERMANENT_KEY: permanent_steps + TRANSFORM_PERMANENT_KEY: permanent_steps, } state_dict[TRANSFORM_STATE_SOURCE_KEY] = source_transform_dict state_dict[TRANSFORM_SOURCE_ID_KEY] = source_id - transform_id = 'trans_id' - state = DfTransformState.from_dict(transform_id=transform_id, - state_dict=state_dict) + transform_id = "trans_id" + state = DfTransformState.from_dict(transform_id=transform_id, state_dict=state_dict) test_transform = DfTransformConfig.from_dict(transform_id=transform_id, transform_dict=transform_dict) assert state.transform == test_transform if use_source: - test_source_transform = DfTransformConfig.from_dict(transform_id=source_id, - transform_dict=source_transform_dict) + test_source_transform = DfTransformConfig.from_dict( + transform_id=source_id, transform_dict=source_transform_dict + ) assert state.source_transform == test_source_transform diff --git a/tests/zero_transform.py b/tests/zero_transform.py index c54d509..0d88e1b 100644 --- a/tests/zero_transform.py +++ b/tests/zero_transform.py @@ -1,6 +1,5 @@ -from typing import List - import pandas as pd +from typing import List from df_and_order.df_transform_step import DfTransformStep @@ -15,4 +14,4 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: for col in self._zero_cols: df[col] = 0 - return df \ No newline at end of file + return df