From 914f0b7c4fbb350b47a7e0561312e5a82344c74a Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Fri, 25 Mar 2022 12:34:32 -0500 Subject: [PATCH 01/15] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 3c5744c..eefd1fd 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ To download the code in this repo, you can simply use git clone git clone https://github.com/khuyentran1401/Data-science ``` + + # Contents 1. [MLOps](#mlops) 2. [Testing](#testing) From 6e062232068891355835b0d34504d6baddafdd43 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Fri, 25 Mar 2022 12:35:27 -0500 Subject: [PATCH 02/15] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index eefd1fd..b2034f1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,10 @@ To download the code in this repo, you can simply use git clone git clone https://github.com/khuyentran1401/Data-science ``` - +However, due to the large number of files in this repository, it will take around 5 minutes. To clone in couple of seconds, use [git-lfs](https://git-lfs.github.com/). +```bash +git-lfs clone https://github.com/khuyentran1401/Data-science +``` # Contents 1. [MLOps](#mlops) From 598ff08b254177df36be2942d37663a7788e5563 Mon Sep 17 00:00:00 2001 From: Khuyen Date: Fri, 25 Mar 2022 15:11:36 -0500 Subject: [PATCH 03/15] add pandera --- .../pandera_example/pandera.ipynb | 845 ++++++++++-------- 1 file changed, 451 insertions(+), 394 deletions(-) diff --git a/data_science_tools/pandera_example/pandera.ipynb b/data_science_tools/pandera_example/pandera.ipynb index de8a050..2abc0bc 100644 --- a/data_science_tools/pandera_example/pandera.ipynb +++ b/data_science_tools/pandera_example/pandera.ipynb @@ -6,56 +6,13 @@ "id": "fb1ddc46", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T14:56:36.233537Z", - "start_time": "2022-03-25T14:56:32.787203Z" + "end_time": "2022-03-25T18:25:44.708196Z", + "start_time": "2022-03-25T18:25:41.488504Z" } }, "outputs": [], "source": [ - "!pip install pandera" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9fd62fb8", - "metadata": { - "ExecuteTime": { - "end_time": "2022-03-25T14:57:11.376220Z", - "start_time": "2022-03-25T14:57:11.097636Z" - } - }, - "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 2;\n", - " var nbb_unformatted_code = \"%load_ext nb_black\";\n", - " var nbb_formatted_code = \"%load_ext nb_black\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%load_ext nb_black" + "!pip install pandera ipytest" ] }, { @@ -68,12 +25,12 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 4, "id": "2ef1fb81", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:55:26.941755Z", - "start_time": "2022-03-25T15:55:26.895622Z" + "end_time": "2022-03-25T18:31:14.257662Z", + "start_time": "2022-03-25T18:31:14.249575Z" } }, "outputs": [ @@ -140,36 +97,9 @@ "3 orange Aldi 4" ] }, - "execution_count": 64, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 64;\n", - " var nbb_unformatted_code = \"import pandas as pd\\n\\nfruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits\";\n", - " var nbb_formatted_code = \"import pandas as pd\\n\\nfruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -188,64 +118,15 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "551e8ffb", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:09:18.335521Z", - "start_time": "2022-03-25T15:09:18.244731Z" + "end_time": "2022-03-25T18:31:14.658417Z", + "start_time": "2022-03-25T18:31:14.485803Z" } }, - "outputs": [ - { - "ename": "SchemaError", - "evalue": " failed element-wise validator 0:\n\nfailure cases:\n index failure_case\n0 3 4", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSchemaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_2209524/275105114.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\"price\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mColumn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCheck\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mless_than\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m })\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfruits\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0mcheck_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m \u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"schema_component_check\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaErrors\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_error_dict\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_component\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mschema_components\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m result = schema_component(\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0mdf_to_validate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1824\u001b[0m ) -> Union[pd.DataFrame, pd.Series]:\n\u001b[1;32m 1825\u001b[0m \u001b[0;34m\"\"\"Alias for ``validate`` method.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1826\u001b[0;31m return self.validate(\n\u001b[0m\u001b[1;32m 1827\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtail\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1828\u001b[0m )\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 212\u001b[0m )\n\u001b[1;32m 213\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate_column\u001b[0;34m(check_obj, column_name)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m super(Column, copy(self).set_name(column_name)).validate(\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1786\u001b[0m )\n\u001b[1;32m 1787\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1788\u001b[0;31m \u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dataframe_check\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1789\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint: disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1790\u001b[0m \u001b[0;31m# catch other exceptions that may occur when executing the\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1781\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1782\u001b[0m check_results.append(\n\u001b[0;32m-> 1783\u001b[0;31m _handle_check_results(\n\u001b[0m\u001b[1;32m 1784\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mcheck_args\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1785\u001b[0m )\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36m_handle_check_results\u001b[0;34m(schema, check_index, check, check_obj, *check_args)\u001b[0m\n\u001b[1;32m 2106\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUserWarning\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2107\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2108\u001b[0;31m raise errors.SchemaError(\n\u001b[0m\u001b[1;32m 2109\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2110\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSchemaError\u001b[0m: failed element-wise validator 0:\n\nfailure cases:\n index failure_case\n0 3 4" - ] - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 16;\n", - " var nbb_unformatted_code = \"import pandera as pa\\nfrom pandera import Column, Check\\n\\navailable_fruits = ['apple', 'banana', 'orange']\\nnearby_stores = ['Aldi', 'Walmart']\\nschema = pa.DataFrameSchema({\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(4)),\\n })\\nschema.validate(fruits)\";\n", - " var nbb_formatted_code = \"import pandera as pa\\nfrom pandera import Column, Check\\n\\navailable_fruits = [\\\"apple\\\", \\\"banana\\\", \\\"orange\\\"]\\nnearby_stores = [\\\"Aldi\\\", \\\"Walmart\\\"]\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(4)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import pandera as pa\n", "from pandera import Column, Check\n", @@ -264,12 +145,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "id": "407ac593", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:09:26.585962Z", - "start_time": "2022-03-25T15:09:26.526407Z" + "end_time": "2022-03-25T18:31:14.786395Z", + "start_time": "2022-03-25T18:31:14.771140Z" } }, "outputs": [ @@ -336,36 +217,9 @@ "3 orange Aldi 4" ] }, - "execution_count": 17, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 17;\n", - " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -381,12 +235,12 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "id": "737b2ac9", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:17:33.118092Z", - "start_time": "2022-03-25T15:17:33.054620Z" + "end_time": "2022-03-25T18:49:54.579171Z", + "start_time": "2022-03-25T18:49:54.552021Z" } }, "outputs": [ @@ -453,7 +307,7 @@ "3 orange Aldi 4" ] }, - "execution_count": 21, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, @@ -462,9 +316,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 21;\n", - " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5), Check(lambda price: sum(price) < 20)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5), Check(lambda price: sum(price) < 20)),\\n }\\n)\\nschema.validate(fruits)\";\n", + " var nbb_cell_id = 10;\n", + " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, [Check.less_than(5), Check(lambda price: sum(price) < 20)]),\\n }\\n)\\nschema.validate(fruits)\";\n", + " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(\\n int, [Check.less_than(5), Check(lambda price: sum(price) < 20)]\\n ),\\n }\\n)\\nschema.validate(fruits)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -490,7 +344,9 @@ " {\n", " \"name\": Column(str, Check.isin(available_fruits)),\n", " \"store\": Column(str, Check.isin(nearby_stores)),\n", - " \"price\": Column(int, Check.less_than(5), Check(lambda price: sum(price) < 20)),\n", + " \"price\": Column(\n", + " int, [Check.less_than(5), Check(lambda price: sum(price) < 20)]\n", + " ),\n", " }\n", ")\n", "schema.validate(fruits)" @@ -645,38 +501,287 @@ }, { "cell_type": "code", - "execution_count": 90, - "id": "f4fe5959", + "execution_count": 15, + "id": "02a75aa4", + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-25T19:05:53.414458Z", + "start_time": "2022-03-25T19:05:53.282131Z" + } + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 15;\n", + " var nbb_unformatted_code = \"import ipytest\\nipytest.autoconfig()\";\n", + " var nbb_formatted_code = \"import ipytest\\n\\nipytest.autoconfig()\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipytest\n", + "\n", + "ipytest.autoconfig()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "05df26e0", + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-25T19:11:03.309250Z", + "start_time": "2022-03-25T19:11:03.283422Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 27;\n", + " var nbb_unformatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\ndef get_total_price(fruits: pd.DataFrame, schema: pa.DataFrameSchema):\\n validated = schema.validate(fruits)\\n return validated[\\\"price\\\"].sum()\\n\\n\\nget_total_price(fruits, schema)\";\n", + " var nbb_formatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\ndef get_total_price(fruits: pd.DataFrame, schema: pa.DataFrameSchema):\\n validated = schema.validate(fruits)\\n return validated[\\\"price\\\"].sum()\\n\\n\\nget_total_price(fruits, schema)\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fruits = pd.DataFrame(\n", + " {\n", + " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", + " \"store\": [\"Aldi\", \"Walmart\", \"Walmart\", \"Aldi\"],\n", + " \"price\": [2, 1, 3, 4],\n", + " }\n", + ")\n", + "\n", + "schema = pa.DataFrameSchema(\n", + " {\n", + " \"name\": Column(str, Check.isin(available_fruits)),\n", + " \"store\": Column(str, Check.isin(nearby_stores)),\n", + " \"price\": Column(int, Check.less_than(5)),\n", + " }\n", + ")\n", + "\n", + "\n", + "def get_total_price(fruits: pd.DataFrame, schema: pa.DataFrameSchema):\n", + " validated = schema.validate(fruits)\n", + " return validated[\"price\"].sum()\n", + "\n", + "\n", + "get_total_price(fruits, schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a3e6b055", + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-25T19:11:03.765825Z", + "start_time": "2022-03-25T19:11:03.672303Z" + }, + "code_folding": [] + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 28;\n", + " var nbb_unformatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n \\n schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n )\\n assert get_total_price(fruits, schema) == 3\";\n", + " var nbb_formatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n \\n schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n )\\n assert get_total_price(fruits, schema) == 3\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m.\u001b[0m\u001b[32m [100%]\u001b[0m\n" + ] + }, + { + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 28;\n", + " var nbb_unformatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n \\n schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n )\\n assert get_total_price(fruits, schema) == 3\";\n", + " var nbb_formatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n \\n schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n )\\n assert get_total_price(fruits, schema) == 3\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%ipytest -qq\n", + "def test_get_total_price():\n", + " fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\n", + " \n", + " schema = pa.DataFrameSchema(\n", + " {\n", + " \"name\": Column(str, Check.isin(available_fruits)),\n", + " \"store\": Column(str, Check.isin(nearby_stores)),\n", + " \"price\": Column(int, Check.less_than(5)),\n", + " }\n", + " )\n", + " assert get_total_price(fruits, schema) == 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4fe5959", + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-25T19:17:18.886164Z", + "start_time": "2022-03-25T19:17:18.616689Z" + } + }, + "outputs": [], + "source": [ + "from pandera import check_input, check_output, check_io\n", + "\n", + "fruits = pd.DataFrame(\n", + " {\n", + " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", + " \"store\": [\"Aldi\", \"Walmart\", \"Walmart\", \"Aldi\"],\n", + " \"price\": [\"2\", \"1\", \"3\", \"4\"],\n", + " }\n", + ")\n", + "\n", + "\n", + "@check_input(schema)\n", + "def get_total_price(fruits: pd.DataFrame):\n", + " return fruits.price.sum()\n", + "\n", + "\n", + "get_total_price(fruits)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1078ca06", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T16:14:09.567370Z", - "start_time": "2022-03-25T16:14:09.472030Z" + "end_time": "2022-03-25T19:08:44.613005Z", + "start_time": "2022-03-25T19:08:44.527232Z" } }, "outputs": [ { - "ename": "SchemaError", - "evalue": "error in check_input decorator of function 'get_total_price': expected series 'price' to have type int64, got object", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSchemaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/decorators.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(fn, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 216\u001b[0;31m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mvalidate_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 217\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m \u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"schema_component_check\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaErrors\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m result = schema_component(\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0mdf_to_validate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1825\u001b[0m \u001b[0;34m\"\"\"Alias for ``validate`` method.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1826\u001b[0;31m return self.validate(\n\u001b[0m\u001b[1;32m 1827\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtail\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate_column\u001b[0;34m(check_obj, column_name)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m super(Column, copy(self).set_name(column_name)).validate(\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1762\u001b[0m )\n\u001b[0;32m-> 1763\u001b[0;31m error_handler.collect_error(\n\u001b[0m\u001b[1;32m 1764\u001b[0m \u001b[0;34m\"wrong_dtype\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSchemaError\u001b[0m: expected series 'price' to have type int64, got object", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mSchemaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_2209524/2013134483.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mget_total_price\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfruits\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/decorators.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(fn, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mvalidate_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0m_handle_schema_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"check_input\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mobj_getter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;31m# get the first key in the same order specified in the\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/decorators.py\u001b[0m in \u001b[0;36m_handle_schema_error\u001b[0;34m(decorator_name, fn, schema, arg_df, schema_error)\u001b[0m\n\u001b[1;32m 91\u001b[0m \"\"\"\n\u001b[1;32m 92\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"error in {decorator_name} decorator of function '{fn.__name__}': {schema_error}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m raise errors.SchemaError(\n\u001b[0m\u001b[1;32m 94\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0marg_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSchemaError\u001b[0m: error in check_input decorator of function 'get_total_price': expected series 'price' to have type int64, got object" + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 25;\n", + " var nbb_unformatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n assert get_total_price(fruits) == 3\";\n", + " var nbb_formatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n assert get_total_price(fruits) == 3\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m.\u001b[0m\u001b[32m [100%]\u001b[0m\n" ] }, { @@ -684,9 +789,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 90;\n", - " var nbb_unformatted_code = \"from pandera import check_input, check_output, check_io\\n\\nfruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [\\\"2\\\", \\\"1\\\", \\\"3\\\", \\\"4\\\"],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\n@check_input(schema)\\ndef get_total_price(fruits: pd.DataFrame):\\n return fruits.price.sum()\\n\\n\\nget_total_price(fruits)\";\n", - " var nbb_formatted_code = \"from pandera import check_input, check_output, check_io\\n\\nfruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [\\\"2\\\", \\\"1\\\", \\\"3\\\", \\\"4\\\"],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\n@check_input(schema)\\ndef get_total_price(fruits: pd.DataFrame):\\n return fruits.price.sum()\\n\\n\\nget_total_price(fruits)\";\n", + " var nbb_cell_id = 25;\n", + " var nbb_unformatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n assert get_total_price(fruits) == 3\";\n", + " var nbb_formatted_code = \"%%ipytest -qq\\ndef test_get_total_price():\\n fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\\n assert get_total_price(fruits) == 3\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -708,31 +813,10 @@ } ], "source": [ - "from pandera import check_input, check_output, check_io\n", - "\n", - "fruits = pd.DataFrame(\n", - " {\n", - " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", - " \"store\": [\"Aldi\", \"Walmart\", \"Walmart\", \"Aldi\"],\n", - " \"price\": [\"2\", \"1\", \"3\", \"4\"],\n", - " }\n", - ")\n", - "\n", - "schema = pa.DataFrameSchema(\n", - " {\n", - " \"name\": Column(str, Check.isin(available_fruits)),\n", - " \"store\": Column(str, Check.isin(nearby_stores)),\n", - " \"price\": Column(int, Check.less_than(5)),\n", - " }\n", - ")\n", - "\n", - "\n", - "@check_input(schema)\n", - "def get_total_price(fruits: pd.DataFrame):\n", - " return fruits.price.sum()\n", - "\n", - "\n", - "get_total_price(fruits)" + "%%ipytest -qq\n", + "def test_get_total_price():\n", + " fruits = pd.DataFrame({'name': ['apple', 'banana'], 'store': ['Aldi', 'Walmart'], 'price': [1, 2]})\n", + " assert get_total_price(fruits) == 3" ] }, { @@ -745,25 +829,108 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 30, "id": "f4000414", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T16:14:13.391984Z", - "start_time": "2022-03-25T16:14:13.330242Z" + "end_time": "2022-03-25T19:23:45.156508Z", + "start_time": "2022-03-25T19:23:45.121056Z" }, - "code_folding": [ - 8 - ] + "code_folding": [] }, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namestoreprice
0appleAldi2
1bananaWalmart1
2appleWalmart3
3orangeAldi4
0appleWhole Foods3
1bananaWhole Foods2
2appleSchnucks4
3orangeSchnucks5
\n", + "
" + ], "text/plain": [ - "10" + " name store price\n", + "0 apple Aldi 2\n", + "1 banana Walmart 1\n", + "2 apple Walmart 3\n", + "3 orange Aldi 4\n", + "0 apple Whole Foods 3\n", + "1 banana Whole Foods 2\n", + "2 apple Schnucks 4\n", + "3 orange Schnucks 5" ] }, - "execution_count": 91, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" }, @@ -772,9 +939,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 91;\n", - " var nbb_unformatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nin_schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\n@check_input(in_schema)\\ndef get_total_price(fruits: pd.DataFrame):\\n return fruits.price.sum()\\n\\n\\nget_total_price(fruits)\";\n", - " var nbb_formatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nin_schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\n\\n\\n@check_input(in_schema)\\ndef get_total_price(fruits: pd.DataFrame):\\n return fruits.price.sum()\\n\\n\\nget_total_price(fruits)\";\n", + " var nbb_cell_id = 30;\n", + " var nbb_unformatted_code = \"fruits_nearby = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits_faraway = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Whole Foods\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\", \\\"Schnucks\\\"],\\n \\\"price\\\": [3, 2, 4, 5],\\n }\\n)\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_output(out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", + " var nbb_formatted_code = \"fruits_nearby = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits_faraway = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Whole Foods\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\", \\\"Schnucks\\\"],\\n \\\"price\\\": [3, 2, 4, 5],\\n }\\n)\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_output(out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -796,7 +963,7 @@ } ], "source": [ - "fruits = pd.DataFrame(\n", + "fruits_nearby = pd.DataFrame(\n", " {\n", " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", " \"store\": [\"Aldi\", \"Walmart\", \"Walmart\", \"Aldi\"],\n", @@ -804,21 +971,26 @@ " }\n", ")\n", "\n", - "in_schema = pa.DataFrameSchema(\n", + "fruits_faraway = pd.DataFrame(\n", " {\n", - " \"name\": Column(str, Check.isin(available_fruits)),\n", - " \"store\": Column(str, Check.isin(nearby_stores)),\n", - " \"price\": Column(int, Check.less_than(5)),\n", + " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", + " \"store\": [\"Whole Foods\", \"Whole Foods\", \"Schnucks\", \"Schnucks\"],\n", + " \"price\": [3, 2, 4, 5],\n", " }\n", ")\n", "\n", + "out_schema = pa.DataFrameSchema(\n", + " {\"store\": Column(str, Check.isin([\"Aldi\", \"Walmart\", \"Whole Foods\", \"Schnucks\"]))}\n", + ")\n", + "\n", "\n", - "@check_input(in_schema)\n", - "def get_total_price(fruits: pd.DataFrame):\n", - " return fruits.price.sum()\n", + "@check_output(out_schema)\n", + "def combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\n", + " fruits = pd.concat([fruits_nearby, fruits_faraway])\n", + " return fruits\n", "\n", "\n", - "get_total_price(fruits)" + "combine_fruits(fruits_nearby, fruits_faraway)" ] }, { @@ -831,12 +1003,12 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 31, "id": "1fcc660e", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T16:23:49.689288Z", - "start_time": "2022-03-25T16:23:49.595415Z" + "end_time": "2022-03-25T19:23:56.679696Z", + "start_time": "2022-03-25T19:23:56.654195Z" } }, "outputs": [ @@ -931,7 +1103,7 @@ "3 orange Schnucks 5" ] }, - "execution_count": 100, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, @@ -940,9 +1112,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 100;\n", - " var nbb_unformatted_code = \"fruits_nearby = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits_faraway = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Whole Foods\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\", \\\"Schnucks\\\"],\\n \\\"price\\\": [3, 2, 4, 5],\\n }\\n)\\n\\nin_schema = pa.DataFrameSchema({\\\"store\\\": Column(str)})\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_io(fruits_nearby=in_schema, fruits_faraway=in_schema, out=out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", - " var nbb_formatted_code = \"fruits_nearby = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits_faraway = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Whole Foods\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\", \\\"Schnucks\\\"],\\n \\\"price\\\": [3, 2, 4, 5],\\n }\\n)\\n\\nin_schema = pa.DataFrameSchema({\\\"store\\\": Column(str)})\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_io(fruits_nearby=in_schema, fruits_faraway=in_schema, out=out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", + " var nbb_cell_id = 31;\n", + " var nbb_unformatted_code = \"in_schema = pa.DataFrameSchema({\\\"store\\\": Column(str)})\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_io(fruits_nearby=in_schema, fruits_faraway=in_schema, out=out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", + " var nbb_formatted_code = \"in_schema = pa.DataFrameSchema({\\\"store\\\": Column(str)})\\n\\nout_schema = pa.DataFrameSchema(\\n {\\\"store\\\": Column(str, Check.isin([\\\"Aldi\\\", \\\"Walmart\\\", \\\"Whole Foods\\\", \\\"Schnucks\\\"]))}\\n)\\n\\n\\n@check_io(fruits_nearby=in_schema, fruits_faraway=in_schema, out=out_schema)\\ndef combine_fruits(fruits_nearby: pd.DataFrame, fruits_faraway: pd.DataFrame):\\n fruits = pd.concat([fruits_nearby, fruits_faraway])\\n return fruits\\n\\n\\ncombine_fruits(fruits_nearby, fruits_faraway)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -964,22 +1136,6 @@ } ], "source": [ - "fruits_nearby = pd.DataFrame(\n", - " {\n", - " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", - " \"store\": [\"Aldi\", \"Walmart\", \"Walmart\", \"Aldi\"],\n", - " \"price\": [2, 1, 3, 4],\n", - " }\n", - ")\n", - "\n", - "fruits_faraway = pd.DataFrame(\n", - " {\n", - " \"name\": [\"apple\", \"banana\", \"apple\", \"orange\"],\n", - " \"store\": [\"Whole Foods\", \"Whole Foods\", \"Schnucks\", \"Schnucks\"],\n", - " \"price\": [3, 2, 4, 5],\n", - " }\n", - ")\n", - "\n", "in_schema = pa.DataFrameSchema({\"store\": Column(str)})\n", "\n", "out_schema = pa.DataFrameSchema(\n", @@ -1014,12 +1170,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 33, "id": "c07491f2", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:26:26.968624Z", - "start_time": "2022-03-25T15:26:26.920492Z" + "end_time": "2022-03-25T19:31:14.040777Z", + "start_time": "2022-03-25T19:31:14.023763Z" } }, "outputs": [ @@ -1086,7 +1242,7 @@ "3 orange NaN 4" ] }, - "execution_count": 25, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" }, @@ -1095,8 +1251,8 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 25;\n", - " var nbb_unformatted_code = \"import numpy as np \\n\\nfruits = fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", np.nan],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits\";\n", + " var nbb_cell_id = 33;\n", + " var nbb_unformatted_code = \"import numpy as np\\n\\nfruits = fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", np.nan],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits\";\n", " var nbb_formatted_code = \"import numpy as np\\n\\nfruits = fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", np.nan],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nfruits\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", @@ -1134,62 +1290,15 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "57828b2f", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:34:38.285866Z", - "start_time": "2022-03-25T15:34:38.216261Z" + "end_time": "2022-03-25T19:31:14.622436Z", + "start_time": "2022-03-25T19:31:14.417389Z" } }, - "outputs": [ - { - "ename": "SchemaError", - "evalue": "non-nullable series 'store' contains null values: {3: nan}", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSchemaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_2209524/1322757302.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m }\n\u001b[1;32m 9\u001b[0m )\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfruits\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0mcheck_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m \u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"schema_component_check\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaErrors\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_error_dict\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_component\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mschema_components\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m result = schema_component(\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0mdf_to_validate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1824\u001b[0m ) -> Union[pd.DataFrame, pd.Series]:\n\u001b[1;32m 1825\u001b[0m \u001b[0;34m\"\"\"Alias for ``validate`` method.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1826\u001b[0;31m return self.validate(\n\u001b[0m\u001b[1;32m 1827\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtail\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1828\u001b[0m )\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 212\u001b[0m )\n\u001b[1;32m 213\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate_column\u001b[0;34m(check_obj, column_name)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m super(Column, copy(self).set_name(column_name)).validate(\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1718\u001b[0m \u001b[0mseries\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnulls\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconstants\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mN_FAILURE_CASES\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1719\u001b[0m )\n\u001b[0;32m-> 1720\u001b[0;31m error_handler.collect_error(\n\u001b[0m\u001b[1;32m 1721\u001b[0m \u001b[0;34m\"series_contains_nulls\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1722\u001b[0m errors.SchemaError(\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSchemaError\u001b[0m: non-nullable series 'store' contains null values: {3: nan}" - ] - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 44;\n", - " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(\\n str, Check.isin(nearby_stores), allow_duplicates=False\\n ),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores), allow_duplicates=False),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "schema = pa.DataFrameSchema(\n", " {\n", @@ -1203,12 +1312,12 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 35, "id": "cfb2b445", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:34:58.551165Z", - "start_time": "2022-03-25T15:34:58.496437Z" + "end_time": "2022-03-25T19:31:18.520397Z", + "start_time": "2022-03-25T19:31:18.496748Z" } }, "outputs": [ @@ -1275,7 +1384,7 @@ "3 orange NaN 4" ] }, - "execution_count": 45, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, @@ -1284,8 +1393,8 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 45;\n", - " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(\\n str, Check.isin(nearby_stores), nullable=True\\n ),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", + " var nbb_cell_id = 35;\n", + " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores), nullable=True),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores), nullable=True),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", @@ -1328,7 +1437,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "a0d79c0a", "metadata": { "ExecuteTime": { @@ -1336,54 +1445,7 @@ "start_time": "2022-03-25T15:35:48.217538Z" } }, - "outputs": [ - { - "ename": "SchemaError", - "evalue": "series 'store' contains duplicate values: {2: 'Walmart'}", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSchemaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_2209524/4157376358.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m }\n\u001b[1;32m 7\u001b[0m )\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfruits\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0mcheck_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m \u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"schema_component_check\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSchemaErrors\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_error_dict\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mschema_component\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mschema_components\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m result = schema_component(\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0mdf_to_validate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1824\u001b[0m ) -> Union[pd.DataFrame, pd.Series]:\n\u001b[1;32m 1825\u001b[0m \u001b[0;34m\"\"\"Alias for ``validate`` method.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1826\u001b[0;31m return self.validate(\n\u001b[0m\u001b[1;32m 1827\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtail\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlazy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1828\u001b[0m )\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 212\u001b[0m )\n\u001b[1;32m 213\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schema_components.py\u001b[0m in \u001b[0;36mvalidate_column\u001b[0;34m(check_obj, column_name)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mvalidate_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m super(Column, copy(self).set_name(column_name)).validate(\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0mcheck_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/schemas.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, check_obj, head, tail, sample, random_state, lazy, inplace)\u001b[0m\n\u001b[1;32m 1741\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1742\u001b[0m )\n\u001b[0;32m-> 1743\u001b[0;31m error_handler.collect_error(\n\u001b[0m\u001b[1;32m 1744\u001b[0m \u001b[0;34m\"series_contains_duplicates\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1745\u001b[0m errors.SchemaError(\n", - "\u001b[0;32m~/Data-science/venv/lib/python3.8/site-packages/pandera/error_handlers.py\u001b[0m in \u001b[0;36mcollect_error\u001b[0;34m(self, reason_code, schema_error, original_exc)\u001b[0m\n\u001b[1;32m 30\u001b[0m \"\"\"\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mschema_error\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moriginal_exc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;31m# delete data of validated object from SchemaError object to prevent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSchemaError\u001b[0m: series 'store' contains duplicate values: {2: 'Walmart'}" - ] - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 46;\n", - " var nbb_unformatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores), nullable=True, allow_duplicates=False),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_formatted_code = \"schema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(\\n str, Check.isin(nearby_stores), nullable=True, allow_duplicates=False\\n ),\\n \\\"price\\\": Column(int, Check.less_than(5)),\\n }\\n)\\nschema.validate(fruits)\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "schema = pa.DataFrameSchema(\n", " {\n", @@ -1407,12 +1469,12 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 38, "id": "0e7e9deb", "metadata": { "ExecuteTime": { - "end_time": "2022-03-25T15:46:13.522139Z", - "start_time": "2022-03-25T15:46:13.457828Z" + "end_time": "2022-03-25T19:38:21.614473Z", + "start_time": "2022-03-25T19:38:21.589580Z" } }, "outputs": [ @@ -1425,7 +1487,7 @@ "dtype: object" ] }, - "execution_count": 59, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" }, @@ -1434,9 +1496,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 59;\n", - " var nbb_unformatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(str, coerce=True),\\n }\\n)\\nvalidated = schema.validate(fruits)\\nvalidated.dtypes\";\n", - " var nbb_formatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"name\\\": Column(str, Check.isin(available_fruits)),\\n \\\"store\\\": Column(str, Check.isin(nearby_stores)),\\n \\\"price\\\": Column(str, coerce=True),\\n }\\n)\\nvalidated = schema.validate(fruits)\\nvalidated.dtypes\";\n", + " var nbb_cell_id = 38;\n", + " var nbb_unformatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema(\\n {\\n \\\"price\\\": Column(str, coerce=True)\\n }\\n)\\nvalidated = schema.validate(fruits)\\nvalidated.dtypes\";\n", + " var nbb_formatted_code = \"fruits = pd.DataFrame(\\n {\\n \\\"name\\\": [\\\"apple\\\", \\\"banana\\\", \\\"apple\\\", \\\"orange\\\"],\\n \\\"store\\\": [\\\"Aldi\\\", \\\"Walmart\\\", \\\"Walmart\\\", \\\"Aldi\\\"],\\n \\\"price\\\": [2, 1, 3, 4],\\n }\\n)\\n\\nschema = pa.DataFrameSchema({\\\"price\\\": Column(str, coerce=True)})\\nvalidated = schema.validate(fruits)\\nvalidated.dtypes\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -1466,13 +1528,7 @@ " }\n", ")\n", "\n", - "schema = pa.DataFrameSchema(\n", - " {\n", - " \"name\": Column(str, Check.isin(available_fruits)),\n", - " \"store\": Column(str, Check.isin(nearby_stores)),\n", - " \"price\": Column(str, coerce=True),\n", - " }\n", - ")\n", + "schema = pa.DataFrameSchema({\"price\": Column(str, coerce=True)})\n", "validated = schema.validate(fruits)\n", "validated.dtypes" ] @@ -1882,10 +1938,11 @@ } ], "metadata": { + "hide_input": false, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Data-science", "language": "python", - "name": "python3" + "name": "data-science" }, "language_info": { "codemirror_mode": { From 3152d44883c15400c0acb6223709246bb68a72de Mon Sep 17 00:00:00 2001 From: Khuyen Date: Wed, 30 Mar 2022 13:29:47 -0500 Subject: [PATCH 04/15] edit pytest --- .../pytest/get_started/sentiment.py | 16 ++++++++------ .../pytest/parametrization/process.py | 18 ++++++++------- .../pytest/parametrization/process_fixture.py | 1 - .../pytest/parametrization/sentiment.py | 18 ++++++++++----- .../test_structure_example/src/__init__.py | 0 .../test_structure_example/src/process.py | 19 +++++++++++----- .../test_structure_example/tests/__init__.py | 0 .../tests/test_process.py | 22 +++++++++++++------ 8 files changed, 60 insertions(+), 34 deletions(-) create mode 100644 data_science_tools/pytest/test_structure_example/src/__init__.py create mode 100644 data_science_tools/pytest/test_structure_example/tests/__init__.py diff --git a/data_science_tools/pytest/get_started/sentiment.py b/data_science_tools/pytest/get_started/sentiment.py index 58c6fc9..25caa6e 100644 --- a/data_science_tools/pytest/get_started/sentiment.py +++ b/data_science_tools/pytest/get_started/sentiment.py @@ -1,14 +1,16 @@ from textblob import TextBlob + def extract_sentiment(text: str): - '''Extract sentiment using textblob. - Polarity is within range [-1, 1]''' + """Extract sentiment using textblob. + Polarity is within range [-1, 1]""" + + text = TextBlob(text) - text = TextBlob(text) + return text.sentiment.polarity - return text.sentiment.polarity -def test_extract_sentiment(): +def test_extract_sentiment(): text = "I think today will be a great day" @@ -16,11 +18,11 @@ def test_extract_sentiment(): assert sentiment > 0 + def test_extract_sentiment_negative(): text = "I do not think this will turn out well" sentiment = extract_sentiment(text) - assert sentiment < 0 - \ No newline at end of file + assert sentiment < 0 \ No newline at end of file diff --git a/data_science_tools/pytest/parametrization/process.py b/data_science_tools/pytest/parametrization/process.py index a63fe2d..2b6abaf 100644 --- a/data_science_tools/pytest/parametrization/process.py +++ b/data_science_tools/pytest/parametrization/process.py @@ -1,18 +1,20 @@ import pytest + + def text_contain_word(word: str, text: str): - '''Find whether the text contains a particular word''' - + """Find whether the text contains a particular word""" + return word in text + testdata = [ - ('There is a duck in this text',True), - ('There is nothing here', False) - ] + ("There is a duck in this text", True), + ("There is nothing here", False)] + -@pytest.mark.parametrize('sample, expected_output', testdata) +@pytest.mark.parametrize("sample, expected_output", testdata) def test_text_contain_word(sample, expected_output): - word = 'duck' + word = "duck" assert text_contain_word(word, sample) == expected_output - diff --git a/data_science_tools/pytest/parametrization/process_fixture.py b/data_science_tools/pytest/parametrization/process_fixture.py index 4fef811..1efc2c3 100644 --- a/data_science_tools/pytest/parametrization/process_fixture.py +++ b/data_science_tools/pytest/parametrization/process_fixture.py @@ -19,7 +19,6 @@ def text_contain_word(word: str, text: str): def example_data(): return 'Today I found a duck and I am happy' - def test_extract_sentiment(example_data): sentiment = extract_sentiment(example_data) diff --git a/data_science_tools/pytest/parametrization/sentiment.py b/data_science_tools/pytest/parametrization/sentiment.py index 50ccf62..dfcb23c 100644 --- a/data_science_tools/pytest/parametrization/sentiment.py +++ b/data_science_tools/pytest/parametrization/sentiment.py @@ -1,17 +1,23 @@ from textblob import TextBlob import pytest + def extract_sentiment(text: str): - '''Extract sentiment using textblob. - Polarity is within range [-1, 1]''' + """Extract sentiment using textblob. + Polarity is within range [-1, 1]""" + + text = TextBlob(text) + + return text.sentiment.polarity - text = TextBlob(text) - return text.sentiment.polarity +testdata = [ + "I think today will be a great day", + "I do not think this will turn out well", +] -testdata = ["I think today will be a great day","I do not think this will turn out well"] -@pytest.mark.parametrize('sample', testdata) +@pytest.mark.parametrize("sample", testdata) def test_extract_sentiment(sample): sentiment = extract_sentiment(sample) diff --git a/data_science_tools/pytest/test_structure_example/src/__init__.py b/data_science_tools/pytest/test_structure_example/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_science_tools/pytest/test_structure_example/src/process.py b/data_science_tools/pytest/test_structure_example/src/process.py index 9682fce..fdd7360 100644 --- a/data_science_tools/pytest/test_structure_example/src/process.py +++ b/data_science_tools/pytest/test_structure_example/src/process.py @@ -1,10 +1,19 @@ from textblob import TextBlob + def extract_sentiment(text: str): - '''Extract sentiment using textblob. - Polarity is within range [-1, 1]''' + """Extract sentiment using textblob. + Polarity is within range [-1, 1]""" + + text = TextBlob(text) + + return text.sentiment.polarity + + +def text_contain_word(word: str, text: str): + """Find whether the text contains a particular word""" + + return word in text - text = TextBlob(text) - return text.sentiment.polarity - \ No newline at end of file +testdata = [("There is a duck in this text", True), ("There is nothing here", False)] diff --git a/data_science_tools/pytest/test_structure_example/tests/__init__.py b/data_science_tools/pytest/test_structure_example/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_science_tools/pytest/test_structure_example/tests/test_process.py b/data_science_tools/pytest/test_structure_example/tests/test_process.py index 39c94c2..a04720f 100644 --- a/data_science_tools/pytest/test_structure_example/tests/test_process.py +++ b/data_science_tools/pytest/test_structure_example/tests/test_process.py @@ -1,15 +1,23 @@ -import sys -import os.path -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from src.process import extract_sentiment +from src.process import extract_sentiment, text_contain_word import pytest +@pytest.fixture +def example_data(): + return "Today I found a duck and I am happy" + + def test_extract_sentiment(): - text = 'Today I found a duck and I am happy' + text = "Today I found a duck and I am happy" sentiment = extract_sentiment(text) - assert sentiment > 0 \ No newline at end of file + assert sentiment > 0 + + +def test_text_contain_word(example_data): + + word = "duck" + + assert text_contain_word(word, example_data) == True From 09cfd9afec32ef69b0c4ae7843f5eb20be3c51d6 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Wed, 30 Mar 2022 14:16:21 -0500 Subject: [PATCH 05/15] Create README.md --- data_science_tools/pytest/README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 data_science_tools/pytest/README.md diff --git a/data_science_tools/pytest/README.md b/data_science_tools/pytest/README.md new file mode 100644 index 0000000..7844939 --- /dev/null +++ b/data_science_tools/pytest/README.md @@ -0,0 +1,2 @@ +[![View on YouTube](https://img.shields.io/badge/YouTube-Watch%20on%20Youtube-red?logo=youtube)](https://www.youtube.com/channel/UCNMawpMow-lW5d2svGhOEbw) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blue?logo=medium)](https://khuyentran1476.medium.com/) + From e7f34f97c66e3ece15fe8f1aa9d2e2546d2a31f8 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Wed, 30 Mar 2022 14:21:06 -0500 Subject: [PATCH 06/15] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b2034f1..4842b3c 100644 --- a/README.md +++ b/README.md @@ -49,10 +49,10 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # MLOps -| Title | Article | Repository | -| ------------- |:-------------:| :-----:| +| Title | Article | Repository | Video +| ------------- |:-------------:| :-----:| :-----:| |Introduction to DVC: Data Version Control Tool for Machine Learning Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-dvc-data-version-control-tool-for-machine-learning-projects-7cb49c229fe0) | [πŸ”—](https://github.com/khuyentran1401/Machine-learning-pipeline) | -| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | +| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | [πŸ”—](https://github.com/khuyentran1401/hydra_demo) | [πŸ”—](https://youtube.com/playlist?list=PLnK6m_JBRVNoPnqnVrWaYtZ2G4nFTnGze) | Introduction to Weight & Biases: Track and Visualize your Machine Learning Experiments in 3 Lines of Code | [πŸ”—](https://towardsdatascience.com/introduction-to-weight-biases-track-and-visualize-your-machine-learning-experiments-in-3-lines-9c9553b0f99d) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/wandb_tracking) | Kedro β€” A Python Framework for Reproducible Data Science Project | [πŸ”—](https://towardsdatascience.com/kedro-a-python-framework-for-reproducible-data-science-project-4d44977d4f04) | [πŸ”—](https://github.com/khuyentran1401/kedro_demo) | Orchestrate a Data Science Project in Python With Prefect | [πŸ”—](https://towardsdatascience.com/orchestrate-a-data-science-project-in-python-with-prefect-e69c61a49074) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/prefect_example) @@ -62,9 +62,9 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # Testing -| Title | Article | Repository | -| ------------- |:-------------:| :-----:| -| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) +| Title | Article | Repository | Video +| ------------- |:-------------:| :-----:| :-----:| +| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) | [πŸ”—](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoYEer9hBmTNwkYB3gmbOPO) | 4 Lessor-Known Yet Awesome Tips forΒ Pytest | [πŸ”—](https://towardsdatascience.com/4-lessor-known-yet-awesome-tips-for-pytest-2117d8a62d9c) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/advanced_pytest) | Great Expectations: Always Know What to Expect From Your Data | [πŸ”—](https://towardsdatascience.com/great-expectations-always-know-what-to-expect-from-your-data-51214866c24) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/great_expectations_example) | Introduction to Schema: A Python Libary to Validate your Data | [πŸ”—](https://towardsdatascience.com/introduction-to-schema-a-python-libary-to-validate-your-data-c6d99e06d56a) | [πŸ”—](https://deepnote.com/launch?url=https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/schema.ipynb) From f15aaabc5cfc36dcbb590c60f55caee38c79744b Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Wed, 30 Mar 2022 14:22:31 -0500 Subject: [PATCH 07/15] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4842b3c..c6220dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-red?logo=medium)](https://khuyentran1476.medium.com/) [![Daily Data Science Tips](https://img.shields.io/badge/Data%20Science%20Simplified-Daily%20Data%20Science%20Tips-green?logo=wordpress)](https://mathdatasimplified.com/) +[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blueviolet?logo=medium)](https://khuyentran1476.medium.com/) [![Daily Data Science Tips](https://img.shields.io/badge/Data%20Science%20Simplified-Daily%20Data%20Science%20Tips-green?logo=wordpress)](https://mathdatasimplified.com/) # [Data Science Topics](https://github.com/khuyentran1401/Data-science) Collection of useful data science topics along with code and articles in my [data science blog](https://medium.com/@khuyentran1476). @@ -49,10 +49,10 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # MLOps -| Title | Article | Repository | Video -| ------------- |:-------------:| :-----:| :-----:| +| Title | Article | Repository | +| ------------- |:-------------:| :-----:| |Introduction to DVC: Data Version Control Tool for Machine Learning Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-dvc-data-version-control-tool-for-machine-learning-projects-7cb49c229fe0) | [πŸ”—](https://github.com/khuyentran1401/Machine-learning-pipeline) | -| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | [πŸ”—](https://github.com/khuyentran1401/hydra_demo) | [πŸ”—](https://youtube.com/playlist?list=PLnK6m_JBRVNoPnqnVrWaYtZ2G4nFTnGze) +| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | | Introduction to Weight & Biases: Track and Visualize your Machine Learning Experiments in 3 Lines of Code | [πŸ”—](https://towardsdatascience.com/introduction-to-weight-biases-track-and-visualize-your-machine-learning-experiments-in-3-lines-9c9553b0f99d) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/wandb_tracking) | Kedro β€” A Python Framework for Reproducible Data Science Project | [πŸ”—](https://towardsdatascience.com/kedro-a-python-framework-for-reproducible-data-science-project-4d44977d4f04) | [πŸ”—](https://github.com/khuyentran1401/kedro_demo) | Orchestrate a Data Science Project in Python With Prefect | [πŸ”—](https://towardsdatascience.com/orchestrate-a-data-science-project-in-python-with-prefect-e69c61a49074) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/prefect_example) @@ -62,9 +62,9 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # Testing -| Title | Article | Repository | Video -| ------------- |:-------------:| :-----:| :-----:| -| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) | [πŸ”—](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoYEer9hBmTNwkYB3gmbOPO) +| Title | Article | Repository | +| ------------- |:-------------:| :-----:| +| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) | 4 Lessor-Known Yet Awesome Tips forΒ Pytest | [πŸ”—](https://towardsdatascience.com/4-lessor-known-yet-awesome-tips-for-pytest-2117d8a62d9c) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/advanced_pytest) | Great Expectations: Always Know What to Expect From Your Data | [πŸ”—](https://towardsdatascience.com/great-expectations-always-know-what-to-expect-from-your-data-51214866c24) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/great_expectations_example) | Introduction to Schema: A Python Libary to Validate your Data | [πŸ”—](https://towardsdatascience.com/introduction-to-schema-a-python-libary-to-validate-your-data-c6d99e06d56a) | [πŸ”—](https://deepnote.com/launch?url=https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/schema.ipynb) From 6598f114856a45b28a0f14c16c3386f4f989d4a8 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Wed, 30 Mar 2022 14:25:40 -0500 Subject: [PATCH 08/15] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c6220dc..2c2c3af 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blueviolet?logo=medium)](https://khuyentran1476.medium.com/) [![Daily Data Science Tips](https://img.shields.io/badge/Data%20Science%20Simplified-Daily%20Data%20Science%20Tips-green?logo=wordpress)](https://mathdatasimplified.com/) +[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blueviolet?logo=medium)](https://khuyentran1476.medium.com/) [![Daily Data Science Tips](https://img.shields.io/badge/Data%20Science%20Simplified-Daily%20Data%20Science%20Tips-green?logo=wordpress)](https://mathdatasimplified.com/) [![View on YouTube](https://img.shields.io/badge/YouTube-Watch%20on%20Youtube-red?logo=youtube)](https://www.youtube.com/channel/UCNMawpMow-lW5d2svGhOEbw) # [Data Science Topics](https://github.com/khuyentran1401/Data-science) Collection of useful data science topics along with code and articles in my [data science blog](https://medium.com/@khuyentran1476). @@ -49,10 +49,10 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # MLOps -| Title | Article | Repository | -| ------------- |:-------------:| :-----:| +| Title | Article | Repository | Video +| ------------- |:-------------:| :-----:| :-----:| |Introduction to DVC: Data Version Control Tool for Machine Learning Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-dvc-data-version-control-tool-for-machine-learning-projects-7cb49c229fe0) | [πŸ”—](https://github.com/khuyentran1401/Machine-learning-pipeline) | -| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | +| Introduction to Hydra.cc: A Powerful Framework to Configure your Data Science Projects | [πŸ”—](https://towardsdatascience.com/introduction-to-hydra-cc-a-powerful-framework-to-configure-your-data-science-projects-ed65713a53c6) | [πŸ”—](https://github.com/khuyentran1401/hydra_demo) | [πŸ”—](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoPnqnVrWaYtZ2G4nFTnGze) | Introduction to Weight & Biases: Track and Visualize your Machine Learning Experiments in 3 Lines of Code | [πŸ”—](https://towardsdatascience.com/introduction-to-weight-biases-track-and-visualize-your-machine-learning-experiments-in-3-lines-9c9553b0f99d) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/wandb_tracking) | Kedro β€” A Python Framework for Reproducible Data Science Project | [πŸ”—](https://towardsdatascience.com/kedro-a-python-framework-for-reproducible-data-science-project-4d44977d4f04) | [πŸ”—](https://github.com/khuyentran1401/kedro_demo) | Orchestrate a Data Science Project in Python With Prefect | [πŸ”—](https://towardsdatascience.com/orchestrate-a-data-science-project-in-python-with-prefect-e69c61a49074) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/prefect_example) @@ -62,9 +62,9 @@ git-lfs clone https://github.com/khuyentran1401/Data-science # Testing -| Title | Article | Repository | -| ------------- |:-------------:| :-----:| -| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) +| Title | Article | Repository | Video +| ------------- |:-------------:| :-----:| :-----:| +| Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) | [πŸ”—](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoYEer9hBmTNwkYB3gmbOPO) | 4 Lessor-Known Yet Awesome Tips forΒ Pytest | [πŸ”—](https://towardsdatascience.com/4-lessor-known-yet-awesome-tips-for-pytest-2117d8a62d9c) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/advanced_pytest) | Great Expectations: Always Know What to Expect From Your Data | [πŸ”—](https://towardsdatascience.com/great-expectations-always-know-what-to-expect-from-your-data-51214866c24) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/great_expectations_example) | Introduction to Schema: A Python Libary to Validate your Data | [πŸ”—](https://towardsdatascience.com/introduction-to-schema-a-python-libary-to-validate-your-data-c6d99e06d56a) | [πŸ”—](https://deepnote.com/launch?url=https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/schema.ipynb) From e887f2666a476e94be7bc9c29f8b7f8729deee3b Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Wed, 30 Mar 2022 21:13:05 -0500 Subject: [PATCH 09/15] Update README.md --- data_science_tools/pytest/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_science_tools/pytest/README.md b/data_science_tools/pytest/README.md index 7844939..5e8125b 100644 --- a/data_science_tools/pytest/README.md +++ b/data_science_tools/pytest/README.md @@ -1,2 +1,2 @@ -[![View on YouTube](https://img.shields.io/badge/YouTube-Watch%20on%20Youtube-red?logo=youtube)](https://www.youtube.com/channel/UCNMawpMow-lW5d2svGhOEbw) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blue?logo=medium)](https://khuyentran1476.medium.com/) +[![View on YouTube](https://img.shields.io/badge/YouTube-Watch%20on%20Youtube-red?logo=youtube)](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoYEer9hBmTNwkYB3gmbOPO) [![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blue?logo=medium)](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) From d8b3718d312a22a0a1263f8e31f72a46c47f6e93 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Sat, 2 Apr 2022 08:38:11 -0500 Subject: [PATCH 10/15] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2c2c3af..5f8ee18 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ git-lfs clone https://github.com/khuyentran1401/Data-science | Pytest for Data Scientists | [πŸ”—](https://towardsdatascience.com/pytest-for-data-scientists-2990319e55e6) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/pytest) | [πŸ”—](https://www.youtube.com/playlist?list=PLnK6m_JBRVNoYEer9hBmTNwkYB3gmbOPO) | 4 Lessor-Known Yet Awesome Tips forΒ Pytest | [πŸ”—](https://towardsdatascience.com/4-lessor-known-yet-awesome-tips-for-pytest-2117d8a62d9c) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/advanced_pytest) | Great Expectations: Always Know What to Expect From Your Data | [πŸ”—](https://towardsdatascience.com/great-expectations-always-know-what-to-expect-from-your-data-51214866c24) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/great_expectations_example) +| Validate Your pandas DataFrame with Pandera | [πŸ”—](https://medium.com/towards-data-science/validate-your-pandas-dataframe-with-pandera-2995910e564) |[πŸ”—](https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/pandera_example/pandera.ipynb) | Introduction to Schema: A Python Libary to Validate your Data | [πŸ”—](https://towardsdatascience.com/introduction-to-schema-a-python-libary-to-validate-your-data-c6d99e06d56a) | [πŸ”—](https://deepnote.com/launch?url=https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/schema.ipynb) | DeepDiff β€” Recursively Find and Ignore Trivial Differences Using Python | [πŸ”—](https://towardsdatascience.com/deepdiff-recursively-find-and-ignore-trivial-differences-using-python-231a5524f41d) | [πŸ”—](https://github.com/khuyentran1401/Data-science/blob/master/productive_tools/deepdiff_example.ipynb) | Checklist β€” Behavioral Testing of NLP Models | [πŸ”—](https://towardsdatascience.com/checklist-behavioral-testing-of-nlp-models-491cf11f0238) | [πŸ”—](https://github.com/khuyentran1401/Data-science/blob/master/nlp/checklist/checklist_examples.ipynb) From 9c19423d750eaa95742d227a91c982acca55eab7 Mon Sep 17 00:00:00 2001 From: Khuyen Date: Tue, 5 Apr 2022 14:49:00 -0500 Subject: [PATCH 11/15] revert black example --- productive_tools/precommit_examples/long_function.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/productive_tools/precommit_examples/long_function.py b/productive_tools/precommit_examples/long_function.py index 1a749d3..4290a74 100644 --- a/productive_tools/precommit_examples/long_function.py +++ b/productive_tools/precommit_examples/long_function.py @@ -1,9 +1,2 @@ -def very_long_function( - long_variable_name, - long_variable_name2, - long_variable_name3, - long_variable_name4, - long_variable_name5, -): - """A very long function""" - pass +def very_long_function(long_variable_name, long_variable_name2, long_variable_name3, long_variable_name4, long_variable_name5): + pass \ No newline at end of file From 5759ba3648d6d5068dae1df688b1f76e8cc0015e Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Thu, 7 Apr 2022 20:48:09 -0500 Subject: [PATCH 12/15] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5f8ee18..d4d6bd8 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ git-lfs clone https://github.com/khuyentran1401/Data-science | DagsHub: a GitHub Supplement for Data Scientists and ML Engineers | [πŸ”—](https://towardsdatascience.com/dagshub-a-github-supplement-for-data-scientists-and-ml-engineers-9ecaf49cc505) | [πŸ”—](https://dagshub.com/khuyentran1401/dagshub-demo) | 4 pre-commit Plugins to Automate Code Reviewing and Formatting in Python | [πŸ”—](https://towardsdatascience.com/4-pre-commit-plugins-to-automate-code-reviewing-and-formatting-in-python-c80c6d2e9f5) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/productive_tools/precommit_examples) | BentoML: Create an ML Powered Prediction Service in Minutes | [πŸ”—](https://towardsdatascience.com/bentoml-create-an-ml-powered-prediction-service-in-minutes-23d135d6ca76) | [πŸ”—](https://github.com/khuyentran1401/customer_segmentation/tree/bentoml_demo) +| How to Structure a Data Science Project for Readability and Transparency | [πŸ”—](https://towardsdatascience.com/how-to-structure-a-data-science-project-for-readability-and-transparency-360c6716800) | [πŸ”—](https://github.com/khuyentran1401/data-science-template) # Testing From 1521a66b2a2a0081d04d616f075cec4099a87d48 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Thu, 7 Apr 2022 20:49:25 -0500 Subject: [PATCH 13/15] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d4d6bd8..27d5a7b 100644 --- a/README.md +++ b/README.md @@ -247,9 +247,9 @@ git-lfs clone https://github.com/khuyentran1401/Data-science | Understand CSV Files from your Terminal with XSV | [πŸ”—](https://towardsdatascience.com/understand-your-csv-files-from-your-terminal-with-xsv-65255ae67293) | Prettify your Terminal Text With Termcolor and Pyfiglet| [πŸ”—](https://towardsdatascience.com/prettify-your-terminal-text-with-termcolor-and-pyfiglet-880de83fda6b) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/python/prettify_terminal_output) | | Stop Using Print to Debug in Python. Use Icecream Instead | [πŸ”—](https://towardsdatascience.com/stop-using-print-to-debug-in-python-use-icecream-instead-79e17b963fcc) -| Rich: Generate Rich and Beautiful Text in the Terminal with Python | [πŸ”—](https://towardsdatascience.com/rich-generate-rich-and-beautiful-text-in-the-terminal-with-python-541f39abf32e#82f6-71c3ea605859-reply) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/terminal/rich) -| Create a Beautiful Dashboard in your Terminal with Wtfutil | [πŸ”—](https://towardsdatascience.com/create-a-beautiful-dashboard-in-your-terminal-with-wtfutil-573424fe3684#a02c-a765cf58b4fd-reply) | [πŸ”—](https://github.com/khuyentran1401/Data-science/blob/master/terminal/wtf/config.yml) -| 3 Tools to Monitor and Optimize your Linux System | [πŸ”—](https://towardsdatascience.com/3-tools-to-monitor-and-optimize-your-linux-system-c8a46c18d692#3e4d-30c6afc3073a-reply) +| Rich: Generate Rich and Beautiful Text in the Terminal with Python | [πŸ”—](https://towardsdatascience.com/rich-generate-rich-and-beautiful-text-in-the-terminal-with-python-541f39abf32e) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/terminal/rich) +| Create a Beautiful Dashboard in your Terminal with Wtfutil | [πŸ”—](https://towardsdatascience.com/create-a-beautiful-dashboard-in-your-terminal-with-wtfutil-573424fe3684) | [πŸ”—](https://github.com/khuyentran1401/Data-science/blob/master/terminal/wtf/config.yml) +| 3 Tools to Monitor and Optimize your Linux System | [πŸ”—](https://towardsdatascience.com/3-tools-to-monitor-and-optimize-your-linux-system-c8a46c18d692) | Ptpython: A Better Python REPL | [πŸ”—](https://towardsdatascience.com/ptpython-a-better-python-repl-6e21df1eb648) | [πŸ”—](https://gist.github.com/khuyentran1401/b5325ff1f3bfe1e36bf9131a0b8cd388) | fd: a Simple but Powerful Tool to Find and Execute Files on the Command Line | [πŸ”—](https://towardsdatascience.com/fd-a-simple-but-powerful-tool-to-find-and-execute-files-on-the-command-line-602f9af235ad) | Speed Up your Command-Line Navigation with These 3 Tools | [πŸ”—](https://towardsdatascience.com/speed-up-your-command-line-navigation-with-these-3-tools-f90105c9aa2b) From 096b2a1f71e5ba1e734794ff0a57e0112983c1a6 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Fri, 8 Apr 2022 17:09:17 -0500 Subject: [PATCH 14/15] Create README.md --- productive_tools/precommit_examples/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 productive_tools/precommit_examples/README.md diff --git a/productive_tools/precommit_examples/README.md b/productive_tools/precommit_examples/README.md new file mode 100644 index 0000000..2f29bca --- /dev/null +++ b/productive_tools/precommit_examples/README.md @@ -0,0 +1 @@ +[![View on Medium](https://img.shields.io/badge/Medium-View%20on%20Medium-blue?logo=medium)](https://towardsdatascience.com/4-pre-commit-plugins-to-automate-code-reviewing-and-formatting-in-python-c80c6d2e9f5) [![View on YouTube](https://img.shields.io/badge/YouTube-Watch%20on%20Youtube-red?logo=youtube)](https://youtube.com/playlist?list=PLnK6m_JBRVNqskWiXLxx1QRDDng9O8Fsf) From 6b1b55a4e56cc666efeda16947a08113aa5c58cd Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Fri, 8 Apr 2022 17:10:03 -0500 Subject: [PATCH 15/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 27d5a7b..f20fa1c 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ git-lfs clone https://github.com/khuyentran1401/Data-science | Kedro β€” A Python Framework for Reproducible Data Science Project | [πŸ”—](https://towardsdatascience.com/kedro-a-python-framework-for-reproducible-data-science-project-4d44977d4f04) | [πŸ”—](https://github.com/khuyentran1401/kedro_demo) | Orchestrate a Data Science Project in Python With Prefect | [πŸ”—](https://towardsdatascience.com/orchestrate-a-data-science-project-in-python-with-prefect-e69c61a49074) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/data_science_tools/prefect_example) | DagsHub: a GitHub Supplement for Data Scientists and ML Engineers | [πŸ”—](https://towardsdatascience.com/dagshub-a-github-supplement-for-data-scientists-and-ml-engineers-9ecaf49cc505) | [πŸ”—](https://dagshub.com/khuyentran1401/dagshub-demo) -| 4 pre-commit Plugins to Automate Code Reviewing and Formatting in Python | [πŸ”—](https://towardsdatascience.com/4-pre-commit-plugins-to-automate-code-reviewing-and-formatting-in-python-c80c6d2e9f5) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/productive_tools/precommit_examples) +| 4 pre-commit Plugins to Automate Code Reviewing and Formatting in Python | [πŸ”—](https://towardsdatascience.com/4-pre-commit-plugins-to-automate-code-reviewing-and-formatting-in-python-c80c6d2e9f5) | [πŸ”—](https://github.com/khuyentran1401/Data-science/tree/master/productive_tools/precommit_examples) | [πŸ”—](https://youtube.com/playlist?list=PLnK6m_JBRVNqskWiXLxx1QRDDng9O8Fsf) | BentoML: Create an ML Powered Prediction Service in Minutes | [πŸ”—](https://towardsdatascience.com/bentoml-create-an-ml-powered-prediction-service-in-minutes-23d135d6ca76) | [πŸ”—](https://github.com/khuyentran1401/customer_segmentation/tree/bentoml_demo) | How to Structure a Data Science Project for Readability and Transparency | [πŸ”—](https://towardsdatascience.com/how-to-structure-a-data-science-project-for-readability-and-transparency-360c6716800) | [πŸ”—](https://github.com/khuyentran1401/data-science-template)