diff --git a/.gitignore b/.gitignore index 911404f..92b0f4a 100644 --- a/.gitignore +++ b/.gitignore @@ -207,7 +207,7 @@ fabric.properties # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser -notebooks/RetailHero-data/* +notebooks/content/* notebooks/catboost_info notebooks/*.tmp diff --git a/Readme.rst b/Readme.rst index 8dfdb3e..1826fd5 100644 --- a/Readme.rst +++ b/Readme.rst @@ -11,15 +11,18 @@ .. |Docs| image:: https://readthedocs.org/projects/scikit-uplift/badge/?version=latest .. _Docs: https://scikit-uplift.readthedocs.io/en/latest/ -.. _RU: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb -.. _EN: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb - .. |Open In Colab1| image:: https://colab.research.google.com/assets/colab-badge.svg .. _Open In Colab1: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb .. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg .. _Open In Colab2: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb - + +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab3: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb + +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab4: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb + .. _scikit-uplift.readthedocs.io: https://scikit-uplift.readthedocs.io/en/latest/ .. _Part 1: https://habr.com/ru/company/ru_mts/blog/485980/ .. _Part 2: https://habr.com/ru/company/ru_mts/blog/485976/ @@ -50,9 +53,13 @@ More about uplift modelling problem read in russian on habr.com: `Part 1`_ and ` * Applying any estimator adheres to scikit-learn conventions; +* All approaches can be used in sklearn.pipeline (see example (`EN `__ |Open In Colab3|_, `RU `__ |Open In Colab4|_)) + * Almost all implemented approaches solve both the problem of classification and regression; -* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model. +* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model; + +* Useful graphs for analyzing the built model. Installation ------------- @@ -89,7 +96,7 @@ And if you now point your browser to ``_build/html/index.html``, you should see Quick Start ----------- -See the **RetailHero tutorial notebook** (`EN`_ |Open In Colab1|_, `RU`_ |Open In Colab2|_) for details. +See the **RetailHero tutorial notebook** (`EN `__ |Open In Colab1|_, `RU `__ |Open In Colab2|_) for details. **Train and predict uplift model** @@ -157,7 +164,7 @@ Important links - Official source code repo: https://github.com/maks-sh/scikit-uplift/ - Issue tracker: https://github.com/maks-sh/scikit-uplift/issues - +- Release History: https://scikit-uplift.readthedocs.io/en/latest/changelog.html =============== diff --git a/docs/changelog.md b/docs/changelog.md index f5ca79a..977495d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,24 @@ * πŸ”¨ something that previously didn’t work as documentated – or according to reasonable expectations – should now work. * ❗️ you will need to change your code to have the same effect in the future; or a feature will be removed in the future. +## Version 0.1.1 + +### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) + +* πŸ’₯ Add [plot_uplift_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_treatment_balance_curve). + +### [sklift.metrics](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html) + +* πŸ’₯ Add [response_rate_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.metrics.metrics.response_rate_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug with import [uplift_auc_score](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.qini_auc_score). +* πŸ“ Fix typos in docstrings. + +### Miscellaneous + +* πŸ’₯ Add tutorial ["Example of usage model from sklift.models in sklearn.pipeline"](https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb). +* πŸ“ Add link to Release History in main Readme.md. + ## Version 0.1.0 ### [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) @@ -19,6 +37,7 @@ * πŸ’₯ Add [treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.treatment_balance_curve) by [@spiaz](https://github.com/spiaz). * ❗️ The metrics `auuc` and `auqc` are now respectively renamed to [uplift_auc_score](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.qini_auc_score). So, `auuc` and `auqc` will be removed in 0.2.0. +* ❗️ Add a new parameter `startegy` in [uplift_at_k](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.uplift_at_k). ### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) diff --git a/docs/index.rst b/docs/index.rst index 0973c70..343979e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,12 @@ .. _Part 1: https://habr.com/ru/company/ru_mts/blog/485980/ .. _Part 2: https://habr.com/ru/company/ru_mts/blog/485976/ +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab3: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb + +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab4: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb + ************** scikit-uplift ************** @@ -18,9 +24,13 @@ Features * Applying any estimator adheres to scikit-learn conventions; +* All approaches can be used in sklearn.pipeline (see example (`EN `_ |Open In Colab3|_, `RU `_ |Open In Colab4|_)) + * Almost all implemented approaches solve both the problem of classification and regression; -* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model. +* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model; + +* Useful graphs for analyzing the built model. **The package currently supports the following methods:** diff --git a/docs/tutorials.rst b/docs/tutorials.rst index db202f0..ba09bdc 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -4,6 +4,11 @@ .. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb +.. _Example of usage model from sklift.models in sklearn.pipeline: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb ********** Tutorials @@ -12,10 +17,12 @@ Tutorials Basic ######## -It is better to start scikit-uplift from the basic tutorial. +It is better to start scikit-uplift from the basic tutorials. * `The overview of the basic approaches to solving the Uplift Modeling problem`_ - * In Englsih: `nbviewer `_ | `github `_ |Open In Colab1| - * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| - + * In English: `nbviewer `__ | `github `__ |Open In Colab1| + * In Russian: `nbviewer `__ | `github `__ |Open In Colab2| +* `Example of usage model from sklift.models in sklearn.pipeline`_ + * In English: `nbviewer `__ | `github `__ |Open In Colab3| + * In Russian: `nbviewer `__ | `github `__ |Open In Colab4| diff --git a/notebooks/Readme.rst b/notebooks/Readme.rst new file mode 100644 index 0000000..c962e1a --- /dev/null +++ b/notebooks/Readme.rst @@ -0,0 +1,28 @@ +.. _The overview of the basic approaches to solving the Uplift Modeling problem: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb +.. |Open In Colab1| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb +.. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb + +.. _Example of usage model from sklift.models in sklearn.pipeline: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb + +********** +Tutorials +********** + +Basic +######## + +It is better to start scikit-uplift from the basic tutorials. + +* `The overview of the basic approaches to solving the Uplift Modeling problem`_ + * In English: `nbviewer `_ | `github `_ |Open In Colab1| + * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| + +* `Example of usage model from sklift.models in sklearn.pipeline`_ + * In English: `nbviewer `_ | `github `_ |Open In Colab3| + * In Russian: `nbviewer `_ | `github `_ |Open In Colab4| diff --git a/notebooks/RetailHero.ipynb b/notebooks/RetailHero.ipynb index 837c80e..e2c424c 100644 --- a/notebooks/RetailHero.ipynb +++ b/notebooks/RetailHero.ipynb @@ -107,8 +107,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:07.634319Z", - "start_time": "2020-04-16T20:41:07.630711Z" + "end_time": "2020-04-28T09:24:37.492036Z", + "start_time": "2020-04-28T09:24:37.488584Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -126,7 +126,7 @@ "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", "!unzip /content/retail_hero.zip\n", - "!pip install scikit-uplift==0.1.0 catboost=0.22" + "!pip install scikit-uplift==0.1.1 catboost=0.22" ] }, { @@ -144,8 +144,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:10.715080Z", - "start_time": "2020-04-16T20:41:07.644388Z" + "end_time": "2020-04-28T09:24:40.783897Z", + "start_time": "2020-04-28T09:24:37.503470Z" }, "colab": {}, "colab_type": "code", @@ -196,8 +196,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:11.873021Z", - "start_time": "2020-04-16T20:41:10.717336Z" + "end_time": "2020-04-28T09:24:42.100096Z", + "start_time": "2020-04-28T09:24:40.786498Z" }, "colab": {}, "colab_type": "code", @@ -251,8 +251,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:14.061983Z", - "start_time": "2020-04-16T20:41:11.875565Z" + "end_time": "2020-04-28T09:24:44.483576Z", + "start_time": "2020-04-28T09:24:42.102707Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -313,8 +313,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:14.084652Z", - "start_time": "2020-04-16T20:41:14.064998Z" + "end_time": "2020-04-28T09:24:44.511016Z", + "start_time": "2020-04-28T09:24:44.486035Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -484,8 +484,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:15.254843Z", - "start_time": "2020-04-16T20:41:14.087893Z" + "end_time": "2020-04-28T09:24:45.715602Z", + "start_time": "2020-04-28T09:24:44.514353Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -545,8 +545,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:17.230517Z", - "start_time": "2020-04-16T20:41:15.257156Z" + "end_time": "2020-04-28T09:24:47.941480Z", + "start_time": "2020-04-28T09:24:45.719641Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -632,8 +632,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:19.361999Z", - "start_time": "2020-04-16T20:41:17.233889Z" + "end_time": "2020-04-28T09:24:50.571779Z", + "start_time": "2020-04-28T09:24:47.944822Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -694,8 +694,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:21.519463Z", - "start_time": "2020-04-16T20:41:19.366080Z" + "end_time": "2020-04-28T09:24:52.942803Z", + "start_time": "2020-04-28T09:24:50.576741Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -758,8 +758,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:21.536036Z", - "start_time": "2020-04-16T20:41:21.522709Z" + "end_time": "2020-04-28T09:24:52.964396Z", + "start_time": "2020-04-28T09:24:52.945544Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -858,8 +858,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:24.150095Z", - "start_time": "2020-04-16T20:41:21.539068Z" + "end_time": "2020-04-28T09:24:56.505700Z", + "start_time": "2020-04-28T09:24:53.019392Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -906,8 +906,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:24.171639Z", - "start_time": "2020-04-16T20:41:24.153353Z" + "end_time": "2020-04-28T09:24:56.560018Z", + "start_time": "2020-04-28T09:24:56.508541Z" }, "colab": { "base_uri": "https://localhost:8080/", diff --git a/notebooks/RetailHero_EN.ipynb b/notebooks/RetailHero_EN.ipynb index 6c7d5e3..45eacf9 100644 --- a/notebooks/RetailHero_EN.ipynb +++ b/notebooks/RetailHero_EN.ipynb @@ -18,7 +18,7 @@ " SCIKIT-UPLIFT REPO | \n", " SCIKIT-UPLIFT DOCS\n", "
\n", - " RUSSIAN VERSION\n", + " RUSSIAN VERSION\n", "" ] }, @@ -98,8 +98,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:13.495345Z", - "start_time": "2020-04-16T20:45:13.491423Z" + "end_time": "2020-04-28T09:26:59.973637Z", + "start_time": "2020-04-28T09:26:59.969856Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -111,13 +111,13 @@ }, "outputs": [], "source": [ - "# import urllib.request\n", + "import urllib.request\n", "\n", - "# url = 'https://drive.google.com/u/0/uc?id=1fkxNmihuS15kk0PP0QcphL_Z3_z8LLeb&export=download'\n", - "# urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", + "url = 'https://drive.google.com/u/0/uc?id=1fkxNmihuS15kk0PP0QcphL_Z3_z8LLeb&export=download'\n", + "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", - "# !unzip /content/retail_hero.zip\n", - "# !pip install scikit-uplift==0.1.0 catboost==0.22" + "!unzip /content/retail_hero.zip\n", + "!pip install scikit-uplift==0.1.1 catboost=0.22" ] }, { @@ -135,8 +135,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:16.518699Z", - "start_time": "2020-04-16T20:45:13.505025Z" + "end_time": "2020-04-28T09:27:03.353098Z", + "start_time": "2020-04-28T09:26:59.984369Z" }, "colab": {}, "colab_type": "code", @@ -151,9 +151,9 @@ "\n", "\n", "# reading data\n", - "df_clients = pd.read_csv('./RetailHero-data/clients.csv', index_col='client_id')\n", - "df_train = pd.read_csv('./RetailHero-data/uplift_train.csv', index_col='client_id')\n", - "df_test = pd.read_csv('./RetailHero-data/uplift_test.csv', index_col='client_id')\n", + "df_clients = pd.read_csv('/content/uplift_data/clients.csv', index_col='client_id')\n", + "df_train = pd.read_csv('/content/uplift_data/uplift_train.csv', index_col='client_id')\n", + "df_test = pd.read_csv('/content/uplift_data/uplift_test.csv', index_col='client_id')\n", "\n", "# extracting features\n", "df_features = df_clients.copy()\n", @@ -187,8 +187,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:17.756422Z", - "start_time": "2020-04-16T20:45:16.521360Z" + "end_time": "2020-04-28T09:27:04.563554Z", + "start_time": "2020-04-28T09:27:03.355432Z" }, "colab": {}, "colab_type": "code", @@ -241,8 +241,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:19.899785Z", - "start_time": "2020-04-16T20:45:17.759288Z" + "end_time": "2020-04-28T09:27:06.789462Z", + "start_time": "2020-04-28T09:27:04.570306Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -305,8 +305,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:19.921897Z", - "start_time": "2020-04-16T20:45:19.902457Z" + "end_time": "2020-04-28T09:27:06.813310Z", + "start_time": "2020-04-28T09:27:06.792837Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -476,8 +476,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:21.009561Z", - "start_time": "2020-04-16T20:45:19.925398Z" + "end_time": "2020-04-28T09:27:07.960855Z", + "start_time": "2020-04-28T09:27:06.816440Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -538,8 +538,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:22.998801Z", - "start_time": "2020-04-16T20:45:21.011706Z" + "end_time": "2020-04-28T09:27:10.062915Z", + "start_time": "2020-04-28T09:27:07.972200Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -627,8 +627,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:25.105146Z", - "start_time": "2020-04-16T20:45:23.001150Z" + "end_time": "2020-04-28T09:27:12.177386Z", + "start_time": "2020-04-28T09:27:10.065653Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -689,8 +689,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:27.262646Z", - "start_time": "2020-04-16T20:45:25.108914Z" + "end_time": "2020-04-28T09:27:14.459477Z", + "start_time": "2020-04-28T09:27:12.181749Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -753,8 +753,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:27.280762Z", - "start_time": "2020-04-16T20:45:27.265440Z" + "end_time": "2020-04-28T09:27:14.480794Z", + "start_time": "2020-04-28T09:27:14.463302Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -853,8 +853,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:30.063722Z", - "start_time": "2020-04-16T20:45:27.284197Z" + "end_time": "2020-04-28T09:27:17.097949Z", + "start_time": "2020-04-28T09:27:14.483691Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -901,8 +901,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:30.101502Z", - "start_time": "2020-04-16T20:45:30.067156Z" + "end_time": "2020-04-28T09:27:17.138755Z", + "start_time": "2020-04-28T09:27:17.101433Z" }, "colab": { "base_uri": "https://localhost:8080/", diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb new file mode 100644 index 0000000..5627162 --- /dev/null +++ b/notebooks/pipeline_usage_EN.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example of usage model from sklift.models in sklearn.pipeline\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS\n", + "
\n", + " RUSSIAN VERSION\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T12:44:35.435852Z", + "start_time": "2020-04-26T12:44:35.239050Z" + } + }, + "source": [ + "This is a simple example on how to use [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) with [sklearn.pipeline](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline).\n", + "\n", + "The data is taken from [MineThatData E-Mail Analytics And Data Mining Challenge dataset by Kevin Hillstrom](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test:\n", + "* 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise.\n", + "* 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise.\n", + "* 1/3 were randomly chosen to not receive an e-mail campaign.\n", + "\n", + "During a period of two weeks following the e-mail campaign, results were tracked. The task is to tell the world if the Mens or Womens e-mail campaign was successful.\n", + "\n", + "The full description of the dataset can be found at the [link](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "Firstly, install the necessary libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:10.044500Z", + "start_time": "2020-04-28T09:14:10.037045Z" + } + }, + "outputs": [], + "source": [ + "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Secondly, load the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:14.101729Z", + "start_time": "2020-04-28T09:14:10.048276Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./content/Hilstorm.csv', )" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import urllib.request\n", + "import pandas as pd\n", + "\n", + "\n", + "csv_path = '/content/Hilstorm.csv'\n", + "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", + "urllib.request.urlretrieve(url, csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For simplicity of the example, we will leave only two user segments:\n", + "* those who were sent an e-mail advertising campaign with women's products;\n", + "* those who were not sent out the ad campaign.\n", + "\n", + "We will use the `visit` variable as the target variable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:14.638574Z", + "start_time": "2020-04-28T09:14:14.106920Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of the dataset before processing: (64000, 12)\n", + "Shape of the dataset after processing: (42693, 10)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelvisittreatment
0102) $100 - $200142.4410Surburban0Phone01
163) $200 - $350329.0811Rural1Web00
272) $100 - $200180.6501Surburban1Web01
421) $0 - $10045.3410Urban0Web01
562) $100 - $200134.8301Surburban0Phone11
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie channel \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n", + "\n", + " visit treatment \n", + "0 0 1 \n", + "1 0 0 \n", + "2 0 1 \n", + "4 0 1 \n", + "5 1 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "%matplotlib inline\n", + "\n", + "dataset = pd.read_csv(csv_path)\n", + "print(f'Shape of the dataset before processing: {dataset.shape}')\n", + "dataset = dataset[dataset['segment']!='Mens E-Mail']\n", + "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0\n", + "})\n", + "\n", + "dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n", + "print(f'Shape of the dataset after processing: {dataset.shape}')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Divide all the data into a training and validation sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:15.419422Z", + "start_time": "2020-04-28T09:14:14.642252Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", + "\n", + "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", + "y_tr = Xyt_tr['visit']\n", + "treat_tr = Xyt_tr['treatment']\n", + "\n", + "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", + "y_val = Xyt_val['visit']\n", + "treat_val = Xyt_val['treatment']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select categorical features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:15.432871Z", + "start_time": "2020-04-28T09:14:15.421629Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['history_segment', 'zip_code', 'channel']\n" + ] + } + ], + "source": [ + "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()\n", + "print(cat_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the necessary objects and combining them into a pipieline:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:15.562848Z", + "start_time": "2020-04-28T09:14:15.437086Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from category_encoders import CatBoostEncoder\n", + "from sklift.models import ClassTransformation\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "encoder = CatBoostEncoder(cols=cat_cols)\n", + "estimator = XGBClassifier(max_depth=2, random_state=42)\n", + "ct = ClassTransformation(estimator=estimator)\n", + "\n", + "my_pipeline = Pipeline([\n", + " ('encoder', encoder),\n", + " ('model', ct)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train pipeline as usual, but adding the treatment column in the step model as a parameter `model__treatment`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:16.816735Z", + "start_time": "2020-04-28T09:14:15.568337Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " self._final_estimator.fit(Xt, y, **fit_params)\n" + ] + } + ], + "source": [ + "my_pipeline = my_pipeline.fit(\n", + " X=X_tr,\n", + " y=y_tr,\n", + " model__treatment=treat_tr\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T18:07:44.970856Z", + "start_time": "2020-04-26T18:07:44.964624Z" + } + }, + "source": [ + "Predict the uplift and calculate the uplift@30%" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:14:16.960683Z", + "start_time": "2020-04-28T09:14:16.819117Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@30%: 0.0661\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "\n", + "uplift_predictions = my_pipeline.predict(X_val)\n", + "\n", + "uplift_30 = uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')\n", + "print(f'uplift@30%: {uplift_30:.4f}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb new file mode 100644 index 0000000..5e306b5 --- /dev/null +++ b/notebooks/pipeline_usage_RU.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ΠŸΡ€ΠΈΠΌΠ΅Ρ€ использованиС ΠΏΠΎΠ΄Ρ…ΠΎΠ΄ΠΎΠ² ΠΈΠ· sklift.models Π² sklearn.pipeline\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS\n", + "
\n", + " ENGLISH VERSION\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Π’ Π΄Π°Π½Π½ΠΎΠΌ Π½ΠΎΡƒΡ‚Π±ΡƒΠΊΠ΅ рассмотрим простой ΠΏΡ€ΠΈΠΌΠ΅Ρ€ примСнСния ΠΎΠ΄Π½ΠΎΠ³ΠΎ ΠΈΠ· ΠΏΠΎΠ΄Ρ…ΠΎΠ΄ΠΎΠ² прогнозирования uplift Π² [sklearn.pipeline](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline).\n", + "\n", + "Π”Π°Π½Π½Ρ‹Π΅ для ΠΏΡ€ΠΈΠΌΠ΅Ρ€Π° взяты ΠΈΠ· [MineThatData E-Mail Analytics And Data Mining Challenge dataset by Kevin Hillstrom](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html). Π­Ρ‚ΠΎΡ‚ Π½Π°Π±ΠΎΡ€ Π΄Π°Π½Π½Ρ‹Ρ… содСрТит 64 000 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ², ΠΊΠΎΡ‚ΠΎΡ€Ρ‹Π΅ Π² послСдний Ρ€Π°Π· ΡΠΎΠ²Π΅Ρ€ΡˆΠ°Π»ΠΈ ΠΏΠΎΠΊΡƒΠΏΠΊΠΈ Π² Ρ‚Π΅Ρ‡Π΅Π½ΠΈΠ΅ Π΄Π²Π΅Π½Π°Π΄Ρ†Π°Ρ‚ΠΈ мСсяцСв. Π‘Ρ€Π΅Π΄ΠΈ ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»Π° ΠΏΡ€ΠΎΠ²Π΅Π΄Π΅Π½Π° рСкламная кампания с ΠΏΠΎΠΌΠΎΡ‰ΡŒΡŽ email рассылки:\n", + "\n", + "* 1/3 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»ΠΈ Π²Ρ‹Π±Ρ€Π°Π½Ρ‹ случайным ΠΎΠ±Ρ€Π°Π·ΠΎΠΌ для получСния элСктронного письма, Ρ€Π΅ΠΊΠ»Π°ΠΌΠΈΡ€ΡƒΡŽΡ‰Π΅Π³ΠΎ ΠΌΡƒΠΆΡΠΊΡƒΡŽ ΠΏΡ€ΠΎΠ΄ΡƒΠΊΡ†ΠΈΡŽ;\n", + "* 1/3 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»ΠΈ Π²Ρ‹Π±Ρ€Π°Π½Ρ‹ случайным ΠΎΠ±Ρ€Π°Π·ΠΎΠΌ для получСния элСктронного письма, Ρ€Π΅ΠΊΠ»Π°ΠΌΠΈΡ€ΡƒΡŽΡ‰Π΅Π³ΠΎ ΠΆΠ΅Π½ΡΠΊΡƒΡŽ ΠΏΡ€ΠΎΠ΄ΡƒΠΊΡ†ΠΈΡŽ;\n", + "* Π‘ ΠΎΡΡ‚Π°Π²ΡˆΠ΅ΠΉΡΡ 1/3 ΠΊΠΎΠΌΠΌΡƒΠ½ΠΈΠΊΠ°Ρ†ΠΈΡŽ Π½Π΅ ΠΏΡ€ΠΎΠ²ΠΎΠ΄ΠΈΠ»ΠΈ.\n", + "\n", + "Для ΠΊΠ°ΠΆΠ΄ΠΎΠ³ΠΎ ΠΊΠ»ΠΈΠ΅Π½Ρ‚Π° ΠΈΠ· Π²Ρ‹Π±ΠΎΡ€ΠΊΠΈ Π·Π°ΠΌΠ΅Ρ€ΠΈΠ»ΠΈ Ρ„Π°ΠΊΡ‚ ΠΏΠ΅Ρ€Π΅Ρ…ΠΎΠ΄Π° ΠΏΠΎ ссылкС Π² письмС, Ρ„Π°ΠΊΡ‚ ΡΠΎΠ²Π΅Ρ€ΡˆΠ΅Π½ΠΈΡ ΠΏΠΎΠΊΡƒΠΏΠΊΠΈ ΠΈ сумму Ρ‚Ρ€Π°Ρ‚ Π·Π° Π΄Π²Π΅ Π½Π΅Π΄Π΅Π»ΠΈ, слСдущими послС получСния письма.\n", + "\n", + "ПолноС описаниС датасСта ΠΌΠΎΠΆΠ½ΠΎΠΉ Π½Π°ΠΉΡ‚ΠΈ ΠΏΠΎ [ссылкС](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "Установим Π½Π΅ΠΎΠ±Ρ…ΠΎΠ΄ΠΈΠΌΡ‹Π΅ Π±ΠΈΠ±Π»ΠΈΠΎΡ‚Π΅ΠΊΠΈ:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:06.767010Z", + "start_time": "2020-04-28T09:23:06.762871Z" + } + }, + "outputs": [], + "source": [ + "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T14:28:36.188277Z", + "start_time": "2020-04-26T14:28:36.106561Z" + } + }, + "source": [ + "Π—Π°Π³Ρ€ΡƒΠ·ΠΈΠΌ Π΄Π°Π½Π½Ρ‹Π΅:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:10.820274Z", + "start_time": "2020-04-28T09:23:06.776057Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./content/Hilstorm.csv', )" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import urllib.request\n", + "import pandas as pd\n", + "\n", + "\n", + "csv_path = '/content/Hilstorm.csv'\n", + "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", + "urllib.request.urlretrieve(url, csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для простоты ΠΏΡ€ΠΈΠΌΠ΅Ρ€Π° оставим Ρ‚ΠΎΠ»ΡŒΠΊΠΎ Π΄Π²Π° сСгмСнта ΠΏΠΎΠ»ΡŒΠ·ΠΎΠ²Π°Ρ‚Π΅Π»Π΅ΠΉ:\n", + "* Ρ‚Π΅ΠΌ, ΠΊΠΎΠΌΡƒ Ρ€Π°ΡΡΡ‹Π»Π°Π»Π°ΡΡŒ ΠΏΠΎ элСктронной ΠΏΠΎΡ‡Ρ‚Π΅ рСкламная кампания с участиСм ТСнских Ρ‚ΠΎΠ²Π°Ρ€ΠΎΠ²;\n", + "* Ρ‚Π΅ΠΌ, ΠΊΠΎΠΌΡƒ Π½Π΅ Ρ€Π°ΡΡΡ‹Π»Π°Π»Π°ΡΡŒ рСкламная кампания.\n", + "\n", + "Π’ качСствС Ρ†Π΅Π»Π΅Π²ΠΎΠΉ ΠΏΠ΅Ρ€Π΅ΠΌΠ΅Π½Π½ΠΎΠΉ Π±ΡƒΠ΄Π΅ΠΌ ΠΈΡΠΏΠΎΠ»ΡŒΠ·ΠΎΠ²Π°Ρ‚ΡŒ ΠΏΠ΅Ρ€Π΅ΠΌΠ΅Π½Π½ΡƒΡŽ `visit`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:11.319111Z", + "start_time": "2020-04-28T09:23:10.823302Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Π Π°Π·ΠΌΠ΅Ρ€ датасСта Π΄ΠΎ ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: (64000, 12)\n", + "Π Π°Π·ΠΌΠ΅Ρ€ датасСта послС ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: (42693, 10)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelvisittreatment
0102) $100 - $200142.4410Surburban0Phone01
163) $200 - $350329.0811Rural1Web00
272) $100 - $200180.6501Surburban1Web01
421) $0 - $10045.3410Urban0Web01
562) $100 - $200134.8301Surburban0Phone11
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie channel \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n", + "\n", + " visit treatment \n", + "0 0 1 \n", + "1 0 0 \n", + "2 0 1 \n", + "4 0 1 \n", + "5 1 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd; pd.set_option('display.max_columns', None)\n", + "\n", + "\n", + "%matplotlib inline\n", + "\n", + "dataset = pd.read_csv(csv_path)\n", + "print(f'Π Π°Π·ΠΌΠ΅Ρ€ датасСта Π΄ΠΎ ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: {dataset.shape}')\n", + "dataset = dataset[dataset['segment']!='Mens E-Mail']\n", + "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0\n", + "})\n", + "\n", + "dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n", + "print(f'Π Π°Π·ΠΌΠ΅Ρ€ датасСта послС ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: {dataset.shape}')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "РазобъСм всС Π΄Π°Π½Π½Ρ‹Π΅ Π½Π° ΠΎΠ±ΡƒΡ‡Π°ΡŽΡ‰ΡƒΡŽ ΠΈ Π²Π°Π»ΠΈΠ΄Π°Ρ†ΠΈΠΎΠ½Π½ΡƒΡŽ Π²Ρ‹Π±ΠΎΡ€ΠΊΡƒ:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:11.919143Z", + "start_time": "2020-04-28T09:23:11.324058Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", + "\n", + "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", + "y_tr = Xyt_tr['visit']\n", + "treat_tr = Xyt_tr['treatment']\n", + "\n", + "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", + "y_val = Xyt_val['visit']\n", + "treat_val = Xyt_val['treatment']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select categorical features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:11.936024Z", + "start_time": "2020-04-28T09:23:11.921716Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['history_segment', 'zip_code', 'channel']\n" + ] + } + ], + "source": [ + "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()\n", + "print(cat_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Π‘ΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π½ΡƒΠΆΠ½Ρ‹Π΅ ΠΎΠ±ΡŠΠ΅ΠΊΡ‚Ρ‹ ΠΈ объСдиним ΠΈΡ… Π² pipieline." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:12.019176Z", + "start_time": "2020-04-28T09:23:11.939728Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from category_encoders import CatBoostEncoder\n", + "from sklift.models import ClassTransformation\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "encoder = CatBoostEncoder(cols=cat_cols)\n", + "estimator = XGBClassifier(max_depth=2, random_state=42)\n", + "ct = ClassTransformation(estimator=estimator)\n", + "\n", + "my_pipeline = Pipeline([\n", + " ('encoder', encoder),\n", + " ('model', ct)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T18:02:52.236917Z", + "start_time": "2020-04-26T18:02:52.110138Z" + } + }, + "source": [ + "ΠžΠ±ΡƒΡ‡Π°Ρ‚ΡŒ pipeline Π±ΡƒΠ΄Π΅ΠΌ ΠΊΠ°ΠΊ ΠΎΠ±Ρ‹Ρ‡Π½ΠΎ, Π½ΠΎ ΠΊΠΎΠ»ΠΎΠ½ΠΊΡƒ treatment Π΄ΠΎΠ±Π°Π²ΠΈΠΌ ΠΊΠ°ΠΊ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ шага model: `model__treatment`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:13.244343Z", + "start_time": "2020-04-28T09:23:12.021559Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " self._final_estimator.fit(Xt, y, **fit_params)\n" + ] + } + ], + "source": [ + "my_pipeline = my_pipeline.fit(\n", + " X=X_tr,\n", + " y=y_tr,\n", + " model__treatment=treat_tr\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ΠŸΡ€Π΅Π΄ΡΠΊΠ°ΠΆΠ΅ΠΌ uplift ΠΈ посчитаСм uplift@30%" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-28T09:23:13.383577Z", + "start_time": "2020-04-28T09:23:13.246513Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@30%: 0.0661\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "\n", + "uplift_predictions = my_pipeline.predict(X_val)\n", + "\n", + "uplift_30 = uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')\n", + "print(f'uplift@30%: {uplift_30:.4f}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sklift/__init__.py b/sklift/__init__.py index b794fd4..df9144c 100644 --- a/sklift/__init__.py +++ b/sklift/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = '0.1.1' diff --git a/sklift/metrics/__init__.py b/sklift/metrics/__init__.py index de53391..097ba43 100644 --- a/sklift/metrics/__init__.py +++ b/sklift/metrics/__init__.py @@ -1,3 +1,9 @@ -from .metrics import uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve +from .metrics import ( + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, response_rate_by_percentile, treatment_balance_curve, + uplift_auc_score, qini_auc_score +) -__all__ = [uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve] \ No newline at end of file +__all__ = [ + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, response_rate_by_percentile, treatment_balance_curve, + uplift_auc_score, qini_auc_score +] diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index c54b2c9..efbf185 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -12,7 +12,7 @@ def uplift_curve(y_true, uplift, treatment): area under the Uplift Curve, see :func:`uplift_auc_score`. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -48,7 +48,7 @@ def uplift_curve(y_true, uplift, treatment): num_ctrl = num_all - num_trmnt y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices] - curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) -\ + curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) - np.divide(y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl), where=num_ctrl != 0)) * num_all if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0: @@ -67,7 +67,7 @@ def qini_curve(y_true, uplift, treatment): area under the Qini Curve, see :func:`qini_auc_score`. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -120,7 +120,7 @@ def uplift_auc_score(y_true, uplift, treatment): """Compute Area Under the Uplift Curve from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -136,7 +136,7 @@ def auuc(y_true, uplift, treatment): """Compute Area Under the Uplift Curve from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -160,7 +160,7 @@ def qini_auc_score(y_true, uplift, treatment): """Compute Area Under the Qini Curve (aka Qini coefficient) from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -176,7 +176,7 @@ def auqc(y_true, uplift, treatment): """Compute Area Under the Qini Curve (aka Qini coefficient) from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -200,12 +200,12 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): """Compute uplift at first k percentage of the total sample. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. k (float or int): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the computation of uplift. If int, represents the absolute number of samples. - strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Defaults to 'first'. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction @@ -237,7 +237,7 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): ) n_samples = len(y_true) - order = np.argsort(uplift)[::-1] + order = np.argsort(uplift, kind='mergesort')[::-1] _, treatment_counts = np.unique(treatment, return_counts=True) n_samples_ctrl = treatment_counts[0] n_samples_trmnt = treatment_counts[1] @@ -247,8 +247,8 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): if (k_type == 'i' and (k >= n_samples or k <= 0) or k_type == 'f' and (k <= 0 or k >= 1)): raise ValueError(f'k={k} should be either positive and smaller' - ' than the number of samples {n_samples} or a float in the ' - '(0, 1) range') + f' than the number of samples {n_samples} or a float in the ' + f'(0, 1) range') if k_type not in ('i', 'f'): raise ValueError(f'Invalid value for k: {k_type}') @@ -289,6 +289,86 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): return score_trmnt - score_ctrl +def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins=10): + """Compute response rate (target mean in the control or treatment group) at each percentile. + + Args: + y_true (1d array-like): Correct (true) target values. + uplift (1d array-like): Predicted uplift, as returned by a model. + treatment (1d array-like): Treatment labels. + group (string, ['treatment', 'control']): Group type for computing response rate: treatment or control. + * ``'treatment'``: + Values equal 1 in the treatment column. + * ``'control'``: + Values equal 0 in the treatment column. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. + * ``'overall'``: + The first step is taking the first k observations of all test data ordered by uplift prediction + (overall both groups - control and treatment) and conversions in treatment and control groups + calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: + Separately calculates conversions in top k observations in each group (control and treatment) + sorted by uplift predictions. Then the difference between these conversions is calculated + bins (int): Determines the number of bins (and relative percentile) in the test data. + + Returns: + array: Response rate at each percentile for control or treatment group + array: Variance of the response rate at each percentile + """ + + group_types = ['treatment', 'control'] + strategy_methods = ['overall', 'by_group'] + + n_samples = len(y_true) + check_consistent_length(y_true, uplift, treatment) + + if group not in group_types: + raise ValueError(f'Response rate supports only group types in {group_types},' + f' got {group}.') + + if strategy not in strategy_methods: + raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' + f' got {strategy}.') + + if not isinstance(bins, int) or bins <= 0: + raise ValueError(f'bins should be positive integer.' + f' Invalid value bins: {bins}') + + if bins >= n_samples: + raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + + if bins == 1: + warnings.warn(f'You will get the only one bin of {n_samples} samples' + f' which is the length of y_true.' + f'\nPlease consider using uplift_at_k function instead', + UserWarning) + + y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) + order = np.argsort(uplift, kind='mergesort')[::-1] + + if group == 'treatment': + trmnt_flag = 1 + else: # group == 'control' + trmnt_flag = 0 + + if strategy == 'overall': + y_true_bin = np.array_split(y_true[order], bins) + trmnt_bin = np.array_split(treatment[order], bins) + + group_size = np.array([len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) + response_rate = np.array([np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) + + else: # strategy == 'by_group' + y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins) + + group_size = np.array([len(y) for y in y_bin]) + response_rate = np.array([np.mean(y) for y in y_bin]) + + variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size)) + + return response_rate, variance + + def treatment_balance_curve(uplift, treatment, winsize): """Compute the treatment balance curve: proportion of treatment group in the ordered predictions. diff --git a/sklift/viz/__init__.py b/sklift/viz/__init__.py index f00ab0c..10601ab 100644 --- a/sklift/viz/__init__.py +++ b/sklift/viz/__init__.py @@ -1,3 +1,3 @@ -from .base import plot_uplift_preds, plot_uplift_qini_curves +from .base import plot_uplift_preds, plot_uplift_qini_curves, plot_uplift_by_percentile, plot_treatment_balance_curve -__all__ = [plot_uplift_preds, plot_uplift_qini_curves] +__all__ = [plot_uplift_preds, plot_uplift_qini_curves, plot_uplift_by_percentile, plot_treatment_balance_curve] diff --git a/sklift/viz/base.py b/sklift/viz/base.py index b1307cc..2d2bb75 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -1,6 +1,6 @@ import matplotlib.pyplot as plt import numpy as np -from ..metrics import uplift_curve, auuc, qini_curve, auqc, treatment_balance_curve +from ..metrics import uplift_curve, auuc, qini_curve, auqc, response_rate_by_percentile, treatment_balance_curve def plot_uplift_preds(trmnt_preds, ctrl_preds, log=False, bins=100): @@ -89,12 +89,12 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals axes[0].plot(x_up_perfect, y_up_perfect, label='Perfect', color='red') axes[1].plot(x_qi_perfect, y_qi_perfect, label='Perfect', color='red') - axes[0].legend() + axes[0].legend(loc='upper left') axes[0].set_title(f'Uplift curve: AUUC={auuc(y_true, uplift, treatment):.2f}') axes[0].set_xlabel('Number targeted') axes[0].set_ylabel('Relative gain: treatment - control') - axes[1].legend() + axes[1].legend(loc='upper left') axes[1].set_title(f'Qini curve: AUQC={auqc(y_true, uplift, treatment):.2f}') axes[1].set_xlabel('Number targeted') axes[1].set_ylabel('Number of incremental outcome') @@ -102,6 +102,82 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals return axes +def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, bins=10): + """Plot Uplift score at each percentile, + Treatment response rate (target mean in the treatment group) + and Control response rate (target mean in the control group) at each percentile. + + Args: + y_true (1d array-like): Correct (true) target values. + uplift (1d array-like): Predicted uplift, as returned by a model. + treatment (1d array-like): Treatment labels. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Defaults to 'first'. + * ``'overall'``: + The first step is taking the first k observations of all test data ordered by uplift prediction + (overall both groups - control and treatment) and conversions in treatment and control groups + calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: + Separately calculates conversions in top k observations in each group (control and treatment) + sorted by uplift predictions. Then the difference between these conversions is calculated + bins (int): Determines the number of bins (and relative percentile) in the test data. + + Returns: + Object that stores computed values. + """ + + strategy_methods = ['overall', 'by_group'] + + n_samples = len(y_true) + check_consistent_length(y_true, uplift, treatment) + + if strategy not in strategy_methods: + raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' + f' got {strategy}.') + + if not isinstance(bins, int) or bins <= 0: + raise ValueError(f'bins should be positive integer.' + f' Invalid value bins: {bins}') + + if bins >= n_samples: + raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + + if bins == 1: + warnings.warn(f'You will get the only one bin of {n_samples} samples' + f' which is the length of y_true.' + f'\nPlease consider using uplift_at_k function instead', + UserWarning) + + rspns_rate_trmnt, var_trmnt = response_rate_by_percentile(y_true, uplift, + treatment, group='treatment', + strategy=strategy, bins=bins) + + rspns_rate_ctrl, var_ctrl = response_rate_by_percentile(y_true, uplift, + treatment, group='control', + strategy=strategy, bins=bins) + + uplift_score, uplift_variance = np.subtract(rspns_rate_trmnt, rspns_rate_ctrl), np.add(var_trmnt, var_ctrl) + + percentiles = [p * 100 / bins for p in range(1, bins + 1)] + + _, axes = plt.subplots(ncols=1, nrows=1, figsize=(8, 6)) + + axes.errorbar(percentiles, uplift_score, yerr=np.sqrt(uplift_variance), + linewidth=2, color='red', label='uplift') + axes.errorbar(percentiles, rspns_rate_trmnt, yerr=np.sqrt(var_trmnt), + linewidth=2, color='forestgreen', label='treatment\nresponse rate') + axes.errorbar(percentiles, rspns_rate_ctrl, yerr=np.sqrt(var_ctrl), + linewidth=2, color='orange', label='control\nresponse rate') + axes.fill_between(percentiles, rspns_rate_ctrl, rspns_rate_trmnt, alpha=0.1, color='red') + + axes.set_xticks(percentiles) + axes.legend(loc='upper right') + axes.set_title('Uplift by percentile') + axes.set_xlabel('Percentile') + axes.set_ylabel('Uplift = treatment response rate - control response rate') + + return axes + + def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1): """Plot Treatment Balance curve. @@ -132,6 +208,6 @@ def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1): axes.legend() axes.set_title('Treatment balance curve') axes.set_xlabel('Percentage targeted') - axes.set_ylabel('Balance: treatment / (treatment + control') + axes.set_ylabel('Balance: treatment / (treatment + control)') return axes