From 864f358a1c6abdb4cab680c5cfaf834e90d48395 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 13:38:01 +0300 Subject: [PATCH 01/20] :hammer: Fix import bug with uplift_auc_score and qini_auc_score --- sklift/metrics/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklift/metrics/__init__.py b/sklift/metrics/__init__.py index de53391..b4623c5 100644 --- a/sklift/metrics/__init__.py +++ b/sklift/metrics/__init__.py @@ -1,3 +1,9 @@ -from .metrics import uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve +from .metrics import ( + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve, + uplift_auc_score, qini_auc_score +) -__all__ = [uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve] \ No newline at end of file +__all__ = [ + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve, + uplift_auc_score, qini_auc_score +] From 4c22a3d1569c9abe99df862ebe47ddc20dbc5d1a Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 13:44:44 +0300 Subject: [PATCH 02/20] :hammer: Fix import bug plot_treatment_balance_curve --- sklift/viz/__init__.py | 4 ++-- sklift/viz/base.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklift/viz/__init__.py b/sklift/viz/__init__.py index f00ab0c..8ed7d5c 100644 --- a/sklift/viz/__init__.py +++ b/sklift/viz/__init__.py @@ -1,3 +1,3 @@ -from .base import plot_uplift_preds, plot_uplift_qini_curves +from .base import plot_uplift_preds, plot_uplift_qini_curves, plot_treatment_balance_curve -__all__ = [plot_uplift_preds, plot_uplift_qini_curves] +__all__ = [plot_uplift_preds, plot_uplift_qini_curves, plot_treatment_balance_curve] diff --git a/sklift/viz/base.py b/sklift/viz/base.py index b1307cc..bd4e648 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -132,6 +132,6 @@ def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1): axes.legend() axes.set_title('Treatment balance curve') axes.set_xlabel('Percentage targeted') - axes.set_ylabel('Balance: treatment / (treatment + control') + axes.set_ylabel('Balance: treatment / (treatment + control)') return axes From aa9ef3de55fd14bd37aa624e9f3b22b2393b8a6d Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 13:54:49 +0300 Subject: [PATCH 03/20] :memo: Fix docstring in uplift_at_k and add mergesort in np.argsort --- sklift/metrics/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index c54b2c9..f2ab22a 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -205,7 +205,7 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): treatment (1d array-like): Treatment labels. k (float or int): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the computation of uplift. If int, represents the absolute number of samples. - strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Defaults to 'first'. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction @@ -237,7 +237,7 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): ) n_samples = len(y_true) - order = np.argsort(uplift)[::-1] + order = np.argsort(uplift, kind='mergesort')[::-1] _, treatment_counts = np.unique(treatment, return_counts=True) n_samples_ctrl = treatment_counts[0] n_samples_trmnt = treatment_counts[1] From 58faf71a1132eb9efe76316060bf00084388a599 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 14:12:00 +0300 Subject: [PATCH 04/20] :memo: Fix y_true description in docstrings --- sklift/metrics/metrics.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index f2ab22a..4008b10 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -12,7 +12,7 @@ def uplift_curve(y_true, uplift, treatment): area under the Uplift Curve, see :func:`uplift_auc_score`. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -48,7 +48,7 @@ def uplift_curve(y_true, uplift, treatment): num_ctrl = num_all - num_trmnt y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices] - curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) -\ + curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) - np.divide(y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl), where=num_ctrl != 0)) * num_all if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0: @@ -67,7 +67,7 @@ def qini_curve(y_true, uplift, treatment): area under the Qini Curve, see :func:`qini_auc_score`. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -120,7 +120,7 @@ def uplift_auc_score(y_true, uplift, treatment): """Compute Area Under the Uplift Curve from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -136,7 +136,7 @@ def auuc(y_true, uplift, treatment): """Compute Area Under the Uplift Curve from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -160,7 +160,7 @@ def qini_auc_score(y_true, uplift, treatment): """Compute Area Under the Qini Curve (aka Qini coefficient) from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -176,7 +176,7 @@ def auqc(y_true, uplift, treatment): """Compute Area Under the Qini Curve (aka Qini coefficient) from prediction scores. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. @@ -200,7 +200,7 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): """Compute uplift at first k percentage of the total sample. Args: - y_true (1d array-like): Ground truth (correct) labels. + y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. k (float or int): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset From af6b6340a4769405748e492a963249e926ad0019 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 15:08:30 +0300 Subject: [PATCH 05/20] :memo: Fix f-string in raise in uplift_at_k --- sklift/metrics/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index 4008b10..68b7540 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -247,8 +247,8 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): if (k_type == 'i' and (k >= n_samples or k <= 0) or k_type == 'f' and (k <= 0 or k >= 1)): raise ValueError(f'k={k} should be either positive and smaller' - ' than the number of samples {n_samples} or a float in the ' - '(0, 1) range') + f' than the number of samples {n_samples} or a float in the ' + f'(0, 1) range') if k_type not in ('i', 'f'): raise ValueError(f'Invalid value for k: {k_type}') From 462d0ec7f41ffd4eebb3e5bfe78facacd2657605 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 15:11:24 +0300 Subject: [PATCH 06/20] :memo: Fix legend location in plot_uplift_qini_curves --- sklift/viz/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index bd4e648..c7e3d9f 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -89,12 +89,12 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals axes[0].plot(x_up_perfect, y_up_perfect, label='Perfect', color='red') axes[1].plot(x_qi_perfect, y_qi_perfect, label='Perfect', color='red') - axes[0].legend() + axes[0].legend(loc='upper left') axes[0].set_title(f'Uplift curve: AUUC={auuc(y_true, uplift, treatment):.2f}') axes[0].set_xlabel('Number targeted') axes[0].set_ylabel('Relative gain: treatment - control') - axes[1].legend() + axes[1].legend(loc='upper left') axes[1].set_title(f'Qini curve: AUQC={auqc(y_true, uplift, treatment):.2f}') axes[1].set_xlabel('Number targeted') axes[1].set_ylabel('Number of incremental outcome') From 524c9b200457c3561378db09a868f7494af53a99 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 15:17:07 +0300 Subject: [PATCH 07/20] :memo: Add changes in changelog and add link to Realease History in main Readme --- Readme.rst | 2 +- docs/changelog.md | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Readme.rst b/Readme.rst index 8dfdb3e..5ea3992 100644 --- a/Readme.rst +++ b/Readme.rst @@ -157,7 +157,7 @@ Important links - Official source code repo: https://github.com/maks-sh/scikit-uplift/ - Issue tracker: https://github.com/maks-sh/scikit-uplift/issues - +- Release History: https://scikit-uplift.readthedocs.io/en/latest/changelog.html =============== diff --git a/docs/changelog.md b/docs/changelog.md index f5ca79a..63d687c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,21 @@ * πŸ”¨ something that previously didn’t work as documentated – or according to reasonable expectations – should now work. * ❗️ you will need to change your code to have the same effect in the future; or a feature will be removed in the future. +## Version 0.1.1 + +### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) + +* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_treatment_balance_curve) + +### [sklift.metrics](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html) + +* πŸ”¨ Fix bug with import [uplift_auc_score](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.qini_auc_score). +* πŸ“ Fix typos in docstrings + +### Miscellaneous + +* πŸ“ Add link to Release History in main Readme.md + ## Version 0.1.0 ### [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) From 17fa1a350323969417b2f103093a317f1101e208 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 15:22:31 +0300 Subject: [PATCH 08/20] :memo: Add Readme page in notebooks folder --- notebooks/Readme.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 notebooks/Readme.rst diff --git a/notebooks/Readme.rst b/notebooks/Readme.rst new file mode 100644 index 0000000..db202f0 --- /dev/null +++ b/notebooks/Readme.rst @@ -0,0 +1,21 @@ +.. _The overview of the basic approaches to solving the Uplift Modeling problem: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb +.. |Open In Colab1| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb +.. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb + + +********** +Tutorials +********** + +Basic +######## + +It is better to start scikit-uplift from the basic tutorial. + +* `The overview of the basic approaches to solving the Uplift Modeling problem`_ + * In Englsih: `nbviewer `_ | `github `_ |Open In Colab1| + * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| + + From 458ef8ac71bf2792443a41bd607d5ff6474ed63e Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sun, 26 Apr 2020 15:28:51 +0300 Subject: [PATCH 09/20] :boom: start tutorial of pipeline usage --- notebooks/pipeline_usage.ipynb | 506 +++++++++++++++++++++++++++++++++ 1 file changed, 506 insertions(+) create mode 100644 notebooks/pipeline_usage.ipynb diff --git a/notebooks/pipeline_usage.ipynb b/notebooks/pipeline_usage.ipynb new file mode 100644 index 0000000..7181382 --- /dev/null +++ b/notebooks/pipeline_usage.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:35.309471Z", + "start_time": "2020-04-22T21:39:34.114404Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(64000, 12)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelsegmentvisitconversionspend
0102) $100 - $200142.4410Surburban0PhoneWomens E-Mail000.0
163) $200 - $350329.0811Rural1WebNo E-Mail000.0
272) $100 - $200180.6501Surburban1WebWomens E-Mail000.0
395) $500 - $750675.8310Rural1WebMens E-Mail000.0
421) $0 - $10045.3410Urban0WebWomens E-Mail000.0
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie channel \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", + "3 9 5) $500 - $750 675.83 1 0 Rural 1 Web \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", + "\n", + " segment visit conversion spend \n", + "0 Womens E-Mail 0 0 0.0 \n", + "1 No E-Mail 0 0 0.0 \n", + "2 Womens E-Mail 0 0 0.0 \n", + "3 Mens E-Mail 0 0 0.0 \n", + "4 Womens E-Mail 0 0 0.0 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd; pd.set_option('display.max_columns', None)\n", + "\n", + "\n", + "%matplotlib inline\n", + "\n", + "hillstrom = pd.read_csv('/Users/Maksim/Desktop/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')\n", + "print(hillstrom.shape)\n", + "hillstrom.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:35.384713Z", + "start_time": "2020-04-22T21:39:35.313129Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(18781, 9)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexing.py:844: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[key] = _infer_fill_value(value)\n", + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexing.py:965: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[item] = s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbievisittreatment
0102) $100 - $200142.4410Surburban001
562) $100 - $200134.8301Surburban011
693) $200 - $350280.2010Surburban101
791) $0 - $10046.4201Urban001
1075) $500 - $750548.9101Urban111
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie visit \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 0 \n", + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 1 \n", + "6 9 3) $200 - $350 280.20 1 0 Surburban 1 0 \n", + "7 9 1) $0 - $100 46.42 0 1 Urban 0 0 \n", + "10 7 5) $500 - $750 548.91 0 1 Urban 1 1 \n", + "\n", + " treatment \n", + "0 1 \n", + "5 1 \n", + "6 1 \n", + "7 1 \n", + "10 1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = hillstrom[hillstrom['segment']!='Mens E-Mail']\n", + "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0\n", + "})\n", + "\n", + "\n", + "dataset = dataset[dataset['channel']=='Phone']\n", + "\n", + "dataset = dataset.drop(['segment', 'channel', 'conversion', 'spend'], axis=1)\n", + "print(dataset.shape)\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:36.171492Z", + "start_time": "2020-04-22T21:39:35.388090Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", + "\n", + "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", + "y_tr = Xyt_tr['visit']\n", + "treat_tr = Xyt_tr['treatment']\n", + "\n", + "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", + "y_val = Xyt_val['visit']\n", + "treat_val = Xyt_val['treatment']\n", + "\n", + "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:36.255395Z", + "start_time": "2020-04-22T21:39:36.174570Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from category_encoders import CatBoostEncoder\n", + "from sklift.models import ClassTransformation\n", + "from xgboost import XGBClassifier\n", + "\n", + "encoder = CatBoostEncoder(cols=cat_cols)\n", + "estimator = XGBClassifier(max_depth=2, random_state=42)\n", + "ct = ClassTransformation(estimator=estimator)\n", + "\n", + "my_pipeline = Pipeline([\n", + " ('encoder', encoder),\n", + " ('model', ct)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:36.824301Z", + "start_time": "2020-04-22T21:39:36.258334Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " self._final_estimator.fit(Xt, y, **fit_params)\n" + ] + }, + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('encoder',\n", + " CatBoostEncoder(a=1, cols=['history_segment', 'zip_code'],\n", + " drop_invariant=False, handle_missing='value',\n", + " handle_unknown='value', random_state=None,\n", + " return_df=True, sigma=None, verbose=0)),\n", + " ('model',\n", + " ClassTransformation(estimator=XGBClassifier(base_score=0.5,\n", + " booster=None,\n", + " colsample_bylevel=1,\n", + " colsample_bynode=1,\n", + " colsample_by...\n", + " interaction_constraints=None,\n", + " learning_rate=0.300000012,\n", + " max_delta_step=0,\n", + " max_depth=2,\n", + " min_child_weight=1,\n", + " missing=nan,\n", + " monotone_constraints=None,\n", + " n_estimators=100,\n", + " n_jobs=0,\n", + " num_parallel_tree=1,\n", + " objective='binary:logistic',\n", + " random_state=42,\n", + " reg_alpha=0,\n", + " reg_lambda=1,\n", + " scale_pos_weight=1,\n", + " subsample=1,\n", + " tree_method=None,\n", + " validate_parameters=False,\n", + " verbosity=None)))],\n", + " verbose=False)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_pipeline.fit(\n", + " X=X_tr,\n", + " y=y_tr,\n", + " model__treatment=treat_tr\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-22T21:39:36.889844Z", + "start_time": "2020-04-22T21:39:36.827707Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.04904281488369654" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "uplift_predictions = my_pipeline.predict(X_val)\n", + "\n", + "uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ce941e13d122d280e4cbbf358596e3eb75fd090f Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Mon, 27 Apr 2020 22:22:27 +0300 Subject: [PATCH 10/20] :boom: Add Tutorial of usage sklift.models in sklearn.pipeline --- notebooks/pipeline_usage_EN.ipynb | 464 ++++++++++++++++++++++++++++++ notebooks/pipeline_usage_RU.ipynb | 463 +++++++++++++++++++++++++++++ 2 files changed, 927 insertions(+) create mode 100644 notebooks/pipeline_usage_EN.ipynb create mode 100644 notebooks/pipeline_usage_RU.ipynb diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb new file mode 100644 index 0000000..0aab882 --- /dev/null +++ b/notebooks/pipeline_usage_EN.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example of usage model from sklift.models in sklearn.pipeline\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS\n", + "
\n", + " RUSSIAN VERSION\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T12:44:35.435852Z", + "start_time": "2020-04-26T12:44:35.239050Z" + } + }, + "source": [ + "This is a simple example on how to use [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) with [sklearn.pipeline](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline).\n", + "\n", + "The data is taken from [MineThatData E-Mail Analytics And Data Mining Challenge dataset by Kevin Hillstrom](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test:\n", + "* 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise.\n", + "* 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise.\n", + "* 1/3 were randomly chosen to not receive an e-mail campaign.\n", + "\n", + "During a period of two weeks following the e-mail campaign, results were tracked. The task is to tell the world if the Mens or Womens e-mail campaign was successful.\n", + "\n", + "The full description of the dataset can be found at the [link](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "Firstly, install the necessary libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:23.787100Z", + "start_time": "2020-04-27T19:15:23.782965Z" + } + }, + "outputs": [], + "source": [ + "!pip install scikit-uplift==0.1.0 xgboost" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Secondly, load the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:29.425545Z", + "start_time": "2020-04-27T19:15:23.800862Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./content/Hilstorm.csv', )" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import urllib.request\n", + "import pandas as pd; pd.set_option('display.max_columns', None)\n", + "\n", + "\n", + "csv_path = './content/Hilstorm.csv'\n", + "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", + "urllib.request.urlretrieve(url, csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For simplicity of the example, we will leave only two user segments:\n", + "* those who were sent an e-mail advertising campaign with women's products;\n", + "* those who were not sent out the ad campaign.\n", + "\n", + "We will use the `visit` variable as the target variable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:29.971490Z", + "start_time": "2020-04-27T19:15:29.429579Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of the dataset before processing: (64000, 12)\n", + "Shape of the dataset after processing: (42693, 10)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelvisittreatment
0102) $100 - $200142.4410Surburban0Phone01
163) $200 - $350329.0811Rural1Web00
272) $100 - $200180.6501Surburban1Web01
421) $0 - $10045.3410Urban0Web01
562) $100 - $200134.8301Surburban0Phone11
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie channel \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n", + "\n", + " visit treatment \n", + "0 0 1 \n", + "1 0 0 \n", + "2 0 1 \n", + "4 0 1 \n", + "5 1 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "%matplotlib inline\n", + "\n", + "dataset = pd.read_csv(csv_path)\n", + "print(f'Shape of the dataset before processing: {dataset.shape}')\n", + "dataset = dataset[dataset['segment']!='Mens E-Mail']\n", + "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0\n", + "})\n", + "\n", + "dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n", + "print(f'Shape of the dataset after processing: {dataset.shape}')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Divide all the data into a training and validation sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:30.688735Z", + "start_time": "2020-04-27T19:15:29.976209Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", + "\n", + "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", + "y_tr = Xyt_tr['visit']\n", + "treat_tr = Xyt_tr['treatment']\n", + "\n", + "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", + "y_val = Xyt_val['visit']\n", + "treat_val = Xyt_val['treatment']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select categorical features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:30.706714Z", + "start_time": "2020-04-27T19:15:30.691607Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['history_segment', 'zip_code', 'channel']\n" + ] + } + ], + "source": [ + "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()\n", + "print(cat_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the necessary objects and combining them into a pipieline:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:30.784120Z", + "start_time": "2020-04-27T19:15:30.710542Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from category_encoders import CatBoostEncoder\n", + "from sklift.models import ClassTransformation\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "encoder = CatBoostEncoder(cols=cat_cols)\n", + "estimator = XGBClassifier(max_depth=2, random_state=42)\n", + "ct = ClassTransformation(estimator=estimator)\n", + "\n", + "my_pipeline = Pipeline([\n", + " ('encoder', encoder),\n", + " ('model', ct)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train pipeline as usual, but adding the treatment column in the step model as a parameter `model__treatment`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:31.921960Z", + "start_time": "2020-04-27T19:15:30.787124Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " self._final_estimator.fit(Xt, y, **fit_params)\n" + ] + } + ], + "source": [ + "my_pipeline = my_pipeline.fit(\n", + " X=X_tr,\n", + " y=y_tr,\n", + " model__treatment=treat_tr\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T18:07:44.970856Z", + "start_time": "2020-04-26T18:07:44.964624Z" + } + }, + "source": [ + "Predict the uplift and calculate the uplift@30%" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:15:32.063373Z", + "start_time": "2020-04-27T19:15:31.924138Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@30%: 0.0660\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "\n", + "uplift_predictions = my_pipeline.predict(X_val)\n", + "\n", + "uplift_30 = uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')\n", + "print(f'uplift@30%: {uplift_30:.4f}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb new file mode 100644 index 0000000..4b56607 --- /dev/null +++ b/notebooks/pipeline_usage_RU.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ΠŸΡ€ΠΈΠΌΠ΅Ρ€ использованиС ΠΏΠΎΠ΄Ρ…ΠΎΠ΄ΠΎΠ² ΠΈΠ· sklift.models Π² sklearn.pipeline\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS\n", + "
\n", + " ENGLISH VERSION\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Π’ Π΄Π°Π½Π½ΠΎΠΌ Π½ΠΎΡƒΡ‚Π±ΡƒΠΊΠ΅ рассмотрим простой ΠΏΡ€ΠΈΠΌΠ΅Ρ€ примСнСния ΠΎΠ΄Π½ΠΎΠ³ΠΎ ΠΈΠ· ΠΏΠΎΠ΄Ρ…ΠΎΠ΄ΠΎΠ² прогнозирования uplift Π² [sklearn.pipeline](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline).\n", + "\n", + "Π”Π°Π½Π½Ρ‹Π΅ для ΠΏΡ€ΠΈΠΌΠ΅Ρ€Π° взяты ΠΈΠ· [MineThatData E-Mail Analytics And Data Mining Challenge dataset by Kevin Hillstrom](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html). Π­Ρ‚ΠΎΡ‚ Π½Π°Π±ΠΎΡ€ Π΄Π°Π½Π½Ρ‹Ρ… содСрТит 64 000 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ², ΠΊΠΎΡ‚ΠΎΡ€Ρ‹Π΅ Π² послСдний Ρ€Π°Π· ΡΠΎΠ²Π΅Ρ€ΡˆΠ°Π»ΠΈ ΠΏΠΎΠΊΡƒΠΏΠΊΠΈ Π² Ρ‚Π΅Ρ‡Π΅Π½ΠΈΠ΅ Π΄Π²Π΅Π½Π°Π΄Ρ†Π°Ρ‚ΠΈ мСсяцСв. Π‘Ρ€Π΅Π΄ΠΈ ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»Π° ΠΏΡ€ΠΎΠ²Π΅Π΄Π΅Π½Π° рСкламная кампания с ΠΏΠΎΠΌΠΎΡ‰ΡŒΡŽ email рассылки:\n", + "\n", + "* 1/3 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»ΠΈ Π²Ρ‹Π±Ρ€Π°Π½Ρ‹ случайным ΠΎΠ±Ρ€Π°Π·ΠΎΠΌ для получСния элСктронного письма, Ρ€Π΅ΠΊΠ»Π°ΠΌΠΈΡ€ΡƒΡŽΡ‰Π΅Π³ΠΎ ΠΌΡƒΠΆΡΠΊΡƒΡŽ ΠΏΡ€ΠΎΠ΄ΡƒΠΊΡ†ΠΈΡŽ;\n", + "* 1/3 ΠΊΠ»ΠΈΠ΅Π½Ρ‚ΠΎΠ² Π±Ρ‹Π»ΠΈ Π²Ρ‹Π±Ρ€Π°Π½Ρ‹ случайным ΠΎΠ±Ρ€Π°Π·ΠΎΠΌ для получСния элСктронного письма, Ρ€Π΅ΠΊΠ»Π°ΠΌΠΈΡ€ΡƒΡŽΡ‰Π΅Π³ΠΎ ΠΆΠ΅Π½ΡΠΊΡƒΡŽ ΠΏΡ€ΠΎΠ΄ΡƒΠΊΡ†ΠΈΡŽ;\n", + "* Π‘ ΠΎΡΡ‚Π°Π²ΡˆΠ΅ΠΉΡΡ 1/3 ΠΊΠΎΠΌΠΌΡƒΠ½ΠΈΠΊΠ°Ρ†ΠΈΡŽ Π½Π΅ ΠΏΡ€ΠΎΠ²ΠΎΠ΄ΠΈΠ»ΠΈ.\n", + "\n", + "Для ΠΊΠ°ΠΆΠ΄ΠΎΠ³ΠΎ ΠΊΠ»ΠΈΠ΅Π½Ρ‚Π° ΠΈΠ· Π²Ρ‹Π±ΠΎΡ€ΠΊΠΈ Π·Π°ΠΌΠ΅Ρ€ΠΈΠ»ΠΈ Ρ„Π°ΠΊΡ‚ ΠΏΠ΅Ρ€Π΅Ρ…ΠΎΠ΄Π° ΠΏΠΎ ссылкС Π² письмС, Ρ„Π°ΠΊΡ‚ ΡΠΎΠ²Π΅Ρ€ΡˆΠ΅Π½ΠΈΡ ΠΏΠΎΠΊΡƒΠΏΠΊΠΈ ΠΈ сумму Ρ‚Ρ€Π°Ρ‚ Π·Π° Π΄Π²Π΅ Π½Π΅Π΄Π΅Π»ΠΈ, слСдущими послС получСния письма.\n", + "\n", + "ПолноС описаниС датасСта ΠΌΠΎΠΆΠ½ΠΎΠΉ Π½Π°ΠΉΡ‚ΠΈ ΠΏΠΎ [ссылкС](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html).\n", + "\n", + "Установим Π½Π΅ΠΎΠ±Ρ…ΠΎΠ΄ΠΈΠΌΡ‹Π΅ Π±ΠΈΠ±Π»ΠΈΠΎΡ‚Π΅ΠΊΠΈ:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:10:58.787183Z", + "start_time": "2020-04-27T19:10:58.780531Z" + } + }, + "outputs": [], + "source": [ + "!pip install scikit-uplift==0.1.0 xgboost" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T14:28:36.188277Z", + "start_time": "2020-04-26T14:28:36.106561Z" + } + }, + "source": [ + "Π—Π°Π³Ρ€ΡƒΠ·ΠΈΠΌ Π΄Π°Π½Π½Ρ‹Π΅:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:04.610210Z", + "start_time": "2020-04-27T19:10:58.796242Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./content/Hilstorm.csv', )" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import urllib.request\n", + "import pandas as pd\n", + "\n", + "\n", + "csv_path = './content/Hilstorm.csv'\n", + "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", + "urllib.request.urlretrieve(url, csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для простоты ΠΏΡ€ΠΈΠΌΠ΅Ρ€Π° оставим Ρ‚ΠΎΠ»ΡŒΠΊΠΎ Π΄Π²Π° сСгмСнта ΠΏΠΎΠ»ΡŒΠ·ΠΎΠ²Π°Ρ‚Π΅Π»Π΅ΠΉ:\n", + "* Ρ‚Π΅ΠΌ, ΠΊΠΎΠΌΡƒ Ρ€Π°ΡΡΡ‹Π»Π°Π»Π°ΡΡŒ ΠΏΠΎ элСктронной ΠΏΠΎΡ‡Ρ‚Π΅ рСкламная кампания с участиСм ТСнских Ρ‚ΠΎΠ²Π°Ρ€ΠΎΠ²;\n", + "* Ρ‚Π΅ΠΌ, ΠΊΠΎΠΌΡƒ Π½Π΅ Ρ€Π°ΡΡΡ‹Π»Π°Π»Π°ΡΡŒ рСкламная кампания.\n", + "\n", + "Π’ качСствС Ρ†Π΅Π»Π΅Π²ΠΎΠΉ ΠΏΠ΅Ρ€Π΅ΠΌΠ΅Π½Π½ΠΎΠΉ Π±ΡƒΠ΄Π΅ΠΌ ΠΈΡΠΏΠΎΠ»ΡŒΠ·ΠΎΠ²Π°Ρ‚ΡŒ ΠΏΠ΅Ρ€Π΅ΠΌΠ΅Π½Π½ΡƒΡŽ `visit`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:05.200695Z", + "start_time": "2020-04-27T19:11:04.614828Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Π Π°Π·ΠΌΠ΅Ρ€ датасСта Π΄ΠΎ ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: (64000, 12)\n", + "Π Π°Π·ΠΌΠ΅Ρ€ датасСта послС ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: (42693, 10)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelvisittreatment
0102) $100 - $200142.4410Surburban0Phone01
163) $200 - $350329.0811Rural1Web00
272) $100 - $200180.6501Surburban1Web01
421) $0 - $10045.3410Urban0Web01
562) $100 - $200134.8301Surburban0Phone11
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie channel \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n", + "\n", + " visit treatment \n", + "0 0 1 \n", + "1 0 0 \n", + "2 0 1 \n", + "4 0 1 \n", + "5 1 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd; pd.set_option('display.max_columns', None)\n", + "\n", + "\n", + "%matplotlib inline\n", + "\n", + "dataset = pd.read_csv(csv_path)\n", + "print(f'Π Π°Π·ΠΌΠ΅Ρ€ датасСта Π΄ΠΎ ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: {dataset.shape}')\n", + "dataset = dataset[dataset['segment']!='Mens E-Mail']\n", + "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0\n", + "})\n", + "\n", + "dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n", + "print(f'Π Π°Π·ΠΌΠ΅Ρ€ датасСта послС ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ: {dataset.shape}')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "РазобъСм всС Π΄Π°Π½Π½Ρ‹Π΅ Π½Π° ΠΎΠ±ΡƒΡ‡Π°ΡŽΡ‰ΡƒΡŽ ΠΈ Π²Π°Π»ΠΈΠ΄Π°Ρ†ΠΈΠΎΠ½Π½ΡƒΡŽ Π²Ρ‹Π±ΠΎΡ€ΠΊΡƒ:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:05.963783Z", + "start_time": "2020-04-27T19:11:05.205409Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", + "\n", + "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", + "y_tr = Xyt_tr['visit']\n", + "treat_tr = Xyt_tr['treatment']\n", + "\n", + "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", + "y_val = Xyt_val['visit']\n", + "treat_val = Xyt_val['treatment']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select categorical features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:05.982444Z", + "start_time": "2020-04-27T19:11:05.966573Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['history_segment', 'zip_code', 'channel']\n" + ] + } + ], + "source": [ + "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()\n", + "print(cat_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Π‘ΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π½ΡƒΠΆΠ½Ρ‹Π΅ ΠΎΠ±ΡŠΠ΅ΠΊΡ‚Ρ‹ ΠΈ объСдиним ΠΈΡ… Π² pipieline." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:06.071221Z", + "start_time": "2020-04-27T19:11:05.984825Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from category_encoders import CatBoostEncoder\n", + "from sklift.models import ClassTransformation\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "encoder = CatBoostEncoder(cols=cat_cols)\n", + "estimator = XGBClassifier(max_depth=2, random_state=42)\n", + "ct = ClassTransformation(estimator=estimator)\n", + "\n", + "my_pipeline = Pipeline([\n", + " ('encoder', encoder),\n", + " ('model', ct)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-26T18:02:52.236917Z", + "start_time": "2020-04-26T18:02:52.110138Z" + } + }, + "source": [ + "ΠžΠ±ΡƒΡ‡Π°Ρ‚ΡŒ pipeline Π±ΡƒΠ΄Π΅ΠΌ ΠΊΠ°ΠΊ ΠΎΠ±Ρ‹Ρ‡Π½ΠΎ, Π½ΠΎ ΠΊΠΎΠ»ΠΎΠ½ΠΊΡƒ treatment Π΄ΠΎΠ±Π°Π²ΠΈΠΌ ΠΊΠ°ΠΊ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ шага model: `model__treatment`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:07.235200Z", + "start_time": "2020-04-27T19:11:06.076210Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " self._final_estimator.fit(Xt, y, **fit_params)\n" + ] + } + ], + "source": [ + "my_pipeline = my_pipeline.fit(\n", + " X=X_tr,\n", + " y=y_tr,\n", + " model__treatment=treat_tr\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ΠŸΡ€Π΅Π΄ΡΠΊΠ°ΠΆΠ΅ΠΌ uplift ΠΈ посчитаСм uplift@30%" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-04-27T19:11:07.391911Z", + "start_time": "2020-04-27T19:11:07.238581Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@30%: 0.0660\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "\n", + "uplift_predictions = my_pipeline.predict(X_val)\n", + "\n", + "uplift_30 = uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')\n", + "print(f'uplift@30%: {uplift_30:.4f}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1962dccfa89e1313e0816093f75f03872172c93f Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Mon, 27 Apr 2020 22:23:49 +0300 Subject: [PATCH 11/20] :memo: Change directory for data in notebook folder --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 911404f..92b0f4a 100644 --- a/.gitignore +++ b/.gitignore @@ -207,7 +207,7 @@ fabric.properties # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser -notebooks/RetailHero-data/* +notebooks/content/* notebooks/catboost_info notebooks/*.tmp From 0e75889d1b4593b4322f8e289bc2e60c5f66dc35 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Mon, 27 Apr 2020 22:32:18 +0300 Subject: [PATCH 12/20] :memo: Add tutorial into docs --- docs/tutorials.rst | 11 +- notebooks/pipeline_usage.ipynb | 506 --------------------------------- 2 files changed, 9 insertions(+), 508 deletions(-) delete mode 100644 notebooks/pipeline_usage.ipynb diff --git a/docs/tutorials.rst b/docs/tutorials.rst index db202f0..2cf49cf 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -4,6 +4,11 @@ .. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb +.. _Example of usage model from sklift.models in sklearn.pipeline: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb ********** Tutorials @@ -12,10 +17,12 @@ Tutorials Basic ######## -It is better to start scikit-uplift from the basic tutorial. +It is better to start scikit-uplift from the basic tutorials. * `The overview of the basic approaches to solving the Uplift Modeling problem`_ * In Englsih: `nbviewer `_ | `github `_ |Open In Colab1| * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| - +* `Example of usage model from sklift.models in sklearn.pipeline`_ + * In Englsih: `nbviewer `_ | `github `_ |Open In Colab3| + * In Russian: `nbviewer `_ | `github `_ |Open In Colab4| diff --git a/notebooks/pipeline_usage.ipynb b/notebooks/pipeline_usage.ipynb deleted file mode 100644 index 7181382..0000000 --- a/notebooks/pipeline_usage.ipynb +++ /dev/null @@ -1,506 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:35.309471Z", - "start_time": "2020-04-22T21:39:34.114404Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(64000, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannelsegmentvisitconversionspend
0102) $100 - $200142.4410Surburban0PhoneWomens E-Mail000.0
163) $200 - $350329.0811Rural1WebNo E-Mail000.0
272) $100 - $200180.6501Surburban1WebWomens E-Mail000.0
395) $500 - $750675.8310Rural1WebMens E-Mail000.0
421) $0 - $10045.3410Urban0WebWomens E-Mail000.0
\n", - "
" - ], - "text/plain": [ - " recency history_segment history mens womens zip_code newbie channel \\\n", - "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", - "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", - "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", - "3 9 5) $500 - $750 675.83 1 0 Rural 1 Web \n", - "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", - "\n", - " segment visit conversion spend \n", - "0 Womens E-Mail 0 0 0.0 \n", - "1 No E-Mail 0 0 0.0 \n", - "2 Womens E-Mail 0 0 0.0 \n", - "3 Mens E-Mail 0 0 0.0 \n", - "4 Womens E-Mail 0 0 0.0 " - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd; pd.set_option('display.max_columns', None)\n", - "\n", - "\n", - "%matplotlib inline\n", - "\n", - "hillstrom = pd.read_csv('/Users/Maksim/Desktop/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')\n", - "print(hillstrom.shape)\n", - "hillstrom.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:35.384713Z", - "start_time": "2020-04-22T21:39:35.313129Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(18781, 9)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexing.py:844: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " self.obj[key] = _infer_fill_value(value)\n", - "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexing.py:965: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " self.obj[item] = s\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
recencyhistory_segmenthistorymenswomenszip_codenewbievisittreatment
0102) $100 - $200142.4410Surburban001
562) $100 - $200134.8301Surburban011
693) $200 - $350280.2010Surburban101
791) $0 - $10046.4201Urban001
1075) $500 - $750548.9101Urban111
\n", - "
" - ], - "text/plain": [ - " recency history_segment history mens womens zip_code newbie visit \\\n", - "0 10 2) $100 - $200 142.44 1 0 Surburban 0 0 \n", - "5 6 2) $100 - $200 134.83 0 1 Surburban 0 1 \n", - "6 9 3) $200 - $350 280.20 1 0 Surburban 1 0 \n", - "7 9 1) $0 - $100 46.42 0 1 Urban 0 0 \n", - "10 7 5) $500 - $750 548.91 0 1 Urban 1 1 \n", - "\n", - " treatment \n", - "0 1 \n", - "5 1 \n", - "6 1 \n", - "7 1 \n", - "10 1 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = hillstrom[hillstrom['segment']!='Mens E-Mail']\n", - "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", - " 'Womens E-Mail': 1,\n", - " 'No E-Mail': 0\n", - "})\n", - "\n", - "\n", - "dataset = dataset[dataset['channel']=='Phone']\n", - "\n", - "dataset = dataset.drop(['segment', 'channel', 'conversion', 'spend'], axis=1)\n", - "print(dataset.shape)\n", - "dataset.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:36.171492Z", - "start_time": "2020-04-22T21:39:35.388090Z" - } - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", - "\n", - "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", - "y_tr = Xyt_tr['visit']\n", - "treat_tr = Xyt_tr['treatment']\n", - "\n", - "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", - "y_val = Xyt_val['visit']\n", - "treat_val = Xyt_val['treatment']\n", - "\n", - "cat_cols = X_tr.select_dtypes(include='object').columns.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:36.255395Z", - "start_time": "2020-04-22T21:39:36.174570Z" - } - }, - "outputs": [], - "source": [ - "from sklearn.pipeline import Pipeline\n", - "from category_encoders import CatBoostEncoder\n", - "from sklift.models import ClassTransformation\n", - "from xgboost import XGBClassifier\n", - "\n", - "encoder = CatBoostEncoder(cols=cat_cols)\n", - "estimator = XGBClassifier(max_depth=2, random_state=42)\n", - "ct = ClassTransformation(estimator=estimator)\n", - "\n", - "my_pipeline = Pipeline([\n", - " ('encoder', encoder),\n", - " ('model', ct)\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:36.824301Z", - "start_time": "2020-04-22T21:39:36.258334Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/pipeline.py:354: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", - " self._final_estimator.fit(Xt, y, **fit_params)\n" - ] - }, - { - "data": { - "text/plain": [ - "Pipeline(memory=None,\n", - " steps=[('encoder',\n", - " CatBoostEncoder(a=1, cols=['history_segment', 'zip_code'],\n", - " drop_invariant=False, handle_missing='value',\n", - " handle_unknown='value', random_state=None,\n", - " return_df=True, sigma=None, verbose=0)),\n", - " ('model',\n", - " ClassTransformation(estimator=XGBClassifier(base_score=0.5,\n", - " booster=None,\n", - " colsample_bylevel=1,\n", - " colsample_bynode=1,\n", - " colsample_by...\n", - " interaction_constraints=None,\n", - " learning_rate=0.300000012,\n", - " max_delta_step=0,\n", - " max_depth=2,\n", - " min_child_weight=1,\n", - " missing=nan,\n", - " monotone_constraints=None,\n", - " n_estimators=100,\n", - " n_jobs=0,\n", - " num_parallel_tree=1,\n", - " objective='binary:logistic',\n", - " random_state=42,\n", - " reg_alpha=0,\n", - " reg_lambda=1,\n", - " scale_pos_weight=1,\n", - " subsample=1,\n", - " tree_method=None,\n", - " validate_parameters=False,\n", - " verbosity=None)))],\n", - " verbose=False)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_pipeline.fit(\n", - " X=X_tr,\n", - " y=y_tr,\n", - " model__treatment=treat_tr\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-04-22T21:39:36.889844Z", - "start_time": "2020-04-22T21:39:36.827707Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.04904281488369654" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklift.metrics import uplift_at_k\n", - "\n", - "uplift_predictions = my_pipeline.predict(X_val)\n", - "\n", - "uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 40fc68534edf33ce0b3824e8d9e7612978c9da6d Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Mon, 27 Apr 2020 22:53:14 +0300 Subject: [PATCH 13/20] :memo: Add links in Readme and docs --- Readme.rst | 19 +++++++++++++------ docs/index.rst | 12 +++++++++++- docs/tutorials.rst | 4 ++-- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Readme.rst b/Readme.rst index 8dfdb3e..089deaf 100644 --- a/Readme.rst +++ b/Readme.rst @@ -11,15 +11,18 @@ .. |Docs| image:: https://readthedocs.org/projects/scikit-uplift/badge/?version=latest .. _Docs: https://scikit-uplift.readthedocs.io/en/latest/ -.. _RU: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb -.. _EN: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb - .. |Open In Colab1| image:: https://colab.research.google.com/assets/colab-badge.svg .. _Open In Colab1: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb .. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg .. _Open In Colab2: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb - + +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab3: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb + +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab4: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb + .. _scikit-uplift.readthedocs.io: https://scikit-uplift.readthedocs.io/en/latest/ .. _Part 1: https://habr.com/ru/company/ru_mts/blog/485980/ .. _Part 2: https://habr.com/ru/company/ru_mts/blog/485976/ @@ -50,9 +53,13 @@ More about uplift modelling problem read in russian on habr.com: `Part 1`_ and ` * Applying any estimator adheres to scikit-learn conventions; +* All approaches can be used in sklearn.pipeline (see example (`EN `_ |Open In Colab3|_, `RU `_ |Open In Colab4|_)) + * Almost all implemented approaches solve both the problem of classification and regression; -* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model. +* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model; + +* Useful graphs for analyzing the built model. Installation ------------- @@ -89,7 +96,7 @@ And if you now point your browser to ``_build/html/index.html``, you should see Quick Start ----------- -See the **RetailHero tutorial notebook** (`EN`_ |Open In Colab1|_, `RU`_ |Open In Colab2|_) for details. +See the **RetailHero tutorial notebook** (`EN `_ |Open In Colab1|_, `RU `_ |Open In Colab2|_) for details. **Train and predict uplift model** diff --git a/docs/index.rst b/docs/index.rst index 0973c70..343979e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,12 @@ .. _Part 1: https://habr.com/ru/company/ru_mts/blog/485980/ .. _Part 2: https://habr.com/ru/company/ru_mts/blog/485976/ +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab3: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb + +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg +.. _Open In Colab4: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb + ************** scikit-uplift ************** @@ -18,9 +24,13 @@ Features * Applying any estimator adheres to scikit-learn conventions; +* All approaches can be used in sklearn.pipeline (see example (`EN `_ |Open In Colab3|_, `RU `_ |Open In Colab4|_)) + * Almost all implemented approaches solve both the problem of classification and regression; -* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model. +* A lot of metrics (Such as *Area Under Uplift Curve* or *Area Under Qini Curve*) are implemented to evaluate your uplift model; + +* Useful graphs for analyzing the built model. **The package currently supports the following methods:** diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 2cf49cf..c962e1a 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -20,9 +20,9 @@ Basic It is better to start scikit-uplift from the basic tutorials. * `The overview of the basic approaches to solving the Uplift Modeling problem`_ - * In Englsih: `nbviewer `_ | `github `_ |Open In Colab1| + * In English: `nbviewer `_ | `github `_ |Open In Colab1| * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| * `Example of usage model from sklift.models in sklearn.pipeline`_ - * In Englsih: `nbviewer `_ | `github `_ |Open In Colab3| + * In English: `nbviewer `_ | `github `_ |Open In Colab3| * In Russian: `nbviewer `_ | `github `_ |Open In Colab4| From 5545015e318b52a1f2a75eeb4ca108c1e5b3fb3e Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Mon, 27 Apr 2020 23:02:32 +0300 Subject: [PATCH 14/20] :memo: Add new tutorial in Readme in notebooks folder --- notebooks/Readme.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/notebooks/Readme.rst b/notebooks/Readme.rst index db202f0..c962e1a 100644 --- a/notebooks/Readme.rst +++ b/notebooks/Readme.rst @@ -4,6 +4,11 @@ .. |Open In Colab2| image:: https://colab.research.google.com/assets/colab-badge.svg :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero.ipynb +.. _Example of usage model from sklift.models in sklearn.pipeline: https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab3| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb +.. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb ********** Tutorials @@ -12,10 +17,12 @@ Tutorials Basic ######## -It is better to start scikit-uplift from the basic tutorial. +It is better to start scikit-uplift from the basic tutorials. * `The overview of the basic approaches to solving the Uplift Modeling problem`_ - * In Englsih: `nbviewer `_ | `github `_ |Open In Colab1| + * In English: `nbviewer `_ | `github `_ |Open In Colab1| * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| - +* `Example of usage model from sklift.models in sklearn.pipeline`_ + * In English: `nbviewer `_ | `github `_ |Open In Colab3| + * In Russian: `nbviewer `_ | `github `_ |Open In Colab4| From 1a121c3bff505c6ad5f3d6667cd5bcb2523557d0 Mon Sep 17 00:00:00 2001 From: Irina Elisova Date: Tue, 28 Apr 2020 11:05:48 +0300 Subject: [PATCH 15/20] Add plot_uplift_by_percentile function to viz (#10) * :chart_with_upwards_trend: Add response_rate_by_percentile function to metrics * :rocket: Add plot_uplift_by_percentile to viz --- sklift/metrics/__init__.py | 4 +- sklift/metrics/metrics.py | 80 ++++++++++++++++++++++++++++++++++++++ sklift/viz/__init__.py | 4 +- sklift/viz/base.py | 78 ++++++++++++++++++++++++++++++++++++- 4 files changed, 161 insertions(+), 5 deletions(-) diff --git a/sklift/metrics/__init__.py b/sklift/metrics/__init__.py index b4623c5..097ba43 100644 --- a/sklift/metrics/__init__.py +++ b/sklift/metrics/__init__.py @@ -1,9 +1,9 @@ from .metrics import ( - uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve, + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, response_rate_by_percentile, treatment_balance_curve, uplift_auc_score, qini_auc_score ) __all__ = [ - uplift_curve, auuc, qini_curve, auqc, uplift_at_k, treatment_balance_curve, + uplift_curve, auuc, qini_curve, auqc, uplift_at_k, response_rate_by_percentile, treatment_balance_curve, uplift_auc_score, qini_auc_score ] diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index 68b7540..efbf185 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -289,6 +289,86 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): return score_trmnt - score_ctrl +def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins=10): + """Compute response rate (target mean in the control or treatment group) at each percentile. + + Args: + y_true (1d array-like): Correct (true) target values. + uplift (1d array-like): Predicted uplift, as returned by a model. + treatment (1d array-like): Treatment labels. + group (string, ['treatment', 'control']): Group type for computing response rate: treatment or control. + * ``'treatment'``: + Values equal 1 in the treatment column. + * ``'control'``: + Values equal 0 in the treatment column. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. + * ``'overall'``: + The first step is taking the first k observations of all test data ordered by uplift prediction + (overall both groups - control and treatment) and conversions in treatment and control groups + calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: + Separately calculates conversions in top k observations in each group (control and treatment) + sorted by uplift predictions. Then the difference between these conversions is calculated + bins (int): Determines the number of bins (and relative percentile) in the test data. + + Returns: + array: Response rate at each percentile for control or treatment group + array: Variance of the response rate at each percentile + """ + + group_types = ['treatment', 'control'] + strategy_methods = ['overall', 'by_group'] + + n_samples = len(y_true) + check_consistent_length(y_true, uplift, treatment) + + if group not in group_types: + raise ValueError(f'Response rate supports only group types in {group_types},' + f' got {group}.') + + if strategy not in strategy_methods: + raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' + f' got {strategy}.') + + if not isinstance(bins, int) or bins <= 0: + raise ValueError(f'bins should be positive integer.' + f' Invalid value bins: {bins}') + + if bins >= n_samples: + raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + + if bins == 1: + warnings.warn(f'You will get the only one bin of {n_samples} samples' + f' which is the length of y_true.' + f'\nPlease consider using uplift_at_k function instead', + UserWarning) + + y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) + order = np.argsort(uplift, kind='mergesort')[::-1] + + if group == 'treatment': + trmnt_flag = 1 + else: # group == 'control' + trmnt_flag = 0 + + if strategy == 'overall': + y_true_bin = np.array_split(y_true[order], bins) + trmnt_bin = np.array_split(treatment[order], bins) + + group_size = np.array([len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) + response_rate = np.array([np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) + + else: # strategy == 'by_group' + y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins) + + group_size = np.array([len(y) for y in y_bin]) + response_rate = np.array([np.mean(y) for y in y_bin]) + + variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size)) + + return response_rate, variance + + def treatment_balance_curve(uplift, treatment, winsize): """Compute the treatment balance curve: proportion of treatment group in the ordered predictions. diff --git a/sklift/viz/__init__.py b/sklift/viz/__init__.py index 8ed7d5c..10601ab 100644 --- a/sklift/viz/__init__.py +++ b/sklift/viz/__init__.py @@ -1,3 +1,3 @@ -from .base import plot_uplift_preds, plot_uplift_qini_curves, plot_treatment_balance_curve +from .base import plot_uplift_preds, plot_uplift_qini_curves, plot_uplift_by_percentile, plot_treatment_balance_curve -__all__ = [plot_uplift_preds, plot_uplift_qini_curves, plot_treatment_balance_curve] +__all__ = [plot_uplift_preds, plot_uplift_qini_curves, plot_uplift_by_percentile, plot_treatment_balance_curve] diff --git a/sklift/viz/base.py b/sklift/viz/base.py index c7e3d9f..2d2bb75 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -1,6 +1,6 @@ import matplotlib.pyplot as plt import numpy as np -from ..metrics import uplift_curve, auuc, qini_curve, auqc, treatment_balance_curve +from ..metrics import uplift_curve, auuc, qini_curve, auqc, response_rate_by_percentile, treatment_balance_curve def plot_uplift_preds(trmnt_preds, ctrl_preds, log=False, bins=100): @@ -102,6 +102,82 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals return axes +def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, bins=10): + """Plot Uplift score at each percentile, + Treatment response rate (target mean in the treatment group) + and Control response rate (target mean in the control group) at each percentile. + + Args: + y_true (1d array-like): Correct (true) target values. + uplift (1d array-like): Predicted uplift, as returned by a model. + treatment (1d array-like): Treatment labels. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Defaults to 'first'. + * ``'overall'``: + The first step is taking the first k observations of all test data ordered by uplift prediction + (overall both groups - control and treatment) and conversions in treatment and control groups + calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: + Separately calculates conversions in top k observations in each group (control and treatment) + sorted by uplift predictions. Then the difference between these conversions is calculated + bins (int): Determines the number of bins (and relative percentile) in the test data. + + Returns: + Object that stores computed values. + """ + + strategy_methods = ['overall', 'by_group'] + + n_samples = len(y_true) + check_consistent_length(y_true, uplift, treatment) + + if strategy not in strategy_methods: + raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' + f' got {strategy}.') + + if not isinstance(bins, int) or bins <= 0: + raise ValueError(f'bins should be positive integer.' + f' Invalid value bins: {bins}') + + if bins >= n_samples: + raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + + if bins == 1: + warnings.warn(f'You will get the only one bin of {n_samples} samples' + f' which is the length of y_true.' + f'\nPlease consider using uplift_at_k function instead', + UserWarning) + + rspns_rate_trmnt, var_trmnt = response_rate_by_percentile(y_true, uplift, + treatment, group='treatment', + strategy=strategy, bins=bins) + + rspns_rate_ctrl, var_ctrl = response_rate_by_percentile(y_true, uplift, + treatment, group='control', + strategy=strategy, bins=bins) + + uplift_score, uplift_variance = np.subtract(rspns_rate_trmnt, rspns_rate_ctrl), np.add(var_trmnt, var_ctrl) + + percentiles = [p * 100 / bins for p in range(1, bins + 1)] + + _, axes = plt.subplots(ncols=1, nrows=1, figsize=(8, 6)) + + axes.errorbar(percentiles, uplift_score, yerr=np.sqrt(uplift_variance), + linewidth=2, color='red', label='uplift') + axes.errorbar(percentiles, rspns_rate_trmnt, yerr=np.sqrt(var_trmnt), + linewidth=2, color='forestgreen', label='treatment\nresponse rate') + axes.errorbar(percentiles, rspns_rate_ctrl, yerr=np.sqrt(var_ctrl), + linewidth=2, color='orange', label='control\nresponse rate') + axes.fill_between(percentiles, rspns_rate_ctrl, rspns_rate_trmnt, alpha=0.1, color='red') + + axes.set_xticks(percentiles) + axes.legend(loc='upper right') + axes.set_title('Uplift by percentile') + axes.set_xlabel('Percentile') + axes.set_ylabel('Uplift = treatment response rate - control response rate') + + return axes + + def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1): """Plot Treatment Balance curve. From 6f00a61e1c27e43f511d4cc7ee909aa879ee566d Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 11:12:46 +0300 Subject: [PATCH 16/20] :memo: Add changes in changelog --- docs/changelog.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 63d687c..977495d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -12,16 +12,19 @@ ### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) -* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_treatment_balance_curve) +* πŸ’₯ Add [plot_uplift_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_treatment_balance_curve). ### [sklift.metrics](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html) +* πŸ’₯ Add [response_rate_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.metrics.metrics.response_rate_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). * πŸ”¨ Fix bug with import [uplift_auc_score](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.qini_auc_score). -* πŸ“ Fix typos in docstrings +* πŸ“ Fix typos in docstrings. ### Miscellaneous -* πŸ“ Add link to Release History in main Readme.md +* πŸ’₯ Add tutorial ["Example of usage model from sklift.models in sklearn.pipeline"](https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_EN.ipynb). +* πŸ“ Add link to Release History in main Readme.md. ## Version 0.1.0 @@ -34,6 +37,7 @@ * πŸ’₯ Add [treatment_balance_curve](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.treatment_balance_curve) by [@spiaz](https://github.com/spiaz). * ❗️ The metrics `auuc` and `auqc` are now respectively renamed to [uplift_auc_score](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.qini_auc_score). So, `auuc` and `auqc` will be removed in 0.2.0. +* ❗️ Add a new parameter `startegy` in [uplift_at_k](https://scikit-uplift.readthedocs.io/en/latest/metrics.html#sklift.metrics.metrics.uplift_at_k). ### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) From 10d453ec8c5c5081197d33ad44f862b1c98c72ca Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 11:52:28 +0300 Subject: [PATCH 17/20] :memo: Fix header in notebooks --- notebooks/RetailHero_EN.ipynb | 2 +- notebooks/pipeline_usage_EN.ipynb | 10 +++++----- notebooks/pipeline_usage_RU.ipynb | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/notebooks/RetailHero_EN.ipynb b/notebooks/RetailHero_EN.ipynb index 6c7d5e3..3f3e6b8 100644 --- a/notebooks/RetailHero_EN.ipynb +++ b/notebooks/RetailHero_EN.ipynb @@ -18,7 +18,7 @@ " SCIKIT-UPLIFT REPO | \n", " SCIKIT-UPLIFT DOCS\n", "
\n", - " RUSSIAN VERSION\n", + " RUSSIAN VERSION\n", "" ] }, diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb index 0aab882..e350312 100644 --- a/notebooks/pipeline_usage_EN.ipynb +++ b/notebooks/pipeline_usage_EN.ipynb @@ -15,7 +15,7 @@ " SCIKIT-UPLIFT REPO | \n", " SCIKIT-UPLIFT DOCS\n", "
\n", - " RUSSIAN VERSION\n", + " RUSSIAN VERSION\n", "\n", "" ] @@ -56,7 +56,7 @@ }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost" + "!pip install scikit-uplift==0.1.0 xgboost category_encoders" ] }, { @@ -89,10 +89,10 @@ ], "source": [ "import urllib.request\n", - "import pandas as pd; pd.set_option('display.max_columns', None)\n", + "import pandas as pd\n", "\n", "\n", - "csv_path = './content/Hilstorm.csv'\n", + "csv_path = '/content/Hilstorm.csv'\n", "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", "urllib.request.urlretrieve(url, csv_path)" ] @@ -369,7 +369,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Train pipeline as usual, but adding the treatment column in the step model as a parameter `model__treatment`." + "Train pipeline as usual, but adding the treatment column in the step model as a parameter `model__treatment`." ] }, { diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb index 4b56607..9637145 100644 --- a/notebooks/pipeline_usage_RU.ipynb +++ b/notebooks/pipeline_usage_RU.ipynb @@ -15,7 +15,7 @@ " SCIKIT-UPLIFT REPO | \n", " SCIKIT-UPLIFT DOCS\n", "
\n", - " ENGLISH VERSION\n", + " ENGLISH VERSION\n", "\n", "" ] @@ -50,7 +50,7 @@ }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost" + "!pip install scikit-uplift==0.1.0 xgboost category_encoders" ] }, { @@ -91,7 +91,7 @@ "import pandas as pd\n", "\n", "\n", - "csv_path = './content/Hilstorm.csv'\n", + "csv_path = '/content/Hilstorm.csv'\n", "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", "urllib.request.urlretrieve(url, csv_path)" ] @@ -373,7 +373,7 @@ } }, "source": [ - "ΠžΠ±ΡƒΡ‡Π°Ρ‚ΡŒ pipeline Π±ΡƒΠ΄Π΅ΠΌ ΠΊΠ°ΠΊ ΠΎΠ±Ρ‹Ρ‡Π½ΠΎ, Π½ΠΎ ΠΊΠΎΠ»ΠΎΠ½ΠΊΡƒ treatment Π΄ΠΎΠ±Π°Π²ΠΈΠΌ ΠΊΠ°ΠΊ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ шага model: `model__treatment`." + "ΠžΠ±ΡƒΡ‡Π°Ρ‚ΡŒ pipeline Π±ΡƒΠ΄Π΅ΠΌ ΠΊΠ°ΠΊ ΠΎΠ±Ρ‹Ρ‡Π½ΠΎ, Π½ΠΎ ΠΊΠΎΠ»ΠΎΠ½ΠΊΡƒ treatment Π΄ΠΎΠ±Π°Π²ΠΈΠΌ ΠΊΠ°ΠΊ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ шага model: `model__treatment`." ] }, { From 474380bd0b7cc6ea9654a7963a38d81388ee3a00 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 12:05:11 +0300 Subject: [PATCH 18/20] :memo: Add versions of packages in tutorial notebook --- notebooks/pipeline_usage_EN.ipynb | 2 +- notebooks/pipeline_usage_RU.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb index e350312..ddfb9a7 100644 --- a/notebooks/pipeline_usage_EN.ipynb +++ b/notebooks/pipeline_usage_EN.ipynb @@ -56,7 +56,7 @@ }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost category_encoders" + "!pip install scikit-uplift==0.1.0 xgboost==1.0.2 category_encoders==2.1.0" ] }, { diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb index 9637145..9284aa3 100644 --- a/notebooks/pipeline_usage_RU.ipynb +++ b/notebooks/pipeline_usage_RU.ipynb @@ -50,7 +50,7 @@ }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost category_encoders" + "!pip install scikit-uplift==0.1.0 xgboost==1.0.2 category_encoders==2.1.0" ] }, { From 8accf41a8c11e6a06cf26d730094f11cb1487b2d Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 13:02:56 +0300 Subject: [PATCH 19/20] :rocket: Bump version number to 0.1.1 --- notebooks/RetailHero.ipynb | 50 ++++++++++++------------ notebooks/RetailHero_EN.ipynb | 64 +++++++++++++++---------------- notebooks/pipeline_usage_EN.ipynb | 38 +++++++++--------- notebooks/pipeline_usage_RU.ipynb | 38 +++++++++--------- sklift/__init__.py | 2 +- 5 files changed, 96 insertions(+), 96 deletions(-) diff --git a/notebooks/RetailHero.ipynb b/notebooks/RetailHero.ipynb index 837c80e..e2c424c 100644 --- a/notebooks/RetailHero.ipynb +++ b/notebooks/RetailHero.ipynb @@ -107,8 +107,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:07.634319Z", - "start_time": "2020-04-16T20:41:07.630711Z" + "end_time": "2020-04-28T09:24:37.492036Z", + "start_time": "2020-04-28T09:24:37.488584Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -126,7 +126,7 @@ "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", "!unzip /content/retail_hero.zip\n", - "!pip install scikit-uplift==0.1.0 catboost=0.22" + "!pip install scikit-uplift==0.1.1 catboost=0.22" ] }, { @@ -144,8 +144,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:10.715080Z", - "start_time": "2020-04-16T20:41:07.644388Z" + "end_time": "2020-04-28T09:24:40.783897Z", + "start_time": "2020-04-28T09:24:37.503470Z" }, "colab": {}, "colab_type": "code", @@ -196,8 +196,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:11.873021Z", - "start_time": "2020-04-16T20:41:10.717336Z" + "end_time": "2020-04-28T09:24:42.100096Z", + "start_time": "2020-04-28T09:24:40.786498Z" }, "colab": {}, "colab_type": "code", @@ -251,8 +251,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:14.061983Z", - "start_time": "2020-04-16T20:41:11.875565Z" + "end_time": "2020-04-28T09:24:44.483576Z", + "start_time": "2020-04-28T09:24:42.102707Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -313,8 +313,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:14.084652Z", - "start_time": "2020-04-16T20:41:14.064998Z" + "end_time": "2020-04-28T09:24:44.511016Z", + "start_time": "2020-04-28T09:24:44.486035Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -484,8 +484,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:15.254843Z", - "start_time": "2020-04-16T20:41:14.087893Z" + "end_time": "2020-04-28T09:24:45.715602Z", + "start_time": "2020-04-28T09:24:44.514353Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -545,8 +545,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:17.230517Z", - "start_time": "2020-04-16T20:41:15.257156Z" + "end_time": "2020-04-28T09:24:47.941480Z", + "start_time": "2020-04-28T09:24:45.719641Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -632,8 +632,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:19.361999Z", - "start_time": "2020-04-16T20:41:17.233889Z" + "end_time": "2020-04-28T09:24:50.571779Z", + "start_time": "2020-04-28T09:24:47.944822Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -694,8 +694,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:21.519463Z", - "start_time": "2020-04-16T20:41:19.366080Z" + "end_time": "2020-04-28T09:24:52.942803Z", + "start_time": "2020-04-28T09:24:50.576741Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -758,8 +758,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:21.536036Z", - "start_time": "2020-04-16T20:41:21.522709Z" + "end_time": "2020-04-28T09:24:52.964396Z", + "start_time": "2020-04-28T09:24:52.945544Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -858,8 +858,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:24.150095Z", - "start_time": "2020-04-16T20:41:21.539068Z" + "end_time": "2020-04-28T09:24:56.505700Z", + "start_time": "2020-04-28T09:24:53.019392Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -906,8 +906,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:41:24.171639Z", - "start_time": "2020-04-16T20:41:24.153353Z" + "end_time": "2020-04-28T09:24:56.560018Z", + "start_time": "2020-04-28T09:24:56.508541Z" }, "colab": { "base_uri": "https://localhost:8080/", diff --git a/notebooks/RetailHero_EN.ipynb b/notebooks/RetailHero_EN.ipynb index 3f3e6b8..45eacf9 100644 --- a/notebooks/RetailHero_EN.ipynb +++ b/notebooks/RetailHero_EN.ipynb @@ -98,8 +98,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:13.495345Z", - "start_time": "2020-04-16T20:45:13.491423Z" + "end_time": "2020-04-28T09:26:59.973637Z", + "start_time": "2020-04-28T09:26:59.969856Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -111,13 +111,13 @@ }, "outputs": [], "source": [ - "# import urllib.request\n", + "import urllib.request\n", "\n", - "# url = 'https://drive.google.com/u/0/uc?id=1fkxNmihuS15kk0PP0QcphL_Z3_z8LLeb&export=download'\n", - "# urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", + "url = 'https://drive.google.com/u/0/uc?id=1fkxNmihuS15kk0PP0QcphL_Z3_z8LLeb&export=download'\n", + "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", - "# !unzip /content/retail_hero.zip\n", - "# !pip install scikit-uplift==0.1.0 catboost==0.22" + "!unzip /content/retail_hero.zip\n", + "!pip install scikit-uplift==0.1.1 catboost=0.22" ] }, { @@ -135,8 +135,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:16.518699Z", - "start_time": "2020-04-16T20:45:13.505025Z" + "end_time": "2020-04-28T09:27:03.353098Z", + "start_time": "2020-04-28T09:26:59.984369Z" }, "colab": {}, "colab_type": "code", @@ -151,9 +151,9 @@ "\n", "\n", "# reading data\n", - "df_clients = pd.read_csv('./RetailHero-data/clients.csv', index_col='client_id')\n", - "df_train = pd.read_csv('./RetailHero-data/uplift_train.csv', index_col='client_id')\n", - "df_test = pd.read_csv('./RetailHero-data/uplift_test.csv', index_col='client_id')\n", + "df_clients = pd.read_csv('/content/uplift_data/clients.csv', index_col='client_id')\n", + "df_train = pd.read_csv('/content/uplift_data/uplift_train.csv', index_col='client_id')\n", + "df_test = pd.read_csv('/content/uplift_data/uplift_test.csv', index_col='client_id')\n", "\n", "# extracting features\n", "df_features = df_clients.copy()\n", @@ -187,8 +187,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:17.756422Z", - "start_time": "2020-04-16T20:45:16.521360Z" + "end_time": "2020-04-28T09:27:04.563554Z", + "start_time": "2020-04-28T09:27:03.355432Z" }, "colab": {}, "colab_type": "code", @@ -241,8 +241,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:19.899785Z", - "start_time": "2020-04-16T20:45:17.759288Z" + "end_time": "2020-04-28T09:27:06.789462Z", + "start_time": "2020-04-28T09:27:04.570306Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -305,8 +305,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:19.921897Z", - "start_time": "2020-04-16T20:45:19.902457Z" + "end_time": "2020-04-28T09:27:06.813310Z", + "start_time": "2020-04-28T09:27:06.792837Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -476,8 +476,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:21.009561Z", - "start_time": "2020-04-16T20:45:19.925398Z" + "end_time": "2020-04-28T09:27:07.960855Z", + "start_time": "2020-04-28T09:27:06.816440Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -538,8 +538,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:22.998801Z", - "start_time": "2020-04-16T20:45:21.011706Z" + "end_time": "2020-04-28T09:27:10.062915Z", + "start_time": "2020-04-28T09:27:07.972200Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -627,8 +627,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:25.105146Z", - "start_time": "2020-04-16T20:45:23.001150Z" + "end_time": "2020-04-28T09:27:12.177386Z", + "start_time": "2020-04-28T09:27:10.065653Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -689,8 +689,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:27.262646Z", - "start_time": "2020-04-16T20:45:25.108914Z" + "end_time": "2020-04-28T09:27:14.459477Z", + "start_time": "2020-04-28T09:27:12.181749Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -753,8 +753,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:27.280762Z", - "start_time": "2020-04-16T20:45:27.265440Z" + "end_time": "2020-04-28T09:27:14.480794Z", + "start_time": "2020-04-28T09:27:14.463302Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -853,8 +853,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:30.063722Z", - "start_time": "2020-04-16T20:45:27.284197Z" + "end_time": "2020-04-28T09:27:17.097949Z", + "start_time": "2020-04-28T09:27:14.483691Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -901,8 +901,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-16T20:45:30.101502Z", - "start_time": "2020-04-16T20:45:30.067156Z" + "end_time": "2020-04-28T09:27:17.138755Z", + "start_time": "2020-04-28T09:27:17.101433Z" }, "colab": { "base_uri": "https://localhost:8080/", diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb index ddfb9a7..5627162 100644 --- a/notebooks/pipeline_usage_EN.ipynb +++ b/notebooks/pipeline_usage_EN.ipynb @@ -50,13 +50,13 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:23.787100Z", - "start_time": "2020-04-27T19:15:23.782965Z" + "end_time": "2020-04-28T09:14:10.044500Z", + "start_time": "2020-04-28T09:14:10.037045Z" } }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost==1.0.2 category_encoders==2.1.0" + "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" ] }, { @@ -71,15 +71,15 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:29.425545Z", - "start_time": "2020-04-27T19:15:23.800862Z" + "end_time": "2020-04-28T09:14:14.101729Z", + "start_time": "2020-04-28T09:14:10.048276Z" } }, "outputs": [ { "data": { "text/plain": [ - "('./content/Hilstorm.csv', )" + "('./content/Hilstorm.csv', )" ] }, "execution_count": 2, @@ -113,8 +113,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:29.971490Z", - "start_time": "2020-04-27T19:15:29.429579Z" + "end_time": "2020-04-28T09:14:14.638574Z", + "start_time": "2020-04-28T09:14:14.106920Z" } }, "outputs": [ @@ -281,8 +281,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:30.688735Z", - "start_time": "2020-04-27T19:15:29.976209Z" + "end_time": "2020-04-28T09:14:15.419422Z", + "start_time": "2020-04-28T09:14:14.642252Z" } }, "outputs": [], @@ -313,8 +313,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:30.706714Z", - "start_time": "2020-04-27T19:15:30.691607Z" + "end_time": "2020-04-28T09:14:15.432871Z", + "start_time": "2020-04-28T09:14:15.421629Z" } }, "outputs": [ @@ -343,8 +343,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:30.784120Z", - "start_time": "2020-04-27T19:15:30.710542Z" + "end_time": "2020-04-28T09:14:15.562848Z", + "start_time": "2020-04-28T09:14:15.437086Z" } }, "outputs": [], @@ -377,8 +377,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:31.921960Z", - "start_time": "2020-04-27T19:15:30.787124Z" + "end_time": "2020-04-28T09:14:16.816735Z", + "start_time": "2020-04-28T09:14:15.568337Z" } }, "outputs": [ @@ -416,8 +416,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:15:32.063373Z", - "start_time": "2020-04-27T19:15:31.924138Z" + "end_time": "2020-04-28T09:14:16.960683Z", + "start_time": "2020-04-28T09:14:16.819117Z" } }, "outputs": [ @@ -425,7 +425,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "uplift@30%: 0.0660\n" + "uplift@30%: 0.0661\n" ] } ], diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb index 9284aa3..5e306b5 100644 --- a/notebooks/pipeline_usage_RU.ipynb +++ b/notebooks/pipeline_usage_RU.ipynb @@ -44,13 +44,13 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:10:58.787183Z", - "start_time": "2020-04-27T19:10:58.780531Z" + "end_time": "2020-04-28T09:23:06.767010Z", + "start_time": "2020-04-28T09:23:06.762871Z" } }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.0 xgboost==1.0.2 category_encoders==2.1.0" + "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" ] }, { @@ -70,15 +70,15 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:04.610210Z", - "start_time": "2020-04-27T19:10:58.796242Z" + "end_time": "2020-04-28T09:23:10.820274Z", + "start_time": "2020-04-28T09:23:06.776057Z" } }, "outputs": [ { "data": { "text/plain": [ - "('./content/Hilstorm.csv', )" + "('./content/Hilstorm.csv', )" ] }, "execution_count": 2, @@ -112,8 +112,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:05.200695Z", - "start_time": "2020-04-27T19:11:04.614828Z" + "end_time": "2020-04-28T09:23:11.319111Z", + "start_time": "2020-04-28T09:23:10.823302Z" } }, "outputs": [ @@ -280,8 +280,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:05.963783Z", - "start_time": "2020-04-27T19:11:05.205409Z" + "end_time": "2020-04-28T09:23:11.919143Z", + "start_time": "2020-04-28T09:23:11.324058Z" } }, "outputs": [], @@ -312,8 +312,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:05.982444Z", - "start_time": "2020-04-27T19:11:05.966573Z" + "end_time": "2020-04-28T09:23:11.936024Z", + "start_time": "2020-04-28T09:23:11.921716Z" } }, "outputs": [ @@ -342,8 +342,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:06.071221Z", - "start_time": "2020-04-27T19:11:05.984825Z" + "end_time": "2020-04-28T09:23:12.019176Z", + "start_time": "2020-04-28T09:23:11.939728Z" } }, "outputs": [], @@ -381,8 +381,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:07.235200Z", - "start_time": "2020-04-27T19:11:06.076210Z" + "end_time": "2020-04-28T09:23:13.244343Z", + "start_time": "2020-04-28T09:23:12.021559Z" } }, "outputs": [ @@ -415,8 +415,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-27T19:11:07.391911Z", - "start_time": "2020-04-27T19:11:07.238581Z" + "end_time": "2020-04-28T09:23:13.383577Z", + "start_time": "2020-04-28T09:23:13.246513Z" } }, "outputs": [ @@ -424,7 +424,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "uplift@30%: 0.0660\n" + "uplift@30%: 0.0661\n" ] } ], diff --git a/sklift/__init__.py b/sklift/__init__.py index b794fd4..df9144c 100644 --- a/sklift/__init__.py +++ b/sklift/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = '0.1.1' From 921923a7cf8f716e90b28669dbf3f8b1bd96f71f Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 13:40:03 +0300 Subject: [PATCH 20/20] :memo: Fix links in Readme --- Readme.rst | 4 ++-- docs/tutorials.rst | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Readme.rst b/Readme.rst index 4517f72..1826fd5 100644 --- a/Readme.rst +++ b/Readme.rst @@ -53,7 +53,7 @@ More about uplift modelling problem read in russian on habr.com: `Part 1`_ and ` * Applying any estimator adheres to scikit-learn conventions; -* All approaches can be used in sklearn.pipeline (see example (`EN `_ |Open In Colab3|_, `RU `_ |Open In Colab4|_)) +* All approaches can be used in sklearn.pipeline (see example (`EN `__ |Open In Colab3|_, `RU `__ |Open In Colab4|_)) * Almost all implemented approaches solve both the problem of classification and regression; @@ -96,7 +96,7 @@ And if you now point your browser to ``_build/html/index.html``, you should see Quick Start ----------- -See the **RetailHero tutorial notebook** (`EN `_ |Open In Colab1|_, `RU `_ |Open In Colab2|_) for details. +See the **RetailHero tutorial notebook** (`EN `__ |Open In Colab1|_, `RU `__ |Open In Colab2|_) for details. **Train and predict uplift model** diff --git a/docs/tutorials.rst b/docs/tutorials.rst index c962e1a..ba09bdc 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -20,9 +20,9 @@ Basic It is better to start scikit-uplift from the basic tutorials. * `The overview of the basic approaches to solving the Uplift Modeling problem`_ - * In English: `nbviewer `_ | `github `_ |Open In Colab1| - * In Russian: `nbviewer `_ | `github `_ |Open In Colab2| + * In English: `nbviewer `__ | `github `__ |Open In Colab1| + * In Russian: `nbviewer `__ | `github `__ |Open In Colab2| * `Example of usage model from sklift.models in sklearn.pipeline`_ - * In English: `nbviewer `_ | `github `_ |Open In Colab3| - * In Russian: `nbviewer `_ | `github `_ |Open In Colab4| + * In English: `nbviewer `__ | `github `__ |Open In Colab3| + * In Russian: `nbviewer `__ | `github `__ |Open In Colab4|