From 486a02cb822784c17096c2082c051008f41871c8 Mon Sep 17 00:00:00 2001 From: Khuyen Tran Date: Mon, 27 Sep 2021 13:59:53 +0000 Subject: [PATCH] add deepnote badge --- .../shapey_values/shapey_values.ipynb | 171 ++++++++++-------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/data_science_tools/shapey_values/shapey_values.ipynb b/data_science_tools/shapey_values/shapey_values.ipynb index f64890e..9e2fda0 100644 --- a/data_science_tools/shapey_values/shapey_values.ipynb +++ b/data_science_tools/shapey_values/shapey_values.ipynb @@ -1,19 +1,47 @@ { "cells": [ + { + "cell_type": "markdown", + "source": "[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science)", + "metadata": { + "tags": [], + "cell_id": "00001-be9f3386-11a5-4276-86bb-237b087fb766", + "deepnote_cell_type": "markdown" + }, + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "source": "[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/shapey_values/shapey_values.ipynb)", + "metadata": { + "tags": [], + "cell_id": "00002-a832cf54-2f4b-458c-be81-9401baf5369a", + "deepnote_cell_type": "code" + }, + "outputs": [], + "execution_count": null + }, { "cell_type": "code", "metadata": { "cell_id": "00000-69f65e90-a458-46b2-8a1a-e5791c1d455f", "deepnote_to_be_reexecuted": false, "source_hash": "e47dc800", - "execution_start": 1632428483538, - "execution_millis": 9918, - "output_cleared": true, + "execution_start": 1632429001960, + "execution_millis": 4741, + "output_cleared": false, "deepnote_cell_type": "code" }, "source": "!pip install shap patsy yellowbrick xgboost", - "execution_count": 16, - "outputs": [] + "execution_count": null, + "outputs": [ + { + "name": "stdout", + "text": "Requirement already satisfied: shap in /root/venv/lib/python3.7/site-packages (0.39.0)\nRequirement already satisfied: patsy in /root/venv/lib/python3.7/site-packages (0.5.1)\nRequirement already satisfied: yellowbrick in /root/venv/lib/python3.7/site-packages (1.3.post1)\nRequirement already satisfied: xgboost in /root/venv/lib/python3.7/site-packages (1.4.2)\nRequirement already satisfied: slicer==0.0.7 in /root/venv/lib/python3.7/site-packages (from shap) (0.0.7)\nRequirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.7.1)\nRequirement already satisfied: cloudpickle in /root/venv/lib/python3.7/site-packages (from shap) (2.0.0)\nRequirement already satisfied: tqdm>4.25.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (4.62.3)\nRequirement already satisfied: numba in /root/venv/lib/python3.7/site-packages (from shap) (0.54.0)\nRequirement already satisfied: scikit-learn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (0.24.2)\nRequirement already satisfied: pandas in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.2.5)\nRequirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.19.5)\nRequirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy) (1.16.0)\nRequirement already satisfied: cycler>=0.10.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yellowbrick) (0.10.0)\nRequirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yellowbrick) (3.4.3)\nRequirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (8.3.2)\nRequirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.3.2)\nRequirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)\nRequirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.4.7)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->shap) (2.2.0)\nRequirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->shap) (1.0.1)\nRequirement already satisfied: setuptools in /root/venv/lib/python3.7/site-packages (from numba->shap) (58.0.4)\nRequirement already satisfied: llvmlite<0.38,>=0.37.0rc1 in /root/venv/lib/python3.7/site-packages (from numba->shap) (0.37.0)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas->shap) (2021.1)\n", + "output_type": "stream" + } + ] }, { "cell_type": "markdown", @@ -33,12 +61,12 @@ "cell_id": "00002-fc321b6a-8b47-46f0-b15d-ee5f2f4d3c96", "deepnote_to_be_reexecuted": false, "source_hash": "2963a52d", - "execution_start": 1632428409806, - "execution_millis": 2, + "execution_start": 1632429010988, + "execution_millis": 6, "deepnote_cell_type": "code" }, "source": "revenues = {\n \"no_ad\": 150,\n \"social_media\": 300,\n \"google_advertising\": 200,\n \"email_marketing\": 350,\n \"social_media + google_advertising\": 320,\n \"social_media + email_marketing\": 400,\n \"google_advertising + email_marketing\": 350,\n \"social_media + google_advertising + email_marketing\": 450,\n}", - "execution_count": 2, + "execution_count": null, "outputs": [] }, { @@ -51,12 +79,12 @@ "cell_id": "00003-c0c45629-a801-4097-8406-4ed04ddc0b3f", "deepnote_to_be_reexecuted": false, "source_hash": "f1d7eda4", - "execution_start": 1632428411379, - "execution_millis": 4, + "execution_start": 1632429011836, + "execution_millis": 1, "deepnote_cell_type": "code" }, "source": "weights = [1 / 3, 1 / 6, 1 / 6, 1 / 3]", - "execution_count": 3, + "execution_count": null, "outputs": [] }, { @@ -125,16 +153,16 @@ "cell_id": "00011-61c80ac6-ab73-496e-93b4-9fd761e1edcd", "deepnote_to_be_reexecuted": false, "source_hash": "a711feba", - "execution_start": 1632428414172, - "execution_millis": 10, + "execution_start": 1632429014639, + "execution_millis": 12, "deepnote_cell_type": "code" }, "source": "google_advertising_contribution = [\n revenues[\"google_advertising\"] - revenues[\"no_ad\"],\n revenues[\"social_media + google_advertising\"] - revenues[\"social_media\"],\n revenues[\"google_advertising + email_marketing\"] - revenues[\"email_marketing\"],\n revenues[\"social_media + google_advertising + email_marketing\"]\n - revenues[\"social_media + email_marketing\"],\n]\ngoogle_advertising_contribution", - "execution_count": 4, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 4, + "execution_count": 36, "data": { "text/plain": "[50, 20, 0, 50]" }, @@ -152,16 +180,16 @@ "cell_id": "00012-ed27e98e-97c9-4584-ad11-4778b7ccb978", "deepnote_to_be_reexecuted": false, "source_hash": "c420f8cf", - "execution_start": 1632428415799, - "execution_millis": 7, + "execution_start": 1632429020332, + "execution_millis": 17, "deepnote_cell_type": "code" }, "source": "google_advertising_total_contribution = sum(\n [\n weight * contribution\n for weight, contribution in zip(weights, google_advertising_contribution)\n ]\n)\ngoogle_advertising_total_contribution", - "execution_count": 5, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 5, + "execution_count": 37, "data": { "text/plain": "36.66666666666666" }, @@ -187,16 +215,16 @@ "cell_id": "00014-031154fe-ae81-4523-832e-9be22dcbed96", "deepnote_to_be_reexecuted": false, "source_hash": "92cf01a", - "execution_start": 1632428417899, - "execution_millis": 12, + "execution_start": 1632429022420, + "execution_millis": 9, "deepnote_cell_type": "code" }, "source": "social_media_contribution = [\n revenues[\"social_media\"] - revenues[\"no_ad\"],\n revenues[\"social_media + google_advertising\"] - revenues[\"google_advertising\"],\n revenues[\"social_media + email_marketing\"] - revenues[\"email_marketing\"],\n revenues[\"social_media + google_advertising + email_marketing\"]\n - revenues[\"google_advertising + email_marketing\"],\n]\nsocial_media_contribution", - "execution_count": 6, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 6, + "execution_count": 38, "data": { "text/plain": "[150, 120, 50, 100]" }, @@ -214,16 +242,16 @@ "cell_id": "00015-fabcffdb-e46b-4c42-b54e-5f6ecbe15030", "deepnote_to_be_reexecuted": false, "source_hash": "938df26a", - "execution_start": 1632428419327, - "execution_millis": 8, + "execution_start": 1632429023733, + "execution_millis": 9, "deepnote_cell_type": "code" }, "source": "social_media_total_contribution = sum(\n [\n weight * contribution\n for weight, contribution in zip(weights, social_media_contribution)\n ]\n)\nsocial_media_total_contribution", - "execution_count": 7, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 7, + "execution_count": 39, "data": { "text/plain": "111.66666666666666" }, @@ -249,16 +277,16 @@ "cell_id": "00017-36e759d9-0651-41ff-a3cf-f1a20c359fa3", "deepnote_to_be_reexecuted": false, "source_hash": "110e924b", - "execution_start": 1632428420421, - "execution_millis": 11, + "execution_start": 1632429025534, + "execution_millis": 9, "deepnote_cell_type": "code" }, "source": "email_marketing_contribution = [\n revenues[\"email_marketing\"] - revenues[\"no_ad\"],\n revenues[\"google_advertising + email_marketing\"] - revenues[\"google_advertising\"],\n revenues[\"social_media + email_marketing\"] - revenues[\"social_media\"],\n revenues[\"social_media + google_advertising + email_marketing\"]\n - revenues[\"social_media + google_advertising\"],\n]\nemail_marketing_contribution", - "execution_count": 8, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 8, + "execution_count": 40, "data": { "text/plain": "[200, 150, 100, 130]" }, @@ -276,16 +304,16 @@ "cell_id": "00018-31aee598-ec02-4390-b885-d95a86e347ed", "deepnote_to_be_reexecuted": false, "source_hash": "33b80ff3", - "execution_start": 1632428422260, - "execution_millis": 10, + "execution_start": 1632429026444, + "execution_millis": 30, "deepnote_cell_type": "code" }, "source": "email_marketing_contribution = sum(\n [\n weight * contribution\n for weight, contribution in zip(weights, email_marketing_contribution)\n ]\n)\nemail_marketing_contribution", - "execution_count": 9, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 9, + "execution_count": 41, "data": { "text/plain": "151.66666666666663" }, @@ -311,16 +339,16 @@ "cell_id": "00020-a517d47a-8225-4da3-b172-fc66cf3f362c", "deepnote_to_be_reexecuted": false, "source_hash": "30fa839e", - "execution_start": 1632428424439, - "execution_millis": 17, + "execution_start": 1632429027380, + "execution_millis": 79, "deepnote_cell_type": "code" }, "source": "(\n google_advertising_total_contribution\n + social_media_total_contribution\n + email_marketing_contribution\n)", - "execution_count": 10, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 10, + "execution_count": 42, "data": { "text/plain": "299.99999999999994" }, @@ -354,16 +382,16 @@ "cell_id": "00023-f8d752e0-3efd-45b9-b6ba-060d423e2426", "deepnote_to_be_reexecuted": false, "source_hash": "32406bb2", - "execution_start": 1632428463896, - "execution_millis": 708, + "execution_start": 1632429028540, + "execution_millis": 197, "deepnote_cell_type": "code" }, "source": "import pandas as pd\n\ndata = pd.read_csv(\"/datasets/advertising/advertising.csv\")\ndata.columns = data.columns.map(lambda row: \"_\".join(row.lower().split(\" \")))\ndata.head(10)", - "execution_count": 11, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 11, + "execution_count": 43, "data": { "application/vnd.deepnote.dataframe.v2+json": { "row_count": 10, @@ -985,12 +1013,12 @@ "cell_id": "00024-a4314779-7341-4e80-b10b-54a44948bbcd", "deepnote_to_be_reexecuted": false, "source_hash": "c5c5aae5", - "execution_start": 1632428467704, - "execution_millis": 450, + "execution_start": 1632429030929, + "execution_millis": 0, "deepnote_cell_type": "code" }, "source": "from patsy import dmatrices\n\ny, X = dmatrices(\n \"clicked_on_ad ~ daily_time_spent_on_site + age + area_income + daily_internet_usage + male -1\",\n data=data,\n)", - "execution_count": 12, + "execution_count": null, "outputs": [] }, { @@ -1003,16 +1031,16 @@ "cell_id": "00025-1c2560b7-4677-4553-b9b7-f744d103d926", "deepnote_to_be_reexecuted": false, "source_hash": "866bb102", - "execution_start": 1632428469154, - "execution_millis": 48, + "execution_start": 1632429032065, + "execution_millis": 486, "deepnote_cell_type": "code" }, "source": "X_frame = pd.DataFrame(data=X, columns=X.design_info.column_names)\nX_frame", - "execution_count": 13, + "execution_count": null, "outputs": [ { "output_type": "execute_result", - "execution_count": 13, + "execution_count": 45, "data": { "application/vnd.deepnote.dataframe.v2+json": { "row_count": 1000, @@ -2955,12 +2983,12 @@ "cell_id": "00026-5200954f-ed6b-4d3a-bba6-bf929f27f5c8", "deepnote_to_be_reexecuted": false, "source_hash": "c12fb403", - "execution_start": 1632428470650, - "execution_millis": 2931, + "execution_start": 1632429032370, + "execution_millis": 2, "deepnote_cell_type": "code" }, "source": "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)", - "execution_count": 14, + "execution_count": null, "outputs": [] }, { @@ -2978,7 +3006,7 @@ "deepnote_cell_type": "code" }, "source": "import xgboost\nimport shap\nfrom sklearn.metrics import f1_score\n\nmodel = xgboost.XGBClassifier().fit(X_train, y_train)\n\nprediction = model.predict(X_test)\nf1 = f1_score(y_test, prediction)\nf1", - "execution_count": 17, + "execution_count": null, "outputs": [ { "name": "stderr", @@ -3010,7 +3038,7 @@ "deepnote_cell_type": "code" }, "source": "from yellowbrick.classifier import ClassificationReport\n\nvisualizer = ClassificationReport(model)\nvisualizer.fit(X_train, y_train)\nvisualizer.score(X_test, y_test)\nvisualizer.show()", - "execution_count": 18, + "execution_count": null, "outputs": [ { "name": "stderr", @@ -3063,7 +3091,7 @@ "deepnote_cell_type": "code" }, "source": "explainer = shap.Explainer(model)\nshap_values = explainer(X_frame)", - "execution_count": 19, + "execution_count": null, "outputs": [ { "name": "stderr", @@ -3087,7 +3115,7 @@ "deepnote_cell_type": "code" }, "source": "shap.summary_plot(shap_values, X)", - "execution_count": 20, + "execution_count": null, "outputs": [ { "name": "stderr", @@ -3133,7 +3161,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.waterfall(shap_values[0])", - "execution_count": 21, + "execution_count": null, "outputs": [ { "data": { @@ -3173,7 +3201,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.waterfall(shap_values[1])", - "execution_count": 22, + "execution_count": null, "outputs": [ { "data": { @@ -3205,7 +3233,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.waterfall(shap_values[1])", - "execution_count": 23, + "execution_count": null, "outputs": [ { "data": { @@ -3245,7 +3273,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"daily_internet_usage\"])", - "execution_count": 24, + "execution_count": null, "outputs": [ { "data": { @@ -3277,7 +3305,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"daily_internet_usage\"], color=shap_values)", - "execution_count": 25, + "execution_count": null, "outputs": [ { "data": { @@ -3309,7 +3337,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"daily_time_spent_on_site\"], color=shap_values)", - "execution_count": 26, + "execution_count": null, "outputs": [ { "data": { @@ -3341,7 +3369,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"area_income\"], color=shap_values)", - "execution_count": 27, + "execution_count": null, "outputs": [ { "data": { @@ -3373,7 +3401,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"age\"], color=shap_values)", - "execution_count": 28, + "execution_count": null, "outputs": [ { "data": { @@ -3405,7 +3433,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.scatter(shap_values[:, \"male\"], color=shap_values)", - "execution_count": 29, + "execution_count": null, "outputs": [ { "data": { @@ -3424,12 +3452,11 @@ }, { "cell_type": "markdown", + "source": "## SHAP Bar Plot", "metadata": { "cell_id": "00044-bc3e0d5d-c297-4329-8ce2-550e28301e0b", "deepnote_cell_type": "markdown" - }, - "source": "## SHAP Bar Plot", - "execution_count": null + } }, { "cell_type": "code", @@ -3446,7 +3473,7 @@ "deepnote_cell_type": "code" }, "source": "shap.plots.bar(shap_values)", - "execution_count": 30, + "execution_count": null, "outputs": [ { "data": { @@ -3486,7 +3513,7 @@ "deepnote_cell_type": "code" }, "source": "shap_interaction_values = explainer.shap_interaction_values(X)", - "execution_count": 31, + "execution_count": null, "outputs": [ { "name": "stderr", @@ -3510,7 +3537,7 @@ "deepnote_cell_type": "code" }, "source": "shap.summary_plot(shap_interaction_values, X_frame)", - "execution_count": 32, + "execution_count": null, "outputs": [ { "data": {