Add files via upload

mzav · Mar 6, 2018 · df1bb95 · df1bb95
1 parent f3b6f75
commit df1bb95
Showing 1 changed file with 147 additions and 0 deletions.
diff --git a/sport/hw1/benchmarks/Tushin_Kirill_0.259_benchmark.ipynb b/sport/hw1/benchmarks/Tushin_Kirill_0.259_benchmark.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "from lightgbm import LGBMClassifier\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "pd.set_option('display.max_columns', 300)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(model , data , columns):\n",
+    "    data = data.copy()\n",
+    "    predict_answer = {}\n",
+    "    for col in ['work' , 'home']:\n",
+    "        data['predict_' + col] = model[col].predict_proba(data[columns])[:,1]\n",
+    "        tmp = data.groupby(['customer_id' , 'pos_address_lat' , 'pos_address_lon'])[['predict_' + col]].max()\n",
+    "        tmp = tmp.groupby(['customer_id']).idxmax()['predict_' + col].values\n",
+    "        \n",
+    "        predict = [np.array(x) for x in tmp]\n",
+    "        predict = pd.DataFrame(predict , columns=['customer_id' , col + '_predict_lat' ,  col + '_predict_lon'])\n",
+    "        predict_answer[col] = predict.convert_objects(convert_numeric=True)\n",
+    "        \n",
+    "    return pd.merge(predict_answer['work'] , predict_answer['home'] , on='customer_id')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv(\"data/train_set.csv.gz\", compression=\"gzip\").rename(columns={\"pos_adress_lat\": \"pos_address_lat\",\"pos_adress_lon\": \"pos_address_lon\"})\n",
+    "test = pd.read_csv(\"data/test_set.csv.gz\", compression=\"gzip\")\n",
+    "sample = pd.read_csv('data/sample.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_mcc(mcc):\n",
+    "    if type(mcc) == int:\n",
+    "        return mcc\n",
+    "    mcc = mcc.split(',')\n",
+    "    if len(mcc) == 1:\n",
+    "        return int(mcc[0])\n",
+    "    else:\n",
+    "        return 1000*int(mcc[0]) + int(mcc[1])\n",
+    "test['mcc'] = test['mcc'].apply(clean_mcc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Target\n",
+    "train['target_home'] = (np.sqrt((train['home_add_lat'] - train['pos_address_lat']) ** 2 + (train['home_add_lon'] - train['pos_address_lon']) ** 2) < 0.02).astype('int8')\n",
+    "train['target_work'] = (np.sqrt((train['work_add_lat'] - train['pos_address_lat']) ** 2 + (train['work_add_lon'] - train['pos_address_lon']) ** 2) < 0.02).astype('int8')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "good_col = ['amount', 'atm_address_lat', 'atm_address_lon', 'currency', 'mcc', 'pos_address_lat','pos_address_lon']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 50.2 s, sys: 237 ms, total: 50.4 s\n",
+      "Wall time: 6.78 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "model_home = LGBMClassifier(n_jobs=-1)\n",
+    "model_work = LGBMClassifier(n_jobs=-1)\n",
+    "model_home.fit(train[good_col] , train['target_home'])\n",
+    "model_work.fit(train[good_col] , train['target_work'])\n",
+    "\n",
+    "model = {'home':model_home , 'work':model_work}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predict_test = sample.merge(predict(model , test , good_col) , how='left').drop(sample.columns[1:] , axis=1)\n",
+    "predict_test = predict_test.fillna(predict_test.median())\n",
+    "predict_test.columns = sample.columns\n",
+    "predict_test.to_csv('first.csv' , index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}