diff --git a/code/Ensemble/power_ensemble.ipynb b/code/Ensemble/power_ensemble.ipynb
new file mode 100644
index 0000000..af41d1e
--- /dev/null
+++ b/code/Ensemble/power_ensemble.ipynb
@@ -0,0 +1,68 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "343ee127-66d1-48cd-a207-3f5ddba915cc",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['/opt/ml/code/output/pjh_0.7312_0.7911.csv', '/opt/ml/code/output/output6_19_17_30.csv', '/opt/ml/code/output/hk_auc8285_acc7554.csv', '/opt/ml/code/output/output_7975.csv']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 평균\n",
+ "from glob import glob\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "plt.style.use('fivethirtyeight')\n",
+ "\n",
+ "%matplotlib inline\n",
+ "\n",
+ "output_path = \"/opt/ml/code/output/cross_validation/output.csv\"\n",
+ "csv_file_path_list = glob(\"/opt/ml/code/output/*.csv\")\n",
+ "print(csv_file_path_list)\n",
+ "\n",
+ "POWER = 1/4\n",
+ "\n",
+ "# concat result dataframe\n",
+ "result = pd.read_csv(csv_file_path_list[0])[\"prediction\"]\n",
+ "result = result ** POWER\n",
+ "for csv_file_path in csv_file_path_list[1:]:\n",
+ " temp_result = pd.read_csv(csv_file_path)[\"prediction\"]\n",
+ " temp_result = temp_result ** POWER\n",
+ " result = pd.concat([result, temp_result], axis=1)\n",
+ "\n",
+ "# mean result dataframe\n",
+ "result = pd.DataFrame(result.mean(axis=1)).reset_index().rename(columns = {0:\"prediction\", \"index\":\"id\"})\n",
+ "result.to_csv(output_path, index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/code/Ensemble/stacking.ipynb b/code/Ensemble/stacking.ipynb
new file mode 100644
index 0000000..b0456b0
--- /dev/null
+++ b/code/Ensemble/stacking.ipynb
@@ -0,0 +1,2116 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "3f275051-6f26-4e07-bdd6-0a03c50f8789",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "from datetime import datetime\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from tqdm import tqdm\n",
+ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n",
+ "from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD\n",
+ "tqdm.pandas()\n",
+ "\n",
+ "\n",
+ "def timestamp(df):\n",
+ " # year, month\n",
+ " df[\"year\"] = df[\"Timestamp\"].apply(lambda x: x.year)\n",
+ " df[\"month\"] = df[\"Timestamp\"].apply(lambda x: x.month)\n",
+ " df[\"year\"] = df[\"year\"].astype(\"category\")\n",
+ " df[\"month\"] = df[\"month\"].astype(\"category\")\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def assessmentItemID(df):\n",
+ " df[\"assessmentItemID\"] = df[\"assessmentItemID\"].astype(\"category\")\n",
+ " df[\"question_num\"] = df[\"assessmentItemID\"].apply(lambda x: int(x[-2:]))\n",
+ "# df[\"question_num\"] = df[\"question_num\"].astype(\"category\")\n",
+ " df[\"question_class\"] = df[\"assessmentItemID\"].apply(lambda x: x[2])\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def KnowledgeTag_relative(df):\n",
+ " # KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ " df_KnowledgeTag = df.sort_values(by=[\"KnowledgeTag\", \"Timestamp\"])\n",
+ " df[\"KnowledgeTag_total_answer\"] = df_KnowledgeTag.groupby(\"KnowledgeTag\")[\"answercode\"].cumcount()\n",
+ " df[\"KnowledgeTag_correct_answer\"] = df_KnowledgeTag.groupby(\"KnowledgeTag\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"KnowledgeTag_acc\"] = (df[\"KnowledgeTag_correct_answer\"] / df[\"KnowledgeTag_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_KnowledgeTag_relative(df):\n",
+ " # userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ " df_userID_KnowledgeTag = df.sort_values(by=[\"userID\", \"Timestamp\"]).reset_index(drop=True)\n",
+ " df[\"userID_KnowledgeTag_total_answer\"] = df_userID_KnowledgeTag.groupby(\"KnowledgeTag\")[\"answercode\"].cumcount()\n",
+ " df[\"userID_KnowledgeTag_correct_answer\"] = df_userID_KnowledgeTag.groupby(\"KnowledgeTag\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"userID_KnowledgeTag_acc\"] = (df[\"userID_KnowledgeTag_correct_answer\"] / df[\"userID_KnowledgeTag_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def assessmentItemID_relative(df):\n",
+ " # assessmentItemID별 누적 풀이 수, 정답 수, 정답률\n",
+ " df_assessmentItemID = df.sort_values(by=[\"assessmentItemID\", \"Timestamp\"])\n",
+ " df[\"assessmentItemID_total_answer\"] = df_assessmentItemID.groupby(\"assessmentItemID\")[\"answercode\"].cumcount()\n",
+ " df[\"assessmentItemID_correct_answer\"] = df_assessmentItemID.groupby(\"assessmentItemID\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"assessmentItemID_acc\"] = (df[\"assessmentItemID_correct_answer\"] / df[\"assessmentItemID_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def question_class_relative(df):\n",
+ " if \"question_class\" not in df.columns:\n",
+ " df = question_class(df)\n",
+ " # Question Class 별 누적 풀이 수, 정답 수, 정답률\n",
+ " df.sort_values(by=[\"question_class\", \"Timestamp\"], inplace=True)\n",
+ " df[\"question_class_correct_answer\"] = df.groupby(\"question_class\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"question_class_total_answer\"] = df.groupby(\"question_class\")[\"answercode\"].cumcount()\n",
+ " df[\"question_class_acc\"] = (df[\"question_class_correct_answer\"] / df[\"question_class_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_question_class_relative(df):\n",
+ " # question_class 있어야 계산 가능\n",
+ " if \"question_class\" not in df.columns:\n",
+ " df = question_class(df)\n",
+ " # userID_question_class 키값 생성(temp)\n",
+ " df[\"userID_question_class\"] = df[[\"userID\", \"question_class\"]].apply(lambda data: str(data[\"userID\"]) + \"_\" + data[\"question_class\"], axis=1)\n",
+ " # userID_question_class별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID_question_class\", \"Timestamp\"], inplace=True)\n",
+ " # userID_question_class별 누적 풀이 수, 정답 수, 정답률\n",
+ " df[\"userID_question_class_correct_answer\"] = df.groupby(\"userID_question_class\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"userID_question_class_total_answer\"] = df.groupby(\"userID_question_class\")[\"answercode\"].cumcount()\n",
+ " df[\"userID_question_class_acc\"] = (df[\"userID_question_class_correct_answer\"] / df[\"userID_question_class_total_answer\"]).fillna(0)\n",
+ " # userID_question_class 키값 삭제(temp)\n",
+ " df.drop(\"userID_question_class\", axis=1, inplace=True)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def question_num_relative(df):\n",
+ " if \"question_num\" not in df.columns:\n",
+ " df = question_class(df)\n",
+ " # Question Class 별 누적 풀이 수, 정답 수, 정답률\n",
+ " df.sort_values(by=[\"question_num\", \"Timestamp\"], inplace=True)\n",
+ " df[\"question_num_correct_answer\"] = df.groupby(\"question_num\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"question_num_total_answer\"] = df.groupby(\"question_num\")[\"answercode\"].cumcount()\n",
+ " df[\"question_num_acc\"] = (df[\"question_num_correct_answer\"] / df[\"question_num_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_question_num_relative(df):\n",
+ " # question_class 있어야 계산 가능\n",
+ " if \"question_num\" not in df.columns:\n",
+ " df = question_class(df)\n",
+ " # userID_question_class 키값 생성(temp)\n",
+ " df[\"userID_question_num\"] = df[[\"userID\", \"question_num\"]].apply(lambda data: str(data[\"userID\"]) + \"_\" + str(data[\"question_num\"]), axis=1)\n",
+ " # userID_question_class별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID_question_num\", \"Timestamp\"], inplace=True)\n",
+ " # userID_question_class별 누적 풀이 수, 정답 수, 정답률\n",
+ " df[\"userID_question_num_correct_answer\"] = df.groupby(\"userID_question_num\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"userID_question_num_total_answer\"] = df.groupby(\"userID_question_num\")[\"answercode\"].cumcount()\n",
+ " df[\"userID_question_num_acc\"] = (df[\"userID_question_num_correct_answer\"] / df[\"userID_question_num_total_answer\"]).fillna(0)\n",
+ " # userID_question_class 키값 삭제(temp)\n",
+ " df.drop(\"userID_question_num\", axis=1, inplace=True)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_relative(df):\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID\", \"Timestamp\"], inplace=True)\n",
+ " # user 별 누적 풀이 수, 정답 수, 정답률\n",
+ " df[\"userID_correct_answer\"] = df.groupby(\"userID\")[\"answercode\"].transform(lambda x: x.cumsum().shift(1)).fillna(0)\n",
+ " df[\"userID_total_answer\"] = df.groupby(\"userID\")[\"answercode\"].cumcount()\n",
+ " df[\"userID_acc\"] = (df[\"userID_correct_answer\"] / df[\"userID_total_answer\"]).fillna(0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_acc_rolling(df, window=5):\n",
+ " # user_acc 있어야 이동평균 계산 가능\n",
+ " if \"userID_acc\" not in df.columns:\n",
+ " df = userID_relative(df)\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID\", \"Timestamp\"], inplace=True)\n",
+ " \n",
+ " # userID별 정답률(user_acc)의 이동 평균\n",
+ " df[\"userID_acc_rolling\"] = df.groupby([\"userID\"])[\"userID_acc\"].rolling(window).mean().values\n",
+ " # userID별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)\n",
+ " # userID별 user_acc_rolling의 중앙값으로 대체\n",
+ " def changed_user_acc_rolling(data):\n",
+ " return data[\"userID_acc_rolling_x\"] if data[\"userID_acc_rolling_x\"] != \"missing\" else data[\"userID_acc_rolling_y\"]\n",
+ " user_median = df.groupby(\"userID\")[\"userID_acc_rolling\"].median()\n",
+ " df = pd.merge(df, user_median, on=[\"userID\"], how=\"left\")\n",
+ " # 결측치 중앙값 변환 및 임시 열 삭제\n",
+ " df[\"userID_acc_rolling_x\"] = df[\"userID_acc_rolling_x\"].fillna(\"missing\")\n",
+ " df[\"userID_acc_rolling_\" + str(window)] = df.progress_apply(changed_user_acc_rolling, axis=1)\n",
+ " df.drop(\"userID_acc_rolling_x\", axis=1, inplace=True)\n",
+ " df.drop(\"userID_acc_rolling_y\", axis=1, inplace=True)\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def feature_dimension_reduction(df, kind=\"lda\"):\n",
+ " if \"assessmentItemID_total_answer\" not in df.columns:\n",
+ " df = assessmentItemID_relative(df)\n",
+ " if \"KnowledgeTag_total_answer\" not in df.columns:\n",
+ " df = KnowledgeTag_relative(df)\n",
+ " if \"question_class_correct_answer\" not in df.columns:\n",
+ " df = question_class_relative(df)\n",
+ " if \"userID_question_class_correct_answer\" not in df.columns:\n",
+ " df = userID_question_class_relative(df)\n",
+ " \n",
+ " if kind == \"lda\":\n",
+ " model = LDA(n_components=1)\n",
+ " elif kind == \"pca\":\n",
+ " model = PCA(n_components=1)\n",
+ " elif kind == \"kpca\":\n",
+ " model = KernelPCA(n_components=1)\n",
+ " elif kind == \"kpca_rbf\":\n",
+ " model = KernelPCA(n_components=1, kernel=\"rbf\")\n",
+ " elif kind == \"kpca_poly\":\n",
+ " model = KernelPCA(n_components=1, kernel=\"poly\")\n",
+ " elif kind == \"svd\":\n",
+ " model = TruncatedSVD(n_components=1)\n",
+ " else:\n",
+ " return df\n",
+ " \n",
+ " y = df[\"answercode\"]\n",
+ " \n",
+ " # KnowledgeTag_dimension_reduction\n",
+ " X = df[[\"KnowledgeTag_total_answer\", \"KnowledgeTag_correct_answer\", \"KnowledgeTag_acc\"]].fillna(0)\n",
+ " df[\"KnowledgeTag_\" + kind] = model.fit_transform(X, y)\n",
+ " # userID_KnowledgeTag_dimension_reduction\n",
+ " X = df[[\"userID_KnowledgeTag_total_answer\", \"userID_KnowledgeTag_correct_answer\",\"userID_KnowledgeTag_acc\"]].fillna(0)\n",
+ " df[\"userID_KnowledgeTag_\" + kind] = model.fit_transform(X, y)\n",
+ " # assessmentItemID_dimension_reduction\n",
+ " X = df[[\"assessmentItemID_total_answer\", \"assessmentItemID_correct_answer\",\"assessmentItemID_acc\"]].fillna(0)\n",
+ " df[\"assessmentItemID_\" + kind] = model.fit_transform(X, y)\n",
+ " # question_class_dimension_reduction\n",
+ " X = df[[\"question_class_correct_answer\", \"question_class_total_answer\",\"question_class_acc\"]].fillna(0)\n",
+ " df[\"question_class_\" + kind] = model.fit_transform(X, y)\n",
+ " # user_question_class_dimension_reductio\n",
+ " X = df[[\"userID_question_class_correct_answer\", \"userID_question_class_total_answer\",\"userID_question_class_acc\"]].fillna(0)\n",
+ " df[\"userID_question_class_\" + kind] = model.fit_transform(X, y)\n",
+ " # question_num_dimension_reduction\n",
+ " X = df[[\"question_num_correct_answer\", \"question_num_total_answer\",\"question_num_acc\"]].fillna(0)\n",
+ " df[\"question_num_\" + kind] = model.fit_transform(X, y)\n",
+ " # user_question_num_dimension_reduction\n",
+ " X = df[[\"userID_question_num_correct_answer\", \"userID_question_num_total_answer\",\"userID_question_num_acc\"]].fillna(0)\n",
+ " df[\"userID_question_num_\" + kind] = model.fit_transform(X, y)\n",
+ " # userID_dimension_reduction\n",
+ " X = df[[\"userID_correct_answer\", \"userID_total_answer\", \"userID_acc\"]].fillna(0)\n",
+ " df[\"userID_\" + kind] = model.fit_transform(X, y)\n",
+ " # all_data_dimension_reduction\n",
+ " X = df.iloc[:, -8:]\n",
+ " df[\"all_data_\" + kind] = model.fit_transform(X, y)\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_elapsed_median(df, max_time=600):\n",
+ " # 약 1m 50s 소요(Progress bar 2개 생김)\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID\", \"Timestamp\"], inplace=True)\n",
+ "\n",
+ " # sample별 elapsed time \n",
+ " diff = df.loc[:, [\"userID\", \"Timestamp\"]].groupby(\"userID\").diff().shift(-1)\n",
+ " elapsed = diff[\"Timestamp\"].progress_apply(lambda x: x.total_seconds() if max_time > x.total_seconds() else None)\n",
+ " df[\"userID_elapsed_median\"] = elapsed\n",
+ " \n",
+ " # userID별 마지막 문제의 풀이 시간(데이터에서 알 수 없는)을\n",
+ " # userID별 문제 풀이 시간의 \"중앙값\"으로 반환하기 위한 Aggregation\n",
+ " user_median = df.groupby(\"userID\")[\"userID_elapsed_median\"].median()\n",
+ " df = pd.merge(df, user_median, on=[\"userID\"], how=\"left\")\n",
+ " \n",
+ " # 결측치 중앙값 변환 및 임시 열 삭제\n",
+ " df[\"userID_elapsed_median_x\"] = df[\"userID_elapsed_median_x\"].fillna(\"missing\")\n",
+ " def changed_elapsed(data):\n",
+ " return data[\"userID_elapsed_median_x\"] if data[\"userID_elapsed_median_x\"] != \"missing\" else data[\"userID_elapsed_median_y\"]\n",
+ " df[\"userID_elapsed_median\"] = df.progress_apply(changed_elapsed, axis=1)\n",
+ " df.drop(\"userID_elapsed_median_x\", axis=1, inplace=True)\n",
+ " df.drop(\"userID_elapsed_median_y\", axis=1, inplace=True)\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def userID_elapsed_median_rolling(df, window=10):\n",
+ " # userID_elapsed_median이 있어야 이동평균 계산 가능\n",
+ " if 'userID_elapsed_median' not in df.columns:\n",
+ " df = userID_elapsed_median(df)\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"userID\", \"Timestamp\"], inplace=True)\n",
+ " \n",
+ " # userID별 문제 풀이 시간의 이동평균\n",
+ " df['userID_elapsed_median_rolling'] = df.groupby(['userID'])['userID_elapsed_median'].rolling(window).mean().values\n",
+ " # 유저별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)\n",
+ " # 유저별 userID_elapsed_median_rolling의 중앙값으로 대체\n",
+ " def changed_mean_time(data):\n",
+ " return data[\"userID_elapsed_median_rolling_x\"] if data[\"userID_elapsed_median_rolling_x\"] != 'missing' else data[\"userID_elapsed_median_rolling_y\"]\n",
+ " user_median = df.groupby('userID')['userID_elapsed_median_rolling'].median()\n",
+ " df = pd.merge(df, user_median, on=[\"userID\"], how=\"left\")\n",
+ " \n",
+ " # 결측치 중앙값 변환 및 임시 열 삭제\n",
+ " df['userID_elapsed_median_rolling_x'] = df['userID_elapsed_median_rolling_x'].fillna('missing')\n",
+ " df['userID_elapsed_median_rolling_' + str(window)] = df.progress_apply(changed_mean_time, axis=1)\n",
+ " df.drop('userID_elapsed_median_rolling_x', axis=1, inplace=True)\n",
+ " df.drop('userID_elapsed_median_rolling_y', axis=1, inplace=True)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def assessmentItemID_time_relative(df):\n",
+ " # 문제별 풀이 시간의 중앙값&평균값\n",
+ " # userID_elapsed_median 있어야 assessmentItemID_time 계산 가능\n",
+ " if 'userID_elapsed_median' not in df.columns:\n",
+ " df = userID_elapsed_median(df)\n",
+ " # assessmentItemID별 풀이 시간의 중앙값&평균값\n",
+ " df_total_agg = df.copy()\n",
+ " agg_df = df_total_agg.groupby('assessmentItemID')['userID_elapsed_median'].agg(['median', 'mean'])\n",
+ " # mapping을 위해 pandas DataFrame을 dictionary형태로 변환\n",
+ " agg_dict = agg_df.to_dict()\n",
+ " # 구한 통계량을 각 사용자에게 mapping\n",
+ " df['assessmentItemID_time_median'] = df_total_agg['assessmentItemID'].map(agg_dict['median'])\n",
+ " df['assessmentItemID_time_mean'] = df_total_agg['assessmentItemID'].map(agg_dict['mean'])\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def assessmentItemID_elapsed_median(df, max_time=600):\n",
+ " # 약 1m 50s 소요(Progress bar 2개 생김)\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"assessmentItemID\", \"Timestamp\"], inplace=True)\n",
+ "\n",
+ " # sample별 elapsed time \n",
+ " diff = df.loc[:, [\"assessmentItemID\", \"Timestamp\"]].groupby(\"assessmentItemID\").diff().shift(-1)\n",
+ " elapsed = diff[\"Timestamp\"].progress_apply(lambda x: x.total_seconds() if max_time > x.total_seconds() else None)\n",
+ " df[\"assessmentItemID_elapsed_median\"] = elapsed\n",
+ " \n",
+ " # userID별 마지막 문제의 풀이 시간(데이터에서 알 수 없는)을\n",
+ " # userID별 문제 풀이 시간의 \"중앙값\"으로 반환하기 위한 Aggregation\n",
+ " user_median = df.groupby(\"assessmentItemID\")[\"assessmentItemID_elapsed_median\"].median()\n",
+ " df = pd.merge(df, user_median, on=[\"assessmentItemID\"], how=\"left\")\n",
+ " \n",
+ " # 결측치 중앙값 변환 및 임시 열 삭제\n",
+ " df[\"assessmentItemID_elapsed_median_x\"] = df[\"assessmentItemID_elapsed_median_x\"].fillna(\"missing\")\n",
+ " def changed_elapsed(data):\n",
+ " return data[\"assessmentItemID_elapsed_median_x\"] if data[\"assessmentItemID_elapsed_median_x\"] != \"missing\" else data[\"assessmentItemID_elapsed_median_y\"]\n",
+ " df[\"assessmentItemID_elapsed_median\"] = df.progress_apply(changed_elapsed, axis=1)\n",
+ " df.drop(\"assessmentItemID_elapsed_median_x\", axis=1, inplace=True)\n",
+ " df.drop(\"assessmentItemID_elapsed_median_y\", axis=1, inplace=True)\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def assessmentItemID_elapsed_median_rolling(df, window=10):\n",
+ " # userID_elapsed_median이 있어야 이동평균 계산 가능\n",
+ " if 'assessmentItemID_elapsed_median' not in df.columns:\n",
+ " df = assessmentItemID_elapsed_median(df)\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df.sort_values(by=[\"assessmentItemID\", \"Timestamp\"], inplace=True)\n",
+ " \n",
+ " # userID별 문제 풀이 시간의 이동평균\n",
+ " df['assessmentItemID_elapsed_median_rolling'] = df.groupby(['assessmentItemID'])['assessmentItemID_elapsed_median'].rolling(window).mean().values\n",
+ " # 유저별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)\n",
+ " # 유저별 userID_elapsed_median_rolling의 중앙값으로 대체\n",
+ " def changed_mean_time(data):\n",
+ " return data[\"assessmentItemID_elapsed_median_rolling_x\"] if data[\"assessmentItemID_elapsed_median_rolling_x\"] != 'missing' else data[\"assessmentItemID_elapsed_median_rolling_y\"]\n",
+ " user_median = df.groupby('assessmentItemID')['assessmentItemID_elapsed_median_rolling'].median()\n",
+ " df = pd.merge(df, user_median, on=[\"assessmentItemID\"], how=\"left\")\n",
+ " \n",
+ " # 결측치 중앙값 변환 및 임시 열 삭제\n",
+ " df['assessmentItemID_elapsed_median_rolling_x'] = df['assessmentItemID_elapsed_median_rolling_x'].fillna('missing')\n",
+ " df['assessmentItemID_elapsed_median_rolling_' + str(window)] = df.progress_apply(changed_mean_time, axis=1)\n",
+ " df.drop('assessmentItemID_elapsed_median_rolling_x', axis=1, inplace=True)\n",
+ " df.drop('assessmentItemID_elapsed_median_rolling_y', axis=1, inplace=True)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "# User가 해당 문제를 풀어본 경험 Feature\n",
+ "def userID_assessmentItemID_experience(df):\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)\n",
+ " \n",
+ " # userID 별로 assessmentItemID를 풀어본 적 있는지\n",
+ " df[\"userID_assessmentItemID_experience\"] = df.groupby([\"userID\", \"assessmentItemID\"])['assessmentItemID'].cumcount()\n",
+ " df['userID_assessmentItemID_experience'] = df['userID_assessmentItemID_experience'].apply(lambda x : 1 if x > 0 else 0)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "# User가 해당 test를 풀어본 경험 Feature\n",
+ "def userID_testid_experience(df):\n",
+ " # userID별 시간 순으로 정렬\n",
+ " df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)\n",
+ " \n",
+ " # userID 별로 testid를 풀어본 적 있는지\n",
+ " df[\"userID_testid_experience\"] = df.groupby([\"userID\", \"testId\"])['testId'].cumcount()\n",
+ " df['userID_testid_experience'] = df['userID_testid_experience'].apply(lambda x : 1 if x > 0 else 0)\n",
+ " return df\n",
+ " \n",
+ "\n",
+ "def feature_engineering(df): \n",
+ " print(\"assessmentItemID 관련 feature\")\n",
+ " df = assessmentItemID(df)\n",
+ " \n",
+ " print(\"KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = KnowledgeTag_relative(df)\n",
+ " \n",
+ " print(\"userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = userID_KnowledgeTag_relative(df)\n",
+ " \n",
+ " print(\"assessmentItemID별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = assessmentItemID_relative(df)\n",
+ " \n",
+ " print(\"question class별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = question_class_relative(df)\n",
+ " \n",
+ " print(\"userID_question_class별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = userID_question_class_relative(df)\n",
+ " \n",
+ " print(\"question num별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = question_num_relative(df)\n",
+ " \n",
+ " print(\"userID_question_num별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = userID_question_num_relative(df)\n",
+ " \n",
+ " print(\"user 별 누적 풀이 수, 정답 수, 정답률\")\n",
+ " df = userID_relative(df)\n",
+ " \n",
+ " print(\"userID별 정답률(user_acc)의 이동 평균 및 중앙값\")\n",
+ " window_list = [5, 10, 15, 30]\n",
+ " window_list = [5]\n",
+ " for window in window_list:\n",
+ " print(window)\n",
+ " df = userID_elapsed_median_rolling(df, window=window)\n",
+ " \n",
+ " print(\"feature_dimension_reduction\")\n",
+ " dimension_reduction_type = [\"lda\"]\n",
+ " for kind in dimension_reduction_type:\n",
+ " print(kind)\n",
+ " df = feature_dimension_reduction(df, kind=kind)\n",
+ " \n",
+ " print(\"User가 해당 문제를 풀어본 경험 Feature\")\n",
+ " df = userID_assessmentItemID_experience(df)\n",
+ " print(\"User가 해당 test를 풀어본 경험 Feature\")\n",
+ " df = userID_testid_experience(df)\n",
+ " \n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "361973a7-1012-42e3-a373-aecf34fda7cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "import pickle\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "import random\n",
+ "\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "import random\n",
+ "import warnings\n",
+ "import lightgbm as lgb\n",
+ "from wandb.lightgbm import wandb_callback\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "import numpy as np\n",
+ "import random\n",
+ "from matplotlib import pylab as plt\n",
+ "from datetime import datetime\n",
+ "import wandb\n",
+ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n",
+ "\n",
+ "%matplotlib inline\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e025187d-7857-457e-8ae8-74d9e514e83c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2266586, 6)\n",
+ "(260114, 6)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 기존\n",
+ "data_dir = '/opt/ml/input/data/train_dataset'\n",
+ "train_csv_file_path = os.path.join(data_dir, 'train_data.csv')\n",
+ "train_df = pd.read_csv(train_csv_file_path, parse_dates=['Timestamp'])\n",
+ "print(train_df.shape)\n",
+ "\n",
+ "test_csv_file_path = os.path.join(data_dir, 'test_data.csv')\n",
+ "test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])\n",
+ "# test = test_df[test_df[\"answerCode\"] == -1]\n",
+ "# test_df = test_df[test_df[\"answerCode\"] > -1]\n",
+ "print(test_df.shape)\n",
+ "\n",
+ "# inference\n",
+ "ikyo = pd.read_csv(\"/opt/ml/code/output/cross_validation/output_8253_1.csv\")\n",
+ "ikyo[\"userID\"] = test_df[test_df[\"userID\"] != test_df['userID'].shift(-1)].reset_index()[\"userID\"]\n",
+ "\n",
+ "df = pd.concat([train_df, test_df], ignore_index=True)\n",
+ "df[\"answercode\"] = df[\"answerCode\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "02dbf323-0c49-408d-a0b6-7b34cecc767d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.iloc[df[df[\"answercode\"] == -1].index, -1] = ikyo[\"prediction\"]\n",
+ "\n",
+ "df[\"answercode\"] = df[\"answercode\"].apply(lambda data: 1 if data >= 0.5 else 0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "365e3097-5b0d-42b0-ac89-c533c36ccc40",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "assessmentItemID 관련 feature\n",
+ "KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "assessmentItemID별 누적 풀이 수, 정답 수, 정답률\n",
+ "question class별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_class별 누적 풀이 수, 정답 수, 정답률\n",
+ "question num별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_num별 누적 풀이 수, 정답 수, 정답률\n",
+ "user 별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID별 정답률(user_acc)의 이동 평균 및 중앙값\n",
+ "5\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2526700/2526700 [00:25<00:00, 101013.30it/s]\n",
+ "100%|██████████| 2526700/2526700 [00:52<00:00, 48185.37it/s]\n",
+ "100%|██████████| 2526700/2526700 [00:52<00:00, 48056.70it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "feature_dimension_reduction\n",
+ "lda\n",
+ "User가 해당 문제를 풀어본 경험 Feature\n",
+ "User가 해당 test를 풀어본 경험 Feature\n",
+ "CPU times: user 5min 27s, sys: 35.2 s, total: 6min 2s\n",
+ "Wall time: 5min 54s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " answercode | \n",
+ " question_num | \n",
+ " question_class | \n",
+ " KnowledgeTag_total_answer | \n",
+ " ... | \n",
+ " userID_KnowledgeTag_lda | \n",
+ " assessmentItemID_lda | \n",
+ " question_class_lda | \n",
+ " userID_question_class_lda | \n",
+ " question_num_lda | \n",
+ " userID_question_num_lda | \n",
+ " userID_lda | \n",
+ " all_data_lda | \n",
+ " userID_assessmentItemID_experience | \n",
+ " userID_testid_experience | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " A060001001 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:11 | \n",
+ " 7224 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 365 | \n",
+ " ... | \n",
+ " 0.898323 | \n",
+ " -1.767781 | \n",
+ " -1.035837 | \n",
+ " 2.586445 | \n",
+ " -1.474883 | \n",
+ " 1.89388 | \n",
+ " 3.207732 | \n",
+ " 0.922117 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A060001002 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:14 | \n",
+ " 7225 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 1743 | \n",
+ " ... | \n",
+ " 0.898323 | \n",
+ " -1.666476 | \n",
+ " -1.035911 | \n",
+ " -1.372008 | \n",
+ " -1.063197 | \n",
+ " 1.89388 | \n",
+ " -1.728488 | \n",
+ " -1.931817 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 46 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
+ "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
+ "\n",
+ " KnowledgeTag answercode question_num question_class \\\n",
+ "0 7224 1 1 6 \n",
+ "1 7225 1 2 6 \n",
+ "\n",
+ " KnowledgeTag_total_answer ... userID_KnowledgeTag_lda \\\n",
+ "0 365 ... 0.898323 \n",
+ "1 1743 ... 0.898323 \n",
+ "\n",
+ " assessmentItemID_lda question_class_lda userID_question_class_lda \\\n",
+ "0 -1.767781 -1.035837 2.586445 \n",
+ "1 -1.666476 -1.035911 -1.372008 \n",
+ "\n",
+ " question_num_lda userID_question_num_lda userID_lda all_data_lda \\\n",
+ "0 -1.474883 1.89388 3.207732 0.922117 \n",
+ "1 -1.063197 1.89388 -1.728488 -1.931817 \n",
+ "\n",
+ " userID_assessmentItemID_experience userID_testid_experience \n",
+ "0 0 0 \n",
+ "1 0 1 \n",
+ "\n",
+ "[2 rows x 46 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "test = feature_engineering(df)\n",
+ "test.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "16521383-1ef6-4a99-a717-93f200c1f4a9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "kpca\n",
+ "kpca_rbf\n",
+ "kpca_poly\n"
+ ]
+ }
+ ],
+ "source": [
+ "test = test[test[\"answerCode\"] == -1]\n",
+ "\n",
+ "dimension_reduction_type = [\"kpca\", \"kpca_rbf\", \"kpca_poly\"]\n",
+ "for kind in dimension_reduction_type:\n",
+ " print(kind)\n",
+ " test = feature_dimension_reduction(test, kind=kind)\n",
+ "\n",
+ "def type_change(df):\n",
+ " df[\"userID\"] = df[\"userID\"].astype(\"category\")\n",
+ " df[\"testId\"] = df[\"testId\"].astype(\"category\")\n",
+ " df[\"question_class\"] = df[\"question_class\"].astype(\"category\")\n",
+ " df[\"KnowledgeTag\"] = df[\"KnowledgeTag\"].astype(\"category\")\n",
+ " return df\n",
+ "\n",
+ "test = type_change(test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "d4a0ea6a-4c2f-48c3-a2b7-4b076ff8564d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2266586, 6)\n",
+ "(259370, 6)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 기존\n",
+ "data_dir = '/opt/ml/input/data/train_dataset'\n",
+ "train_csv_file_path = os.path.join(data_dir, 'train_data.csv')\n",
+ "train_df = pd.read_csv(train_csv_file_path, parse_dates=['Timestamp'])\n",
+ "print(train_df.shape)\n",
+ "\n",
+ "test_csv_file_path = os.path.join(data_dir, 'test_data.csv')\n",
+ "test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])\n",
+ "# test = test_df[test_df[\"answerCode\"] == -1]\n",
+ "test_df = test_df[test_df[\"answerCode\"] > -1]\n",
+ "print(test_df.shape)\n",
+ "\n",
+ "df = pd.concat([train_df, test_df], ignore_index=True)\n",
+ "df[\"answercode\"] = df[\"answerCode\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "b7862dbd-83c3-4d1a-a488-89f8096f633c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "assessmentItemID 관련 feature\n",
+ "KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "assessmentItemID별 누적 풀이 수, 정답 수, 정답률\n",
+ "question class별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_class별 누적 풀이 수, 정답 수, 정답률\n",
+ "question num별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_num별 누적 풀이 수, 정답 수, 정답률\n",
+ "user 별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID별 정답률(user_acc)의 이동 평균 및 중앙값\n",
+ "5\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:24<00:00, 101797.44it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:53<00:00, 47070.07it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:52<00:00, 47690.07it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "feature_dimension_reduction\n",
+ "lda\n",
+ "User가 해당 문제를 풀어본 경험 Feature\n",
+ "User가 해당 test를 풀어본 경험 Feature\n",
+ "CPU times: user 5min 29s, sys: 34.6 s, total: 6min 3s\n",
+ "Wall time: 5min 55s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " answercode | \n",
+ " question_num | \n",
+ " question_class | \n",
+ " KnowledgeTag_total_answer | \n",
+ " ... | \n",
+ " userID_KnowledgeTag_lda | \n",
+ " assessmentItemID_lda | \n",
+ " question_class_lda | \n",
+ " userID_question_class_lda | \n",
+ " question_num_lda | \n",
+ " userID_question_num_lda | \n",
+ " userID_lda | \n",
+ " all_data_lda | \n",
+ " userID_assessmentItemID_experience | \n",
+ " userID_testid_experience | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " A060001001 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:11 | \n",
+ " 7224 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 365 | \n",
+ " ... | \n",
+ " 1.833508 | \n",
+ " -1.767737 | \n",
+ " -1.035569 | \n",
+ " 2.586450 | \n",
+ " -1.474702 | \n",
+ " 1.894006 | \n",
+ " 3.208150 | \n",
+ " 0.935909 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A060001002 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:14 | \n",
+ " 7225 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 1743 | \n",
+ " ... | \n",
+ " 1.833508 | \n",
+ " -1.666415 | \n",
+ " -1.035643 | \n",
+ " -1.371945 | \n",
+ " -1.062984 | \n",
+ " 1.894006 | \n",
+ " -1.728447 | \n",
+ " -1.917765 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 46 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
+ "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
+ "\n",
+ " KnowledgeTag answercode question_num question_class \\\n",
+ "0 7224 1 1 6 \n",
+ "1 7225 1 2 6 \n",
+ "\n",
+ " KnowledgeTag_total_answer ... userID_KnowledgeTag_lda \\\n",
+ "0 365 ... 1.833508 \n",
+ "1 1743 ... 1.833508 \n",
+ "\n",
+ " assessmentItemID_lda question_class_lda userID_question_class_lda \\\n",
+ "0 -1.767737 -1.035569 2.586450 \n",
+ "1 -1.666415 -1.035643 -1.371945 \n",
+ "\n",
+ " question_num_lda userID_question_num_lda userID_lda all_data_lda \\\n",
+ "0 -1.474702 1.894006 3.208150 0.935909 \n",
+ "1 -1.062984 1.894006 -1.728447 -1.917765 \n",
+ "\n",
+ " userID_assessmentItemID_experience userID_testid_experience \n",
+ "0 0 0 \n",
+ "1 0 1 \n",
+ "\n",
+ "[2 rows x 46 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "df = feature_engineering(df)\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "f5782c50-4c8f-465d-bfce-c0b4bb045363",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df[\"userID\"] != df['userID'].shift(-1)].reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9a5468b7-d36c-4d32-b814-8b69b8385e78",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "kpca\n",
+ "kpca_rbf\n",
+ "kpca_poly\n"
+ ]
+ }
+ ],
+ "source": [
+ "dimension_reduction_type = [\"kpca\", \"kpca_rbf\", \"kpca_poly\"]\n",
+ "for kind in dimension_reduction_type:\n",
+ " print(kind)\n",
+ " df = feature_dimension_reduction(df, kind=kind)\n",
+ "\n",
+ "def type_change(df):\n",
+ " df[\"userID\"] = df[\"userID\"].astype(\"category\")\n",
+ " df[\"testId\"] = df[\"testId\"].astype(\"category\")\n",
+ " df[\"question_class\"] = df[\"question_class\"].astype(\"category\")\n",
+ " df[\"KnowledgeTag\"] = df[\"KnowledgeTag\"].astype(\"category\")\n",
+ " return df\n",
+ "\n",
+ "df = type_change(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "2a22d996-a7cf-4f54-a2c6-916a25932794",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pickle\n",
+ "\n",
+ "# save\n",
+ "with open('train_data.pickle', 'wb') as f:\n",
+ " pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)\n",
+ " \n",
+ "# save\n",
+ "with open('test_data.pickle', 'wb') as f:\n",
+ " pickle.dump(test, f, pickle.HIGHEST_PROTOCOL)\n",
+ "\n",
+ "# load\n",
+ "with open('train_data.pickle', 'rb') as f:\n",
+ " df = pickle.load(f)\n",
+ "\n",
+ "# load\n",
+ "with open('test_data.pickle', 'rb') as f:\n",
+ " test = pickle.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "be9bb2d7-a276-4773-b5bd-af3e1d3ac3e1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "7438\n",
+ "744\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "7439"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 종헌\n",
+ "train_df = pd.read_csv(\"/opt/ml/code/output/oof/pjh_valid_proba.csv\").rename(columns={\"userid\": \"userID\", \"prediction\": \"pjh_pred\"})\n",
+ "test_df = pd.read_csv(\"/opt/ml/code/output/oof/pjh_test_proba.csv\").rename(columns={\"userid\": \"userID\", \"prediction\": \"pjh_pred\"})\n",
+ "\n",
+ "train = train_df.merge(df, on=\"userID\", how=\"left\")\n",
+ "test = test_df.merge(test, on=\"userID\", how=\"left\")\n",
+ "\n",
+ "print(len(set(train_df[\"userID\"].unique())))\n",
+ "print(len(set(test_df[\"userID\"].unique())))\n",
+ "len(set(train_df[\"userID\"].unique()) | set(test_df[\"userID\"].unique()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "b88ce8a7-aa0c-40ca-97d3-b76f8a1de049",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "7442\n",
+ "744\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "7442"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 익효\n",
+ "train_df = pd.read_csv(\"/opt/ml/code/output/oof/stacking_jih.csv\").rename(columns={\"id\": \"userID\", \"pred\": \"jik_pred\"})\n",
+ "test_df = pd.read_csv(\"/opt/ml/code/output/oof/test.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"jik_pred\"})\n",
+ "\n",
+ "train = train_df.merge(train, on=\"userID\", how=\"right\")\n",
+ "test = pd.concat([test, test_df], axis=1)\n",
+ "\n",
+ "print(len(set(train_df[\"userID\"].unique())))\n",
+ "print(len(set(test_df[\"userID\"].unique())))\n",
+ "len(set(train_df[\"userID\"].unique()) | set(test_df[\"userID\"].unique()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "e0a7b524-ef54-454c-bdce-8d8a003ce59c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 태양\n",
+ "import pickle\n",
+ "\n",
+ "for i in range(5):\n",
+ " if i:\n",
+ " with open('/opt/ml/code/output/oof/7975/oof_sun_' + str(i) + '.pickle', 'rb') as f:\n",
+ " a = pickle.load(f)\n",
+ " result = pd.concat([result, pd.DataFrame(a[1], a[0]).reset_index().rename(columns = {\"index\": \"userID\", 0: \"sun_pred\"})])\n",
+ " else:\n",
+ " with open('/opt/ml/code/output/oof/7975/oof_sun_' + str(i) + '.pickle', 'rb') as f:\n",
+ " a = pickle.load(f)\n",
+ " result = pd.DataFrame(a[1], a[0]).reset_index().rename(columns = {\"index\": \"userID\", 0: \"sun_pred\"})\n",
+ " \n",
+ "test_df = pd.read_csv(\"/opt/ml/code/output/oof/sun_test.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"sun_pred\"})\n",
+ "\n",
+ "train = train.merge(result, on=\"userID\", how=\"left\")\n",
+ "test = pd.concat([test, test_df], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "98ca0434-b228-40a9-97d4-3309de7f28c8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "7438\n",
+ "744\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "7438"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 재희\n",
+ "train_df = pd.read_csv(\"/opt/ml/code/output/oof/LGBM_8073_valid_proba.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"rjh_pred\"})\n",
+ "test_df = pd.read_csv(\"/opt/ml/code/output/oof/LGBM_8073_test_proba.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"rjh_pred\"})\n",
+ "\n",
+ "train = train_df.merge(train, on=\"userID\", how=\"left\")\n",
+ "test = pd.concat([test, test_df], axis=1)\n",
+ "\n",
+ "print(len(set(train_df[\"userID\"].unique())))\n",
+ "print(len(set(test_df[\"userID\"].unique())))\n",
+ "len(set(train_df[\"userID\"].unique()) | set(test_df[\"userID\"].unique()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "afcd91e8-da50-4f72-91e8-8388b2f57f0b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "7442\n",
+ "744\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "7442"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 수지\n",
+ "train_df = pd.read_csv(\"/opt/ml/code/output/oof/suz_0611_valid_proba.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"osj_pred\"})\n",
+ "test_df = pd.read_csv(\"/opt/ml/code/output/oof/suz_0611_test_proba.csv\").rename(columns={\"id\": \"userID\", \"prediction\": \"osj_pred\"})\n",
+ "\n",
+ "train = train_df.merge(train, on=\"userID\", how=\"right\")\n",
+ "test = pd.concat([test, test_df], axis=1)\n",
+ "\n",
+ "print(len(set(train_df[\"userID\"].unique())))\n",
+ "print(len(set(test_df[\"userID\"].unique())))\n",
+ "len(set(train_df[\"userID\"].unique()) | set(test_df[\"userID\"].unique()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "474f051a-50a8-4dd1-846b-d2b6ef4fe43c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train[\"sun_jik_pred\"] = train[[\"sun_pred\", \"jik_pred\"]].mean(axis=1)\n",
+ "train[\"sun_jik_jik_pred\"] = (train[\"sun_pred\"] + train[\"jik_pred\"] * 2) / 3\n",
+ "train[\"sun_jik_power_4_pred\"] = (train[\"sun_pred\"]**4 + train[\"jik_pred\"] ** 4) / 2\n",
+ "train[\"sun_jik_power_025_pred\"] = (train[\"sun_pred\"]**0.25 + train[\"jik_pred\"] ** 0.25) / 2\n",
+ "\n",
+ "test[\"sun_jik_pred\"] = test[[\"sun_pred\", \"jik_pred\"]].mean(axis=1)\n",
+ "test[\"sun_jik_jik_pred\"] = (test[\"sun_pred\"] + test[\"jik_pred\"] * 2) / 3\n",
+ "test[\"sun_jik_power_4_pred\"] = (test[\"sun_pred\"]**4 + train[\"jik_pred\"] ** 4) / 2\n",
+ "test[\"sun_jik_power_025_pred\"] = (test[\"sun_pred\"]**0.25 + train[\"jik_pred\"] ** 0.25) / 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "bb8522ca-30ac-4625-b1fd-7a82cfcefd33",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "jik_pred\n",
+ "0.8312157477051288\n",
+ "pjh_pred\n",
+ "0.8135325617865522\n",
+ "sun_pred\n",
+ "0.798833670827161\n",
+ "rjh_pred\n",
+ "0.8342890834350734\n",
+ "osj_pred\n",
+ "0.8435961414556462\n",
+ "sun_jik_pred\n",
+ "0.8325791098846328\n",
+ "sun_jik_jik_pred\n",
+ "0.8348436616285727\n"
+ ]
+ }
+ ],
+ "source": [
+ "columns = [\"jik_pred\", \"pjh_pred\", \"sun_pred\", \"rjh_pred\", \"osj_pred\", \"sun_jik_pred\", \"sun_jik_jik_pred\"]\n",
+ "\n",
+ "for column in columns:\n",
+ " print(column)\n",
+ " print(roc_auc_score(train[\"answerCode\"], train[column].fillna(0.5)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "840161bd-7513-4533-9678-1308734bd16b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['userID', 'osj_pred', 'Timestamp_x', 'rjh_pred', 'Unnamed: 0_x',\n",
+ " 'timestamp', 'jik_pred', 'next_userID', 'Unnamed: 0_y', 'pjh_pred',\n",
+ " 'index', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp_y',\n",
+ " 'KnowledgeTag', 'answercode', 'question_num', 'question_class',\n",
+ " 'KnowledgeTag_total_answer', 'KnowledgeTag_correct_answer',\n",
+ " 'KnowledgeTag_acc', 'userID_KnowledgeTag_total_answer',\n",
+ " 'userID_KnowledgeTag_correct_answer', 'userID_KnowledgeTag_acc',\n",
+ " 'assessmentItemID_total_answer', 'assessmentItemID_correct_answer',\n",
+ " 'assessmentItemID_acc', 'question_class_correct_answer',\n",
+ " 'question_class_total_answer', 'question_class_acc',\n",
+ " 'userID_question_class_correct_answer',\n",
+ " 'userID_question_class_total_answer', 'userID_question_class_acc',\n",
+ " 'question_num_correct_answer', 'question_num_total_answer',\n",
+ " 'question_num_acc', 'userID_question_num_correct_answer',\n",
+ " 'userID_question_num_total_answer', 'userID_question_num_acc',\n",
+ " 'userID_correct_answer', 'userID_total_answer', 'userID_acc',\n",
+ " 'userID_elapsed_median', 'userID_elapsed_median_rolling_5',\n",
+ " 'KnowledgeTag_lda', 'userID_KnowledgeTag_lda', 'assessmentItemID_lda',\n",
+ " 'question_class_lda', 'userID_question_class_lda', 'question_num_lda',\n",
+ " 'userID_question_num_lda', 'userID_lda', 'all_data_lda',\n",
+ " 'userID_assessmentItemID_experience', 'userID_testid_experience',\n",
+ " 'KnowledgeTag_kpca', 'userID_KnowledgeTag_kpca',\n",
+ " 'assessmentItemID_kpca', 'question_class_kpca',\n",
+ " 'userID_question_class_kpca', 'question_num_kpca',\n",
+ " 'userID_question_num_kpca', 'userID_kpca', 'all_data_kpca',\n",
+ " 'KnowledgeTag_kpca_rbf', 'userID_KnowledgeTag_kpca_rbf',\n",
+ " 'assessmentItemID_kpca_rbf', 'question_class_kpca_rbf',\n",
+ " 'userID_question_class_kpca_rbf', 'question_num_kpca_rbf',\n",
+ " 'userID_question_num_kpca_rbf', 'userID_kpca_rbf', 'all_data_kpca_rbf',\n",
+ " 'KnowledgeTag_kpca_poly', 'userID_KnowledgeTag_kpca_poly',\n",
+ " 'assessmentItemID_kpca_poly', 'question_class_kpca_poly',\n",
+ " 'userID_question_class_kpca_poly', 'question_num_kpca_poly',\n",
+ " 'userID_question_num_kpca_poly', 'userID_kpca_poly',\n",
+ " 'all_data_kpca_poly', 'sun_pred', 'sun_jik_pred', 'sun_jik_jik_pred',\n",
+ " 'sun_jik_power_4_pred', 'sun_jik_power_025_pred'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "cf57df92-4dbe-4afc-a0ef-1207aa316fbf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'userID', 'pjh_pred', 'assessmentItemID', 'testId', 'answerCode',\n",
+ " 'Timestamp', 'KnowledgeTag', 'answercode', 'question_num',\n",
+ " 'question_class', 'KnowledgeTag_total_answer',\n",
+ " 'KnowledgeTag_correct_answer', 'KnowledgeTag_acc',\n",
+ " 'userID_KnowledgeTag_total_answer',\n",
+ " 'userID_KnowledgeTag_correct_answer', 'userID_KnowledgeTag_acc',\n",
+ " 'assessmentItemID_total_answer', 'assessmentItemID_correct_answer',\n",
+ " 'assessmentItemID_acc', 'question_class_correct_answer',\n",
+ " 'question_class_total_answer', 'question_class_acc',\n",
+ " 'userID_question_class_correct_answer',\n",
+ " 'userID_question_class_total_answer', 'userID_question_class_acc',\n",
+ " 'question_num_correct_answer', 'question_num_total_answer',\n",
+ " 'question_num_acc', 'userID_question_num_correct_answer',\n",
+ " 'userID_question_num_total_answer', 'userID_question_num_acc',\n",
+ " 'userID_correct_answer', 'userID_total_answer', 'userID_acc',\n",
+ " 'userID_elapsed_median', 'userID_elapsed_median_rolling_5',\n",
+ " 'KnowledgeTag_lda', 'userID_KnowledgeTag_lda', 'assessmentItemID_lda',\n",
+ " 'question_class_lda', 'userID_question_class_lda', 'question_num_lda',\n",
+ " 'userID_question_num_lda', 'userID_lda', 'all_data_lda',\n",
+ " 'userID_assessmentItemID_experience', 'userID_testid_experience',\n",
+ " 'KnowledgeTag_kpca', 'userID_KnowledgeTag_kpca',\n",
+ " 'assessmentItemID_kpca', 'question_class_kpca',\n",
+ " 'userID_question_class_kpca', 'question_num_kpca',\n",
+ " 'userID_question_num_kpca', 'userID_kpca', 'all_data_kpca',\n",
+ " 'KnowledgeTag_kpca_rbf', 'userID_KnowledgeTag_kpca_rbf',\n",
+ " 'assessmentItemID_kpca_rbf', 'question_class_kpca_rbf',\n",
+ " 'userID_question_class_kpca_rbf', 'question_num_kpca_rbf',\n",
+ " 'userID_question_num_kpca_rbf', 'userID_kpca_rbf', 'all_data_kpca_rbf',\n",
+ " 'KnowledgeTag_kpca_poly', 'userID_KnowledgeTag_kpca_poly',\n",
+ " 'assessmentItemID_kpca_poly', 'question_class_kpca_poly',\n",
+ " 'userID_question_class_kpca_poly', 'question_num_kpca_poly',\n",
+ " 'userID_question_num_kpca_poly', 'userID_kpca_poly',\n",
+ " 'all_data_kpca_poly', 'userID', 'jik_pred', 'userID', 'sun_pred',\n",
+ " 'userID', 'rjh_pred', 'userID', 'osj_pred', 'sun_jik_pred',\n",
+ " 'sun_jik_jik_pred', 'sun_jik_power_4_pred', 'sun_jik_power_025_pred'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "8933cd38-a186-4420-99bb-20f5bc7cdf7f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " assessmentItemID_total_answer | \n",
+ " assessmentItemID_correct_answer | \n",
+ " assessmentItemID_acc | \n",
+ " userID_question_class_correct_answer | \n",
+ " userID_question_class_total_answer | \n",
+ " userID_question_class_acc | \n",
+ " jik_pred | \n",
+ " sun_pred | \n",
+ " sun_jik_pred | \n",
+ " sun_jik_jik_pred | \n",
+ " ... | \n",
+ " assessmentItemID_kpca_poly | \n",
+ " question_class_kpca_poly | \n",
+ " userID_question_class_kpca_poly | \n",
+ " question_num_kpca_poly | \n",
+ " userID_question_num_kpca_poly | \n",
+ " userID_kpca_poly | \n",
+ " userID_assessmentItemID_experience | \n",
+ " userID_testid_experience | \n",
+ " userID_elapsed_median_rolling_5 | \n",
+ " answerCode | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 372 | \n",
+ " 56.0 | \n",
+ " 0.150538 | \n",
+ " 170.0 | \n",
+ " 362 | \n",
+ " 0.469613 | \n",
+ " 3.760731e-02 | \n",
+ " 0.142654 | \n",
+ " 0.090131 | \n",
+ " 0.072623 | \n",
+ " ... | \n",
+ " 2.455014e+06 | \n",
+ " 2.618637e+15 | \n",
+ " 1.758173e+06 | \n",
+ " -4.844229e+15 | \n",
+ " 159301.609307 | \n",
+ " 6.685003e+07 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 56.8 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 3.0 | \n",
+ " 0.750000 | \n",
+ " 317.0 | \n",
+ " 351 | \n",
+ " 0.903134 | \n",
+ " 9.827468e-01 | \n",
+ " 0.843784 | \n",
+ " 0.913266 | \n",
+ " 0.936426 | \n",
+ " ... | \n",
+ " -5.107819e+06 | \n",
+ " 6.979407e+15 | \n",
+ " 1.008129e+07 | \n",
+ " -5.357891e+15 | \n",
+ " 70712.012754 | \n",
+ " 2.877639e+08 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 75.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 200 | \n",
+ " 26.0 | \n",
+ " 0.130000 | \n",
+ " 13.0 | \n",
+ " 97 | \n",
+ " 0.134021 | \n",
+ " 2.442704e-09 | \n",
+ " 0.041673 | \n",
+ " 0.020836 | \n",
+ " 0.013891 | \n",
+ " ... | \n",
+ " -3.975562e+06 | \n",
+ " -6.092281e+14 | \n",
+ " -9.804004e+06 | \n",
+ " -7.394932e+15 | \n",
+ " -108789.834221 | \n",
+ " -5.691442e+07 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7.4 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 249 | \n",
+ " 106.0 | \n",
+ " 0.425703 | \n",
+ " 564.0 | \n",
+ " 860 | \n",
+ " 0.655814 | \n",
+ " 7.093197e-01 | \n",
+ " 0.427248 | \n",
+ " 0.568284 | \n",
+ " 0.615296 | \n",
+ " ... | \n",
+ " -1.502617e+06 | \n",
+ " -1.918543e+15 | \n",
+ " 1.987392e+08 | \n",
+ " -7.330267e+15 | \n",
+ " 156034.046756 | \n",
+ " 3.195926e+08 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 40.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 145 | \n",
+ " 97.0 | \n",
+ " 0.668966 | \n",
+ " 298.0 | \n",
+ " 424 | \n",
+ " 0.702830 | \n",
+ " 9.948128e-01 | \n",
+ " 0.761487 | \n",
+ " 0.878150 | \n",
+ " 0.917037 | \n",
+ " ... | \n",
+ " -4.086328e+06 | \n",
+ " -5.063882e+14 | \n",
+ " 1.684814e+07 | \n",
+ " -7.092266e+15 | \n",
+ " -89662.971786 | \n",
+ " 4.041229e+07 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 30.2 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 51 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " assessmentItemID_total_answer assessmentItemID_correct_answer \\\n",
+ "0 372 56.0 \n",
+ "1 4 3.0 \n",
+ "2 200 26.0 \n",
+ "3 249 106.0 \n",
+ "4 145 97.0 \n",
+ "\n",
+ " assessmentItemID_acc userID_question_class_correct_answer \\\n",
+ "0 0.150538 170.0 \n",
+ "1 0.750000 317.0 \n",
+ "2 0.130000 13.0 \n",
+ "3 0.425703 564.0 \n",
+ "4 0.668966 298.0 \n",
+ "\n",
+ " userID_question_class_total_answer userID_question_class_acc \\\n",
+ "0 362 0.469613 \n",
+ "1 351 0.903134 \n",
+ "2 97 0.134021 \n",
+ "3 860 0.655814 \n",
+ "4 424 0.702830 \n",
+ "\n",
+ " jik_pred sun_pred sun_jik_pred sun_jik_jik_pred ... \\\n",
+ "0 3.760731e-02 0.142654 0.090131 0.072623 ... \n",
+ "1 9.827468e-01 0.843784 0.913266 0.936426 ... \n",
+ "2 2.442704e-09 0.041673 0.020836 0.013891 ... \n",
+ "3 7.093197e-01 0.427248 0.568284 0.615296 ... \n",
+ "4 9.948128e-01 0.761487 0.878150 0.917037 ... \n",
+ "\n",
+ " assessmentItemID_kpca_poly question_class_kpca_poly \\\n",
+ "0 2.455014e+06 2.618637e+15 \n",
+ "1 -5.107819e+06 6.979407e+15 \n",
+ "2 -3.975562e+06 -6.092281e+14 \n",
+ "3 -1.502617e+06 -1.918543e+15 \n",
+ "4 -4.086328e+06 -5.063882e+14 \n",
+ "\n",
+ " userID_question_class_kpca_poly question_num_kpca_poly \\\n",
+ "0 1.758173e+06 -4.844229e+15 \n",
+ "1 1.008129e+07 -5.357891e+15 \n",
+ "2 -9.804004e+06 -7.394932e+15 \n",
+ "3 1.987392e+08 -7.330267e+15 \n",
+ "4 1.684814e+07 -7.092266e+15 \n",
+ "\n",
+ " userID_question_num_kpca_poly userID_kpca_poly \\\n",
+ "0 159301.609307 6.685003e+07 \n",
+ "1 70712.012754 2.877639e+08 \n",
+ "2 -108789.834221 -5.691442e+07 \n",
+ "3 156034.046756 3.195926e+08 \n",
+ "4 -89662.971786 4.041229e+07 \n",
+ "\n",
+ " userID_assessmentItemID_experience userID_testid_experience \\\n",
+ "0 0 1 \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "\n",
+ " userID_elapsed_median_rolling_5 answerCode \n",
+ "0 56.8 0 \n",
+ "1 75.0 1 \n",
+ "2 7.4 0 \n",
+ "3 40.2 0 \n",
+ "4 30.2 1 \n",
+ "\n",
+ "[5 rows x 51 columns]"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "FEATS = ['assessmentItemID_total_answer', 'assessmentItemID_correct_answer', 'assessmentItemID_acc',\n",
+ " 'userID_question_class_correct_answer', 'userID_question_class_total_answer', 'userID_question_class_acc',\n",
+ " \"jik_pred\", \"sun_pred\", \"sun_jik_pred\", 'sun_jik_jik_pred', 'sun_jik_power_4_pred', 'sun_jik_power_025_pred', \n",
+ " 'KnowledgeTag_lda', 'userID_KnowledgeTag_lda', 'assessmentItemID_lda', \n",
+ " 'question_class_lda', 'userID_question_class_lda', 'question_num_lda',\n",
+ " 'userID_question_num_lda', 'userID_lda', 'all_data_lda',\n",
+ " 'KnowledgeTag_kpca', 'userID_KnowledgeTag_kpca',\n",
+ " 'assessmentItemID_kpca', 'question_class_kpca',\n",
+ " 'userID_question_class_kpca', 'question_num_kpca',\n",
+ " 'userID_question_num_kpca', 'userID_kpca', 'all_data_kpca',\n",
+ " 'KnowledgeTag_kpca_rbf', 'userID_KnowledgeTag_kpca_rbf',\n",
+ " 'assessmentItemID_kpca_rbf', 'question_class_kpca_rbf',\n",
+ " 'userID_question_class_kpca_rbf', 'question_num_kpca_rbf',\n",
+ " 'userID_question_num_kpca_rbf', 'userID_kpca_rbf', 'all_data_kpca_rbf',\n",
+ " 'KnowledgeTag_kpca_poly', 'userID_KnowledgeTag_kpca_poly',\n",
+ " 'assessmentItemID_kpca_poly', 'question_class_kpca_poly',\n",
+ " 'userID_question_class_kpca_poly', 'question_num_kpca_poly',\n",
+ " 'userID_question_num_kpca_poly', 'userID_kpca_poly',\n",
+ " 'userID_assessmentItemID_experience', 'userID_testid_experience',\n",
+ " 'userID_elapsed_median_rolling_5',\n",
+ " \"answerCode\"]\n",
+ "train[FEATS].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "663e7453-5611-4bca-89d0-a5d8f038ea79",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " assessmentItemID_total_answer | \n",
+ " assessmentItemID_correct_answer | \n",
+ " assessmentItemID_acc | \n",
+ " userID_question_class_correct_answer | \n",
+ " userID_question_class_total_answer | \n",
+ " userID_question_class_acc | \n",
+ " jik_pred | \n",
+ " sun_pred | \n",
+ " sun_jik_pred | \n",
+ " sun_jik_jik_pred | \n",
+ " ... | \n",
+ " assessmentItemID_kpca_poly | \n",
+ " question_class_kpca_poly | \n",
+ " userID_question_class_kpca_poly | \n",
+ " question_num_kpca_poly | \n",
+ " userID_question_num_kpca_poly | \n",
+ " userID_kpca_poly | \n",
+ " userID_assessmentItemID_experience | \n",
+ " userID_testid_experience | \n",
+ " userID_elapsed_median_rolling_5 | \n",
+ " answerCode | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 249 | \n",
+ " 133.0 | \n",
+ " 0.534137 | \n",
+ " 564.0 | \n",
+ " 861 | \n",
+ " 0.655052 | \n",
+ " 0.805032 | \n",
+ " 0.533959 | \n",
+ " 0.669495 | \n",
+ " 0.714674 | \n",
+ " ... | \n",
+ " -1.002197e+06 | \n",
+ " -1.805907e+15 | \n",
+ " 1.998720e+08 | \n",
+ " -7.105437e+15 | \n",
+ " -92059.906678 | \n",
+ " 3.206844e+08 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 27.6 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 145 | \n",
+ " 89.0 | \n",
+ " 0.613793 | \n",
+ " 299.0 | \n",
+ " 425 | \n",
+ " 0.703529 | \n",
+ " 0.982029 | \n",
+ " 0.786134 | \n",
+ " 0.884081 | \n",
+ " 0.916730 | \n",
+ " ... | \n",
+ " -4.318186e+06 | \n",
+ " -3.626281e+14 | \n",
+ " 1.826659e+07 | \n",
+ " -7.051564e+15 | \n",
+ " -78865.563502 | \n",
+ " 4.107684e+07 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 29.8 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 248 | \n",
+ " 92.0 | \n",
+ " 0.370968 | \n",
+ " 191.0 | \n",
+ " 489 | \n",
+ " 0.390593 | \n",
+ " 0.112553 | \n",
+ " 0.243223 | \n",
+ " 0.177888 | \n",
+ " 0.156110 | \n",
+ " ... | \n",
+ " -1.990065e+06 | \n",
+ " -3.676959e+14 | \n",
+ " 1.623048e+07 | \n",
+ " -7.051510e+15 | \n",
+ " -68835.666071 | \n",
+ " 7.288832e+08 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 9.4 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 31 | \n",
+ " 6.0 | \n",
+ " 0.193548 | \n",
+ " 381.0 | \n",
+ " 412 | \n",
+ " 0.924757 | \n",
+ " 0.920797 | \n",
+ " 0.824487 | \n",
+ " 0.872642 | \n",
+ " 0.888694 | \n",
+ " ... | \n",
+ " -5.258445e+06 | \n",
+ " 7.097086e+15 | \n",
+ " 2.478480e+07 | \n",
+ " -4.803164e+15 | \n",
+ " 638936.156149 | \n",
+ " 7.610829e+08 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 72.6 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 236 | \n",
+ " 75.0 | \n",
+ " 0.317797 | \n",
+ " 273.0 | \n",
+ " 334 | \n",
+ " 0.817365 | \n",
+ " 0.070350 | \n",
+ " 0.375657 | \n",
+ " 0.223003 | \n",
+ " 0.172119 | \n",
+ " ... | \n",
+ " -2.676155e+06 | \n",
+ " -1.184679e+15 | \n",
+ " 6.711042e+06 | \n",
+ " -6.647451e+15 | \n",
+ " -84211.032967 | \n",
+ " -4.139170e+07 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 21.1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 51 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " assessmentItemID_total_answer assessmentItemID_correct_answer \\\n",
+ "0 249 133.0 \n",
+ "1 145 89.0 \n",
+ "2 248 92.0 \n",
+ "3 31 6.0 \n",
+ "4 236 75.0 \n",
+ "\n",
+ " assessmentItemID_acc userID_question_class_correct_answer \\\n",
+ "0 0.534137 564.0 \n",
+ "1 0.613793 299.0 \n",
+ "2 0.370968 191.0 \n",
+ "3 0.193548 381.0 \n",
+ "4 0.317797 273.0 \n",
+ "\n",
+ " userID_question_class_total_answer userID_question_class_acc jik_pred \\\n",
+ "0 861 0.655052 0.805032 \n",
+ "1 425 0.703529 0.982029 \n",
+ "2 489 0.390593 0.112553 \n",
+ "3 412 0.924757 0.920797 \n",
+ "4 334 0.817365 0.070350 \n",
+ "\n",
+ " sun_pred sun_jik_pred sun_jik_jik_pred ... assessmentItemID_kpca_poly \\\n",
+ "0 0.533959 0.669495 0.714674 ... -1.002197e+06 \n",
+ "1 0.786134 0.884081 0.916730 ... -4.318186e+06 \n",
+ "2 0.243223 0.177888 0.156110 ... -1.990065e+06 \n",
+ "3 0.824487 0.872642 0.888694 ... -5.258445e+06 \n",
+ "4 0.375657 0.223003 0.172119 ... -2.676155e+06 \n",
+ "\n",
+ " question_class_kpca_poly userID_question_class_kpca_poly \\\n",
+ "0 -1.805907e+15 1.998720e+08 \n",
+ "1 -3.626281e+14 1.826659e+07 \n",
+ "2 -3.676959e+14 1.623048e+07 \n",
+ "3 7.097086e+15 2.478480e+07 \n",
+ "4 -1.184679e+15 6.711042e+06 \n",
+ "\n",
+ " question_num_kpca_poly userID_question_num_kpca_poly userID_kpca_poly \\\n",
+ "0 -7.105437e+15 -92059.906678 3.206844e+08 \n",
+ "1 -7.051564e+15 -78865.563502 4.107684e+07 \n",
+ "2 -7.051510e+15 -68835.666071 7.288832e+08 \n",
+ "3 -4.803164e+15 638936.156149 7.610829e+08 \n",
+ "4 -6.647451e+15 -84211.032967 -4.139170e+07 \n",
+ "\n",
+ " userID_assessmentItemID_experience userID_testid_experience \\\n",
+ "0 0 1 \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "\n",
+ " userID_elapsed_median_rolling_5 answerCode \n",
+ "0 27.6 -1 \n",
+ "1 29.8 -1 \n",
+ "2 9.4 -1 \n",
+ "3 72.6 -1 \n",
+ "4 21.1 -1 \n",
+ "\n",
+ "[5 rows x 51 columns]"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test[FEATS].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "3acb66b8-bb65-42c9-8cbf-f86c91baadec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train = train[FEATS]\n",
+ "test = test[FEATS]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "c8561608-c1f9-4eff-bd9e-4ef787c8e577",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import roc_auc_score\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "# xgboost 관련\n",
+ "from xgboost import XGBClassifier\n",
+ "from xgboost import plot_importance\n",
+ "from sklearn.model_selection import StratifiedKFold\n",
+ "from bayes_opt import BayesianOptimization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "21cbcbe2-f111-4417-ab31-6f76e80b4660",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "==============================\n",
+ "{'booster': 'dart', 'learning_rate': 0.08837, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 3, 'min_child_weight': 5.7, 'gamma': 4.7, 'subsample': 0.8, 'colsample_bytree': 0.65, 'random_state': 2021}\n",
+ "==============================\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "def set_params():\n",
+ " params = {}\n",
+ " params[\"booster\"] = \"dart\" # gbdt, dart, goss\n",
+ " params[\"learning_rate\"] = 0.08837 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3\n",
+ " params[\"objective\"] = \"binary:logistic\"\n",
+ " params[\"eval_metric\"] = \"auc\" # binary_logloss, rmse, huber, auc\n",
+ " params[\"max_depth\"] = 3 # -1\n",
+ " params[\"min_child_weight\"] = 5.7 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것\n",
+ " params[\"gamma\"] = 4.7 # 0.0\n",
+ " params[\"subsample\"] = 0.8 # 0.0\n",
+ " params[\"colsample_bytree\"] = 0.65 # 0.0\n",
+ " params[\"random_state\"] = 2021\n",
+ " \n",
+ " print(\"=\"*30)\n",
+ " print(params)\n",
+ " print(\"=\"*30)\n",
+ " print()\n",
+ " \n",
+ " return params\n",
+ "\n",
+ "params = set_params()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "735474d8-860f-4cb2-8e69-674d6786455b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "==============================\n",
+ "{'booster': 'dart', 'learning_rate': 0.08837, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 3, 'min_child_weight': 5.7, 'gamma': 4.7, 'subsample': 0.8, 'colsample_bytree': 0.65, 'random_state': 2021}\n",
+ "==============================\n",
+ "\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "VALID AUC : 0.8424596602127828 ACC : 0.7627688172043011\n",
+ "\n",
+ "VALID AUC : 0.8551062656505664 ACC : 0.7762096774193549\n",
+ "\n",
+ "VALID AUC : 0.8385358753921116 ACC : 0.7614247311827957\n",
+ "\n",
+ "VALID AUC : 0.8485941375626889 ACC : 0.7659717552118359\n",
+ "\n",
+ "VALID AUC : 0.8427353212753246 ACC : 0.7558843308675185\n",
+ "\n",
+ "0.8454862520186948\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import xgboost as xgb\n",
+ "\n",
+ "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)\n",
+ "\n",
+ "params = set_params()\n",
+ "\n",
+ "auc_score = 0\n",
+ "\n",
+ "final_preds = []\n",
+ "\n",
+ "print(\"@\"*50)\n",
+ "print(\"start\")\n",
+ "print(\"@\"*50)\n",
+ "\n",
+ "for fold, (train_index, test_index) in enumerate(skf.split(train, train[\"answerCode\"])):\n",
+ "\n",
+ " temp_train = train.iloc[train_index,:]\n",
+ " temp_valid = train.iloc[test_index,:]\n",
+ "\n",
+ " # X, y 값 분리\n",
+ " y_train = temp_train[\"answerCode\"]\n",
+ " train_df = temp_train.drop([\"answerCode\"], axis=1)\n",
+ "\n",
+ " y_test = temp_valid[\"answerCode\"]\n",
+ " test_df = temp_valid.drop([\"answerCode\"], axis=1)\n",
+ "\n",
+ " D_train = xgb.DMatrix(train_df, label=y_train)\n",
+ " D_test = xgb.DMatrix(test_df, label=y_test)\n",
+ " \n",
+ " y_final = test[\"answerCode\"]\n",
+ " final = test.drop([\"answerCode\"], axis=1)\n",
+ " D_final = xgb.DMatrix(final, label=y_final)\n",
+ "\n",
+ " model = xgb.train(params, D_train, num_boost_round=100)\n",
+ "\n",
+ " preds = model.predict(D_test)\n",
+ "\n",
+ " acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n",
+ " auc = roc_auc_score(y_test, preds)\n",
+ "\n",
+ " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n",
+ " \n",
+ " final_preds.append(model.predict(D_final))\n",
+ " \n",
+ " fig,ax = plt.subplots(figsize=(10,8))\n",
+ " plot_importance(model, ax=ax, max_num_features = 50, height=.4)\n",
+ "\n",
+ " auc_score += auc\n",
+ "\n",
+ "print(auc_score / 5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "1142144d-fc11-45fb-bf1b-f6ebbd6be3f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "result = pd.DataFrame(np.array(final_preds).mean(axis=0)).reset_index().rename(columns = {0:\"prediction\", \"index\":\"id\"})\n",
+ "result.to_csv(\"stacking.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "0b348f3a-0d8a-45da-af18-3b7602645578",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQnElEQVR4nO3df4wcd3nH8fdDAsXNpQ5gWFkm5agaUCOfCM0pBfFH9whUKUgEVISIAMUi5RAtCIlTpYiqalqKFNQa/kIqRkGxKuBIIZSIQFGU+rCogPYMgcsPUUIwNFdkN+AYjqa0B0//2HFtH7s3s3f7w9+990s63e53Z2efPDv+ZG72uzORmUiSyvOkcRcgSdoaA1ySCmWAS1KhDHBJKpQBLkmFuniUL7Znz56cnp6uXe6nP/0pl1xyyfALKoC96LAPHfbhrJ3Ui2PHjj2Wmc/cOD7SAJ+enmZ5ebl2uaWlJdrt9vALKoC96LAPHfbhrJ3Ui4j4XrdxD6FIUqEMcEkqlAEuSYUywCWpUAa4JBXKAJekQhngklQoA1ySCmWAS1KhRvpNTPVn+ua7WZhZ58DNd583fvzWV46pIkkXEvfAJalQBrgkFcoAl6RCGeCSVCgDXJIKZYBLUqEMcEkqlAEuSYUywCWpUAa4JBXKAJekQhngklQoA1ySCmWAS1KhagM8Ip4aEf8SEd+IiAci4i+q8edGxFcj4uGI+EREPGX45UqSzmiyB/4z4KWZ+QLgKuC6iHgR8D7gA5n5m8Ap4KahVSlJ+iW1AZ4da9XdJ1c/CbwU+GQ1fhh49TAKlCR11+gYeERcFBH3ASeBe4DvAI9n5nq1yKPAvqFUKEnqKjKz+cIRlwGfBv4MuL06fEJEXA58PjP3d3nOPDAP0Gq1rl5cXKx9nbW1NaamphrXNalWVk/T2gUnnjh/fGbf7vEUNEZuEx324ayd1Iu5ubljmTm7cbyva2Jm5uMRcQR4MXBZRFxc7YU/G1jt8ZxDwCGA2dnZbLfbta+ztLREk+Um3YHqmpgHV85/m46/oT2egsbIbaLDPpxlL5rNQnlmtedNROwCXg48BBwBXlstdiPwmSHVKEnqoske+F7gcERcRCfw78jMz0bEg8BiRPwV8HXgtiHWKUnaoDbAM/ObwAu7jD8CXDOMoiRJ9fwmpiQVqq8PMXVhmL757q7jx2995YgrkTRO7oFLUqEMcEkqlAEuSYUywCWpUAa4JBWq+FkozsiQtFO5By5JhTLAJalQBrgkFcoAl6RCGeCSVKjiZ6FciPqdGdNreUnajHvgklQoA1ySCmWAS1KhDHBJKpQBLkmFMsAlqVAGuCQVygCXpEIZ4JJUqNoAj4jLI+JIRDwYEQ9ExDur8VsiYjUi7qt+XjH8ciVJZzT5Kv06sJCZX4uIS4FjEXFP9dgHMvNvhleeJKmX2gDPzB8AP6hu/yQiHgL2DbswSdLmIjObLxwxDRwF9gPvAg4APwaW6eyln+rynHlgHqDVal29uLhY+zpra2tMTU01qmll9XTX8Zl9uxs9fxh61bQVrV1w4olmy47zv3nY+tkmJpl9OGsn9WJubu5YZs5uHG8c4BExBXwReG9m3hkRLeAxIIH3AHsz882brWN2djaXl5drX2tpaYl2u92orgvxmpiDPLvgwsw6B1eanTRykq8D2s82Mcnsw1k7qRcR0TXAG81CiYgnA58CPpqZdwJk5onM/Hlm/gL4MHDNIAuWJG2uySyUAG4DHsrM958zvvecxV4D3D/48iRJvTT52/wlwJuAlYi4rxp7N3BDRFxF5xDKceCtQ6hPktRDk1koXwKiy0OfG3w5kqSmirmkmpcdk6Tz+VV6SSqUAS5JhTLAJalQBrgkFcoAl6RCFTMLRfUuxNMK9FJSrdKFyj1wSSqUAS5JhTLAJalQBrgkFcoAl6RCGeCSVCgDXJIKZYBLUqEMcEkqlAEuSYUywCWpUAa4JBXKk1lpqLwUnjQ87oFLUqEMcEkqVG2AR8TlEXEkIh6MiAci4p3V+NMj4p6I+Hb1+2nDL1eSdEaTPfB1YCEzrwReBPxxRFwJ3Azcm5lXAPdW9yVJI1Ib4Jn5g8z8WnX7J8BDwD7geuBwtdhh4NVDqlGS1EVkZvOFI6aBo8B+4PuZeVk1HsCpM/c3PGcemAdotVpXLy4u1r7O2toaU1NT542trJ5uXCfAzL7dfS0/SP3WupnWLjjxxPbWUVIvetXabZvYbP3j/G8epl592Il2Ui/m5uaOZebsxvHGAR4RU8AXgfdm5p0R8fi5gR0RpzJz0+Pgs7Ozuby8XPtaS0tLtNvt88b6nY42zmsrDnLq3MLMOgdXtjfbs6Re9Kq12zax2fon9dqavfqwE+2kXkRE1wBvNAslIp4MfAr4aGbeWQ2fiIi91eN7gZODKlaSVK/JLJQAbgMeysz3n/PQXcCN1e0bgc8MvjxJUi9N/jZ/CfAmYCUi7qvG3g3cCtwRETcB3wNeN5QKJUld1QZ4Zn4JiB4PXzvYciRJTflNTEkqlCez2gZP1CRpnNwDl6RCGeCSVCgDXJIKZYBLUqEMcEkqlLNQdoCddr6QSdHtfVuYWac9+lJ0gXIPXJIKZYBLUqEMcEkqlAEuSYUywCWpUBM7C8WZF/XskVQ298AlqVAGuCQVygCXpEIZ4JJUKANckgo1sbNQetnsKjq9Zl945R0NgrN+NGjugUtSoQxwSSqUAS5JhaoN8Ij4SEScjIj7zxm7JSJWI+K+6ucVwy1TkrRRkz3w24Hruox/IDOvqn4+N9iyJEl1agM8M48CPxpBLZKkPkRm1i8UMQ18NjP3V/dvAQ4APwaWgYXMPNXjufPAPECr1bp6cXGx9vXW1taYmpo6b2xl9XTt8yZRaxeceGK0rzmzb3fPx3q9D72e0+/71ms93baJrdQzToPoXWsXPOvp/fW63/dmUMsPW69tYhLNzc0dy8zZjeNbDfAW8BiQwHuAvZn55rr1zM7O5vLycu3rLS0t0W63zxvbqXOxF2bWObgy2un6m81L7ncuc7/vW6/1dNsmtlLPOA2idwsz67zjDdcPbf2DXH7Yem0Tkygiugb4lmahZOaJzPx5Zv4C+DBwzXYLlCT1Z0sBHhF7z7n7GuD+XstKkoaj9m/ziPg40Ab2RMSjwJ8D7Yi4is4hlOPAW4dXoiSpm9oAz8wbugzfNoRaJEl92HEns1KZVlZPc6CPD0RL/4BOasKv0ktSoQxwSSqUAS5JhTLAJalQBrgkFcpZKNKY9Xu6gZ16Wgn9MvfAJalQBrgkFcoAl6RCGeCSVCgDXJIK5SwU/ZJxznLo9doLM8Ndv7Zus556Lpnhcg9ckgplgEtSoQxwSSqUAS5JhTLAJalQBrgkFcpphNIWlTIlsZQ6YXCXttspl8hzD1ySCmWAS1KhagM8Ij4SEScj4v5zxp4eEfdExLer308bbpmSpI2a7IHfDly3Yexm4N7MvAK4t7ovSRqh2gDPzKPAjzYMXw8crm4fBl492LIkSXUiM+sXipgGPpuZ+6v7j2fmZdXtAE6dud/lufPAPECr1bp6cXGx9vXW1taYmpo6b2xl9XTt8yZRaxeceGLcVYzfuPows293z8fGsU2Wtj306l+/veu2nm45Ubf+zd7PC9nc3NyxzJzdOL7taYSZmRHR8/8CmXkIOAQwOzub7Xa7dp1LS0tsXO5AQVOhBmlhZp2DK872HFcfjr+h3fOxcWyTpW0PvfrXb++6radbTtStf7P3s0RbnYVyIiL2AlS/Tw6uJElSE1sN8LuAG6vbNwKfGUw5kqSmmkwj/DjwZeD5EfFoRNwE3Aq8PCK+Dbysui9JGqHag2mZeUOPh64dcC2SpD6U82mIpB2r27lNFmbWaY++lAuKX6WXpEIZ4JJUKANckgplgEtSoQxwSSqUAS5JhXIaobSJki5HthP1+/5M2qXW3AOXpEIZ4JJUKANckgplgEtSoQxwSSqUAS5JhTLAJalQBrgkFcoAl6RCGeCSVCgDXJIKZYBLUqEMcEkqlAEuSYXa1ulkI+I48BPg58B6Zs4OoihJUr1BnA98LjMfG8B6JEl98BCKJBUqMnPrT474LnAKSOBDmXmoyzLzwDxAq9W6enFxsXa9a2trTE1NnTe2snp6y3WWrLULTjwx7irGzz502IezBtmLmX27B7OiIZmbmzvW7RD1dgN8X2auRsSzgHuAd2Tm0V7Lz87O5vLycu16l5aWaLfb543t1EtbLcysc3DFK9/Zhw77cNYge3GhX1ItIroG+LYOoWTmavX7JPBp4JrtrE+S1NyWAzwiLomIS8/cBn4PuH9QhUmSNredvz9awKcj4sx6PpaZ/ziQqiRJtbYc4Jn5CPCCAdYiSeqD0wglqVB+nC1px+t3ltuFMmvFPXBJKpQBLkmFMsAlqVAGuCQVygCXpEI5C0WS+tRr1sqoZ6e4By5JhTLAJalQBrgkFcoAl6RCGeCSVChnoUjSgGx2TpVhzFBxD1ySCmWAS1KhDHBJKpQBLkmFMsAlqVAGuCQVygCXpEIZ4JJUKANckgq1rQCPiOsi4lsR8XBE3DyooiRJ9bYc4BFxEfBB4PeBK4EbIuLKQRUmSdrcdvbArwEezsxHMvN/gEXg+sGUJUmqE5m5tSdGvBa4LjP/sLr/JuB3MvPtG5abB+aru88HvtVg9XuAx7ZU2OSxFx32ocM+nLWTevGczHzmxsGhn40wMw8Bh/p5TkQsZ+bskEoqir3osA8d9uEse7G9QyirwOXn3H92NSZJGoHtBPi/AldExHMj4inA64G7BlOWJKnOlg+hZOZ6RLwd+AJwEfCRzHxgQHX1dchlwtmLDvvQYR/O2vG92PKHmJKk8fKbmJJUKANckgo11gCv+yp+RPxKRHyievyrETE9hjKHrkEf3hURD0bENyPi3oh4zjjqHIWmp2eIiD+IiIyIiZxG1qQPEfG6art4ICI+NuoaR6XBv49fj4gjEfH16t/IK8ZR51hk5lh+6Hzw+R3gN4CnAN8ArtywzB8Bf1vdfj3wiXHVO+Y+zAG/Wt1+2yT2oWkvquUuBY4CXwFmx133mLaJK4CvA0+r7j9r3HWPsReHgLdVt68Ejo+77lH9jHMPvMlX8a8HDle3PwlcGxExwhpHobYPmXkkM/+ruvsVOnPuJ1HT0zO8B3gf8N+jLG6EmvThLcAHM/MUQGaeHHGNo9KkFwn8WnV7N/AfI6xvrMYZ4PuAfz/n/qPVWNdlMnMdOA08YyTVjU6TPpzrJuDzQ61ofGp7ERG/DVyemXePsrARa7JNPA94XkT8c0R8JSKuG1l1o9WkF7cAb4yIR4HPAe8YTWnjN/Sv0mtwIuKNwCzwu+OuZRwi4knA+4EDYy7lQnAxncMobTp/kR2NiJnMfHycRY3JDcDtmXkwIl4M/F1E7M/MX4y7sGEb5x54k6/i//8yEXExnT+PfjiS6kan0SkJIuJlwJ8Cr8rMn42otlGr68WlwH5gKSKOAy8C7prADzKbbBOPAndl5v9m5neBf6MT6JOmSS9uAu4AyMwvA0+lc6KriTfOAG/yVfy7gBur268F/imrTyomSG0fIuKFwIfohPekHuuEml5k5unM3JOZ05k5TefzgFdl5vJ4yh2aJv82/oHO3jcRsYfOIZVHRljjqDTpxfeBawEi4rfoBPh/jrTKMRlbgFfHtM98Ff8h4I7MfCAi/jIiXlUtdhvwjIh4GHgXMHFX/WnYh78GpoC/j4j7ImIizznTsBcTr2EfvgD8MCIeBI4Af5KZk/bXadNeLABviYhvAB8HDkzgjl5XfpVekgrlNzElqVAGuCQVygCXpEIZ4JJUKANckgplgEtSoQxwSSrU/wHG5fPywIQKFwAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "result[\"prediction\"].hist(bins=50)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/code/LGBM/lgbm_baseline.ipynb b/code/LGBM/lgbm_baseline.ipynb
new file mode 100644
index 0000000..01c8578
--- /dev/null
+++ b/code/LGBM/lgbm_baseline.ipynb
@@ -0,0 +1,2771 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## LGBM Baseline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-05-24T09:49:29.375544Z",
+ "start_time": "2021-05-24T09:49:28.999092Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "import random\n",
+ "import warnings\n",
+ "import lightgbm as lgb\n",
+ "from wandb.lightgbm import wandb_callback\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from tqdm import tqdm\n",
+ "import numpy as np\n",
+ "import random\n",
+ "from matplotlib import pylab as plt\n",
+ "from lgbm_function import inference, set_params, custom_train_test_split\n",
+ "from feature_engineering import feature_engineering_sun\n",
+ "from bayes_opt import BayesianOptimization\n",
+ "from datetime import datetime\n",
+ "import wandb\n",
+ "from sklearn.model_selection import StratifiedKFold\n",
+ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n",
+ "\n",
+ "%matplotlib inline\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Load & Preprocessing\n",
+ "- validation data answer의 정보가 모델에 학습되는 것을 방지\n",
+ "- inference 단계와 동일하게 user mean을 기준으로 random하게 값을 지정\n",
+ "- train할 때 마지막 random하게 지정한 값은 제외\n",
+ "- test, validation의 마지막 값을 제외한 모든 행을 학습에 사용"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-05-24T09:49:29.678737Z",
+ "start_time": "2021-05-24T09:49:29.376581Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2266586, 8)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " next_userID | \n",
+ " is_test_data | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " A060001001 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:11 | \n",
+ " 7224 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A060001002 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:14 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " A060001003 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:22 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " A060001004 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:29 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " A060001005 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:36 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
+ "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
+ "2 0 A060001003 A060000001 1 2020-03-24 00:17:22 \n",
+ "3 0 A060001004 A060000001 1 2020-03-24 00:17:29 \n",
+ "4 0 A060001005 A060000001 1 2020-03-24 00:17:36 \n",
+ "\n",
+ " KnowledgeTag next_userID is_test_data \n",
+ "0 7224 0.0 False \n",
+ "1 7225 0.0 False \n",
+ "2 7225 0.0 False \n",
+ "3 7225 0.0 False \n",
+ "4 7225 0.0 False "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_dir = '/opt/ml/input/data/train_dataset'\n",
+ "train_csv_file_path = os.path.join(data_dir, 'train_data.csv')\n",
+ "train_df = pd.read_csv(train_csv_file_path, parse_dates=['Timestamp'])\n",
+ "train_df[\"next_userID\"] = train_df['userID'].shift(-1)\n",
+ "train_df[\"is_test_data\"] = False\n",
+ "print(train_df.shape)\n",
+ "train_df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(259370, 8)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " next_userID | \n",
+ " is_test_data | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " A050023001 | \n",
+ " A050000023 | \n",
+ " 1 | \n",
+ " 2020-01-09 10:56:31 | \n",
+ " 2626 | \n",
+ " 3.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " A050023002 | \n",
+ " A050000023 | \n",
+ " 1 | \n",
+ " 2020-01-09 10:56:57 | \n",
+ " 2626 | \n",
+ " 3.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " A050023003 | \n",
+ " A050000023 | \n",
+ " 0 | \n",
+ " 2020-01-09 10:58:31 | \n",
+ " 2625 | \n",
+ " 3.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " A050023004 | \n",
+ " A050000023 | \n",
+ " 0 | \n",
+ " 2020-01-09 10:58:36 | \n",
+ " 2625 | \n",
+ " 3.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " A050023006 | \n",
+ " A050000023 | \n",
+ " 0 | \n",
+ " 2020-01-09 10:58:43 | \n",
+ " 2623 | \n",
+ " 3.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 3 A050023001 A050000023 1 2020-01-09 10:56:31 \n",
+ "1 3 A050023002 A050000023 1 2020-01-09 10:56:57 \n",
+ "2 3 A050023003 A050000023 0 2020-01-09 10:58:31 \n",
+ "3 3 A050023004 A050000023 0 2020-01-09 10:58:36 \n",
+ "4 3 A050023006 A050000023 0 2020-01-09 10:58:43 \n",
+ "\n",
+ " KnowledgeTag next_userID is_test_data \n",
+ "0 2626 3.0 False \n",
+ "1 2626 3.0 False \n",
+ "2 2625 3.0 False \n",
+ "3 2625 3.0 False \n",
+ "4 2623 3.0 False "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_csv_file_path = os.path.join(data_dir, 'test_data.csv')\n",
+ "test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])\n",
+ "test_df = test_df[test_df[\"answerCode\"] > -1]\n",
+ "test_df[\"next_userID\"] = test_df['userID'].shift(-5)\n",
+ "test_df[\"is_test_data\"] = test_df[[\"userID\", \"next_userID\"]].apply(lambda data: False if data[\"userID\"] == data[\"next_userID\"] else True, axis=1)\n",
+ "print(test_df.shape)\n",
+ "test_df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2525956, 8)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.concat([train_df, test_df], ignore_index=True)\n",
+ "del(train_df)\n",
+ "del(test_df)\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 43.2 s, sys: 836 ms, total: 44 s\n",
+ "Wall time: 44.1 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "def random_answering(data):\n",
+ " if data[\"is_test_data\"]:\n",
+ " return 1 if random.random() < 0.5 else 0\n",
+ " else:\n",
+ " return data[\"answerCode\"]\n",
+ "\n",
+ "df[\"answercode\"] = df.apply(random_answering, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Feature Engineering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-05-24T09:49:29.683739Z",
+ "start_time": "2021-05-24T09:49:28.981Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Timestamp 관련 feature\n",
+ "assessmentItemID 관련 feature\n",
+ "KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률\n",
+ "assessmentItemID별 누적 풀이 수, 정답 수, 정답률\n",
+ "question class별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_class별 누적 풀이 수, 정답 수, 정답률\n",
+ "question num별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID_question_num별 누적 풀이 수, 정답 수, 정답률\n",
+ "user 별 누적 풀이 수, 정답 수, 정답률\n",
+ "userID별 timestamp 중앙값\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:26<00:00, 95144.91it/s] \n",
+ "100%|██████████| 2525956/2525956 [00:55<00:00, 45765.68it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "문제별 풀이 시간의 중앙값&평균값\n",
+ "userID별 정답률(user_acc)의 이동 평균 및 중앙값\n",
+ "5\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:55<00:00, 45491.33it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:55<00:00, 45293.70it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:55<00:00, 45780.82it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:56<00:00, 44982.34it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "15\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:54<00:00, 46207.33it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:58<00:00, 43198.60it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "30\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2525956/2525956 [00:57<00:00, 43599.19it/s]\n",
+ "100%|██████████| 2525956/2525956 [00:59<00:00, 42402.29it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "feature_dimension_reduction\n",
+ "lda\n",
+ "svd\n",
+ "User가 해당 문제를 풀어본 경험 Feature\n",
+ "User가 해당 test를 풀어본 경험 Feature\n",
+ "CPU times: user 17min 58s, sys: 2min 32s, total: 20min 31s\n",
+ "Wall time: 19min 41s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " next_userID | \n",
+ " is_test_data | \n",
+ " answercode | \n",
+ " year | \n",
+ " ... | \n",
+ " userID_KnowledgeTag_svd | \n",
+ " assessmentItemID_svd | \n",
+ " question_class_svd | \n",
+ " userID_question_class_svd | \n",
+ " question_num_svd | \n",
+ " userID_question_num_svd | \n",
+ " userID_svd | \n",
+ " all_data_svd | \n",
+ " userID_assessmentItemID_experience | \n",
+ " userID_testid_experience | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " A060001001 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:11 | \n",
+ " 7224 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 225.635792 | \n",
+ " 39868.971351 | \n",
+ " 0.000000 | \n",
+ " 55488.827014 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 68304.216652 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A060001002 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:14 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 223.963244 | \n",
+ " 39870.359260 | \n",
+ " 1.395334 | \n",
+ " 54423.735554 | \n",
+ " 0.0 | \n",
+ " 1.392196 | \n",
+ " 67470.716893 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 68 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
+ "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
+ "\n",
+ " KnowledgeTag next_userID is_test_data answercode year ... \\\n",
+ "0 7224 0.0 False 1 2020 ... \n",
+ "1 7225 0.0 False 1 2020 ... \n",
+ "\n",
+ " userID_KnowledgeTag_svd assessmentItemID_svd question_class_svd \\\n",
+ "0 0.0 225.635792 39868.971351 \n",
+ "1 0.0 223.963244 39870.359260 \n",
+ "\n",
+ " userID_question_class_svd question_num_svd userID_question_num_svd \\\n",
+ "0 0.000000 55488.827014 0.0 \n",
+ "1 1.395334 54423.735554 0.0 \n",
+ "\n",
+ " userID_svd all_data_svd userID_assessmentItemID_experience \\\n",
+ "0 0.000000 68304.216652 0 \n",
+ "1 1.392196 67470.716893 0 \n",
+ "\n",
+ " userID_testid_experience \n",
+ "0 0 \n",
+ "1 1 \n",
+ "\n",
+ "[2 rows x 68 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "df = feature_engineering_sun(df)\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',\n",
+ " 'KnowledgeTag', 'next_userID', 'is_test_data', 'answercode', 'year',\n",
+ " 'month', 'question_num', 'question_class', 'KnowledgeTag_total_answer',\n",
+ " 'KnowledgeTag_correct_answer', 'KnowledgeTag_acc',\n",
+ " 'userID_KnowledgeTag_total_answer',\n",
+ " 'userID_KnowledgeTag_correct_answer', 'userID_KnowledgeTag_acc',\n",
+ " 'assessmentItemID_total_answer', 'assessmentItemID_correct_answer',\n",
+ " 'assessmentItemID_acc', 'question_class_correct_answer',\n",
+ " 'question_class_total_answer', 'question_class_acc',\n",
+ " 'userID_question_class_total_answer',\n",
+ " 'userID_question_class_correct_answer', 'userID_question_class_acc',\n",
+ " 'question_num_correct_answer', 'question_num_total_answer',\n",
+ " 'question_num_acc', 'userID_question_num_total_answer',\n",
+ " 'userID_question_num_correct_answer', 'userID_question_num_acc',\n",
+ " 'userID_correct_answer', 'userID_total_answer', 'userID_acc',\n",
+ " 'userID_elapsed_median', 'assessmentItemID_time_median',\n",
+ " 'assessmentItemID_time_mean', 'userID_acc_rolling_5',\n",
+ " 'userID_elapsed_median_rolling_5', 'userID_acc_rolling_10',\n",
+ " 'userID_elapsed_median_rolling_10', 'userID_acc_rolling_15',\n",
+ " 'userID_elapsed_median_rolling_15', 'userID_acc_rolling_30',\n",
+ " 'userID_elapsed_median_rolling_30', 'KnowledgeTag_lda',\n",
+ " 'userID_KnowledgeTag_lda', 'assessmentItemID_lda', 'question_class_lda',\n",
+ " 'userID_question_class_lda', 'question_num_lda',\n",
+ " 'userID_question_num_lda', 'userID_lda', 'all_data_lda',\n",
+ " 'KnowledgeTag_svd', 'userID_KnowledgeTag_svd', 'assessmentItemID_svd',\n",
+ " 'question_class_svd', 'userID_question_class_svd', 'question_num_svd',\n",
+ " 'userID_question_num_svd', 'userID_svd', 'all_data_svd',\n",
+ " 'userID_assessmentItemID_experience', 'userID_testid_experience'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userID | \n",
+ " assessmentItemID | \n",
+ " testId | \n",
+ " answerCode | \n",
+ " Timestamp | \n",
+ " KnowledgeTag | \n",
+ " next_userID | \n",
+ " is_test_data | \n",
+ " answercode | \n",
+ " year | \n",
+ " ... | \n",
+ " pred_userID_KnowledgeTag_svd | \n",
+ " pred_assessmentItemID_svd | \n",
+ " pred_question_class_svd | \n",
+ " pred_userID_question_class_svd | \n",
+ " pred_question_num_svd | \n",
+ " pred_userID_question_num_svd | \n",
+ " pred_userID_svd | \n",
+ " pred_all_data_svd | \n",
+ " pred_userID_assessmentItemID_experience | \n",
+ " pred_userID_testid_experience | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " A060001001 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:11 | \n",
+ " 7224 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A060001002 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:14 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 225.635792 | \n",
+ " 39868.971351 | \n",
+ " 0.000000 | \n",
+ " 55488.827014 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 68304.216652 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " A060001003 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:22 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 223.963244 | \n",
+ " 39870.359260 | \n",
+ " 1.395334 | \n",
+ " 54423.735554 | \n",
+ " 0.000000 | \n",
+ " 1.392196 | \n",
+ " 67470.716893 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " A060001004 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:29 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 1.471256 | \n",
+ " 220.618148 | \n",
+ " 39871.747169 | \n",
+ " 2.788897 | \n",
+ " 53791.418093 | \n",
+ " 0.000000 | \n",
+ " 2.783401 | \n",
+ " 66968.120221 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " A060001005 | \n",
+ " A060000001 | \n",
+ " 1 | \n",
+ " 2020-03-24 00:17:36 | \n",
+ " 7225 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 2.865134 | \n",
+ " 225.635792 | \n",
+ " 39873.135078 | \n",
+ " 4.182461 | \n",
+ " 52641.660567 | \n",
+ " 0.000000 | \n",
+ " 4.174606 | \n",
+ " 66053.545087 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2525951 | \n",
+ " 7441 | \n",
+ " A030071005 | \n",
+ " A030000071 | \n",
+ " 0 | \n",
+ " 2020-06-05 06:50:21 | \n",
+ " 438 | \n",
+ " 7441.0 | \n",
+ " False | \n",
+ " 0 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 3.025986 | \n",
+ " 106.779681 | \n",
+ " 112921.194514 | \n",
+ " 3.028496 | \n",
+ " 148815.437899 | \n",
+ " 0.000000 | \n",
+ " 3.036800 | \n",
+ " 186821.500958 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2525952 | \n",
+ " 7441 | \n",
+ " A040165001 | \n",
+ " A040000165 | \n",
+ " 1 | \n",
+ " 2020-08-21 01:06:39 | \n",
+ " 8836 | \n",
+ " 7441.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 3.822695 | \n",
+ " 98.974257 | \n",
+ " 112922.024215 | \n",
+ " 3.845519 | \n",
+ " 141310.766711 | \n",
+ " 0.000000 | \n",
+ " 3.859350 | \n",
+ " 180846.830330 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2525953 | \n",
+ " 7441 | \n",
+ " A040165002 | \n",
+ " A040000165 | \n",
+ " 1 | \n",
+ " 2020-08-21 01:06:50 | \n",
+ " 8836 | \n",
+ " 7441.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 117.348169 | \n",
+ " 219522.603095 | \n",
+ " 0.000000 | \n",
+ " 309990.171028 | \n",
+ " 0.810791 | \n",
+ " 4.681933 | \n",
+ " 379630.758839 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2525954 | \n",
+ " 7441 | \n",
+ " A040165003 | \n",
+ " A040000165 | \n",
+ " 1 | \n",
+ " 2020-08-21 01:07:36 | \n",
+ " 8836 | \n",
+ " 7441.0 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 1.471256 | \n",
+ " 116.233111 | \n",
+ " 219523.991004 | \n",
+ " 1.395334 | \n",
+ " 305243.442158 | \n",
+ " 0.810791 | \n",
+ " 6.073270 | \n",
+ " 375852.292570 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2525955 | \n",
+ " 7441 | \n",
+ " A040165004 | \n",
+ " A040000165 | \n",
+ " 1 | \n",
+ " 2020-08-21 01:08:49 | \n",
+ " 8836 | \n",
+ " NaN | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2020 | \n",
+ " ... | \n",
+ " 2.865134 | \n",
+ " 120.693342 | \n",
+ " 219526.766822 | \n",
+ " 2.788897 | \n",
+ " 301028.514945 | \n",
+ " 1.402592 | \n",
+ " 7.464570 | \n",
+ " 372498.086417 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2525956 rows × 136 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userID assessmentItemID testId answerCode Timestamp \\\n",
+ "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
+ "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
+ "2 0 A060001003 A060000001 1 2020-03-24 00:17:22 \n",
+ "3 0 A060001004 A060000001 1 2020-03-24 00:17:29 \n",
+ "4 0 A060001005 A060000001 1 2020-03-24 00:17:36 \n",
+ "... ... ... ... ... ... \n",
+ "2525951 7441 A030071005 A030000071 0 2020-06-05 06:50:21 \n",
+ "2525952 7441 A040165001 A040000165 1 2020-08-21 01:06:39 \n",
+ "2525953 7441 A040165002 A040000165 1 2020-08-21 01:06:50 \n",
+ "2525954 7441 A040165003 A040000165 1 2020-08-21 01:07:36 \n",
+ "2525955 7441 A040165004 A040000165 1 2020-08-21 01:08:49 \n",
+ "\n",
+ " KnowledgeTag next_userID is_test_data answercode year ... \\\n",
+ "0 7224 0.0 False 1 2020 ... \n",
+ "1 7225 0.0 False 1 2020 ... \n",
+ "2 7225 0.0 False 1 2020 ... \n",
+ "3 7225 0.0 False 1 2020 ... \n",
+ "4 7225 0.0 False 1 2020 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "2525951 438 7441.0 False 0 2020 ... \n",
+ "2525952 8836 7441.0 False 1 2020 ... \n",
+ "2525953 8836 7441.0 False 1 2020 ... \n",
+ "2525954 8836 7441.0 False 1 2020 ... \n",
+ "2525955 8836 NaN False 1 2020 ... \n",
+ "\n",
+ " pred_userID_KnowledgeTag_svd pred_assessmentItemID_svd \\\n",
+ "0 NaN NaN \n",
+ "1 0.000000 225.635792 \n",
+ "2 0.000000 223.963244 \n",
+ "3 1.471256 220.618148 \n",
+ "4 2.865134 225.635792 \n",
+ "... ... ... \n",
+ "2525951 3.025986 106.779681 \n",
+ "2525952 3.822695 98.974257 \n",
+ "2525953 0.000000 117.348169 \n",
+ "2525954 1.471256 116.233111 \n",
+ "2525955 2.865134 120.693342 \n",
+ "\n",
+ " pred_question_class_svd pred_userID_question_class_svd \\\n",
+ "0 NaN NaN \n",
+ "1 39868.971351 0.000000 \n",
+ "2 39870.359260 1.395334 \n",
+ "3 39871.747169 2.788897 \n",
+ "4 39873.135078 4.182461 \n",
+ "... ... ... \n",
+ "2525951 112921.194514 3.028496 \n",
+ "2525952 112922.024215 3.845519 \n",
+ "2525953 219522.603095 0.000000 \n",
+ "2525954 219523.991004 1.395334 \n",
+ "2525955 219526.766822 2.788897 \n",
+ "\n",
+ " pred_question_num_svd pred_userID_question_num_svd pred_userID_svd \\\n",
+ "0 NaN NaN NaN \n",
+ "1 55488.827014 0.000000 0.000000 \n",
+ "2 54423.735554 0.000000 1.392196 \n",
+ "3 53791.418093 0.000000 2.783401 \n",
+ "4 52641.660567 0.000000 4.174606 \n",
+ "... ... ... ... \n",
+ "2525951 148815.437899 0.000000 3.036800 \n",
+ "2525952 141310.766711 0.000000 3.859350 \n",
+ "2525953 309990.171028 0.810791 4.681933 \n",
+ "2525954 305243.442158 0.810791 6.073270 \n",
+ "2525955 301028.514945 1.402592 7.464570 \n",
+ "\n",
+ " pred_all_data_svd pred_userID_assessmentItemID_experience \\\n",
+ "0 NaN NaN \n",
+ "1 68304.216652 0.0 \n",
+ "2 67470.716893 0.0 \n",
+ "3 66968.120221 0.0 \n",
+ "4 66053.545087 0.0 \n",
+ "... ... ... \n",
+ "2525951 186821.500958 0.0 \n",
+ "2525952 180846.830330 0.0 \n",
+ "2525953 379630.758839 0.0 \n",
+ "2525954 375852.292570 0.0 \n",
+ "2525955 372498.086417 0.0 \n",
+ "\n",
+ " pred_userID_testid_experience \n",
+ "0 NaN \n",
+ "1 0.0 \n",
+ "2 1.0 \n",
+ "3 1.0 \n",
+ "4 1.0 \n",
+ "... ... \n",
+ "2525951 1.0 \n",
+ "2525952 1.0 \n",
+ "2525953 0.0 \n",
+ "2525954 1.0 \n",
+ "2525955 1.0 \n",
+ "\n",
+ "[2525956 rows x 136 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_df = df.shift(1)\n",
+ "new_df.columns = [\"pred_\" + i for i in new_df.columns]\n",
+ "final_df = pd.concat([df, new_df], axis=1)\n",
+ "\n",
+ "del(new_df)\n",
+ "del(df)\n",
+ "\n",
+ "final_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FEATS = ['assessmentItemID_acc', 'assessmentItemID_correct_answer', 'assessmentItemID_total_answer',\n",
+ " 'userID_question_class_acc', 'userID_question_class_correct_answer', 'userID_question_class_total_answer',\n",
+ " 'question_class_acc', 'question_class_correct_answer', 'question_class_total_answer',\n",
+ " 'userID_testid_experience', 'userID_assessmentItemID_experience',\n",
+ " 'assessmentItemID_lda', 'userID_question_class_lda', 'question_class_lda', 'question_num_lda', 'userID_lda', \n",
+ " 'KnowledgeTag_lda', 'userID_KnowledgeTag_lda', 'all_data_lda',\n",
+ " 'assessmentItemID_svd', 'userID_question_class_svd', 'question_class_svd', 'question_num_svd', 'userID_svd', \n",
+ " 'KnowledgeTag_svd', 'userID_KnowledgeTag_svd', 'all_data_svd',\n",
+ " 'userID_elapsed_median_rolling_5', 'userID_elapsed_median_rolling_10',\n",
+ " 'userID_elapsed_median_rolling_15', 'userID_elapsed_median_rolling_30',\n",
+ " 'assessmentItemID', 'testId', 'question_class', 'question_num', 'userID', 'KnowledgeTag']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 15.5 s, sys: 18.8 s, total: 34.3 s\n",
+ "Wall time: 34.3 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "# 유저별 분리\n",
+ "train_lst, test_lst = custom_train_test_split(final_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "==============================\n",
+ "{'boosting_type': 'dart', 'learning_rate': 0.05, 'objective': 'binary', 'metric': 'auc', 'num_iterations': 100, 'max_depth': -1, 'num_leaves': 127, 'min_data_in_leaf': 100, 'max_bin': 256, 'bagging_fraction': 0.7, 'feature_fraction': 0.7, 'lambda_l1': 0.1, 'lambda_l2': 0.1}\n",
+ "==============================\n",
+ "\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "0 번째 fold\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "==============================\n",
+ "train, test shape\n",
+ "(2524526, 135) (1430, 135)\n",
+ "==============================\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33msunnight9507\u001b[0m (use `wandb login --relogin` to force relogin)\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.10.32 is available! To upgrade, please run:\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " Tracking run with wandb version 0.10.30
\n",
+ " Syncing run logical-plasma-2576 to Weights & Biases (Documentation).
\n",
+ " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
+ " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/y4gklll1
\n",
+ " Run data is saved locally in /opt/ml/code/wandb/run-20210619_172258-y4gklll1
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LightGBM] [Info] Number of positive: 1652912, number of negative: 871614\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.588170 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 25009\n",
+ "[LightGBM] [Info] Number of data points in the train set: 2524526, number of used features: 37\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654742 -> initscore=0.639947\n",
+ "[LightGBM] [Info] Start training from score 0.639947\n",
+ "[100]\ttraining's auc: 0.848456\tvalid_1's auc: 0.800057\n",
+ "VALID AUC : 0.8000565033823552 ACC : 0.727972027972028\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "0 번째 fold\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "==============================\n",
+ "train, test shape\n",
+ "(2524412, 135) (1544, 135)\n",
+ "==============================\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Finishing last run (ID:y4gklll1) before initializing another..."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
Waiting for W&B process to finish, PID 18230
Program ended successfully."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find user logs for this run at: /opt/ml/code/wandb/run-20210619_172258-y4gklll1/logs/debug.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find internal logs for this run at: /opt/ml/code/wandb/run-20210619_172258-y4gklll1/logs/debug-internal.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run summary:
\n",
+ "training_auc | 0.84846 |
valid_1_auc | 0.80006 |
_runtime | 101 |
_timestamp | 1624123479 |
_step | 99 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run history:
\n",
+ "training_auc | ▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████ |
valid_1_auc | ▁▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████ |
_runtime | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_timestamp | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███ |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Synced logical-plasma-2576: https://wandb.ai/team-ikyo/P4-DKT/runs/y4gklll1
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "...Successfully finished last run (ID:y4gklll1). Initializing new run:
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.10.32 is available! To upgrade, please run:\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " Tracking run with wandb version 0.10.30
\n",
+ " Syncing run solar-meadow-2577 to Weights & Biases (Documentation).
\n",
+ " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
+ " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/sfd7mank
\n",
+ " Run data is saved locally in /opt/ml/code/wandb/run-20210619_172446-sfd7mank
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LightGBM] [Info] Number of positive: 1652830, number of negative: 871582\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.483986 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 24994\n",
+ "[LightGBM] [Info] Number of data points in the train set: 2524412, number of used features: 37\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654739 -> initscore=0.639934\n",
+ "[LightGBM] [Info] Start training from score 0.639934\n",
+ "[100]\ttraining's auc: 0.848449\tvalid_1's auc: 0.804981\n",
+ "VALID AUC : 0.8049809663840158 ACC : 0.7299222797927462\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "0 번째 fold\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "==============================\n",
+ "train, test shape\n",
+ "(2524490, 135) (1466, 135)\n",
+ "==============================\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Finishing last run (ID:sfd7mank) before initializing another..."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
Waiting for W&B process to finish, PID 18269
Program ended successfully."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find user logs for this run at: /opt/ml/code/wandb/run-20210619_172446-sfd7mank/logs/debug.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find internal logs for this run at: /opt/ml/code/wandb/run-20210619_172446-sfd7mank/logs/debug-internal.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run summary:
\n",
+ "training_auc | 0.84845 |
valid_1_auc | 0.80498 |
_runtime | 100 |
_timestamp | 1624123590 |
_step | 99 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run history:
\n",
+ "training_auc | ▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████ |
valid_1_auc | ▁▂▃▃▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████ |
_runtime | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_timestamp | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███ |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Synced solar-meadow-2577: https://wandb.ai/team-ikyo/P4-DKT/runs/sfd7mank
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "...Successfully finished last run (ID:sfd7mank). Initializing new run:
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.10.32 is available! To upgrade, please run:\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " Tracking run with wandb version 0.10.30
\n",
+ " Syncing run olive-bush-2578 to Weights & Biases (Documentation).
\n",
+ " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
+ " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/txgdytuh
\n",
+ " Run data is saved locally in /opt/ml/code/wandb/run-20210619_172637-txgdytuh
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LightGBM] [Info] Number of positive: 1652874, number of negative: 871616\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.543932 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 25000\n",
+ "[LightGBM] [Info] Number of data points in the train set: 2524490, number of used features: 37\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654736 -> initscore=0.639922\n",
+ "[LightGBM] [Info] Start training from score 0.639922\n",
+ "[100]\ttraining's auc: 0.848478\tvalid_1's auc: 0.813116\n",
+ "VALID AUC : 0.8131155387686989 ACC : 0.732605729877217\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "0 번째 fold\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "==============================\n",
+ "train, test shape\n",
+ "(2524448, 135) (1508, 135)\n",
+ "==============================\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Finishing last run (ID:txgdytuh) before initializing another..."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
Waiting for W&B process to finish, PID 18302
Program ended successfully."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find user logs for this run at: /opt/ml/code/wandb/run-20210619_172637-txgdytuh/logs/debug.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find internal logs for this run at: /opt/ml/code/wandb/run-20210619_172637-txgdytuh/logs/debug-internal.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run summary:
\n",
+ "training_auc | 0.84848 |
valid_1_auc | 0.81312 |
_runtime | 104 |
_timestamp | 1624123704 |
_step | 99 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run history:
\n",
+ "training_auc | ▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████ |
valid_1_auc | ▁▂▃▄▄▄▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇▇██████████████████ |
_runtime | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_timestamp | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███ |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Synced olive-bush-2578: https://wandb.ai/team-ikyo/P4-DKT/runs/txgdytuh
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "...Successfully finished last run (ID:txgdytuh). Initializing new run:
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.10.32 is available! To upgrade, please run:\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " Tracking run with wandb version 0.10.30
\n",
+ " Syncing run leafy-disco-2579 to Weights & Biases (Documentation).
\n",
+ " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
+ " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/ow74wc8y
\n",
+ " Run data is saved locally in /opt/ml/code/wandb/run-20210619_172831-ow74wc8y
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LightGBM] [Info] Number of positive: 1652870, number of negative: 871578\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.425213 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 25008\n",
+ "[LightGBM] [Info] Number of data points in the train set: 2524448, number of used features: 37\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654745 -> initscore=0.639963\n",
+ "[LightGBM] [Info] Start training from score 0.639963\n",
+ "[100]\ttraining's auc: 0.848424\tvalid_1's auc: 0.837722\n",
+ "VALID AUC : 0.8377216952857799 ACC : 0.7672413793103449\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "0 번째 fold\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "==============================\n",
+ "train, test shape\n",
+ "(2524466, 135) (1490, 135)\n",
+ "==============================\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Finishing last run (ID:ow74wc8y) before initializing another..."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
Waiting for W&B process to finish, PID 18335
Program ended successfully."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find user logs for this run at: /opt/ml/code/wandb/run-20210619_172831-ow74wc8y/logs/debug.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find internal logs for this run at: /opt/ml/code/wandb/run-20210619_172831-ow74wc8y/logs/debug-internal.log
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run summary:
\n",
+ "training_auc | 0.84842 |
valid_1_auc | 0.83772 |
_runtime | 101 |
_timestamp | 1624123815 |
_step | 99 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run history:
\n",
+ "training_auc | ▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████ |
valid_1_auc | ▁▄▄▅▄▄▆▅▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████ |
_runtime | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_timestamp | ▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███ |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Synced leafy-disco-2579: https://wandb.ai/team-ikyo/P4-DKT/runs/ow74wc8y
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "...Successfully finished last run (ID:ow74wc8y). Initializing new run:
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.10.32 is available! To upgrade, please run:\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " Tracking run with wandb version 0.10.30
\n",
+ " Syncing run serene-sun-2580 to Weights & Biases (Documentation).
\n",
+ " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
+ " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/48bggk8q
\n",
+ " Run data is saved locally in /opt/ml/code/wandb/run-20210619_173022-48bggk8q
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LightGBM] [Info] Number of positive: 1652841, number of negative: 871625\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.599690 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 24994\n",
+ "[LightGBM] [Info] Number of data points in the train set: 2524466, number of used features: 37\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[100]\ttraining's auc: 0.848487\tvalid_1's auc: 0.809136\n",
+ "VALID AUC : 0.8091360507079913 ACC : 0.723489932885906\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# # Best features\n",
+ "# FEATS = ['assessmentItemID_acc', 'assessmentItemID_correct_answer', 'assessmentItemID_total_answer',\n",
+ "# 'userID_question_class_acc', 'userID_question_class_correct_answer', 'userID_question_class_total_answer',\n",
+ "# 'question_class', 'userID', 'assessmentItemID']\n",
+ "\n",
+ "# def learning_rate_decay(current_iter):\n",
+ "# lr = 1e-1 * (.999 ** (current_iter % 50))\n",
+ "# return lr\n",
+ "\n",
+ "# set parameters\n",
+ "params = set_params()\n",
+ "\n",
+ "for fold_num, (train_df, test_df) in enumerate(zip(train_lst, test_lst)):\n",
+ " fold_num = 0\n",
+ " print(\"@\"*50)\n",
+ " print(fold_num, \"번째 fold\")\n",
+ " print(\"@\"*50)\n",
+ "\n",
+ " # X, y 값 분리\n",
+ " y_train = train_df[\"answerCode\"]\n",
+ " train = train_df.drop([\"answerCode\"], axis=1)\n",
+ "\n",
+ " y_test = test_df[\"answerCode\"]\n",
+ " test = test_df.drop([\"answerCode\"], axis=1)\n",
+ "\n",
+ " print(\"=\"*30)\n",
+ " print(\"train, test shape\")\n",
+ " print(train.shape, test.shape)\n",
+ " print(\"=\"*30)\n",
+ " print()\n",
+ "\n",
+ " lgb_train = lgb.Dataset(train[FEATS], y_train)\n",
+ " lgb_test = lgb.Dataset(test[FEATS], y_test)\n",
+ "\n",
+ " now = datetime.now()\n",
+ " wandb.init(project='P4-DKT', config=params, entity=\"team-ikyo\")\n",
+ " wandb.run.name = \"sun-lgbm-fold\" + str(fold_num) + \" time: \" + \" \".join(map(str, [now.month, now.day, now.hour, now.minute]))\n",
+ "\n",
+ " # train\n",
+ " model = lgb.train(params,\n",
+ " lgb_train,\n",
+ " valid_sets = [lgb_train, lgb_test],\n",
+ " verbose_eval = 100,\n",
+ " callbacks=[wandb_callback()])\n",
+ " # lgb.reset_parameter(learning_rate = learning_rate_decay)])\n",
+ "\n",
+ " preds = model.predict(test[FEATS])\n",
+ " acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n",
+ " auc = roc_auc_score(y_test, preds)\n",
+ "\n",
+ " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n",
+ "\n",
+ " # show feature importance\n",
+ " fig, ax = plt.subplots(figsize=(6,12))\n",
+ " lgb.plot_importance(model, max_num_features=100, height=0.8, ax=ax)\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Permutation importance"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def permutation_importance(model, X_val, y_val, metric, threshold=0.001, minimize=True, verbose=True):\n",
+ " results = {}\n",
+ " \n",
+ " y_pred = model.predict(X_val)\n",
+ " \n",
+ " results['base_score'] = metric(y_val, y_pred)\n",
+ " if verbose:\n",
+ " print(f'Base score {results[\"base_score\"]:.5}')\n",
+ "\n",
+ " for col in X_val.columns:\n",
+ " if col in ['assessmentItemID', 'testId', 'question_class', 'question_num', 'userID', 'KnowledgeTag']:\n",
+ " continue\n",
+ " \n",
+ " freezed_col = X_val[col].copy()\n",
+ "\n",
+ " X_val[col] = np.random.permutation(X_val[col])\n",
+ " \n",
+ " preds = model.predict(X_val)\n",
+ " results[col] = metric(y_val, preds)\n",
+ "\n",
+ " X_val[col] = freezed_col\n",
+ " \n",
+ " if verbose:\n",
+ " print(f'column: {col} - {results[col]:.5}')\n",
+ " \n",
+ " if minimize:\n",
+ " bad_features = [k for k in results if results[k] < results['base_score'] + threshold]\n",
+ " else:\n",
+ " bad_features = [k for k in results if results[k] > results['base_score'] - threshold]\n",
+ " bad_features.remove('base_score')\n",
+ " \n",
+ " return results, bad_features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Base score 0.80914\n",
+ "column: assessmentItemID_acc - 0.80782\n",
+ "column: assessmentItemID_correct_answer - 0.80947\n",
+ "column: assessmentItemID_total_answer - 0.80899\n",
+ "column: userID_question_class_acc - 0.80577\n",
+ "column: userID_question_class_correct_answer - 0.8078\n",
+ "column: userID_question_class_total_answer - 0.80901\n",
+ "column: question_class_acc - 0.80777\n",
+ "column: question_class_correct_answer - 0.80914\n",
+ "column: question_class_total_answer - 0.80914\n",
+ "column: userID_testid_experience - 0.80914\n",
+ "column: userID_assessmentItemID_experience - 0.8089\n",
+ "column: assessmentItemID_lda - 0.80734\n",
+ "column: userID_question_class_lda - 0.80877\n",
+ "column: question_class_lda - 0.80767\n",
+ "column: question_num_lda - 0.80914\n",
+ "column: userID_lda - 0.80905\n",
+ "column: KnowledgeTag_lda - 0.80907\n",
+ "column: userID_KnowledgeTag_lda - 0.80394\n",
+ "column: all_data_lda - 0.62124\n",
+ "column: assessmentItemID_svd - 0.80932\n",
+ "column: userID_question_class_svd - 0.8087\n",
+ "column: question_class_svd - 0.80914\n",
+ "column: question_num_svd - 0.80914\n",
+ "column: userID_svd - 0.80914\n",
+ "column: KnowledgeTag_svd - 0.80913\n",
+ "column: userID_KnowledgeTag_svd - 0.8079\n",
+ "column: all_data_svd - 0.80914\n",
+ "column: userID_elapsed_median_rolling_5 - 0.78766\n",
+ "column: userID_elapsed_median_rolling_10 - 0.80861\n",
+ "column: userID_elapsed_median_rolling_15 - 0.80908\n",
+ "column: userID_elapsed_median_rolling_30 - 0.80914\n"
+ ]
+ }
+ ],
+ "source": [
+ "results, bad_features = permutation_importance(model, test[FEATS], y_test, roc_auc_score, minimize=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['assessmentItemID_correct_answer',\n",
+ " 'assessmentItemID_total_answer',\n",
+ " 'userID_question_class_total_answer',\n",
+ " 'question_class_correct_answer',\n",
+ " 'question_class_total_answer',\n",
+ " 'userID_testid_experience',\n",
+ " 'userID_assessmentItemID_experience',\n",
+ " 'userID_question_class_lda',\n",
+ " 'question_num_lda',\n",
+ " 'userID_lda',\n",
+ " 'KnowledgeTag_lda',\n",
+ " 'assessmentItemID_svd',\n",
+ " 'userID_question_class_svd',\n",
+ " 'question_class_svd',\n",
+ " 'question_num_svd',\n",
+ " 'userID_svd',\n",
+ " 'KnowledgeTag_svd',\n",
+ " 'all_data_svd',\n",
+ " 'userID_elapsed_median_rolling_10',\n",
+ " 'userID_elapsed_median_rolling_15',\n",
+ " 'userID_elapsed_median_rolling_30']"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['userID_question_class_correct_answer',\n",
+ " 'userID_elapsed_median_rolling_5',\n",
+ " 'question_class',\n",
+ " 'assessmentItemID_lda',\n",
+ " 'userID_question_class_acc',\n",
+ " 'question_class_acc',\n",
+ " 'question_class_lda',\n",
+ " 'testId',\n",
+ " 'all_data_lda',\n",
+ " 'assessmentItemID_acc',\n",
+ " 'assessmentItemID',\n",
+ " 'userID_KnowledgeTag_lda',\n",
+ " 'KnowledgeTag',\n",
+ " 'userID',\n",
+ " 'question_num',\n",
+ " 'userID_KnowledgeTag_svd']"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_FEATS = list(set(FEATS) - set(bad_features))\n",
+ "\n",
+ "new_FEATS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### AutoML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train_optuna(num_leaves, min_data_in_leaf, max_bin, bagging_fraction, feature_fraction, lambda_l1, lambda_l2,\n",
+ " train_df=train_df):\n",
+ " skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=2021)\n",
+ " \n",
+ " params = {}\n",
+ " params[\"boosting_type\"] = \"dart\" # gbdt, dart, goss\n",
+ " params[\"learning_rate\"] = 5e-2 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3\n",
+ " params[\"objective\"] = \"binary\"\n",
+ " params[\"metric\"] = \"auc\" # binary_logloss, rmse, huber, auc\n",
+ " params[\"num_iterations\"] = 10 # 100\n",
+ " params[\"max_depth\"] = -1 # -1\n",
+ " params[\"num_leaves\"] = int(num_leaves) # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.\n",
+ " params[\"min_data_in_leaf\"] = int(min_data_in_leaf) # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것\n",
+ " params[\"max_bin\"] = int(max_bin) # 256\n",
+ " params[\"bagging_fraction\"] = bagging_fraction # 1.0\n",
+ " params[\"feature_fraction\"] = feature_fraction # 1.0\n",
+ " params[\"lambda_l1\"] = lambda_l1 # 0.0\n",
+ " params[\"lambda_l2\"] = lambda_l2 # 0.0\n",
+ " params[\"random_state\"] = 2021\n",
+ "\n",
+ " auc_score = 0\n",
+ " \n",
+ " print(\"@\"*50)\n",
+ " print(\"start\")\n",
+ " print(\"@\"*50)\n",
+ "\n",
+ " for fold, (train_index, test_index) in enumerate(skf.split(train_df, train_df[\"answerCode\"])):\n",
+ " temp_train = train_df.iloc[train_index,:]\n",
+ " temp_valid = train_df.iloc[test_index,:]\n",
+ " \n",
+ " y_train = train_df[\"answerCode\"].iloc[train_index]\n",
+ " y_test = train_df[\"answerCode\"].iloc[test_index]\n",
+ " \n",
+ " lgb_train = lgb.Dataset(temp_train[new_FEATS], y_train)\n",
+ " lgb_test = lgb.Dataset(temp_valid[new_FEATS], y_test)\n",
+ "\n",
+ " # train\n",
+ " model = lgb.train(params,\n",
+ " lgb_train,\n",
+ " valid_sets = [lgb_train, lgb_test],\n",
+ " verbose_eval = 100,\n",
+ " callbacks=[wandb_callback()])\n",
+ " \n",
+ " \n",
+ " preds = model.predict(temp_valid[new_FEATS])\n",
+ " acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n",
+ " auc = roc_auc_score(y_test, preds)\n",
+ "\n",
+ " auc_score += auc\n",
+ "\n",
+ " print(auc_score / 5)\n",
+ " return auc_score / 5"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lgbm_bo = BayesianOptimization(train_optuna, {'num_leaves': (16, 512),\n",
+ " 'min_data_in_leaf': (20, 1000),\n",
+ " 'max_bin': (10, 256),\n",
+ " 'bagging_fraction': (0.5, 1),\n",
+ " 'feature_fraction': (0.5, 1),\n",
+ " 'lambda_l1' : (0, 10),\n",
+ " 'lambda_l2' : (0, 10)})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "| iter | target | baggin... | featur... | lambda_l1 | lambda_l2 | max_bin | min_da... | num_le... |\n",
+ "-------------------------------------------------------------------------------------------------------------\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032849 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19315\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.262100 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19297\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035968 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19309\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.4991880736397416\n",
+ "| \u001b[0m 1 \u001b[0m | \u001b[0m 0.4992 \u001b[0m | \u001b[0m 0.6942 \u001b[0m | \u001b[0m 0.8675 \u001b[0m | \u001b[0m 7.646 \u001b[0m | \u001b[0m 6.016 \u001b[0m | \u001b[0m 176.7 \u001b[0m | \u001b[0m 620.6 \u001b[0m | \u001b[0m 256.4 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.150602 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19485\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226611 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19467\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107780 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19479\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.4990461546350316\n",
+ "| \u001b[0m 2 \u001b[0m | \u001b[0m 0.499 \u001b[0m | \u001b[0m 0.5351 \u001b[0m | \u001b[0m 0.6165 \u001b[0m | \u001b[0m 4.278 \u001b[0m | \u001b[0m 7.662 \u001b[0m | \u001b[0m 193.4 \u001b[0m | \u001b[0m 945.2 \u001b[0m | \u001b[0m 183.7 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034947 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19255\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.271151 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19237\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032964 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19249\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49917224286305384\n",
+ "| \u001b[0m 3 \u001b[0m | \u001b[0m 0.4992 \u001b[0m | \u001b[0m 0.9048 \u001b[0m | \u001b[0m 0.9592 \u001b[0m | \u001b[0m 5.62 \u001b[0m | \u001b[0m 7.988 \u001b[0m | \u001b[0m 170.4 \u001b[0m | \u001b[0m 618.5 \u001b[0m | \u001b[0m 262.8 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023693 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19305\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021775 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19287\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023475 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19299\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49937466830404\n",
+ "| \u001b[95m 4 \u001b[0m | \u001b[95m 0.4994 \u001b[0m | \u001b[95m 0.9172 \u001b[0m | \u001b[95m 0.5705 \u001b[0m | \u001b[95m 7.658 \u001b[0m | \u001b[95m 3.864 \u001b[0m | \u001b[95m 175.3 \u001b[0m | \u001b[95m 627.9 \u001b[0m | \u001b[95m 259.7 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035023 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19015\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033781 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 18997\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031771 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19009\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49824766561616196\n",
+ "| \u001b[0m 5 \u001b[0m | \u001b[0m 0.4982 \u001b[0m | \u001b[0m 0.6968 \u001b[0m | \u001b[0m 0.9719 \u001b[0m | \u001b[0m 5.153 \u001b[0m | \u001b[0m 1.699 \u001b[0m | \u001b[0m 146.9 \u001b[0m | \u001b[0m 46.22 \u001b[0m | \u001b[0m 243.6 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022291 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19315\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021259 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19297\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023667 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19309\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.4993547741350028\n",
+ "| \u001b[0m 6 \u001b[0m | \u001b[0m 0.4994 \u001b[0m | \u001b[0m 0.7699 \u001b[0m | \u001b[0m 0.5432 \u001b[0m | \u001b[0m 5.242 \u001b[0m | \u001b[0m 8.487 \u001b[0m | \u001b[0m 176.8 \u001b[0m | \u001b[0m 650.2 \u001b[0m | \u001b[0m 256.2 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087377 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19385\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090262 seconds.\n",
+ "You can set `force_col_wise=true` to remove the overhead.\n",
+ "[LightGBM] [Info] Total Bins 19367\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029710 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19379\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49966067721580476\n",
+ "| \u001b[95m 7 \u001b[0m | \u001b[95m 0.4997 \u001b[0m | \u001b[95m 0.7796 \u001b[0m | \u001b[95m 0.6905 \u001b[0m | \u001b[95m 5.743 \u001b[0m | \u001b[95m 9.019 \u001b[0m | \u001b[95m 183.7 \u001b[0m | \u001b[95m 646.5 \u001b[0m | \u001b[95m 282.9 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043650 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19665\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032418 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19647\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033105 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19659\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.4986258538546828\n",
+ "| \u001b[0m 8 \u001b[0m | \u001b[0m 0.4986 \u001b[0m | \u001b[0m 0.7325 \u001b[0m | \u001b[0m 0.9752 \u001b[0m | \u001b[0m 5.997 \u001b[0m | \u001b[0m 3.319 \u001b[0m | \u001b[0m 211.3 \u001b[0m | \u001b[0m 648.8 \u001b[0m | \u001b[0m 294.8 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033679 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19335\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033990 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19317\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035500 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19329\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49920670655196353\n",
+ "| \u001b[0m 9 \u001b[0m | \u001b[0m 0.4992 \u001b[0m | \u001b[0m 0.9761 \u001b[0m | \u001b[0m 0.8682 \u001b[0m | \u001b[0m 5.045 \u001b[0m | \u001b[0m 4.903 \u001b[0m | \u001b[0m 178.1 \u001b[0m | \u001b[0m 643.5 \u001b[0m | \u001b[0m 247.0 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022415 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19325\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021907 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19307\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023632 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19319\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.499337872789756\n",
+ "| \u001b[0m 10 \u001b[0m | \u001b[0m 0.4993 \u001b[0m | \u001b[0m 0.9536 \u001b[0m | \u001b[0m 0.5449 \u001b[0m | \u001b[0m 6.762 \u001b[0m | \u001b[0m 1.704 \u001b[0m | \u001b[0m 177.4 \u001b[0m | \u001b[0m 640.2 \u001b[0m | \u001b[0m 246.4 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034827 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19265\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033483 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19247\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032816 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19259\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.49926421899546386\n",
+ "| \u001b[0m 11 \u001b[0m | \u001b[0m 0.4993 \u001b[0m | \u001b[0m 0.6679 \u001b[0m | \u001b[0m 0.8229 \u001b[0m | \u001b[0m 7.179 \u001b[0m | \u001b[0m 9.145 \u001b[0m | \u001b[0m 171.1 \u001b[0m | \u001b[0m 651.4 \u001b[0m | \u001b[0m 271.2 \u001b[0m |\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "start\n",
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031930 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19335\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581083\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033664 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19317\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682977, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639892\n",
+ "[LightGBM] [Info] Start training from score 0.639892\n",
+ "[LightGBM] [Info] Number of positive: 1101894, number of negative: 581084\n",
+ "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035002 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 19329\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1682978, number of used features: 16\n",
+ "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654729 -> initscore=0.639890\n",
+ "[LightGBM] [Info] Start training from score 0.639890\n",
+ "0.499341976800152\n",
+ "| \u001b[0m 12 \u001b[0m | \u001b[0m 0.4993 \u001b[0m | \u001b[0m 0.5206 \u001b[0m | \u001b[0m 0.9012 \u001b[0m | \u001b[0m 0.6585 \u001b[0m | \u001b[0m 7.539 \u001b[0m | \u001b[0m 178.8 \u001b[0m | \u001b[0m 635.5 \u001b[0m | \u001b[0m 294.3 \u001b[0m |\n",
+ "=============================================================================================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "lgbm_bo.maximize(init_points=2, n_iter=10, acq='ei')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "writing prediction : output/output6_19_17_30.csv\n"
+ ]
+ }
+ ],
+ "source": [
+ "total_preds = model.predict(test[FEATS])\n",
+ "\n",
+ "# SAVE OUTPUT\n",
+ "output_dir = 'output/'\n",
+ "write_path = os.path.join(output_dir, f\"output\" + \"_\".join(map(str, [now.month, now.day, now.hour, now.minute])) + \".csv\")\n",
+ "if not os.path.exists(output_dir):\n",
+ " os.makedirs(output_dir) \n",
+ "with open(write_path, 'w', encoding='utf8') as w:\n",
+ " print(\"writing prediction : {}\".format(write_path))\n",
+ " w.write(\"id,prediction\\n\")\n",
+ " for id, p in enumerate(total_preds):\n",
+ " w.write('{},{}\\n'.format(id,p))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": false,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/code/baseline/lgbm_function.py b/code/LGBM/lgbm_function.py
similarity index 66%
rename from code/baseline/lgbm_function.py
rename to code/LGBM/lgbm_function.py
index c9647d3..b37936e 100644
--- a/code/baseline/lgbm_function.py
+++ b/code/LGBM/lgbm_function.py
@@ -5,7 +5,6 @@
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
-from feature_engineering import feature_engineering
import numpy as np
import random
from matplotlib import pylab as plt
@@ -18,22 +17,21 @@ def set_params():
# 3순위 min_data_in_leaf
params = {}
params["boosting_type"] = "dart" # gbdt, dart, goss
- params["learning_rate"] = 1e-1 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3
+ params["learning_rate"] = 5e-2 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3
params["objective"] = "binary"
params["metric"] = "auc" # binary_logloss, rmse, huber, auc
- params["num_iterations"] = 300 # 100
- params["max_depth"] = 6 # -1
- params["num_leaves"] = 30 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.
- params["min_data_in_leaf"] = 5000 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것
- params["max_bin"] = 32 # 256
- params["scale_pos_weight"] = 1.1 # 1.1~1.5 data 불균형
- params["tree_learner"] = "serial" # serial, feature, data, voting
- params["early_stopping_rounds"] = 100
- params["bagging_fraction"] = 0.8 # 1.0
- params["feature_fraction"] = 0.5 # 1.0
- params["lambda_l1"] = 1e-1 # 0.0
- params["lambda_l2"] = 1e-1 # 0.0
-
+ params["num_iterations"] = 100 # 100
+ params["max_depth"] = -1 # -1
+ params["num_leaves"] = 127 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.
+ params["min_data_in_leaf"] = 100 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것
+ params["max_bin"] = 256 # 256
+# params["scale_pos_weight"] = 0.9 # 1.1~1.5 data 불균형
+# params["tree_learner"] = "serial" # serial, feature, data, voting
+# params["early_stopping_rounds"] = 50
+ params["bagging_fraction"] = 0.7 # 1.0
+ params["feature_fraction"] = 0.7 # 1.0
+ params["lambda_l1"] = 0.1 # 0.0
+ params["lambda_l2"] = 0.1 # 0.0
print("="*30)
print(params)
@@ -52,7 +50,7 @@ def custom_train_test_split(df, ratio=0.2):
train_lst = []
test_lst = []
- max_train_data_len = 0.2*len(df)
+ max_train_data_len = int(ratio*len(df))
sum_of_train_data = 0
user_ids_lst = []
@@ -69,15 +67,16 @@ def custom_train_test_split(df, ratio=0.2):
user_ids_lst.append(user_ids)
for user_ids in user_ids_lst:
- train_lst.append(df[df['userID'].isin(user_ids) == False])
+ train = df[df['userID'].isin(user_ids) == False]
test = df[df['userID'].isin(user_ids)]
- #test데이터셋은 각 유저의 마지막 interaction만 추출
+ train = pd.concat([train, test[test['userID'] == test['userID'].shift(-1)]])
+ train_lst.append(train)
test_lst.append(test[test['userID'] != test['userID'].shift(-1)])
return train_lst, test_lst
-def inference(FEATS, model, auc, acc, time):
+def inference(FEATS, model, auc, acc, time, test=False):
print("="*30)
print("Start inference")
print("="*30)
@@ -100,11 +99,9 @@ def inference(FEATS, model, auc, acc, time):
test_df = pd.concat([df, test_df])
not_test_df = test_df[test_df["answerCode"] != -1]
- not_test_df = feature_engineering(not_test_df)
not_test_df["is_test"] = False
test_df = test_df[test_df["answerCode"] == -1]
- test_df["question_class"] = test_df["assessmentItemID"].apply(lambda x: x[2])
test_df["is_test"] = True
print("="*30)
@@ -113,19 +110,31 @@ def inference(FEATS, model, auc, acc, time):
print("="*30)
print()
- test_df = pd.merge(test_df, not_test_df[["userID", "user_mean"]].drop_duplicates(), on=["userID"], how="inner")
- test_df = pd.merge(test_df, not_test_df[["question_class", "question_class_mean"]].drop_duplicates(), on=["question_class"], how="inner")
+ not_test_df.sort_values(by=["userID", "Timestamp"], inplace=True)
- def random_answering(data):
- return 1 if random.random() < data["user_mean"] * data["question_class_mean"] else 0
-
- test_df["answerCode"] = test_df[["user_mean", "question_class_mean"]].apply(random_answering, axis=1)
- test_df.drop(["question_class", "user_mean", "question_class_mean"], axis=1, inplace=True)
+ user_mean = not_test_df.groupby(["userID"])["answerCode"].agg(["mean"])
+ user_mean.columns = ["user_mean"]
data = pd.concat([not_test_df, test_df], join="inner")
+
+ df = pd.merge(data, user_mean, on=["userID"], how="left")
+
+ df["next_userID"] = df["userID"].shift(-1)
+
+ def random_answering(data):
+ if data["is_test"]:
+ return 1 if random.random() < 0.5 else 0
+ else:
+ return data["answerCode"]
+
+ df["answercode"] = df.apply(random_answering, axis=1)
# FEATURE ENGINEERING
- data = feature_engineering(data)
+ df = feature_engineering(df)
+
+ new_df = df.shift(1)
+ new_df.columns = ["pred_" + i for i in new_df.columns]
+ data = pd.concat([df, new_df], axis=1)
# TEST DATA
test_df = data[data["is_test"]]
diff --git a/code/Machine Learning/none.txt b/code/Machine Learning/none.txt
deleted file mode 100644
index 8b13789..0000000
--- a/code/Machine Learning/none.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/code/baseline/args.py b/code/baseline/args.py
deleted file mode 100644
index ac9404f..0000000
--- a/code/baseline/args.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-import argparse
-
-
-def parse_args(mode='train'):
- parser = argparse.ArgumentParser()
-
-
- parser.add_argument('--seed', default=42, type=int, help='seed')
- parser.add_argument('--run_name', default='teamikyo', type=str, help='wandb run name')
-
- parser.add_argument('--device', default='cpu', type=str, help='cpu or gpu')
-
- parser.add_argument('--data_dir', default='../data/', type=str, help='data directory')
- parser.add_argument('--asset_dir', default='asset/', type=str, help='data directory')
-
- parser.add_argument('--file_name', default='train_data.csv', type=str, help='train file name')
-
- parser.add_argument('--model_dir', default='models/', type=str, help='model directory')
- parser.add_argument('--model_name', default='model.pt', type=str, help='model file name')
-
- parser.add_argument('--output_dir', default='output/', type=str, help='output directory')
- parser.add_argument('--test_file_name', default='test_data.csv', type=str, help='test file name')
-
- parser.add_argument('--max_seq_len', default=20, type=int, help='max sequence length')
- parser.add_argument('--num_workers', default=1, type=int, help='number of workers')
-
- # 모델
- parser.add_argument('--hidden_dim', default=64, type=int, help='hidden dimension size')
- parser.add_argument('--n_layers', default=2, type=int, help='number of layers')
- parser.add_argument('--n_heads', default=2, type=int, help='number of heads')
- parser.add_argument('--drop_out', default=0.2, type=float, help='drop out rate')
-
- # 훈련
- parser.add_argument('--n_epochs', default=20, type=int, help='number of epochs')
- parser.add_argument('--batch_size', default=64, type=int, help='batch size')
- parser.add_argument('--lr', default=0.0001, type=float, help='learning rate')
- parser.add_argument('--clip_grad', default=10, type=int, help='clip grad')
- parser.add_argument('--patience', default=5, type=int, help='for early stopping')
-
-
- parser.add_argument('--log_steps', default=50, type=int, help='print log per n steps')
-
-
- ### 중요 ###
- parser.add_argument('--model', default='lstm', type=str, help='model type')
- parser.add_argument('--optimizer', default='adam', type=str, help='optimizer type')
- parser.add_argument('--scheduler', default='plateau', type=str, help='scheduler type')
-
- args = parser.parse_args()
-
- return args
diff --git a/code/baseline/baseline.ipynb b/code/baseline/baseline.ipynb
deleted file mode 100644
index 55c7789..0000000
--- a/code/baseline/baseline.ipynb
+++ /dev/null
@@ -1,1950 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Nv5EvIVPnz0y"
- },
- "source": [
- "# LSTM 활용한 베이스라인"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: easydict in /opt/conda/lib/python3.7/site-packages (1.9)\n"
- ]
- }
- ],
- "source": [
- "!pip install easydict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "id": "wtJhitPznz06"
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import os\n",
- "import torch\n",
- "import easydict\n",
- "import numpy as np\n",
- "from sklearn.preprocessing import LabelEncoder\n",
- "import time\n",
- "import datetime\n",
- "from datetime import datetime\n",
- "import random\n",
- "import wandb"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "6w3E-ACunz07"
- },
- "source": [
- "## 1. 데이터 로드 및 전처리 컴포넌트"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "id": "od9O-ttAnz08"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "from datetime import datetime\n",
- "import time\n",
- "import tqdm\n",
- "import pandas as pd\n",
- "import random\n",
- "from sklearn.preprocessing import LabelEncoder\n",
- "import numpy as np\n",
- "import torch\n",
- "\n",
- "class Preprocess:\n",
- " def __init__(self,args):\n",
- " self.args = args\n",
- " self.train_data = None\n",
- " self.test_data = None\n",
- " \n",
- "\n",
- " def get_train_data(self):\n",
- " return self.train_data\n",
- "\n",
- " def get_test_data(self):\n",
- " return self.test_data\n",
- "\n",
- " def split_data(self, data, ratio=0.9, shuffle=True, seed=0):\n",
- " \"\"\"\n",
- " split data into two parts with a given ratio.\n",
- " \"\"\"\n",
- " if shuffle:\n",
- " random.seed(seed) # fix to default seed 0\n",
- " random.shuffle(data)\n",
- "\n",
- " size = int(len(data) * ratio)\n",
- " data_1 = data[:size]\n",
- " data_2 = data[size:]\n",
- "\n",
- " return data_1, data_2\n",
- "\n",
- " def __save_labels(self, encoder, name):\n",
- " le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')\n",
- " np.save(le_path, encoder.classes_)\n",
- "\n",
- " def __preprocessing(self, df, is_train = True):\n",
- " cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']\n",
- "\n",
- " if not os.path.exists(self.args.asset_dir):\n",
- " os.makedirs(self.args.asset_dir)\n",
- " \n",
- " for col in cate_cols:\n",
- " \n",
- " \n",
- " le = LabelEncoder()\n",
- " if is_train:\n",
- " #For UNKNOWN class\n",
- " a = df[col].unique().tolist() + ['unknown']\n",
- " le.fit(a)\n",
- " self.__save_labels(le, col)\n",
- " else:\n",
- " label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')\n",
- " le.classes_ = np.load(label_path)\n",
- " \n",
- " df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')\n",
- "\n",
- " #모든 컬럼이 범주형이라고 가정\n",
- " df[col]= df[col].astype(str)\n",
- " test = le.transform(df[col])\n",
- " df[col] = test\n",
- " \n",
- "\n",
- " def convert_time(s):\n",
- " timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())\n",
- " return int(timestamp)\n",
- "\n",
- " df['Timestamp'] = df['Timestamp'].apply(convert_time)\n",
- " \n",
- " return df\n",
- "\n",
- " def __feature_engineering(self, df):\n",
- " #TODO\n",
- " return df\n",
- "\n",
- " def load_data_from_file(self, file_name, is_train=True):\n",
- " csv_file_path = os.path.join(self.args.data_dir, file_name)\n",
- " df = pd.read_csv(csv_file_path)#, nrows=100000)\n",
- " df = self.__feature_engineering(df)\n",
- " df = self.__preprocessing(df, is_train)\n",
- "\n",
- " # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용\n",
- "\n",
- " \n",
- " self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))\n",
- " self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))\n",
- " self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))\n",
- " \n",
- " df = df.sort_values(by=['userID','Timestamp'], axis=0)\n",
- " columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']\n",
- " group = df[columns].groupby('userID').apply(\n",
- " lambda r: (\n",
- " r['testId'].values, \n",
- " r['assessmentItemID'].values,\n",
- " r['KnowledgeTag'].values,\n",
- " r['answerCode'].values\n",
- " )\n",
- " )\n",
- "\n",
- " return group.values\n",
- "\n",
- " def load_train_data(self, file_name):\n",
- " self.train_data = self.load_data_from_file(file_name)\n",
- "\n",
- " def load_test_data(self, file_name):\n",
- " self.test_data = self.load_data_from_file(file_name, is_train= False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "E-MQhPevnz08"
- },
- "source": [
- "## 2. 데이터 셋 / 데이터 로더"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "id": "h29rn8YNnz09"
- },
- "outputs": [],
- "source": [
- "class DKTDataset(torch.utils.data.Dataset):\n",
- " def __init__(self, data, args):\n",
- " self.data = data\n",
- " self.args = args\n",
- "\n",
- " def __getitem__(self, index):\n",
- " row = self.data[index]\n",
- "\n",
- " # 각 data의 sequence length\n",
- " seq_len = len(row[0])\n",
- "\n",
- " test, question, tag, correct = row[0], row[1], row[2], row[3]\n",
- " \n",
- "\n",
- " cate_cols = [test, question, tag, correct]\n",
- "\n",
- " # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다\n",
- " if seq_len > self.args.max_seq_len:\n",
- " for i, col in enumerate(cate_cols):\n",
- " cate_cols[i] = col[-self.args.max_seq_len:]\n",
- " mask = np.ones(self.args.max_seq_len, dtype=np.int16)\n",
- " else:\n",
- " mask = np.zeros(self.args.max_seq_len, dtype=np.int16)\n",
- " mask[-seq_len:] = 1\n",
- "\n",
- " # mask도 columns 목록에 포함시킴\n",
- " cate_cols.append(mask)\n",
- "\n",
- " # np.array -> torch.tensor 형변환\n",
- " for i, col in enumerate(cate_cols):\n",
- " cate_cols[i] = torch.tensor(col)\n",
- "\n",
- " return cate_cols\n",
- "\n",
- " def __len__(self):\n",
- " return len(self.data)\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "def collate(batch):\n",
- " col_n = len(batch[0])\n",
- " col_list = [[] for _ in range(col_n)]\n",
- " max_seq_len = len(batch[0][-1])\n",
- "\n",
- " \n",
- " # batch의 값들을 각 column끼리 그룹화\n",
- " for row in batch:\n",
- " for i, col in enumerate(row):\n",
- " pre_padded = torch.zeros(max_seq_len)\n",
- " pre_padded[-len(col):] = col\n",
- " col_list[i].append(pre_padded)\n",
- "\n",
- "\n",
- " for i, _ in enumerate(col_list):\n",
- " col_list[i] =torch.stack(col_list[i])\n",
- " \n",
- " return tuple(col_list)\n",
- "\n",
- "\n",
- "def get_loaders(args, train, valid):\n",
- "\n",
- " pin_memory = False\n",
- " train_loader, valid_loader = None, None\n",
- " \n",
- " if train is not None:\n",
- " trainset = DKTDataset(train, args)\n",
- " train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,\n",
- " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n",
- " if valid is not None:\n",
- " valset = DKTDataset(valid, args)\n",
- " valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,\n",
- " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n",
- "\n",
- " return train_loader, valid_loader"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "QyiplxY6nz0-"
- },
- "source": [
- "## 3. LSTM 기반의 모델"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "id": "aO72oKAgnz0-"
- },
- "outputs": [],
- "source": [
- "import torch\n",
- "import torch.nn as nn\n",
- "import torch.nn.functional as F \n",
- "import numpy as np\n",
- "import copy\n",
- "import math\n",
- "\n",
- "try:\n",
- " from transformers.modeling_bert import BertConfig, BertEncoder, BertModel \n",
- "except:\n",
- " from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel \n",
- "\n",
- "class LSTM(nn.Module):\n",
- " def __init__(self, args):\n",
- " super(LSTM, self).__init__()\n",
- " self.args = args\n",
- " self.device = args.device\n",
- "\n",
- " self.hidden_dim = self.args.hidden_dim\n",
- " self.n_layers = self.args.n_layers\n",
- "\n",
- " # Embedding \n",
- " # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n",
- " self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)\n",
- " self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)\n",
- " self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)\n",
- " self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)\n",
- "\n",
- " # embedding combination projection\n",
- " self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)\n",
- "\n",
- " self.lstm = nn.LSTM(self.hidden_dim,\n",
- " self.hidden_dim,\n",
- " self.n_layers,\n",
- " batch_first=True)\n",
- " \n",
- " # Fully connected layer\n",
- " self.fc = nn.Linear(self.hidden_dim, 1)\n",
- "\n",
- " self.activation = nn.Sigmoid()\n",
- "\n",
- " def init_hidden(self, batch_size):\n",
- " h = torch.zeros(\n",
- " self.n_layers,\n",
- " batch_size,\n",
- " self.hidden_dim)\n",
- " h = h.to(self.device)\n",
- "\n",
- " c = torch.zeros(\n",
- " self.n_layers,\n",
- " batch_size,\n",
- " self.hidden_dim)\n",
- " c = c.to(self.device)\n",
- "\n",
- " return (h, c)\n",
- "\n",
- " def forward(self, input):\n",
- " test, question, tag, _, mask, interaction, _ = input\n",
- " batch_size = interaction.size(0)\n",
- "\n",
- " # Embedding\n",
- " embed_interaction = self.embedding_interaction(interaction)\n",
- " embed_test = self.embedding_test(test)\n",
- " embed_question = self.embedding_question(question)\n",
- " embed_tag = self.embedding_tag(tag)\n",
- "\n",
- " embed = torch.cat([embed_interaction,\n",
- " embed_test,\n",
- " embed_question,\n",
- " embed_tag,], 2)\n",
- "\n",
- " X = self.comb_proj(embed)\n",
- "\n",
- " hidden = self.init_hidden(batch_size)\n",
- " out, hidden = self.lstm(X, hidden)\n",
- " out = out.contiguous().view(batch_size, -1, self.hidden_dim)\n",
- "\n",
- " out = self.fc(out)\n",
- " preds = self.activation(out).view(batch_size, -1)\n",
- "\n",
- " return preds\n",
- " \n",
- "class LSTMATTN(nn.Module):\n",
- "\n",
- " def __init__(self, args):\n",
- " super(LSTMATTN, self).__init__()\n",
- " self.args = args\n",
- " self.device = args.device\n",
- "\n",
- " self.hidden_dim = self.args.hidden_dim\n",
- " self.n_layers = self.args.n_layers\n",
- " self.n_heads = self.args.n_heads\n",
- " self.drop_out = self.args.drop_out\n",
- "\n",
- " # Embedding \n",
- " # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n",
- " self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)\n",
- " self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)\n",
- " self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)\n",
- " self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)\n",
- "\n",
- " # embedding combination projection\n",
- " self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)\n",
- "\n",
- " self.lstm = nn.LSTM(self.hidden_dim,\n",
- " self.hidden_dim,\n",
- " self.n_layers,\n",
- " batch_first=True)\n",
- " \n",
- " self.config = BertConfig( \n",
- " 3, # not used\n",
- " hidden_size=self.hidden_dim,\n",
- " num_hidden_layers=1,\n",
- " num_attention_heads=self.n_heads,\n",
- " intermediate_size=self.hidden_dim,\n",
- " hidden_dropout_prob=self.drop_out,\n",
- " attention_probs_dropout_prob=self.drop_out,\n",
- " )\n",
- " self.attn = BertEncoder(self.config) \n",
- " \n",
- " # Fully connected layer\n",
- " self.fc = nn.Linear(self.hidden_dim, 1)\n",
- "\n",
- " self.activation = nn.Sigmoid()\n",
- "\n",
- " def init_hidden(self, batch_size):\n",
- " h = torch.zeros(\n",
- " self.n_layers,\n",
- " batch_size,\n",
- " self.hidden_dim)\n",
- " h = h.to(self.device)\n",
- "\n",
- " c = torch.zeros(\n",
- " self.n_layers,\n",
- " batch_size,\n",
- " self.hidden_dim)\n",
- " c = c.to(self.device)\n",
- "\n",
- " return (h, c)\n",
- "\n",
- " def forward(self, input):\n",
- "\n",
- " test, question, tag, _, mask, interaction, _ = input\n",
- "\n",
- " batch_size = interaction.size(0)\n",
- "\n",
- " # Embedding\n",
- "\n",
- " embed_interaction = self.embedding_interaction(interaction)\n",
- " embed_test = self.embedding_test(test)\n",
- " embed_question = self.embedding_question(question)\n",
- " embed_tag = self.embedding_tag(tag)\n",
- " \n",
- "\n",
- " embed = torch.cat([embed_interaction,\n",
- " embed_test,\n",
- " embed_question,\n",
- " embed_tag,], 2)\n",
- "\n",
- " X = self.comb_proj(embed)\n",
- "\n",
- " hidden = self.init_hidden(batch_size)\n",
- " out, hidden = self.lstm(X, hidden)\n",
- " out = out.contiguous().view(batch_size, -1, self.hidden_dim)\n",
- " \n",
- " extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)\n",
- " extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)\n",
- " extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\n",
- " head_mask = [None] * self.n_layers\n",
- " \n",
- " encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) \n",
- " sequence_output = encoded_layers[-1]\n",
- " \n",
- " out = self.fc(sequence_output)\n",
- "\n",
- " preds = self.activation(out).view(batch_size, -1)\n",
- "\n",
- " return preds\n",
- "\n",
- "\n",
- "class Bert(nn.Module):\n",
- "\n",
- " def __init__(self, args):\n",
- " super(Bert, self).__init__()\n",
- " self.args = args\n",
- " self.device = args.device\n",
- "\n",
- " # Defining some parameters\n",
- " self.hidden_dim = self.args.hidden_dim\n",
- " self.n_layers = self.args.n_layers\n",
- "\n",
- " # Embedding \n",
- " # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)\n",
- " self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)\n",
- " self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)\n",
- " self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)\n",
- " self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)\n",
- "\n",
- " # embedding combination projection\n",
- " self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)\n",
- "\n",
- " # Bert config\n",
- " self.config = BertConfig( \n",
- " 3, # not used\n",
- " hidden_size=self.hidden_dim,\n",
- " num_hidden_layers=self.args.n_layers,\n",
- " num_attention_heads=self.args.n_heads,\n",
- " max_position_embeddings=self.args.max_seq_len \n",
- " )\n",
- "\n",
- " # Defining the layers\n",
- " # Bert Layer\n",
- " self.encoder = BertModel(self.config) \n",
- "\n",
- " # Fully connected layer\n",
- " self.fc = nn.Linear(self.args.hidden_dim, 1)\n",
- " \n",
- " self.activation = nn.Sigmoid()\n",
- "\n",
- "\n",
- " def forward(self, input):\n",
- " test, question, tag, _, mask, interaction, _ = input\n",
- " batch_size = interaction.size(0)\n",
- "\n",
- " # 신나는 embedding\n",
- " \n",
- " embed_interaction = self.embedding_interaction(interaction)\n",
- " embed_test = self.embedding_test(test)\n",
- " embed_question = self.embedding_question(question)\n",
- " embed_tag = self.embedding_tag(tag)\n",
- "\n",
- " embed = torch.cat([embed_interaction,\n",
- " \n",
- " embed_test,\n",
- " embed_question,\n",
- " \n",
- " embed_tag,], 2)\n",
- "\n",
- " X = self.comb_proj(embed)\n",
- "\n",
- " # Bert\n",
- " encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)\n",
- " out = encoded_layers[0]\n",
- " out = out.contiguous().view(batch_size, -1, self.hidden_dim)\n",
- " out = self.fc(out)\n",
- " preds = self.activation(out).view(batch_size, -1)\n",
- "\n",
- " return preds"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "NEaAa6Prnz0_"
- },
- "source": [
- "## 4. 모델 훈련을 위한 함수들"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "id": "r_wU37QGnz0_"
- },
- "outputs": [],
- "source": [
- "import os, sys\n",
- "\n",
- "import numpy as np\n",
- "\n",
- "import tarfile\n",
- "import torch\n",
- "from torch import nn\n",
- "import torch.nn.functional as F\n",
- "from torch.optim import Adam, AdamW\n",
- "\n",
- "from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
- "\n",
- "from transformers import get_linear_schedule_with_warmup\n",
- "from transformers import get_cosine_with_hard_restarts_schedule_with_warmup\n",
- "\n",
- "from sklearn.metrics import roc_auc_score\n",
- "from sklearn.metrics import accuracy_score\n",
- "import scipy.stats\n",
- "\n",
- "\n",
- "# 훈련을 하기 위한 세팅\n",
- "def get_optimizer(model, args):\n",
- " if args.optimizer == 'adam':\n",
- " optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)\n",
- " if args.optimizer == 'adamW':\n",
- " optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)\n",
- " \n",
- " # 모든 parameter들의 grad값을 0으로 초기화\n",
- " optimizer.zero_grad()\n",
- " \n",
- " return optimizer\n",
- "\n",
- "def get_scheduler(optimizer, args):\n",
- " if args.scheduler == 'plateau':\n",
- " scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)\n",
- " elif args.scheduler == 'linear_warmup':\n",
- " scheduler = get_linear_schedule_with_warmup(optimizer,\n",
- " num_warmup_steps=args.warmup_steps,\n",
- " num_training_steps=args.total_steps)\n",
- " elif args.scheduler == 'cosine_warmup':\n",
- " scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,\n",
- " num_warmup_steps=args.warmup_steps,\n",
- " num_training_steps=args.total_steps)\n",
- " return scheduler\n",
- "\n",
- "def get_criterion(pred, target):\n",
- " loss = nn.BCELoss(reduction=\"none\")\n",
- " return loss(pred, target)\n",
- "\n",
- "def get_metric(targets, preds):\n",
- " auc = roc_auc_score(targets, preds)\n",
- " acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))\n",
- " return auc, acc\n",
- "\n",
- "def get_model(args):\n",
- " \"\"\"\n",
- " Load model and move tensors to a given devices.\n",
- " \"\"\"\n",
- " if args.model == 'lstm': model = LSTM(args)\n",
- " model.to(args.device)\n",
- "\n",
- " return model\n",
- "\n",
- "\n",
- "# 배치 전처리\n",
- "def process_batch(batch, args):\n",
- " test, question, tag, correct, mask = batch\n",
- " \n",
- " # change to float\n",
- " mask = mask.type(torch.FloatTensor)\n",
- " correct = correct.type(torch.FloatTensor)\n",
- "\n",
- " # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용\n",
- " # saint의 경우 decoder에 들어가는 input이다\n",
- " interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.\n",
- " interaction = interaction.roll(shifts=1, dims=1)\n",
- " interaction[:, 0] = 0 # set padding index to the first sequence\n",
- " interaction = (interaction * mask).to(torch.int64)\n",
- " # print(interaction)\n",
- " # exit()\n",
- " # test_id, question_id, tag\n",
- " test = ((test + 1) * mask).to(torch.int64)\n",
- " question = ((question + 1) * mask).to(torch.int64)\n",
- " tag = ((tag + 1) * mask).to(torch.int64)\n",
- "\n",
- " # gather index\n",
- " # 마지막 sequence만 사용하기 위한 index\n",
- " gather_index = torch.tensor(np.count_nonzero(mask, axis=1))\n",
- " gather_index = gather_index.view(-1, 1) - 1\n",
- "\n",
- "\n",
- " # device memory로 이동\n",
- "\n",
- " test = test.to(args.device)\n",
- " question = question.to(args.device)\n",
- "\n",
- "\n",
- " tag = tag.to(args.device)\n",
- " correct = correct.to(args.device)\n",
- " mask = mask.to(args.device)\n",
- "\n",
- " interaction = interaction.to(args.device)\n",
- " gather_index = gather_index.to(args.device)\n",
- "\n",
- " return (test, question,\n",
- " tag, correct, mask,\n",
- " interaction, gather_index)\n",
- "\n",
- "\n",
- "# loss계산하고 parameter update!\n",
- "def compute_loss(preds, targets):\n",
- " \"\"\"\n",
- " Args :\n",
- " preds : (batch_size, max_seq_len)\n",
- " targets : (batch_size, max_seq_len)\n",
- "\n",
- " \"\"\"\n",
- " loss = get_criterion(preds, targets)\n",
- " #마지막 시퀀드에 대한 값만 loss 계산\n",
- " loss = loss[:,-1]\n",
- " loss = torch.mean(loss)\n",
- " return loss\n",
- "\n",
- "def update_params(loss, model, optimizer, args):\n",
- " loss.backward()\n",
- " torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)\n",
- " optimizer.step()\n",
- " optimizer.zero_grad()\n",
- "\n",
- "def save_checkpoint(state, model_dir, model_filename):\n",
- " print('saving model ...')\n",
- " if not os.path.exists(model_dir):\n",
- " os.makedirs(model_dir) \n",
- " torch.save(state, os.path.join(model_dir, model_filename))\n",
- "\n",
- "def load_model(args):\n",
- " model_path = os.path.join(args.model_dir, args.model_name)\n",
- " print(\"Loading Model from:\", model_path)\n",
- " load_state = torch.load(model_path)\n",
- " model = get_model(args)\n",
- "\n",
- " # 1. load model state\n",
- " model.load_state_dict(load_state['state_dict'], strict=True)\n",
- " print(\"Loading Model from:\", model_path, \"...Finished.\")\n",
- " \n",
- " return model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "YO_xFaJYnz1B"
- },
- "source": [
- "## 5. 전체 프로세스를 담당하는 함수들"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "id": "BMiIOHgJnz1D"
- },
- "outputs": [],
- "source": [
- "def run(args, train_data, valid_data):\n",
- " train_loader, valid_loader = get_loaders(args, train_data, valid_data)\n",
- " \n",
- " # only when using warmup scheduler\n",
- " args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)\n",
- " args.warmup_steps = args.total_steps // 10\n",
- " \n",
- " model = get_model(args)\n",
- " optimizer = get_optimizer(model, args)\n",
- " scheduler = get_scheduler(optimizer, args)\n",
- "\n",
- " best_auc = -1\n",
- " early_stopping_counter = 0\n",
- " for epoch in range(args.n_epochs):\n",
- "\n",
- " print(f\"Start Training: Epoch {epoch + 1}\")\n",
- " \n",
- " ### TRAIN\n",
- " train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)\n",
- " \n",
- " ### VALID\n",
- " auc, acc, _, _ = validate(valid_loader, model, args)\n",
- "\n",
- " ### TODO: model save or early stopping\n",
- " wandb.log({\"epoch\": epoch, \"train_loss\": train_loss, \"train_auc\": train_auc, \"train_acc\":train_acc,\n",
- " \"valid_auc\":auc, \"valid_acc\":acc})\n",
- " if auc >= best_auc:\n",
- " best_auc = auc\n",
- " # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.\n",
- " model_to_save = model.module if hasattr(model, 'module') else model\n",
- " save_checkpoint({\n",
- " 'epoch': epoch + 1,\n",
- " 'state_dict': model_to_save.state_dict(),\n",
- " },\n",
- " args.model_dir, 'model.pt',\n",
- " )\n",
- " early_stopping_counter = 0\n",
- " else:\n",
- " early_stopping_counter += 1\n",
- " if early_stopping_counter >= args.patience:\n",
- " print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')\n",
- " break\n",
- "\n",
- " # scheduler\n",
- " if args.scheduler == 'plateau':\n",
- " scheduler.step(best_auc)\n",
- " else:\n",
- " scheduler.step()\n",
- "\n",
- "\n",
- "def train(train_loader, model, optimizer, args):\n",
- " model.train()\n",
- "\n",
- " total_preds = []\n",
- " total_targets = []\n",
- " losses = []\n",
- " for step, batch in enumerate(train_loader):\n",
- " input = process_batch(batch, args)\n",
- " preds = model(input)\n",
- " targets = input[3] # correct\n",
- "\n",
- "\n",
- " loss = compute_loss(preds, targets)\n",
- " update_params(loss, model, optimizer, args)\n",
- "\n",
- " if step % args.log_steps == 0:\n",
- " print(f\"Training steps: {step} Loss: {str(loss.item())}\")\n",
- " \n",
- " # predictions\n",
- " preds = preds[:,-1]\n",
- " targets = targets[:,-1]\n",
- "\n",
- " if args.device == 'cuda':\n",
- " preds = preds.to('cpu').detach().numpy()\n",
- " targets = targets.to('cpu').detach().numpy()\n",
- " else: # cpu\n",
- " preds = preds.detach().numpy()\n",
- " targets = targets.detach().numpy()\n",
- " \n",
- " total_preds.append(preds)\n",
- " total_targets.append(targets)\n",
- " losses.append(loss)\n",
- " \n",
- "\n",
- " total_preds = np.concatenate(total_preds)\n",
- " total_targets = np.concatenate(total_targets)\n",
- "\n",
- " # Train AUC / ACC\n",
- " auc, acc = get_metric(total_targets, total_preds)\n",
- " loss_avg = sum(losses)/len(losses)\n",
- " print(f'TRAIN AUC : {auc} ACC : {acc}')\n",
- " return auc, acc, loss_avg\n",
- " \n",
- "\n",
- "def validate(valid_loader, model, args):\n",
- " model.eval()\n",
- "\n",
- " total_preds = []\n",
- " total_targets = []\n",
- " for step, batch in enumerate(valid_loader):\n",
- " input = process_batch(batch, args)\n",
- "\n",
- " preds = model(input)\n",
- " targets = input[3] # correct\n",
- "\n",
- "\n",
- " # predictions\n",
- " preds = preds[:,-1]\n",
- " targets = targets[:,-1]\n",
- " \n",
- " if args.device == 'cuda':\n",
- " preds = preds.to('cpu').detach().numpy()\n",
- " targets = targets.to('cpu').detach().numpy()\n",
- " else: # cpu\n",
- " preds = preds.detach().numpy()\n",
- " targets = targets.detach().numpy()\n",
- "\n",
- " total_preds.append(preds)\n",
- " total_targets.append(targets)\n",
- "\n",
- " total_preds = np.concatenate(total_preds)\n",
- " total_targets = np.concatenate(total_targets)\n",
- "\n",
- " # Train AUC / ACC\n",
- " auc, acc = get_metric(total_targets, total_preds)\n",
- " \n",
- " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n",
- "\n",
- " return auc, acc, total_preds, total_targets\n",
- "\n",
- "\n",
- "\n",
- "def inference(args, test_data):\n",
- " \n",
- " model = load_model(args)\n",
- " model.eval()\n",
- " _, test_loader = get_loaders(args, None, test_data)\n",
- " \n",
- " \n",
- " total_preds = []\n",
- " \n",
- " for step, batch in enumerate(test_loader):\n",
- " input = process_batch(batch, args)\n",
- "\n",
- " preds = model(input)\n",
- " \n",
- "\n",
- " # predictions\n",
- " preds = preds[:,-1]\n",
- " \n",
- "\n",
- " if args.device == 'cuda':\n",
- " preds = preds.to('cpu').detach().numpy()\n",
- " else: # cpu\n",
- " preds = preds.detach().numpy()\n",
- " \n",
- " total_preds+=list(preds)\n",
- "\n",
- " write_path = os.path.join(args.output_dir, \"output.csv\")\n",
- " if not os.path.exists(args.output_dir):\n",
- " os.makedirs(args.output_dir) \n",
- " with open(write_path, 'w', encoding='utf8') as w:\n",
- " print(\"writing prediction : {}\".format(write_path))\n",
- " w.write(\"id,prediction\\n\")\n",
- " for id, p in enumerate(total_preds):\n",
- " w.write('{},{}\\n'.format(id,p))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "gPEE00qUnz1E"
- },
- "source": [
- "## 6.실행부분"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "id": "qZmwQenqnz1E"
- },
- "outputs": [],
- "source": [
- "data_dir = '/opt/ml/input/data/train_dataset'\n",
- "file_name = 'train_data.csv'\n",
- "test_file_name = 'test_data.csv'\n",
- "\n",
- "config = {}\n",
- "\n",
- "# 설정\n",
- "config['seed'] = 42\n",
- "config['device'] = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
- "config['data_dir'] = data_dir\n",
- "config['asset_dir'] = 'asset'\n",
- "config['model_dir'] = 'models'\n",
- "config['model_name'] = 'model.pt'\n",
- "config['output_dir'] = 'output'\n",
- "\n",
- "# 데이터\n",
- "config['max_seq_len'] = 30\n",
- "config['num_workers'] = 1\n",
- "\n",
- "# 모델\n",
- "config['hidden_dim'] = 64\n",
- "config['n_layers'] = 2\n",
- "config['dropout'] = 0.2\n",
- "\n",
- "# 훈련\n",
- "config['n_epochs'] = 100\n",
- "config['batch_size'] = 64\n",
- "config['lr'] = 5e-5\n",
- "config['clip_grad'] = 10\n",
- "config['log_steps'] = 50\n",
- "config['patience'] = 30\n",
- "\n",
- "\n",
- "### 중요 ###\n",
- "config['model'] = 'lstm'\n",
- "config['optimizer'] = 'adam'\n",
- "config['scheduler'] = 'cosine_warmup'\n",
- "\n",
- "args = easydict.EasyDict(config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "def setSeeds(seed = 42):\n",
- " # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.\n",
- " os.environ['PYTHONHASHSEED'] = str(seed)\n",
- " random.seed(seed)\n",
- " np.random.seed(seed)\n",
- " torch.manual_seed(seed) \n",
- " torch.cuda.manual_seed(seed)\n",
- " torch.backends.cudnn.deterministic = True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "setSeeds(42)\n",
- "\n",
- "preprocess = Preprocess(args)\n",
- "preprocess.load_train_data(file_name)\n",
- "\n",
- "train_data = preprocess.get_train_data()\n",
- "train_data, valid_data = preprocess.split_data(train_data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Calling wandb.login() after wandb.init() has no effect.\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "wandb.login()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "Finishing last run (ID:2y20u2i4) before initializing another..."
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "
Waiting for W&B process to finish, PID 3232
Program ended successfully."
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Find user logs for this run at: /opt/ml/code/wandb/run-20210524_172457-2y20u2i4/logs/debug.log
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Find internal logs for this run at: /opt/ml/code/wandb/run-20210524_172457-2y20u2i4/logs/debug-internal.log
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Run summary:
\n",
- "epoch | 49 |
train_loss | 0.63742 |
train_auc | 0.74035 |
train_acc | 0.68082 |
valid_auc | 0.72291 |
valid_acc | 0.67164 |
_runtime | 86 |
_timestamp | 1621877183 |
_step | 49 |
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Run history:
\n",
- "epoch | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███ |
train_loss | ███████████████████▇▇▇▇▇▇▇▇▇▆▆▆▆▅▅▄▄▃▃▂▁ |
train_auc | ▁▁▁▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇████████████ |
train_acc | ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▄▅▆▆▇▇██████ |
valid_auc | ▁▁▁▁▂▂▂▂▃▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇████████████████ |
valid_acc | ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▄▄▅▆▇▇▇▇████ |
_runtime | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███ |
_timestamp | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███ |
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
Synced young-hill-35: https://wandb.ai/team-ikyo/P4-DKT/runs/2y20u2i4
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "...Successfully finished last run (ID:2y20u2i4). Initializing new run:
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- " Tracking run with wandb version 0.10.30
\n",
- " Syncing run super-shadow-36 to Weights & Biases (Documentation).
\n",
- " Project page: https://wandb.ai/team-ikyo/P4-DKT
\n",
- " Run page: https://wandb.ai/team-ikyo/P4-DKT/runs/gmpr78jx
\n",
- " Run data is saved locally in /opt/ml/code/wandb/run-20210524_173510-gmpr78jx
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "Run(gmpr78jx)
"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "wandb.init(project='P4-DKT', config=config, entity=\"team-ikyo\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "id": "v9qV6aXonz1E",
- "outputId": "0d36ac2e-7ca2-4fc0-cf4c-ea296bc40ce4"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Start Training: Epoch 1\n",
- "Training steps: 0 Loss: 0.692417562007904\n",
- "Training steps: 50 Loss: 0.6840407848358154\n",
- "TRAIN AUC : 0.5706521217682164 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5666883771558069 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 2\n",
- "Training steps: 0 Loss: 0.6862214207649231\n",
- "Training steps: 50 Loss: 0.6924540996551514\n",
- "TRAIN AUC : 0.5707625153638439 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.566964962839375 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 3\n",
- "Training steps: 0 Loss: 0.7004860639572144\n",
- "Training steps: 50 Loss: 0.692416787147522\n",
- "TRAIN AUC : 0.5710979420582505 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5674422961965007 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 4\n",
- "Training steps: 0 Loss: 0.6815810799598694\n",
- "Training steps: 50 Loss: 0.6838415265083313\n",
- "TRAIN AUC : 0.571644671579053 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5681025329895343 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 5\n",
- "Training steps: 0 Loss: 0.6949415802955627\n",
- "Training steps: 50 Loss: 0.6862828135490417\n",
- "TRAIN AUC : 0.5723872642625272 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.568981361693775 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 6\n",
- "Training steps: 0 Loss: 0.6926754713058472\n",
- "Training steps: 50 Loss: 0.7078460454940796\n",
- "TRAIN AUC : 0.5733982865281766 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5700743212498104 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 7\n",
- "Training steps: 0 Loss: 0.6875584125518799\n",
- "Training steps: 50 Loss: 0.7003850340843201\n",
- "TRAIN AUC : 0.5746439916824325 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5711940471623201 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 8\n",
- "Training steps: 0 Loss: 0.6917406916618347\n",
- "Training steps: 50 Loss: 0.6930617094039917\n",
- "TRAIN AUC : 0.5760360427921309 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5725591313425112 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 9\n",
- "Training steps: 0 Loss: 0.7017080783843994\n",
- "Training steps: 50 Loss: 0.7089024782180786\n",
- "TRAIN AUC : 0.5776695592141432 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.574240950740982 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 10\n",
- "Training steps: 0 Loss: 0.6867542266845703\n",
- "Training steps: 50 Loss: 0.6907169818878174\n",
- "TRAIN AUC : 0.5795056109392223 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5763064212489182 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 11\n",
- "Training steps: 0 Loss: 0.6878455877304077\n",
- "Training steps: 50 Loss: 0.6867403388023376\n",
- "TRAIN AUC : 0.5815521383658551 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5782157546774208 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 12\n",
- "Training steps: 0 Loss: 0.7022501826286316\n",
- "Training steps: 50 Loss: 0.6819038391113281\n",
- "TRAIN AUC : 0.5837682815268284 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5805176613342136 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 13\n",
- "Training steps: 0 Loss: 0.6965742707252502\n",
- "Training steps: 50 Loss: 0.698516845703125\n",
- "TRAIN AUC : 0.5862256694334912 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5830916926151622 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 14\n",
- "Training steps: 0 Loss: 0.6859776973724365\n",
- "Training steps: 50 Loss: 0.6959233283996582\n",
- "TRAIN AUC : 0.5888533899160249 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5857237176684719 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 15\n",
- "Training steps: 0 Loss: 0.6952184438705444\n",
- "Training steps: 50 Loss: 0.6844170093536377\n",
- "TRAIN AUC : 0.5916244235629119 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.588440502850617 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 16\n",
- "Training steps: 0 Loss: 0.696398138999939\n",
- "Training steps: 50 Loss: 0.6886947751045227\n",
- "TRAIN AUC : 0.5946340000143369 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5916167771522381 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 17\n",
- "Training steps: 0 Loss: 0.6868777275085449\n",
- "Training steps: 50 Loss: 0.6872795224189758\n",
- "TRAIN AUC : 0.5977235870088464 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5945566153050027 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 18\n",
- "Training steps: 0 Loss: 0.682781994342804\n",
- "Training steps: 50 Loss: 0.6983034610748291\n",
- "TRAIN AUC : 0.6009908404194957 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.5975856746460149 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 19\n",
- "Training steps: 0 Loss: 0.6877943277359009\n",
- "Training steps: 50 Loss: 0.6941701173782349\n",
- "TRAIN AUC : 0.6044240702151793 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6010965284035652 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 20\n",
- "Training steps: 0 Loss: 0.6820396184921265\n",
- "Training steps: 50 Loss: 0.6988538503646851\n",
- "TRAIN AUC : 0.6079523642260786 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6043218743587228 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 21\n",
- "Training steps: 0 Loss: 0.6819553971290588\n",
- "Training steps: 50 Loss: 0.686287522315979\n",
- "TRAIN AUC : 0.6115711456947325 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6076185972644783 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 22\n",
- "Training steps: 0 Loss: 0.6871113181114197\n",
- "Training steps: 50 Loss: 0.6858992576599121\n",
- "TRAIN AUC : 0.6151929048128189 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.611397114586772 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 23\n",
- "Training steps: 0 Loss: 0.6912109851837158\n",
- "Training steps: 50 Loss: 0.6891967058181763\n",
- "TRAIN AUC : 0.6189247266765958 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6152782362755507 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 24\n",
- "Training steps: 0 Loss: 0.691832423210144\n",
- "Training steps: 50 Loss: 0.6918518543243408\n",
- "TRAIN AUC : 0.6227046320645422 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6191325916078549 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 25\n",
- "Training steps: 0 Loss: 0.7003605365753174\n",
- "Training steps: 50 Loss: 0.6976995468139648\n",
- "TRAIN AUC : 0.6264152243906985 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6231252397819435 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 26\n",
- "Training steps: 0 Loss: 0.681491494178772\n",
- "Training steps: 50 Loss: 0.7046642899513245\n",
- "TRAIN AUC : 0.6302583221166015 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6273186356295893 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 27\n",
- "Training steps: 0 Loss: 0.689978837966919\n",
- "Training steps: 50 Loss: 0.686962366104126\n",
- "TRAIN AUC : 0.6341556792321634 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6315744863090086 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 28\n",
- "Training steps: 0 Loss: 0.6937505006790161\n",
- "Training steps: 50 Loss: 0.6912407875061035\n",
- "TRAIN AUC : 0.6380334059181334 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6353886921066015 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 29\n",
- "Training steps: 0 Loss: 0.6831233501434326\n",
- "Training steps: 50 Loss: 0.6911698579788208\n",
- "TRAIN AUC : 0.641875676519194 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6393055022706793 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 30\n",
- "Training steps: 0 Loss: 0.6827176809310913\n",
- "Training steps: 50 Loss: 0.6942216157913208\n",
- "TRAIN AUC : 0.6456827116019698 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6427940507311676 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 31\n",
- "Training steps: 0 Loss: 0.6906517744064331\n",
- "Training steps: 50 Loss: 0.6942943334579468\n",
- "TRAIN AUC : 0.6495278495691507 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6470855898858863 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 32\n",
- "Training steps: 0 Loss: 0.6913177967071533\n",
- "Training steps: 50 Loss: 0.687474250793457\n",
- "TRAIN AUC : 0.6533110083148103 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6517518580312454 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 33\n",
- "Training steps: 0 Loss: 0.6927112936973572\n",
- "Training steps: 50 Loss: 0.672516942024231\n",
- "TRAIN AUC : 0.6570383085627823 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.655543758531776 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 34\n",
- "Training steps: 0 Loss: 0.6943120956420898\n",
- "Training steps: 50 Loss: 0.6772613525390625\n",
- "TRAIN AUC : 0.6607302630091574 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6591126060616875 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 35\n",
- "Training steps: 0 Loss: 0.6956484317779541\n",
- "Training steps: 50 Loss: 0.6883639097213745\n",
- "TRAIN AUC : 0.6642208401272449 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6629045065622184 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 36\n",
- "Training steps: 0 Loss: 0.6808600425720215\n",
- "Training steps: 50 Loss: 0.6896501183509827\n",
- "TRAIN AUC : 0.6676444201331009 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6659558712002926 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 37\n",
- "Training steps: 0 Loss: 0.6888445615768433\n",
- "Training steps: 50 Loss: 0.6821715235710144\n",
- "TRAIN AUC : 0.6709846259548468 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6697031611066996 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 38\n",
- "Training steps: 0 Loss: 0.6828429102897644\n",
- "Training steps: 50 Loss: 0.679345965385437\n",
- "TRAIN AUC : 0.6742262936370389 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6728526690518464 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 39\n",
- "Training steps: 0 Loss: 0.683903694152832\n",
- "Training steps: 50 Loss: 0.6989715099334717\n",
- "TRAIN AUC : 0.6773904321506735 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6759575664028693 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 40\n",
- "Training steps: 0 Loss: 0.6891867518424988\n",
- "Training steps: 50 Loss: 0.6754833459854126\n",
- "TRAIN AUC : 0.6803322946483368 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6789286319715206 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 41\n",
- "Training steps: 0 Loss: 0.6858559250831604\n",
- "Training steps: 50 Loss: 0.6812664270401001\n",
- "TRAIN AUC : 0.6832431675352396 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.681475896895995 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 42\n",
- "Training steps: 0 Loss: 0.6873148679733276\n",
- "Training steps: 50 Loss: 0.6852411031723022\n",
- "TRAIN AUC : 0.6861574592048242 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6849287568811842 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 43\n",
- "Training steps: 0 Loss: 0.685234546661377\n",
- "Training steps: 50 Loss: 0.6838175058364868\n",
- "TRAIN AUC : 0.68879505004381 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6873109626074 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 44\n",
- "Training steps: 0 Loss: 0.6828192472457886\n",
- "Training steps: 50 Loss: 0.6900765895843506\n",
- "TRAIN AUC : 0.6914436140723713 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6894478100659345 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 45\n",
- "Training steps: 0 Loss: 0.6888051629066467\n",
- "Training steps: 50 Loss: 0.6832900047302246\n",
- "TRAIN AUC : 0.6938668692938725 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6917318724850777 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 46\n",
- "Training steps: 0 Loss: 0.6885271668434143\n",
- "Training steps: 50 Loss: 0.6838119626045227\n",
- "TRAIN AUC : 0.6960900154451779 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6937661155771273 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 47\n",
- "Training steps: 0 Loss: 0.6950991153717041\n",
- "Training steps: 50 Loss: 0.6807537078857422\n",
- "TRAIN AUC : 0.6983338397175424 ACC : 0.5214001327140013\n",
- "VALID AUC : 0.6957289817185784 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 48\n",
- "Training steps: 0 Loss: 0.6780133247375488\n",
- "Training steps: 50 Loss: 0.6875277757644653\n",
- "TRAIN AUC : 0.7004263552854104 ACC : 0.5215660252156602\n",
- "VALID AUC : 0.6976650815035557 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 49\n",
- "Training steps: 0 Loss: 0.6855498552322388\n",
- "Training steps: 50 Loss: 0.6872391700744629\n",
- "TRAIN AUC : 0.7024724967204499 ACC : 0.5215660252156602\n",
- "VALID AUC : 0.6993781283179129 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 50\n",
- "Training steps: 0 Loss: 0.6818110942840576\n",
- "Training steps: 50 Loss: 0.6915051937103271\n",
- "TRAIN AUC : 0.7043800120319094 ACC : 0.5215660252156602\n",
- "VALID AUC : 0.7007164461416296 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 51\n",
- "Training steps: 0 Loss: 0.6899938583374023\n",
- "Training steps: 50 Loss: 0.6827948689460754\n",
- "TRAIN AUC : 0.7062040980176023 ACC : 0.5217319177173192\n",
- "VALID AUC : 0.7026079353324828 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 52\n",
- "Training steps: 0 Loss: 0.6951197385787964\n",
- "Training steps: 50 Loss: 0.6869640350341797\n",
- "TRAIN AUC : 0.7079948784429761 ACC : 0.5217319177173192\n",
- "VALID AUC : 0.7040488575226845 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 53\n",
- "Training steps: 0 Loss: 0.6864886283874512\n",
- "Training steps: 50 Loss: 0.6770392656326294\n",
- "TRAIN AUC : 0.7096173665337568 ACC : 0.5217319177173192\n",
- "VALID AUC : 0.7050704401281216 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 54\n",
- "Training steps: 0 Loss: 0.6844491958618164\n",
- "Training steps: 50 Loss: 0.6901024580001831\n",
- "TRAIN AUC : 0.7112048396728776 ACC : 0.5217319177173192\n",
- "VALID AUC : 0.7060072626047235 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 55\n",
- "Training steps: 0 Loss: 0.6776145100593567\n",
- "Training steps: 50 Loss: 0.6930866241455078\n",
- "TRAIN AUC : 0.7128946557258269 ACC : 0.5218978102189781\n",
- "VALID AUC : 0.7073232751313782 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 56\n",
- "Training steps: 0 Loss: 0.6839854717254639\n",
- "Training steps: 50 Loss: 0.6891499161720276\n",
- "TRAIN AUC : 0.7143063924067733 ACC : 0.5220637027206371\n",
- "VALID AUC : 0.7081842595979693 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 57\n",
- "Training steps: 0 Loss: 0.6766519546508789\n",
- "Training steps: 50 Loss: 0.6914658546447754\n",
- "TRAIN AUC : 0.7156377325530421 ACC : 0.5220637027206371\n",
- "VALID AUC : 0.7092638359757675 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 58\n",
- "Training steps: 0 Loss: 0.6948622465133667\n",
- "Training steps: 50 Loss: 0.6832144260406494\n",
- "TRAIN AUC : 0.716888951872914 ACC : 0.5223954877239548\n",
- "VALID AUC : 0.7100043718382242 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 59\n",
- "Training steps: 0 Loss: 0.6926015019416809\n",
- "Training steps: 50 Loss: 0.6960905194282532\n",
- "TRAIN AUC : 0.7181502621158626 ACC : 0.5230590577305906\n",
- "VALID AUC : 0.7110259544436612 ACC : 0.5194029850746269\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 60\n",
- "Training steps: 0 Loss: 0.6834248304367065\n",
- "Training steps: 50 Loss: 0.676106870174408\n",
- "TRAIN AUC : 0.7193734143327501 ACC : 0.5232249502322495\n",
- "VALID AUC : 0.7115880479296223 ACC : 0.5194029850746269\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 61\n",
- "Training steps: 0 Loss: 0.6779543161392212\n",
- "Training steps: 50 Loss: 0.6940694451332092\n",
- "TRAIN AUC : 0.720566238638751 ACC : 0.5247179827471798\n",
- "VALID AUC : 0.7120832255243975 ACC : 0.517910447761194\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 62\n",
- "Training steps: 0 Loss: 0.6803902387619019\n",
- "Training steps: 50 Loss: 0.6782917976379395\n",
- "TRAIN AUC : 0.7215273577055777 ACC : 0.5268745852687459\n",
- "VALID AUC : 0.7125560978221107 ACC : 0.5223880597014925\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 63\n",
- "Training steps: 0 Loss: 0.6868630647659302\n",
- "Training steps: 50 Loss: 0.6726241111755371\n",
- "TRAIN AUC : 0.7226479464420122 ACC : 0.5296947577969475\n",
- "VALID AUC : 0.7134839981798878 ACC : 0.5238805970149254\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 64\n",
- "Training steps: 0 Loss: 0.6790873408317566\n",
- "Training steps: 50 Loss: 0.6886022686958313\n",
- "TRAIN AUC : 0.723664703439902 ACC : 0.533012607830126\n",
- "VALID AUC : 0.7139033377646524 ACC : 0.5238805970149254\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 65\n",
- "Training steps: 0 Loss: 0.6879961490631104\n",
- "Training steps: 50 Loss: 0.6784022450447083\n",
- "TRAIN AUC : 0.7246031041443917 ACC : 0.5351692103516921\n",
- "VALID AUC : 0.7144163595970772 ACC : 0.5328358208955224\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 66\n",
- "Training steps: 0 Loss: 0.6939610242843628\n",
- "Training steps: 50 Loss: 0.674263596534729\n",
- "TRAIN AUC : 0.7254999280401387 ACC : 0.5403118779031187\n",
- "VALID AUC : 0.7148936929542028 ACC : 0.5388059701492537\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 67\n",
- "Training steps: 0 Loss: 0.6743142604827881\n",
- "Training steps: 50 Loss: 0.6903876066207886\n",
- "TRAIN AUC : 0.7262595043537095 ACC : 0.5497677504976775\n",
- "VALID AUC : 0.7155985403413603 ACC : 0.5447761194029851\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 68\n",
- "Training steps: 0 Loss: 0.6766376495361328\n",
- "Training steps: 50 Loss: 0.687745213508606\n",
- "TRAIN AUC : 0.7271661986059086 ACC : 0.5547445255474452\n",
- "VALID AUC : 0.7163123098473425 ACC : 0.5462686567164179\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 69\n",
- "Training steps: 0 Loss: 0.6879795789718628\n",
- "Training steps: 50 Loss: 0.6815367937088013\n",
- "TRAIN AUC : 0.727793048953108 ACC : 0.5613802256138023\n",
- "VALID AUC : 0.716731649432107 ACC : 0.5582089552238806\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 70\n",
- "Training steps: 0 Loss: 0.6805175542831421\n",
- "Training steps: 50 Loss: 0.6751073598861694\n",
- "TRAIN AUC : 0.7285517430001802 ACC : 0.5668546781685467\n",
- "VALID AUC : 0.7171955996109955 ACC : 0.564179104477612\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 71\n",
- "Training steps: 0 Loss: 0.6840928792953491\n",
- "Training steps: 50 Loss: 0.6745134592056274\n",
- "TRAIN AUC : 0.7293068528396023 ACC : 0.579628400796284\n",
- "VALID AUC : 0.7177220046216576 ACC : 0.5746268656716418\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 72\n",
- "Training steps: 0 Loss: 0.6894361972808838\n",
- "Training steps: 50 Loss: 0.680274486541748\n",
- "TRAIN AUC : 0.7300070967311475 ACC : 0.5925680159256802\n",
- "VALID AUC : 0.7180030513646382 ACC : 0.5835820895522388\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 73\n",
- "Training steps: 0 Loss: 0.6819484233856201\n",
- "Training steps: 50 Loss: 0.6783482432365417\n",
- "TRAIN AUC : 0.730717266120801 ACC : 0.5990378234903783\n",
- "VALID AUC : 0.7185249953158876 ACC : 0.582089552238806\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 74\n",
- "Training steps: 0 Loss: 0.6840368509292603\n",
- "Training steps: 50 Loss: 0.6785203814506531\n",
- "TRAIN AUC : 0.7314435368740526 ACC : 0.6068347710683477\n",
- "VALID AUC : 0.7190781666830239 ACC : 0.5850746268656717\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 75\n",
- "Training steps: 0 Loss: 0.6774231791496277\n",
- "Training steps: 50 Loss: 0.6709319353103638\n",
- "TRAIN AUC : 0.7320755153952747 ACC : 0.6157929661579297\n",
- "VALID AUC : 0.7193012196536434 ACC : 0.5880597014925373\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 76\n",
- "Training steps: 0 Loss: 0.6719912886619568\n",
- "Training steps: 50 Loss: 0.6652551889419556\n",
- "TRAIN AUC : 0.732552931854287 ACC : 0.6189449236894492\n",
- "VALID AUC : 0.7199793006843265 ACC : 0.6\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 77\n",
- "Training steps: 0 Loss: 0.6745172739028931\n",
- "Training steps: 50 Loss: 0.6809003353118896\n",
- "TRAIN AUC : 0.7329577818937961 ACC : 0.6279031187790312\n",
- "VALID AUC : 0.7203495686155548 ACC : 0.6059701492537314\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 78\n",
- "Training steps: 0 Loss: 0.6692463159561157\n",
- "Training steps: 50 Loss: 0.6750683784484863\n",
- "TRAIN AUC : 0.7332547748538608 ACC : 0.635036496350365\n",
- "VALID AUC : 0.7207733692597318 ACC : 0.6238805970149254\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 79\n",
- "Training steps: 0 Loss: 0.6872453689575195\n",
- "Training steps: 50 Loss: 0.6738052368164062\n",
- "TRAIN AUC : 0.7340794183216975 ACC : 0.6420039814200398\n",
- "VALID AUC : 0.7209428895174026 ACC : 0.6358208955223881\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 80\n",
- "Training steps: 0 Loss: 0.6732134222984314\n",
- "Training steps: 50 Loss: 0.676743745803833\n",
- "TRAIN AUC : 0.7344128047748262 ACC : 0.6473125414731254\n",
- "VALID AUC : 0.721290852151569 ACC : 0.6432835820895523\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 81\n",
- "Training steps: 0 Loss: 0.6713000535964966\n",
- "Training steps: 50 Loss: 0.6630432605743408\n",
- "TRAIN AUC : 0.7350863601047912 ACC : 0.6531187790311878\n",
- "VALID AUC : 0.7215674378351371 ACC : 0.6477611940298508\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 82\n",
- "Training steps: 0 Loss: 0.6677560210227966\n",
- "Training steps: 50 Loss: 0.668387234210968\n",
- "TRAIN AUC : 0.7352542664477911 ACC : 0.6566025215660252\n",
- "VALID AUC : 0.721817257162231 ACC : 0.6388059701492538\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 83\n",
- "Training steps: 0 Loss: 0.6648356914520264\n",
- "Training steps: 50 Loss: 0.6758044958114624\n",
- "TRAIN AUC : 0.7357784430312251 ACC : 0.6595885865958858\n",
- "VALID AUC : 0.7219600110634274 ACC : 0.6477611940298508\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 84\n",
- "Training steps: 0 Loss: 0.6734097599983215\n",
- "Training steps: 50 Loss: 0.661230742931366\n",
- "TRAIN AUC : 0.736174525547405 ACC : 0.6624087591240876\n",
- "VALID AUC : 0.7219867774199018 ACC : 0.655223880597015\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 85\n",
- "Training steps: 0 Loss: 0.6649569272994995\n",
- "Training steps: 50 Loss: 0.671761691570282\n",
- "TRAIN AUC : 0.7365350968370193 ACC : 0.6662242866622429\n",
- "VALID AUC : 0.722080459667562 ACC : 0.6447761194029851\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 86\n",
- "Training steps: 0 Loss: 0.6692578792572021\n",
- "Training steps: 50 Loss: 0.6655640602111816\n",
- "TRAIN AUC : 0.7370357279332742 ACC : 0.6713669542136695\n",
- "VALID AUC : 0.7222633631034698 ACC : 0.6522388059701493\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 87\n",
- "Training steps: 0 Loss: 0.6766065359115601\n",
- "Training steps: 50 Loss: 0.6757673025131226\n",
- "TRAIN AUC : 0.7372363884200317 ACC : 0.6746848042468481\n",
- "VALID AUC : 0.7224730328958522 ACC : 0.6567164179104478\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 88\n",
- "Training steps: 0 Loss: 0.6472039818763733\n",
- "Training steps: 50 Loss: 0.6517682671546936\n",
- "TRAIN AUC : 0.7374914185797604 ACC : 0.6743530192435302\n",
- "VALID AUC : 0.7224507275987903 ACC : 0.6597014925373135\n",
- "\n",
- "Start Training: Epoch 89\n",
- "Training steps: 0 Loss: 0.6545861959457397\n",
- "Training steps: 50 Loss: 0.6705600023269653\n",
- "TRAIN AUC : 0.7378045680450795 ACC : 0.6756801592568016\n",
- "VALID AUC : 0.7225131824305637 ACC : 0.6597014925373135\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 90\n",
- "Training steps: 0 Loss: 0.654556155204773\n",
- "Training steps: 50 Loss: 0.6534612774848938\n",
- "TRAIN AUC : 0.7380298217104831 ACC : 0.6775049767750497\n",
- "VALID AUC : 0.7225845593811618 ACC : 0.6656716417910448\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 91\n",
- "Training steps: 0 Loss: 0.6601536273956299\n",
- "Training steps: 50 Loss: 0.6620394587516785\n",
- "TRAIN AUC : 0.7385535020190117 ACC : 0.6803251493032515\n",
- "VALID AUC : 0.7225221045493884 ACC : 0.6626865671641791\n",
- "\n",
- "Start Training: Epoch 92\n",
- "Training steps: 0 Loss: 0.6564822196960449\n",
- "Training steps: 50 Loss: 0.6467278003692627\n",
- "TRAIN AUC : 0.7387894531657101 ACC : 0.679163901791639\n",
- "VALID AUC : 0.7226024036188115 ACC : 0.6671641791044776\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 93\n",
- "Training steps: 0 Loss: 0.6503366231918335\n",
- "Training steps: 50 Loss: 0.6717678904533386\n",
- "TRAIN AUC : 0.7389809601375454 ACC : 0.681320504313205\n",
- "VALID AUC : 0.7227719238764821 ACC : 0.6716417910447762\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 94\n",
- "Training steps: 0 Loss: 0.6541410684585571\n",
- "Training steps: 50 Loss: 0.6537812948226929\n",
- "TRAIN AUC : 0.7395030413380455 ACC : 0.681320504313205\n",
- "VALID AUC : 0.7229057556588538 ACC : 0.673134328358209\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 95\n",
- "Training steps: 0 Loss: 0.6492453813552856\n",
- "Training steps: 50 Loss: 0.6344279050827026\n",
- "TRAIN AUC : 0.7396138760669221 ACC : 0.6823158593231586\n",
- "VALID AUC : 0.7229503662529777 ACC : 0.6761194029850747\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 96\n",
- "Training steps: 0 Loss: 0.6675565242767334\n",
- "Training steps: 50 Loss: 0.6522634029388428\n",
- "TRAIN AUC : 0.7398092980963447 ACC : 0.6838088918380889\n",
- "VALID AUC : 0.722905755658854 ACC : 0.6761194029850747\n",
- "\n",
- "Start Training: Epoch 97\n",
- "Training steps: 0 Loss: 0.6629478931427002\n",
- "Training steps: 50 Loss: 0.6766805648803711\n",
- "TRAIN AUC : 0.7398948779467012 ACC : 0.6821499668214996\n",
- "VALID AUC : 0.7230128210847512 ACC : 0.6761194029850747\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 98\n",
- "Training steps: 0 Loss: 0.6685456037521362\n",
- "Training steps: 50 Loss: 0.6543511152267456\n",
- "TRAIN AUC : 0.739927356382178 ACC : 0.6821499668214996\n",
- "VALID AUC : 0.7230306653224008 ACC : 0.6746268656716418\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 99\n",
- "Training steps: 0 Loss: 0.6607905626296997\n",
- "Training steps: 50 Loss: 0.6353657841682434\n",
- "TRAIN AUC : 0.7402753002325324 ACC : 0.681486396814864\n",
- "VALID AUC : 0.7231377307482981 ACC : 0.673134328358209\n",
- "\n",
- "saving model ...\n",
- "Start Training: Epoch 100\n",
- "Training steps: 0 Loss: 0.648962676525116\n",
- "Training steps: 50 Loss: 0.6430833339691162\n",
- "TRAIN AUC : 0.7404201022216021 ACC : 0.681320504313205\n",
- "VALID AUC : 0.7230128210847511 ACC : 0.6761194029850747\n",
- "\n"
- ]
- }
- ],
- "source": [
- "run(args, train_data, valid_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "QFY0zXGFnz1F"
- },
- "source": [
- "## Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PcTCBhrZnz1G"
- },
- "outputs": [],
- "source": [
- "preprocess = Preprocess(args)\n",
- "preprocess.load_test_data(test_file_name)\n",
- "test_data = preprocess.get_test_data()\n",
- "inference(args, test_data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "name": "3강_lstm_baseline.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/code/baseline/dkt/criterion.py b/code/baseline/dkt/criterion.py
deleted file mode 100644
index 3d46a7f..0000000
--- a/code/baseline/dkt/criterion.py
+++ /dev/null
@@ -1,7 +0,0 @@
-
-import torch.nn as nn
-
-
-def get_criterion(pred, target):
- loss = nn.BCELoss(reduction="none")
- return loss(pred, target)
\ No newline at end of file
diff --git a/code/baseline/dkt/dataloader.py b/code/baseline/dkt/dataloader.py
deleted file mode 100644
index c542aa9..0000000
--- a/code/baseline/dkt/dataloader.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import os
-from datetime import datetime
-import time
-import tqdm
-import pandas as pd
-import random
-from sklearn.preprocessing import LabelEncoder
-import numpy as np
-import torch
-
-class Preprocess:
- def __init__(self,args):
- self.args = args
- self.train_data = None
- self.test_data = None
-
-
- def get_train_data(self):
- return self.train_data
-
- def get_test_data(self):
- return self.test_data
-
- def split_data(self, data, ratio=0.7, shuffle=True, seed=0):
- """
- split data into two parts with a given ratio.
- """
- if shuffle:
- random.seed(seed) # fix to default seed 0
- random.shuffle(data)
-
- size = int(len(data) * ratio)
- data_1 = data[:size]
- data_2 = data[size:]
-
- return data_1, data_2
-
- def __save_labels(self, encoder, name):
- le_path = os.path.join(self.args.data_dir, name + '_classes.npy')
- np.save(le_path, encoder.classes_)
-
- def __preprocessing(self, df, is_train = True):
- cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']
-
- if not os.path.exists(self.args.asset_dir):
- os.makedirs(self.args.asset_dir)
-
- for col in cate_cols:
-
-
- le = LabelEncoder()
- if is_train:
- #For UNKNOWN class
- a = df[col].unique().tolist() + ['unknown']
- le.fit(a)
- self.__save_labels(le, col)
- else:
- label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')
- le.classes_ = np.load(label_path)
-
- df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')
-
- #모든 컬럼이 범주형이라고 가정
- df[col]= df[col].astype(str)
- test = le.transform(df[col])
- df[col] = test
-
-
- def convert_time(s):
- timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
- return int(timestamp)
-
- df['Timestamp'] = df['Timestamp'].apply(convert_time)
-
- return df
-
- def __feature_engineering(self, df):
- #TODO
- return df
-
- def load_data_from_file(self, file_name, is_train=True):
- csv_file_path = os.path.join(self.args.data_dir, file_name)
- df = pd.read_csv(csv_file_path)#, nrows=100000)
- df = self.__feature_engineering(df)
- df = self.__preprocessing(df, is_train)
-
- # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용
-
-
- self.args.n_questions = len(np.load(os.path.join(self.args.data_dir,'assessmentItemID_classes.npy')))
- self.args.n_test = len(np.load(os.path.join(self.args.data_dir,'testId_classes.npy')))
- self.args.n_tag = len(np.load(os.path.join(self.args.data_dir,'KnowledgeTag_classes.npy')))
-
-
-
- df = df.sort_values(by=['userID','Timestamp'], axis=0)
- columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
- group = df[columns].groupby('userID').apply(
- lambda r: (
- r['testId'].values,
- r['assessmentItemID'].values,
- r['KnowledgeTag'].values,
- r['answerCode'].values
- )
- )
-
- return group.values
-
- def load_train_data(self, file_name):
- self.train_data = self.load_data_from_file(file_name)
-
- def load_test_data(self, file_name):
- self.test_data = self.load_data_from_file(file_name, is_train= False)
-
-
-class DKTDataset(torch.utils.data.Dataset):
- def __init__(self, data, args):
- self.data = data
- self.args = args
-
- def __getitem__(self, index):
- row = self.data[index]
-
- # 각 data의 sequence length
- seq_len = len(row[0])
-
- test, question, tag, correct = row[0], row[1], row[2], row[3]
-
-
- cate_cols = [test, question, tag, correct]
-
- # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
- if seq_len > self.args.max_seq_len:
- for i, col in enumerate(cate_cols):
- cate_cols[i] = col[-self.args.max_seq_len:]
- mask = np.ones(self.args.max_seq_len, dtype=np.int16)
- else:
- mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
- mask[-seq_len:] = 1
-
- # mask도 columns 목록에 포함시킴
- cate_cols.append(mask)
-
- # np.array -> torch.tensor 형변환
- for i, col in enumerate(cate_cols):
- cate_cols[i] = torch.tensor(col)
-
- return cate_cols
-
- def __len__(self):
- return len(self.data)
-
-
-from torch.nn.utils.rnn import pad_sequence
-
-def collate(batch):
- col_n = len(batch[0])
- col_list = [[] for _ in range(col_n)]
- max_seq_len = len(batch[0][-1])
-
-
- # batch의 값들을 각 column끼리 그룹화
- for row in batch:
- for i, col in enumerate(row):
- pre_padded = torch.zeros(max_seq_len)
- pre_padded[-len(col):] = col
- col_list[i].append(pre_padded)
-
-
- for i, _ in enumerate(col_list):
- col_list[i] =torch.stack(col_list[i])
-
- return tuple(col_list)
-
-
-def get_loaders(args, train, valid):
-
- pin_memory = False
- train_loader, valid_loader = None, None
-
- if train is not None:
- trainset = DKTDataset(train, args)
- train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,
- batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)
- if valid is not None:
- valset = DKTDataset(valid, args)
- valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,
- batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)
-
- return train_loader, valid_loader
\ No newline at end of file
diff --git a/code/baseline/dkt/metric.py b/code/baseline/dkt/metric.py
deleted file mode 100644
index 9ffe2f2..0000000
--- a/code/baseline/dkt/metric.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from sklearn.metrics import roc_auc_score, accuracy_score
-import numpy as np
-
-def get_metric(targets, preds):
- auc = roc_auc_score(targets, preds)
- acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
-
- return auc, acc
\ No newline at end of file
diff --git a/code/baseline/dkt/model.py b/code/baseline/dkt/model.py
deleted file mode 100644
index 267256c..0000000
--- a/code/baseline/dkt/model.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import copy
-import math
-
-try:
- from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
-except:
- from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel
-
-
-
-
-class LSTM(nn.Module):
-
- def __init__(self, args):
- super(LSTM, self).__init__()
- self.args = args
- self.device = args.device
-
- self.hidden_dim = self.args.hidden_dim
- self.n_layers = self.args.n_layers
-
- # Embedding
- # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
- self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
- self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
- self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
- self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
-
- # embedding combination projection
- self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)
-
- self.lstm = nn.LSTM(self.hidden_dim,
- self.hidden_dim,
- self.n_layers,
- batch_first=True)
-
- # Fully connected layer
- self.fc = nn.Linear(self.hidden_dim, 1)
-
- self.activation = nn.Sigmoid()
-
- def init_hidden(self, batch_size):
- h = torch.zeros(
- self.n_layers,
- batch_size,
- self.hidden_dim)
- h = h.to(self.device)
-
- c = torch.zeros(
- self.n_layers,
- batch_size,
- self.hidden_dim)
- c = c.to(self.device)
-
- return (h, c)
-
- def forward(self, input):
-
- test, question, tag, _, mask, interaction, _ = input
-
- batch_size = interaction.size(0)
-
- # Embedding
-
- embed_interaction = self.embedding_interaction(interaction)
- embed_test = self.embedding_test(test)
- embed_question = self.embedding_question(question)
- embed_tag = self.embedding_tag(tag)
-
-
- embed = torch.cat([embed_interaction,
- embed_test,
- embed_question,
- embed_tag,], 2)
-
- X = self.comb_proj(embed)
-
- hidden = self.init_hidden(batch_size)
- out, hidden = self.lstm(X, hidden)
- out = out.contiguous().view(batch_size, -1, self.hidden_dim)
-
- out = self.fc(out)
- preds = self.activation(out).view(batch_size, -1)
-
- return preds
-
diff --git a/code/baseline/dkt/optimizer.py b/code/baseline/dkt/optimizer.py
deleted file mode 100644
index 1548373..0000000
--- a/code/baseline/dkt/optimizer.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from torch.optim import Adam, AdamW
-
-def get_optimizer(model, args):
- if args.optimizer == 'adam':
- optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
- if args.optimizer == 'adamW':
- optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
-
- # 모든 parameter들의 grad값을 0으로 초기화
- optimizer.zero_grad()
-
- return optimizer
\ No newline at end of file
diff --git a/code/baseline/dkt/scheduler.py b/code/baseline/dkt/scheduler.py
deleted file mode 100644
index 0823313..0000000
--- a/code/baseline/dkt/scheduler.py
+++ /dev/null
@@ -1,14 +0,0 @@
-
-from torch.optim.lr_scheduler import ReduceLROnPlateau
-
-from transformers import get_linear_schedule_with_warmup
-
-
-def get_scheduler(optimizer, args):
- if args.scheduler == 'plateau':
- scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)
- elif args.scheduler == 'linear_warmup':
- scheduler = get_linear_schedule_with_warmup(optimizer,
- num_warmup_steps=args.warmup_steps,
- num_training_steps=args.total_steps)
- return scheduler
\ No newline at end of file
diff --git a/code/baseline/dkt/trainer.py b/code/baseline/dkt/trainer.py
deleted file mode 100644
index 9b12650..0000000
--- a/code/baseline/dkt/trainer.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import os
-import torch
-import numpy as np
-
-
-from .dataloader import get_loaders
-from .optimizer import get_optimizer
-from .scheduler import get_scheduler
-from .criterion import get_criterion
-from .metric import get_metric
-from .model import LSTM
-
-import wandb
-
-def run(args, train_data, valid_data):
- train_loader, valid_loader = get_loaders(args, train_data, valid_data)
-
- # only when using warmup scheduler
- args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
- args.warmup_steps = args.total_steps // 10
-
- model = get_model(args)
- optimizer = get_optimizer(model, args)
- scheduler = get_scheduler(optimizer, args)
-
- best_auc = -1
- early_stopping_counter = 0
- for epoch in range(args.n_epochs):
-
- print(f"Start Training: Epoch {epoch + 1}")
-
- ### TRAIN
- train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)
-
- ### VALID
- auc, acc = validate(valid_loader, model, args)
-
- ### TODO: model save or early stopping
- wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
- "valid_auc":auc, "valid_acc":acc})
- if auc > best_auc:
- best_auc = auc
- # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
- model_to_save = model.module if hasattr(model, 'module') else model
- save_checkpoint({
- 'epoch': epoch + 1,
- 'state_dict': model_to_save.state_dict(),
- },
- args.model_dir, 'model.pt',
- )
- early_stopping_counter = 0
- else:
- early_stopping_counter += 1
- if early_stopping_counter >= args.patience:
- print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
- break
-
- # scheduler
- if args.scheduler == 'plateau':
- scheduler.step(best_auc)
- else:
- scheduler.step()
-
-
-def train(train_loader, model, optimizer, args):
- model.train()
-
- total_preds = []
- total_targets = []
- losses = []
- for step, batch in enumerate(train_loader):
- input = process_batch(batch, args)
- preds = model(input)
- targets = input[3] # correct
-
-
- loss = compute_loss(preds, targets)
- update_params(loss, model, optimizer, args)
-
- if step % args.log_steps == 0:
- print(f"Training steps: {step} Loss: {str(loss.item())}")
-
- # predictions
- preds = preds[:,-1]
- targets = targets[:,-1]
-
- if args.device == 'cuda':
- preds = preds.to('cpu').detach().numpy()
- targets = targets.to('cpu').detach().numpy()
- else: # cpu
- preds = preds.detach().numpy()
- targets = targets.detach().numpy()
-
- total_preds.append(preds)
- total_targets.append(targets)
- losses.append(loss)
-
-
- total_preds = np.concatenate(total_preds)
- total_targets = np.concatenate(total_targets)
-
- # Train AUC / ACC
- auc, acc = get_metric(total_targets, total_preds)
- loss_avg = sum(losses)/len(losses)
- print(f'TRAIN AUC : {auc} ACC : {acc}')
- return auc, acc, loss_avg
-
-
-def validate(valid_loader, model, args):
- model.eval()
-
- total_preds = []
- total_targets = []
- for step, batch in enumerate(valid_loader):
- input = process_batch(batch, args)
-
- preds = model(input)
- targets = input[3] # correct
-
-
- # predictions
- preds = preds[:,-1]
- targets = targets[:,-1]
-
- if args.device == 'cuda':
- preds = preds.to('cpu').detach().numpy()
- targets = targets.to('cpu').detach().numpy()
- else: # cpu
- preds = preds.detach().numpy()
- targets = targets.detach().numpy()
-
- total_preds.append(preds)
- total_targets.append(targets)
-
- total_preds = np.concatenate(total_preds)
- total_targets = np.concatenate(total_targets)
-
- # Train AUC / ACC
- auc, acc = get_metric(total_targets, total_preds)
-
- print(f'VALID AUC : {auc} ACC : {acc}\n')
-
- return auc, acc
-
-
-
-def inference(args, test_data):
-
- model = load_model(args)
- model.eval()
- _, test_loader = get_loaders(args, None, test_data)
-
-
- total_preds = []
-
- for step, batch in enumerate(test_loader):
- input = process_batch(batch, args)
-
- preds = model(input)
-
-
- # predictions
- preds = preds[:,-1]
-
-
- if args.device == 'cuda':
- preds = preds.to('cpu').detach().numpy()
- else: # cpu
- preds = preds.detach().numpy()
-
- total_preds+=list(preds)
-
- write_path = os.path.join(args.output_dir, "output.csv")
- if not os.path.exists(args.output_dir):
- os.makedirs(args.output_dir)
- with open(write_path, 'w', encoding='utf8') as w:
- w.write("id,prediction\n")
- for id, p in enumerate(total_preds):
- w.write('{},{}\n'.format(id,p))
-
-
-
-
-def get_model(args):
- """
- Load model and move tensors to a given devices.
- """
- if args.model == 'lstm': model = LSTM(args)
- if args.model == 'lstmattn': model = LSTMATTN(args)
- if args.model == 'bert': model = Bert(args)
-
-
- model.to(args.device)
-
- return model
-
-
-# 배치 전처리
-def process_batch(batch, args):
-
- test, question, tag, correct, mask = batch
-
-
- # change to float
- mask = mask.type(torch.FloatTensor)
- correct = correct.type(torch.FloatTensor)
-
- # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
- # saint의 경우 decoder에 들어가는 input이다
- interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
- interaction = interaction.roll(shifts=1, dims=1)
- interaction[:, 0] = 0 # set padding index to the first sequence
- interaction = (interaction * mask).to(torch.int64)
- # print(interaction)
- # exit()
- # test_id, question_id, tag
- test = ((test + 1) * mask).to(torch.int64)
- question = ((question + 1) * mask).to(torch.int64)
- tag = ((tag + 1) * mask).to(torch.int64)
-
- # gather index
- # 마지막 sequence만 사용하기 위한 index
- gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
- gather_index = gather_index.view(-1, 1) - 1
-
-
- # device memory로 이동
-
- test = test.to(args.device)
- question = question.to(args.device)
-
-
- tag = tag.to(args.device)
- correct = correct.to(args.device)
- mask = mask.to(args.device)
-
- interaction = interaction.to(args.device)
- gather_index = gather_index.to(args.device)
-
- return (test, question,
- tag, correct, mask,
- interaction, gather_index)
-
-
-# loss계산하고 parameter update!
-def compute_loss(preds, targets):
- """
- Args :
- preds : (batch_size, max_seq_len)
- targets : (batch_size, max_seq_len)
-
- """
- loss = get_criterion(preds, targets)
- #마지막 시퀀드에 대한 값만 loss 계산
- loss = loss[:,-1]
- loss = torch.mean(loss)
- return loss
-
-def update_params(loss, model, optimizer, args):
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
- optimizer.step()
- optimizer.zero_grad()
-
-
-
-def save_checkpoint(state, model_dir, model_filename):
- print('saving model ...')
- if not os.path.exists(model_dir):
- os.makedirs(model_dir)
- torch.save(state, os.path.join(model_dir, model_filename))
-
-
-
-def load_model(args):
-
-
- model_path = os.path.join(args.model_dir, args.model_name)
- print("Loading Model from:", model_path)
- load_state = torch.load(model_path)
- model = get_model(args)
-
- # 1. load model state
- model.load_state_dict(load_state['state_dict'], strict=True)
-
-
- print("Loading Model from:", model_path, "...Finished.")
- return model
\ No newline at end of file
diff --git a/code/baseline/dkt/utils.py b/code/baseline/dkt/utils.py
deleted file mode 100644
index ca8a411..0000000
--- a/code/baseline/dkt/utils.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import os, random, torch
-import numpy as np
-def setSeeds(seed = 42):
- # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
- os.environ['PYTHONHASHSEED'] = str(seed)
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed(seed)
- torch.backends.cudnn.deterministic = True
\ No newline at end of file
diff --git a/code/baseline/evaluation.py b/code/baseline/evaluation.py
deleted file mode 100644
index 1088905..0000000
--- a/code/baseline/evaluation.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pandas as pd
-from sklearn.metrics import roc_auc_score, accuracy_score
-import json
-import numpy as np
-
-def evaluation(gt_path, pred_path):
- """
- Args:
- gt_path (string) : root directory of ground truth file
- pred_path (string) : root directory of prediction file (output of inference.py)
- """
- #어떤 gt를 사용하느냐에 따라 달라짐,
- #제출 ID에서 gt에 있는 ID 값만 채점
-
- gt = pd.read_csv(gt_path, index_col='id')
- total_targets = gt['answerCode'].values
-
- pred = pd.read_csv(pred_path,index_col='id')
- #ground truth에 있는 id 값만 골라내기
- total_preds = pred.loc[list(gt.index),'prediction']
-
- # AUROC
- auroc = roc_auc_score(total_targets, total_preds)
- acc = accuracy_score(total_targets, np.where(total_preds >= 0.5, 1, 0))
- results={}
- results['accuracy'] = {
- 'value': f'{acc:.4f}',
- 'rank': False,
- 'decs': True,
- }
- results['auroc'] = {
- 'value': f'{auroc:.4f}',
- 'rank': True,
- 'decs': True,
- }
-
- return json.dumps(results)
diff --git a/code/baseline/feature_engineering.py b/code/baseline/feature_engineering.py
deleted file mode 100644
index cd79ae2..0000000
--- a/code/baseline/feature_engineering.py
+++ /dev/null
@@ -1,686 +0,0 @@
-import time
-import random
-from datetime import datetime
-import pandas as pd
-import numpy as np
-from tqdm import tqdm
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
-tqdm.pandas()
-
-def IK_question_acc(df):
- assessmentItemID_groupby = df.groupby('assessmentItemID').agg({
- 'answercode': 'mean'
- })
-
- df["IK_question_acc"] = assessmentItemID_groupby["answercode"][df["assessmentItemID"]].values
- return df
-
-def IK_KnowledgeTag_acc(df):
- KnowledgeTag_groupby = df.groupby('KnowledgeTag').agg({
- 'answercode': 'mean'
- })
-
- df["IK_KnowledgeTag_acc"] = KnowledgeTag_groupby["answercode"][df["KnowledgeTag"]].values
-
- return df
-
-def solved_question(df):
- df["solved_question"] = df.groupby(["userID"]).cumcount()
-
- return df
-
-def user_question_class_solved(df):
- if "question_class" not in df.columns:
- df = question_class(df)
-
- df["user_question_class_solved"] = df.groupby(["userID", "question_class"]).cumcount()
- return df
-
-def userID_elapsed_cate(df, max_time=600):
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # sample별 elapsed time
- diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().shift(-1)
- elapsed = diff['Timestamp'].apply(lambda x: int(x.total_seconds() // 10 * 10) if max_time > x.total_seconds() else 1)
- df['userID_elapsed_cate'] = elapsed
-
- return df
-
-def userID_testid_experience(df):
- # userID별 시간 순으로 정렬
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
-
- # userID 별로 testid를 풀어본 적 있는지
- df["userID_testid_experience"] = df.groupby(["userID", "testId"])['testId'].cumcount()
- df['userID_testid_experience'] = df['userID_testid_experience'].apply(lambda x : 1 if x > 0 else 0)
- return df
-
-def userID_assessmentItemID_experience(df):
- # userID별 시간 순으로 정렬
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
-
- # userID 별로 assessmentItemID를 풀어본 적 있는지
- df["userID_assessmentItemID_experience"] = df.groupby(["userID", "assessmentItemID"])['assessmentItemID'].cumcount()
- df['userID_assessmentItemID_experience'] = df['userID_assessmentItemID_experience'].apply(lambda x : 1 if x > 0 else 0)
- return df
-
-def userID_time_diff_from_last(df):
-
- def convert_time(s):
- timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
- return int(timestamp)
-
- # 초 단위 시간
- df['sec'] = df['Timestamp'].apply(convert_time)
-
- # userID별 시간 순으로 정렬 + index column 생성
- df = df.sort_values(by=['userID', 'sec']).reset_index(drop=False)
-
- # userID별 마지막 index 값
- last_idx_group = df.groupby(['userID'])['index'].agg(["max"])
- last_idx_group = last_idx_group.reset_index()
- last_idx_group.columns = ['userID', 'last_index']
- df = pd.merge(df, last_idx_group, on=["userID"], how="left")
-
- def changed_time(x):
- last_time = df['sec'][x['last_index']]
- period = last_time-x['sec']
- return period
-
- # userID별 마지막 index의 시간과의 차이 계산
- df["userID_time_diff_from_last"] = df.apply(changed_time, axis=1)
-
- df.drop('sec', axis=1, inplace=True)
- df.drop('index', axis=1, inplace=True)
- return df
-
-def userID_KnowledgeTag_relative(df):
- # userID별 시간 순으로 정렬
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- # userID, KnowledgeTag 키값 생성(temp)
- df["tmp"] = df[["userID", "KnowledgeTag"]].apply(lambda data: str(data["userID"]) + "_" + str(data["KnowledgeTag"]), axis=1)
- # userID, KnowledgeTag별 누적 풀이 수, 정답 수, 정답률
- df["userID_KnowledgeTag_total_answer"] = df.groupby("tmp")["answercode"].cumcount()
- df["userID_KnowledgeTag_correct_answer"] = df.groupby("tmp")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df['userID_KnowledgeTag_correct_answer'].fillna(0, inplace=True)
- df["userID_KnowledgeTag_acc"] = df["userID_KnowledgeTag_correct_answer"] / df["userID_KnowledgeTag_total_answer"]
- df['userID_KnowledgeTag_acc'].fillna(0, inplace=True)
- df.drop('tmp', axis=1, inplace=True)
- return df
-
-def userID_question_num_relative(df):
- # question_num이 있어야 계산 가능
- if 'question_num' not in df.columns:
- df = question_num(df)
-
- # userID별 시간 순으로 정렬
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- # userID_question_class 키값 생성(temp)
- df["tmp"] = df[["userID", "question_num"]].apply(lambda data: str(data["userID"]) + "_" + data["question_num"], axis=1)
- # userID, question_num별 누적 풀이 수, 정답 수, 정답률
- df["userID_question_num_total_answer"] = df.groupby("tmp")["answercode"].cumcount()
- df["userID_question_num_correct_answer"] = df.groupby("tmp")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df['userID_question_num_correct_answer'].fillna(0, inplace=True)
- df["userID_question_num_acc"] = df["userID_question_num_correct_answer"] / df["userID_question_num_total_answer"]
- df['userID_question_num_acc'].fillna(0, inplace=True)
- df.drop('tmp', axis=1, inplace=True)
- return df
-
-def userID_elapsed_median(df, max_time=600):
- # 약 1m 50s 소요(Progress bar 2개 생김)
- # userID별 시간 순으로 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # sample별 elapsed time
- diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().shift(-1)
- elapsed = diff['Timestamp'].progress_apply(lambda x: x.total_seconds() if max_time > x.total_seconds() else None)
- df['userID_elapsed_median'] = elapsed
-
- # userID별 마지막 문제의 풀이 시간(데이터에서 알 수 없는)을
- # userID별 문제 풀이 시간의 "중앙값"으로 반환하기 위한 Aggregation
- user_median = df.groupby('userID')['userID_elapsed_median'].median()
- df = pd.merge(df, user_median, on=["userID"], how="left")
-
- # 결측치 중앙값 변환 및 임시 열 삭제
- df["userID_elapsed_median_x"] = df["userID_elapsed_median_x"].fillna('missing')
- def changed_elapsed(data):
- return data["userID_elapsed_median_x"] if data["userID_elapsed_median_x"] != 'missing' else data["userID_elapsed_median_y"]
- df['userID_elapsed_median'] = df.progress_apply(changed_elapsed, axis=1)
- df.drop('userID_elapsed_median_x', axis=1, inplace=True)
- df.drop('userID_elapsed_median_y', axis=1, inplace=True)
- return df
-
-
-def userID_elapsed_median_rolling(df, window=5):
- # userID_elapsed_median이 있어야 이동평균 계산 가능
- if 'userID_elapsed_median' not in df.columns:
- df = userID_elapsed_median(df)
- # userID별 시간 순으로 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # userID별 문제 풀이 시간의 이동평균
- df['userID_elapsed_median_rolling'] = df.groupby(['userID'])['userID_elapsed_median'].rolling(window).mean().values
- # 유저별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)
- # 유저별 userID_elapsed_median_rolling의 중앙값으로 대체
- def changed_mean_time(data):
- return data["userID_elapsed_median_rolling_x"] if data["userID_elapsed_median_rolling_x"] != 'missing' else data["userID_elapsed_median_rolling_y"]
- user_median = df.groupby('userID')['userID_elapsed_median_rolling'].median()
- df = pd.merge(df, user_median, on=["userID"], how="left")
-
- # 결측치 중앙값 변환 및 임시 열 삭제
- df['userID_elapsed_median_rolling_x'] = df['userID_elapsed_median_rolling_x'].fillna('missing')
- df['userID_elapsed_median_rolling'] = df.progress_apply(changed_mean_time, axis=1)
- df.drop('userID_elapsed_median_rolling_x', axis=1, inplace=True)
- df.drop('userID_elapsed_median_rolling_y', axis=1, inplace=True)
- return df
-
-
-def question_num(df):
- # 문제지 안 문제 번호
- df["question_num"] = df["assessmentItemID"].apply(lambda x: x[-3:])
- return df
-
-
-def question_class(df):
- # 문제지 안 문제 번호
- df["question_class"] = df["assessmentItemID"].apply(lambda x: x[2])
- return df
-
-
-def KnowledgeTag_relative(df):
- df.reset_index(drop=True, inplace=True)
- # KnowledgeTag별 누적 풀이 수, 정답 수, 정답률
- df_KnowledgeTag = df.sort_values(by=["KnowledgeTag", "Timestamp"])
- df['KnowledgeTag_total_answer'] = df_KnowledgeTag.groupby("KnowledgeTag")["answerCode"].cumcount()
- df["KnowledgeTag_correct_answer"] = df_KnowledgeTag.groupby("KnowledgeTag")["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- df["KnowledgeTag_acc"] = (df["KnowledgeTag_correct_answer"] / df["KnowledgeTag_total_answer"]).fillna(0)
- return df
-
-
-def assessmentItemID_relative(df):
- df.reset_index(drop=True, inplace=True)
- # assessmentItemID별 누적 풀이 수, 정답 수, 정답률
- df_assessmentItemID = df.sort_values(by=["assessmentItemID", "Timestamp"])
- df['assessmentItemID_total_answer'] = df_assessmentItemID.groupby("assessmentItemID")["answerCode"].cumcount()
- df["assessmentItemID_correct_answer"] = df_assessmentItemID.groupby("assessmentItemID")["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- df["assessmentItemID_acc"] = (df["assessmentItemID_correct_answer"] / df["assessmentItemID_total_answer"]).fillna(0)
- return df
-
-
-def question_class_relative(df):
- # userID_elapsed_median이 있어야 이동평균 계산 가능
- if 'question_class' not in df.columns:
- df = question_class(df)
- # Question Class 별 누적 풀이 수, 정답 수, 정답률
- df.sort_values(by=["question_class", "Timestamp"], inplace=True)
- df["question_class_correct_answer"] = df.groupby("question_class")["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- df["question_class_total_answer"] = df.groupby("question_class")["answerCode"].cumcount()
- df["question_class_acc"] = (df["question_class_correct_answer"] / df["question_class_total_answer"]).fillna(0)
- return df
-
-
-def userID_question_class_relative(df):
- # question_class 있어야 계산 가능
- if 'question_class' not in df.columns:
- df = question_class(df)
-
- # userID별 시간 순으로 정렬
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- # userID_question_class 키값 생성(temp)
- df["tmp"] = df[["userID", "question_class"]].apply(lambda data: str(data["userID"]) + "_" + data["question_class"], axis=1)
- # userID_question_class 별 누적 풀이 수, 정답 수, 정답률
- df["userID_question_class_total_answer"] = df.groupby("tmp")["answercode"].cumcount()
- df["userID_question_class_correct_answer"] = df.groupby("tmp")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df['userID_question_class_correct_answer'].fillna(0, inplace=True)
- df["userID_question_class_acc"] = df["userID_question_class_correct_answer"] / df["userID_question_class_total_answer"]
- df['userID_question_class_acc'].fillna(0, inplace=True)
- df.drop('tmp', axis=1, inplace=True)
- return df
-
-
-def userID_relative(df):
- # userID별 시간 순으로 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
- #user 별 누적 풀이 수, 정답 수, 정답률
- df["userID_correct_answer"] = df.groupby("userID")["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- df["userID_total_answer"] = df.groupby("userID")["answerCode"].cumcount()
- df["userID_acc"] = (df["userID_correct_answer"] / df["userID_total_answer"]).fillna(0)
- return df
-
-
-def userID_acc_rolling(df, window=5):
- # user_acc 있어야 이동평균 계산 가능
- if 'userID_acc' not in df.columns:
- df = userID_relative(df)
- # userID별 시간 순으로 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # userID별 정답률(user_acc)의 이동 평균
- df['userID_acc_rolling'] = df.groupby(['userID'])['userID_acc'].rolling(window).mean().values
- # userID별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)
- # userID별 user_acc_rolling의 중앙값으로 대체
- def changed_user_acc_rolling(data):
- return data["userID_acc_rolling_x"] if data["userID_acc_rolling_x"] != 'missing' else data["userID_acc_rolling_y"]
- user_median = df.groupby('userID')['userID_acc_rolling'].median()
- df = pd.merge(df, user_median, on=["userID"], how="left")
- # 결측치 중앙값 변환 및 임시 열 삭제
- df['userID_acc_rolling_x'] = df['userID_acc_rolling_x'].fillna('missing')
- df['userID_acc_rolling'] = df.progress_apply(changed_user_acc_rolling, axis=1)
- df.drop('userID_acc_rolling_x', axis=1, inplace=True)
- df.drop('userID_acc_rolling_y', axis=1, inplace=True)
- return df
-
-
-def userID_elapsed_median_rolling(df, window=5):
- # userID_elapsed_median이 있어야 이동평균 계산 가능
- if 'userID_elapsed_median' not in df.columns:
- df = userID_elapsed_median(df)
- # userID별 시간 순으로 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # userID별 문제 풀이 시간의 이동평균
- df['userID_elapsed_median_rolling'] = df.groupby(['userID'])['userID_elapsed_median'].rolling(window).mean().values
- # 유저별 window-1만큼 N/A data가 생김(rolling의 특성상 앞데이터에 생김)
- # 유저별 userID_elapsed_median_rolling의 중앙값으로 대체
- def changed_mean_time(data):
- return data["userID_elapsed_median_rolling_x"] if data["userID_elapsed_median_rolling_x"] != 'missing' else data["userID_elapsed_median_rolling_y"]
- user_median = df.groupby('userID')['userID_elapsed_median_rolling'].median()
- df = pd.merge(df, user_median, on=["userID"], how="left")
-
- # 결측치 중앙값 변환 및 임시 열 삭제
- df['userID_elapsed_median_rolling_x'] = df['userID_elapsed_median_rolling_x'].fillna('missing')
- df['userID_elapsed_median_rolling'] = df.progress_apply(changed_mean_time, axis=1)
- df.drop('userID_elapsed_median_rolling_x', axis=1, inplace=True)
- df.drop('userID_elapsed_median_rolling_y', axis=1, inplace=True)
- return df
-
-
-def assessmentItemID_time_relative(df):
- # 문제별 풀이 시간의 중앙값&평균값
- # userID_elapsed_median 있어야 assessmentItemID_time 계산 가능
- if 'userID_elapsed_median' not in df.columns:
- df = userID_elapsed_median(df)
- # assessmentItemID별 풀이 시간의 중앙값&평균값
- df_total_agg = df.copy()
- agg_df = df_total_agg.groupby('assessmentItemID')['userID_elapsed_median'].agg(['median', 'mean'])
- # mapping을 위해 pandas DataFrame을 dictionary형태로 변환
- agg_dict = agg_df.to_dict()
- # 구한 통계량을 각 사용자에게 mapping
- df['assessmentItemID_time_median'] = df_total_agg['assessmentItemID'].map(agg_dict['median'])
- df['assessmentItemID_time_mean'] = df_total_agg['assessmentItemID'].map(agg_dict['mean'])
- return df
-
-
-def userID_time_relative(df):
- # 유저별 풀이 시간의 중앙값&평균값
- # userID_elapsed_median 있어야 userID_time_relative 계산 가능
- if 'userID_elapsed_median' not in df.columns:
- df = userID_elapsed_median(df)
- # assessmentItemID별 풀이 시간의 중앙값&평균값
- df_total_agg = df.copy()
- agg_df = df_total_agg.groupby('userID')['userID_elapsed_median'].agg(['median', 'mean'])
- # mapping을 위해 pandas DataFrame을 dictionary형태로 변환
- agg_dict = agg_df.to_dict()
- # 구한 통계량을 각 사용자에게 mapping
- df['userID_time_median'] = df_total_agg['userID'].map(agg_dict['median'])
- df['userID_time_mean'] = df_total_agg['userID'].map(agg_dict['mean'])
- return df
-
-
-def userID_elapsed_normalize(df):
- # userID_elapsed_normalize 있어야 userID_elapsed_normalize 계산 가능
- if 'userID_elapsed_normalize' not in df.columns:
- df = userID_elapsed_median(df)
- df_total_norm = df.copy()
- df['userID_elapsed_normalize'] = df_total_norm.groupby('userID')['userID_elapsed_median'].transform(lambda x: (x - x.mean())/x.std())
- return df
-
-
-def lda_feature(df):
- df.reset_index(drop=True, inplace=True)
- if 'assessmentItemID_total_answer' not in df.columns:
- df = assessmentItemID_relative(df)
- if 'KnowledgeTag_total_answer' not in df.columns:
- df = KnowledgeTag_relative(df)
- if 'question_class_correct_answer' not in df.columns:
- df = question_class_relative(df)
- if 'userID_question_class_correct_answer' not in df.columns:
- df = userID_question_class_relative(df)
- # lda_latent_factor 변수
- lda = LDA(n_components=1)
- y = df['answerCode']
-
- #assessmentItemID_lda
- X = df[['assessmentItemID_total_answer', 'assessmentItemID_correct_answer','assessmentItemID_acc']]
- df['assessmentItemID_lda'] = lda.fit_transform(X, y)
- # KnowledgeTag_lda
- X = df[['KnowledgeTag_total_answer', 'KnowledgeTag_correct_answer','KnowledgeTag_acc']]
- df['KnowledgeTag_lda'] = lda.fit_transform(X, y)
- # question_class_lda
- X = df[['question_class_correct_answer', 'question_class_total_answer','question_class_acc']]
- df['question_class_lda'] = lda.fit_transform(X, y)
- # user_question_class_lda
- X = df[['userID_question_class_correct_answer', 'userID_question_class_total_answer','userID_question_class_acc']]
- df['userID_question_class_lda'] = lda.fit_transform(X, y)
- return df
-
-
-def find_time_difference(data):
- if data["userID"] == data["userID_shift"]:
- temp_time_difference = int(((data["Timestamp"] - data["next_timestamp"]) / pd.to_timedelta(1, unit='D')) * (60 * 60 * 24))
- if temp_time_difference > 600: # 10분 넘는 경우 # 변경 가능
- return 600
- elif temp_time_difference > 3600: # 1시간 넘는 경우 # 변경 가능:
- return 0
- return temp_time_difference
- else:
- return 0
-
-
-def feature_engineering_sun(df):
- # assessmentItemID, timestamp 기준 정렬
- df.sort_values(by=["KnowledgeTag", "Timestamp"], inplace=True)
-
- # KnowledgeTag 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
- df["KnowledgeTag_correct_answer"] = df.groupby("KnowledgeTag")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df["KnowledgeTag_total_answer"] = df.groupby("KnowledgeTag")["answercode"].cumcount()
- df["KnowledgeTag_acc"] = df["KnowledgeTag_correct_answer"] / df["KnowledgeTag_total_answer"]
-
- # assessmentItemID, timestamp 기준 정렬
- df.sort_values(by=["assessmentItemID", "Timestamp"], inplace=True)
-
- # assessmentItemID 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
- df["question_correct_answer"] = df.groupby("assessmentItemID")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df["question_total_answer"] = df.groupby("assessmentItemID")["answercode"].cumcount()
- df["question_acc"] = df["question_correct_answer"] / df["question_total_answer"]
-
- # question class
- df["question_class"] = df["assessmentItemID"].apply(lambda x: x[2])
- # user_question_class
- df["userID_question_class"] = df[["userID", "question_class"]].apply(lambda data: str(data["userID"]) + "_" + data["question_class"], axis=1)
-
- # question_class, timestamp 기준 정렬
- df.sort_values(by=["question_class", "Timestamp"], inplace=True)
-
- # question_class 정답 수, 풀이 수, 정답률을 시간순으로 누적해서 계산
- df["question_class_correct_answer"] = df.groupby("question_class")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df["question_class_total_answer"] = df.groupby("question_class")["answercode"].cumcount()
- df["question_class_acc"] = df["question_class_correct_answer"] / df["question_class_total_answer"]
-
- # assessmentItemID, timestamp 기준 정렬
- df.sort_values(by=["userID_question_class", "Timestamp"], inplace=True)
-
- # userID_question_class 정답 수, 풀이 수, 정답률을 시간순으로 누적해서 계산
- df["user_question_class_correct_answer"] = df.groupby("userID_question_class")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df["user_question_class_total_answer"] = df.groupby("userID_question_class")["answercode"].cumcount()
- df["user_question_class_acc"] = df["user_question_class_correct_answer"] / df["user_question_class_total_answer"]
-
- # user별 timestamp 기준 정렬
- df.sort_values(by=["userID", "Timestamp"], inplace=True)
-
- # user 문제 푼 시간 측정
- df["next_timestamp"] = df["Timestamp"].shift(-1)
- df["userID_shift"] = df["userID"].shift(-1)
- # 3min 25s 소요..
- df["time_difference"] = df[["userID", "userID_shift", "Timestamp", "next_timestamp"]].apply(find_time_difference, axis=1)
-
- # question class
- df["question_class"] = df["assessmentItemID"].apply(lambda x: x[2])
-
- # user의 문제 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
- df["user_correct_answer"] = df.groupby("userID")["answercode"].transform(lambda x: x.cumsum().shift(1))
- df["user_total_answer"] = df.groupby("userID")["answercode"].cumcount()
- df["user_acc"] = df["user_correct_answer"] / df["user_total_answer"]
-
- # testId 기준 mean, sumanswercode
- group_test = df.groupby(["testId"])["answercode"].agg(["mean", "sum"])
- group_test.columns = ["test_mean", "test_sum"]
- # knowledge_tag 기준 mean, sum
- group_tag = df.groupby(["KnowledgeTag"])["answercode"].agg(["mean", "sum"])
- group_tag.columns = ["tag_mean", "tag_sum"]
- # userID 기준 mean, sum
- group_user = df.groupby(["userID"])["answercode"].agg(["sum"])
- group_user.columns = ["user_count"]
- # question 기준 mean, sum
- group_question = df.groupby(["assessmentItemID"])["answercode"].agg(["mean", "sum"])
- group_question.columns = ["question_mean", "question_count"]
- # question class(assessmentItemID 두 번째 숫자) 기준 mean, sum
- group_question_class = df.groupby(["question_class"])["answercode"].agg(["mean", "sum"])
- group_question_class.columns = ["question_class_mean", "question_class_count"]
- # time_difference 기준 mean, median
- group_time_difference = df.groupby(["userID"])["time_difference"].agg(["mean", "median"])
- group_time_difference.columns = ["time_difference_mean", "time_difference_median"]
- # userID_question_class 기준 mean, sum
- group_user_question_class = df.groupby(["userID_question_class"])["answercode"].agg(["mean", "sum"])
- group_user_question_class.columns = ["user_question_class_mean", "user_question_class_count"]
-
- # merge
- df = pd.merge(df, group_test, on=["testId"], how="left")
- df = pd.merge(df, group_tag, on=["KnowledgeTag"], how="left")
- df = pd.merge(df, group_user, on=["userID"], how="left")
- df = pd.merge(df, group_question, on=["assessmentItemID"], how="left")
- df = pd.merge(df, group_question_class, on=["question_class"], how="left")
- df = pd.merge(df, group_time_difference, on=["userID"], how="left")
- df = pd.merge(df, group_user_question_class, on=["userID_question_class"], how="left")
-
- return df
-
-# ======================================================================================================= #
-
-# ACC 같은 민감한 정보를 Categorical로 바꾸는 함수
-def make_grade(data) :
- if data < 0.05 : return 0
- elif data < 0.10 : return 1
- elif data < 0.15 : return 2
- elif data < 0.20 : return 3
- elif data < 0.25 : return 4
- elif data < 0.30 : return 5
- elif data < 0.35 : return 6
- elif data < 0.40 : return 7
- elif data < 0.45 : return 8
- elif data < 0.50 : return 9
- elif data < 0.55 : return 10
- elif data < 0.60 : return 11
- elif data < 0.65 : return 12
- elif data < 0.70 : return 13
- elif data < 0.75 : return 14
- elif data < 0.80 : return 15
- elif data < 0.85 : return 16
- elif data < 0.90 : return 17
- elif data < 0.95 : return 18
- else : return 19
-
-
-# dataframe을 만들고 기본 세팅하는 함수 (1분 소요)
-def get_df() :
- dtype = {
- 'userID': 'int16',
- 'answerCode': 'int8',
- 'KnowledgeTag': 'int16'
- }
-
- DATA_PATH = '/opt/ml/input/data/train_dataset/train_data.csv'
- TEST_DATA_PATH = '/opt/ml/input/data/train_dataset/test_data.csv'
-
- train_df = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
- train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- test_df = pd.read_csv(TEST_DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
- test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
-
- train_df['is_test'] = False
- test_df['is_test'] = True
-
- df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
- df['next_userID'] = df['userID'].shift(-1).fillna(9999)
-
- def answer_masking(df):
- if df['userID'] != df['next_userID']:
- return 1 if random.random() < 0.5 else 0
- else:
- return df['answerCode']
- df['masked_answer'] = df.apply(answer_masking, axis=1)
-
- return df
-
-
-# 문제의 난이도 Feature
-def get_question_grade(df):
- tmp_df = df.groupby('assessmentItemID')['masked_answer'].mean().reset_index()
- tmp_df.columns = ['assessmentItemID', 'question_grade']
- tmp_df['question_grade'] = tmp_df['question_grade'].apply(make_grade)
- df = pd.merge(left=df, right=tmp_df, on=['assessmentItemID'], how='left')
- return df
-
-
-# 문제의 번호 Feature
-def get_question_order(df) :
- df['question_order'] = df['assessmentItemID'].apply(lambda x : int(x[-3:]))
- return df
-
-
-# 문제의 대분류 Feature
-def get_question_large_cate(df) :
- df['question_large_cate'] = df.apply(lambda x : int(x['testId'][2]), axis=1)
- return df
-
-
-# User가 해당 대분류의 문제를 몇번 풀었는지 Feature
-def get_userID_cnt_item_in_largeCate(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- if 'question_large_cate' not in df.columns :
- df = get_question_large_cate(df)
- tmp_df = df.copy()
- tmp_df['tmp'] = tmp_df[["userID", "question_large_cate"]].apply(lambda data: str(data["userID"]) + "_" + str(data["question_large_cate"]), axis=1)
- df['userID_cnt_item_in_largeCate'] = tmp_df.groupby('tmp')['assessmentItemID'].cumcount()
- return df
-
-
-# User의 해당 대분류에 대한 정답률 Grade Feature
-def get_userID_answerRate_in_largeCate(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- if 'question_large_cate' not in df.columns :
- df = get_question_large_cate(df)
- tmp_df = df.copy()
- tmp_df['tmp'] = tmp_df.apply(lambda data: str(data["userID"]) + "_" + str(data["question_large_cate"]), axis=1)
- tmp_df['answers'] = tmp_df.groupby("tmp")["masked_answer"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- df['userID_answerRate_in_largeCate'] = (tmp_df['answers']/tmp_df['userID_cnt_item_in_largeCate']).fillna(1)
- df['userID_answerRate_in_largeCate'] = df['userID_answerRate_in_largeCate'].apply(make_grade)
- return df
-
-
-# User의 해당 Tag에 대한 정답률 Grade Feature
-def get_userID_answerRate_in_tag(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- tmp_df = df.copy()
- tmp_df['tmp'] = tmp_df.apply(lambda data: str(data["userID"]) + "_" + str(data["KnowledgeTag"]), axis=1)
- tmp_df['answers'] = tmp_df.groupby("tmp")["masked_answer"].transform(lambda x: x.cumsum().shift(1)).fillna(0)
- tmp_df['userID_cnt_item_in_tag'] = tmp_df.groupby('tmp')['assessmentItemID'].cumcount()
- df['userID_answerRate_in_tag'] = (tmp_df['answers']/tmp_df['userID_cnt_item_in_tag']).fillna(1)
- df['userID_answerRate_in_tag'] = df['userID_answerRate_in_tag'].apply(make_grade)
- return df
-
-
-# User가 해당 문제를 풀어본 경험 Feature
-def get_userID_question_experience(df):
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- df['userID_question_experience'] = df.groupby(["userID", "assessmentItemID"])['assessmentItemID'].cumcount()
- return df
-
-
-# User가 문제를 순서대로 접근하는지 Feature
-def get_question_solve_order(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- if 'question_order' not in df.columns :
- df = get_question_order(df)
- tmp_df = df.loc[:, ['userID', 'testId', 'question_order']].groupby(['userID', 'testId']).diff().fillna(1)
- df['question_solve_order'] = tmp_df['question_order'].apply(lambda x : 1 if x == 1 else 0)
- return df
-
-
-# 문제를 푸는데 걸리는 시간 Feature (UserID와 TestID 기준으로 구하고, Max 및 nan 값은 125로 사용)
-def get_userID_elapsed_by_test_125(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- diff = df.loc[:, ['userID', 'testId', 'Timestamp']].groupby(['userID','testId']).diff().shift(-1).fillna(pd.Timedelta(seconds=0))
- diff = diff.fillna(pd.Timedelta(seconds=0))
- diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
- df['elapsed_solving'] = diff
- df['elapsed_solving'] = df['elapsed_solving'].apply(lambda x : 125 if x == 0 or x > 125 else x)
- return df
-
-
-# User가 문제를 풀때 걸리는 시간의 중앙값 Feature
-def get_userID_elapsed_median(df) :
- if 'elapsed_solving' not in df.columns :
- df = get_userID_elapsed_by_test_125(df)
- tmp_df = df.groupby('userID')['elapsed_solving'].median()
- tmp_df.name = 'userID_elapsed_median'
- tmp_df = tmp_df.reset_index()
- df = pd.merge(left=df, right=tmp_df, on='userID', how='left')
- return df
-
-
-# 문제를 맞춘 사람과 못맞춘 사람이 걸리는 시간의 중앙값 Feature
-def get_question_elapsed_median(df) :
- tmp_df = df.groupby(['assessmentItemID','masked_answer']).agg({'elapsed_solving':'median'})
- tmp_df = tmp_df.reset_index()
- tmp_df_correct = tmp_df[tmp_df['masked_answer']==1]
- tmp_df_incorrect = tmp_df[tmp_df['masked_answer']==0]
- tmp_df_correct.columns = ['assessmentItemID', 'masked_answer', 'question_correct_elapsed_median']
- tmp_df_incorrect.columns = ['assessmentItemID', 'masked_answer', 'question_incorrect_elapsed_median']
- tmp_df_correct = tmp_df_correct.drop('masked_answer', axis=1)
- tmp_df_incorrect = tmp_df_incorrect.drop('masked_answer', axis=1)
- df = pd.merge(left=df, right=tmp_df_correct, on=['assessmentItemID'], how='left')
- df = pd.merge(left=df, right=tmp_df_incorrect, on=['assessmentItemID'], how='left')
- return df
-
-
-# 문제를 접근한 날짜 Feature (월단위)
-def get_question_solve_month(df) :
- df["question_solve_month"] = df["Timestamp"].apply(lambda x: x.month)
- return df
-
-
-# User가 이전에 몇 문제를 풀었는지 Feature
-def get_userID_cnt_items(df) :
- df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
- df['userID_cnt_items'] = df.groupby("userID")["assessmentItemID"].cumcount()
- return df
-
-
-# User가 이전에 몇개의 시험지를 풀었는지 Feature
-def get_userID_cnt_tests(df) :
- tmp_df = df[['userID','testId']]
- tmp_df = tmp_df.drop_duplicates()
- tmp_df['userID_cnt_tests'] = tmp_df.groupby('userID')['testId'].cumcount()
- df = pd.merge(left=df, right=tmp_df, on=['userID','testId'], how='left')
- return df
-
-
-# User가 이전에 몇개의 Tag를 풀었는지 Feature
-def get_userID_cnt_tags(df) :
- tmp_df = df[['userID','KnowledgeTag']]
- tmp_df = tmp_df.drop_duplicates()
- tmp_df['userID_cnt_tags'] = tmp_df.groupby('userID')['KnowledgeTag'].cumcount()
- df = pd.merge(left=df, right=tmp_df, on=['userID','KnowledgeTag'], how='left')
- return df
-
-
-# Dataframe부터, 전체 Feature Engineering까지 수행되는 Code
-def get_all() :
- df = get_df()
- df = get_question_grade(df)
- df = get_question_order(df)
- df = get_question_large_cate(df)
- df = get_userID_cnt_item_in_largeCate(df)
- df = get_userID_answerRate_in_largeCate(df)
- df = get_userID_answerRate_in_tag(df)
- df = get_userID_question_experience(df)
- df = get_question_solve_order(df)
- df = get_userID_elapsed_by_test_125(df)
- df = get_userID_elapsed_median(df)
- df = get_question_elapsed_median(df)
- df = get_question_solve_month(df)
- df = get_userID_cnt_items(df)
- df = get_userID_cnt_tests(df)
- df = get_userID_cnt_tags(df)
- return df
diff --git a/code/baseline/inference.py b/code/baseline/inference.py
deleted file mode 100644
index 246f906..0000000
--- a/code/baseline/inference.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-from args import parse_args
-from dkt.dataloader import Preprocess
-from dkt import trainer
-import torch
-def main(args):
- device = "cuda" if torch.cuda.is_available() else "cpu"
- args.device = device
-
- args.data_dir = os.environ.get('SM_CHANNEL_EVAL', args.data_dir)
- args.model_dir = os.environ.get('SM_CHANNEL_MODEL', args.model_dir)
- args.output_dir = os.environ.get('SM_OUTPUT_DATA_DIR ', args.output_dir)
- preprocess = Preprocess(args)
- preprocess.load_test_data(args.test_file_name)
- test_data = preprocess.get_test_data()
-
-
- trainer.inference(args, test_data)
-
-
-if __name__ == "__main__":
- args = parse_args(mode='train')
- os.makedirs(args.model_dir, exist_ok=True)
- main(args)
\ No newline at end of file
diff --git a/code/baseline/lgbm_baseline.ipynb b/code/baseline/lgbm_baseline.ipynb
deleted file mode 100644
index 514f51d..0000000
--- a/code/baseline/lgbm_baseline.ipynb
+++ /dev/null
@@ -1,582 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## LGBM Baseline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-05-24T09:49:29.375544Z",
- "start_time": "2021-05-24T09:49:28.999092Z"
- }
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import os\n",
- "import random\n",
- "import warnings\n",
- "import lightgbm as lgb\n",
- "from wandb.lightgbm import wandb_callback\n",
- "from sklearn.metrics import roc_auc_score\n",
- "from sklearn.metrics import accuracy_score\n",
- "import numpy as np\n",
- "import random\n",
- "from matplotlib import pylab as plt\n",
- "from lgbm_function import inference, set_params, custom_train_test_split\n",
- "from feature_engineering import feature_engineering\n",
- "from datetime import datetime\n",
- "import wandb\n",
- "\n",
- "%matplotlib inline\n",
- "warnings.filterwarnings('ignore')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. 데이터 로딩"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-05-24T09:49:29.678737Z",
- "start_time": "2021-05-24T09:49:29.376581Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(2266586, 6)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " userID | \n",
- " assessmentItemID | \n",
- " testId | \n",
- " answerCode | \n",
- " Timestamp | \n",
- " KnowledgeTag | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " A060001001 | \n",
- " A060000001 | \n",
- " 1 | \n",
- " 2020-03-24 00:17:11 | \n",
- " 7224 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0 | \n",
- " A060001002 | \n",
- " A060000001 | \n",
- " 1 | \n",
- " 2020-03-24 00:17:14 | \n",
- " 7225 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0 | \n",
- " A060001003 | \n",
- " A060000001 | \n",
- " 1 | \n",
- " 2020-03-24 00:17:22 | \n",
- " 7225 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0 | \n",
- " A060001004 | \n",
- " A060000001 | \n",
- " 1 | \n",
- " 2020-03-24 00:17:29 | \n",
- " 7225 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0 | \n",
- " A060001005 | \n",
- " A060000001 | \n",
- " 1 | \n",
- " 2020-03-24 00:17:36 | \n",
- " 7225 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " userID assessmentItemID testId answerCode Timestamp \\\n",
- "0 0 A060001001 A060000001 1 2020-03-24 00:17:11 \n",
- "1 0 A060001002 A060000001 1 2020-03-24 00:17:14 \n",
- "2 0 A060001003 A060000001 1 2020-03-24 00:17:22 \n",
- "3 0 A060001004 A060000001 1 2020-03-24 00:17:29 \n",
- "4 0 A060001005 A060000001 1 2020-03-24 00:17:36 \n",
- "\n",
- " KnowledgeTag \n",
- "0 7224 \n",
- "1 7225 \n",
- "2 7225 \n",
- "3 7225 \n",
- "4 7225 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_dir = '/opt/ml/input/data/train_dataset'\n",
- "csv_file_path = os.path.join(data_dir, 'train_data.csv')\n",
- "df = pd.read_csv(csv_file_path, parse_dates=['Timestamp'])\n",
- "print(df.shape)\n",
- "df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Feature Engineering"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-05-24T09:49:29.683739Z",
- "start_time": "2021-05-24T09:49:28.981Z"
- }
- },
- "outputs": [],
- "source": [
- "%%time\n",
- "df = feature_engineering(df)\n",
- "df.head(2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Cross Validation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 유저별 분리\n",
- "train_lst, test_lst = custom_train_test_split(df)\n",
- "\n",
- "# 사용할 Feature 설정\n",
- "FEATS = [\"user_acc\", \"user_mean\", \"user_count\", \"user_correct_answer\", \"question_mean\", \"question_class_mean\"]\n",
- "\n",
- "# set parameters\n",
- "params = set_params()\n",
- "\n",
- "# \"test_sum\", \"question_class_count\", \"tag_sum\", \"question_count\", \"tag_mean\", \"test_mean\",\n",
- "\n",
- "for fold_num, (train, test) in enumerate(zip(train_lst, test_lst)):\n",
- " print(\"@\"*50)\n",
- " print(fold_num, \"번째 fold\")\n",
- " print(\"@\"*50)\n",
- " \n",
- " # X, y 값 분리\n",
- " y_train = train[\"answerCode\"]\n",
- " train = train.drop([\"answerCode\"], axis=1)\n",
- "\n",
- " y_test = test[\"answerCode\"]\n",
- " test = test.drop([\"answerCode\"], axis=1)\n",
- " \n",
- " print(\"=\"*30)\n",
- " print(\"train, test shape\")\n",
- " print(train.shape, test.shape)\n",
- " print(\"=\"*30)\n",
- " print()\n",
- " \n",
- " lgb_train = lgb.Dataset(train[FEATS], y_train)\n",
- " lgb_test = lgb.Dataset(test[FEATS], y_test)\n",
- " \n",
- " now = datetime.now()\n",
- " wandb.init(project='P4-DKT', config=params, entity=\"team-ikyo\")\n",
- " wandb.run.name = \"sun-lgbm-fold\" + str(fold_num) + \" time: \" + \" \".join(map(str, [now.month, now.day, now.hour, now.minute]))\n",
- " \n",
- " # train\n",
- " model = lgb.train(params,\n",
- " lgb_train,\n",
- " valid_sets = [lgb_train, lgb_test],\n",
- " verbose_eval = 100,\n",
- " callbacks=[wandb_callback()])\n",
- "\n",
- " preds = model.predict(test[FEATS])\n",
- " acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n",
- " auc = roc_auc_score(y_test, preds)\n",
- "\n",
- " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n",
- " \n",
- " # show feature importance\n",
- " fig, ax = plt.subplots(figsize=(6,12))\n",
- " lgb.plot_importance(model, max_num_features=100, height=0.8, ax=ax)\n",
- " plt.show()\n",
- " \n",
- " # inference\n",
- " inference(FEATS, model, auc, acc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Result to csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Exception in thread Thread-7:\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/conda/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n",
- " self.run()\n",
- " File \"/opt/conda/lib/python3.7/threading.py\", line 870, in run\n",
- " self._target(*self._args, **self._kwargs)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_run.py\", line 180, in check_network_status\n",
- " status_response = self._interface.communicate_network_status()\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 747, in communicate_network_status\n",
- " resp = self._communicate(req, timeout=timeout, local=True)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 537, in _communicate\n",
- " return self._communicate_async(rec, local=local).get(timeout=timeout)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 542, in _communicate_async\n",
- " raise Exception(\"The wandb backend process has shutdown\")\n",
- "Exception: The wandb backend process has shutdown\n",
- "\n",
- "Exception in thread Thread-6:\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/conda/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n",
- " self.run()\n",
- " File \"/opt/conda/lib/python3.7/threading.py\", line 870, in run\n",
- " self._target(*self._args, **self._kwargs)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_run.py\", line 198, in check_status\n",
- " status_response = self._interface.communicate_stop_status()\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 735, in communicate_stop_status\n",
- " resp = self._communicate(req, timeout=timeout, local=True)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 537, in _communicate\n",
- " return self._communicate_async(rec, local=local).get(timeout=timeout)\n",
- " File \"/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py\", line 542, in _communicate_async\n",
- " raise Exception(\"The wandb backend process has shutdown\")\n",
- "Exception: The wandb backend process has shutdown\n",
- "\n"
- ]
- }
- ],
- "source": [
- "from glob import glob\n",
- "import pandas as pd\n",
- "\n",
- "output_path = \"/opt/ml/code/output/cross_validation/output.csv\"\n",
- "csv_file_path_list = glob(\"/opt/ml/code/output/*.csv\")\n",
- "print(csv_file_path_list)\n",
- "\n",
- "# concat result dataframe\n",
- "result = pd.read_csv(csv_file_path_list[0])[\"prediction\"]\n",
- "for csv_file_path in csv_file_path_list[1:]:\n",
- " result = pd.concat([result, pd.read_csv(csv_file_path)[\"prediction\"]], axis=1)\n",
- "\n",
- "# mean result dataframe\n",
- "result = pd.DataFrame(result.mean(axis=1)).reset_index().rename(columns = {0:\"prediction\", \"index\":\"id\"})\n",
- "result.to_csv(output_path, index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Grid Search"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 485,
- "metadata": {},
- "outputs": [],
- "source": [
- "FEATS = [\"user_correct_answer\", \"time_difference\",\n",
- " \"user_acc\", \"test_mean\", \"test_sum\", \n",
- " \"tag_mean\", \"tag_sum\", \"user_mean\", \"user_count\",\n",
- " \"question_mean\", \"question_count\", \"question_class_mean\", \"question_class_count\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 488,
- "metadata": {},
- "outputs": [],
- "source": [
- "grid_FEATS = [[\"user_correct_answer\", \"time_difference\",\n",
- " \"user_acc\", \"test_mean\", \"test_sum\", \n",
- " \"tag_mean\", \"tag_sum\", \"user_mean\", \"user_count\",\n",
- " \"question_mean\", \"question_count\", \"question_class_mean\", \"question_class_count\"]]\n",
- "\n",
- "for comb_num in range(6, 13, 2):\n",
- " for features in list(combinations(FEATS, comb_num)):\n",
- " grid_FEATS.append(list(features))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 367,
- "metadata": {},
- "outputs": [],
- "source": [
- "# for FEATS in grid_FEATS:\n",
- "# # 유저별 분리\n",
- "# train, test = custom_train_test_split(df)\n",
- "\n",
- "# # X, y 값 분리\n",
- "# y_train = train['answerCode']\n",
- "# train = train.drop(['answerCode'], axis=1)\n",
- "\n",
- "# y_test = test['answerCode']\n",
- "# test = test.drop(['answerCode'], axis=1)\n",
- " \n",
- "# params = {}\n",
- "# params[\"boosting_type\"] = \"gbdt\" # gbdt, dart, goss\n",
- "# params[\"learning_rate\"] = 1e-1 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3\n",
- "# params[\"objective\"] = \"binary\"\n",
- "# params[\"metric\"] = \"auc\" # binary_logloss, rmse, huber, auc\n",
- "# params[\"num_iterations\"] = 1000 # 100\n",
- "# params[\"max_depth\"] = 5 # -1\n",
- "# params[\"num_leaves\"] = 10 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.\n",
- "# params[\"min_data_in_leaf\"] = 10000 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것\n",
- "# params[\"max_bin\"] = 16 # 256\n",
- "# params[\"min_split_gain\"] = 1e-2 # ?\n",
- "# params[\"scale_pos_weight\"] = 1.1 # 1.1~1.5\n",
- "# params[\"tree_learner\"] = \"serial\" # serial, feature, data, voting\n",
- "# params[\"early_stopping_rounds\"] = 50\n",
- "# params[\"bagging_fraction\"] = 0.8 # 1.0\n",
- "# params[\"lambda_l1\"] = 1e-1 # 0.0\n",
- "# params[\"lambda_l2\"] = 1e-1 # 0.0\n",
- "\n",
- "# print(\"=\"*30)\n",
- "# print(\"=\"*30)\n",
- "# print(FEATS)\n",
- "# print(\"|\"*30)\n",
- "# print(params)\n",
- "# print(\"|\"*30)\n",
- "# lgb_train = lgb.Dataset(train[FEATS], y_train)\n",
- "# lgb_test = lgb.Dataset(test[FEATS], y_test)\n",
- "\n",
- "# model = lgb.train(params,\n",
- "# lgb_train,\n",
- "# valid_sets = [lgb_train, lgb_test],\n",
- "# verbose_eval = 500)\n",
- "\n",
- "# preds = model.predict(test[FEATS])\n",
- "# acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n",
- "# auc = roc_auc_score(y_test, preds)\n",
- "\n",
- "# print(f'VALID AUC : {auc} ACC : {acc}\\n')\n",
- "\n",
- "# # LOAD TESTDATA\n",
- "# test_csv_file_path = os.path.join(data_dir, 'test_data.csv')\n",
- "# test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])\n",
- "\n",
- "# # FEATURE ENGINEERING\n",
- "# test_df = feature_engineering(test_df)\n",
- "\n",
- "# # LEAVE LAST INTERACTION ONLY\n",
- "# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]\n",
- "\n",
- "# # DROP ANSWERCODE\n",
- "# test_df = test_df.drop(['answerCode'], axis=1)\n",
- "\n",
- "# # MAKE PREDICTION\n",
- "# total_preds = model.predict(test_df[FEATS])\n",
- "\n",
- "# # SAVE OUTPUT\n",
- "# output_dir = 'output/'\n",
- "# write_path = os.path.join(output_dir, f\"lgbm/output_VALID_AUC_{round(auc, 4)}_ACC_{round(acc, 4)}.csv\")\n",
- "# if not os.path.exists(output_dir):\n",
- "# os.makedirs(output_dir) \n",
- "# with open(write_path, 'w', encoding='utf8') as w:\n",
- "# print(\"writing prediction : {}\".format(write_path))\n",
- "# w.write(\"id,prediction\\n\")\n",
- "# for id, p in enumerate(total_preds):\n",
- "# w.write('{},{}\\n'.format(id,p))\n",
- "# print(\"=\"*30)\n",
- "# print(\"=\"*30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.7"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": false,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {},
- "toc_section_display": true,
- "toc_window_display": true
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/code/baseline/new_model.py b/code/baseline/new_model.py
deleted file mode 100644
index 70f201a..0000000
--- a/code/baseline/new_model.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import torch
-import torch.nn as nn
-
-try:
- from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
-except:
- from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel
-
-
-class LSTMATTN(nn.Module):
-
- def __init__(self, args):
- super(LSTMATTN, self).__init__()
- self.args = args
- self.device = args.device
-
- self.hidden_dim = self.args.hidden_dim
- self.n_layers = self.args.n_layers
- self.n_heads = self.args.n_heads
- self.drop_out = self.args.drop_out
-
- # Embedding
- # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
- self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
- self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
- self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
- self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
-
- # embedding combination projection
- self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)
-
- self.lstm = nn.LSTM(self.hidden_dim,
- self.hidden_dim,
- self.n_layers,
- batch_first=True)
-
- self.config = BertConfig(
- 3, # not used
- hidden_size=self.hidden_dim,
- num_hidden_layers=1,
- num_attention_heads=self.n_heads,
- intermediate_size=self.hidden_dim,
- hidden_dropout_prob=self.drop_out,
- attention_probs_dropout_prob=self.drop_out,
- )
- self.attn = BertEncoder(self.config)
-
- # Fully connected layer
- self.fc = nn.Linear(self.hidden_dim, 1)
-
- self.activation = nn.Sigmoid()
-
- def init_hidden(self, batch_size):
- h = torch.zeros(
- self.n_layers,
- batch_size,
- self.hidden_dim)
- h = h.to(self.device)
-
- c = torch.zeros(
- self.n_layers,
- batch_size,
- self.hidden_dim)
- c = c.to(self.device)
-
- return (h, c)
-
- def forward(self, input):
-
- test, question, tag, _, mask, interaction, _ = input
-
- batch_size = interaction.size(0)
-
- # Embedding
-
- embed_interaction = self.embedding_interaction(interaction)
- embed_test = self.embedding_test(test)
- embed_question = self.embedding_question(question)
- embed_tag = self.embedding_tag(tag)
-
-
- embed = torch.cat([embed_interaction,
- embed_test,
- embed_question,
- embed_tag,], 2)
-
- X = self.comb_proj(embed)
-
- hidden = self.init_hidden(batch_size)
- out, hidden = self.lstm(X, hidden)
- out = out.contiguous().view(batch_size, -1, self.hidden_dim)
-
- extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
- extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
- extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
- head_mask = [None] * self.n_layers
-
- encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask)
- sequence_output = encoded_layers[-1]
-
- out = self.fc(sequence_output)
-
- preds = self.activation(out).view(batch_size, -1)
-
- return preds
-
-
-class Bert(nn.Module):
-
- def __init__(self, args):
- super(Bert, self).__init__()
- self.args = args
- self.device = args.device
-
- # Defining some parameters
- self.hidden_dim = self.args.hidden_dim
- self.n_layers = self.args.n_layers
-
- # Embedding
- # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
- self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
- self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
- self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
- self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
-
- # embedding combination projection
- self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)
-
- # Bert config
- self.config = BertConfig(
- 3, # not used
- hidden_size=self.hidden_dim,
- num_hidden_layers=self.args.n_layers,
- num_attention_heads=self.args.n_heads,
- max_position_embeddings=self.args.max_seq_len
- )
-
- # Defining the layers
- # Bert Layer
- self.encoder = BertModel(self.config)
-
- # Fully connected layer
- self.fc = nn.Linear(self.args.hidden_dim, 1)
-
- self.activation = nn.Sigmoid()
-
-
- def forward(self, input):
- test, question, tag, _, mask, interaction, _ = input
- batch_size = interaction.size(0)
-
- # 신나는 embedding
-
- embed_interaction = self.embedding_interaction(interaction)
- embed_test = self.embedding_test(test)
- embed_question = self.embedding_question(question)
- embed_tag = self.embedding_tag(tag)
-
- embed = torch.cat([embed_interaction,
-
- embed_test,
- embed_question,
-
- embed_tag,], 2)
-
- X = self.comb_proj(embed)
-
- # Bert
- encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)
- out = encoded_layers[0]
- out = out.contiguous().view(batch_size, -1, self.hidden_dim)
- out = self.fc(out)
- preds = self.activation(out).view(batch_size, -1)
-
- return preds
\ No newline at end of file
diff --git a/code/baseline/requirements.txt b/code/baseline/requirements.txt
deleted file mode 100644
index c2bd6f9..0000000
--- a/code/baseline/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-torch
-pandas
-sklearn
-tqdm
-wandb
diff --git a/code/baseline/train.py b/code/baseline/train.py
deleted file mode 100644
index 1cc3bfc..0000000
--- a/code/baseline/train.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-from args import parse_args
-from dkt.dataloader import Preprocess
-from dkt import trainer
-import torch
-from dkt.utils import setSeeds
-import wandb
-def main(args):
- wandb.login()
-
- setSeeds(args.seed)
- device = "cuda" if torch.cuda.is_available() else "cpu"
- args.device = device
-
- args.data_dir = os.environ.get('SM_CHANNEL_TRAIN', args.data_dir)
- args.model_dir = os.environ.get('SM_MODEL_DIR', args.model_dir)
-
-
- preprocess = Preprocess(args)
- preprocess.load_train_data(args.file_name)
- train_data = preprocess.get_train_data()
-
- train_data, valid_data = preprocess.split_data(train_data)
-
- wandb.init(project='P4-DKT', entity='team-ikyo', name=args.run_name, config=vars(args))
- trainer.run(args, train_data, valid_data)
-
-
-if __name__ == "__main__":
- args = parse_args(mode='train')
- os.makedirs(args.model_dir, exist_ok=True)
- main(args)