diff --git a/01-intro/DataHandling_HW.ipynb b/01-intro/DataHandling_HW.ipynb index 34539c5..96fe35e 100644 --- a/01-intro/DataHandling_HW.ipynb +++ b/01-intro/DataHandling_HW.ipynb @@ -52,29 +52,29 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4CWHxPY62I-D", - "outputId": "ee47fd61-089c-4236-8093-b9c2ce24fe57" + "outputId": "32b5c8e3-3935-4e84-bb3e-b6a704cd2800" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "--2023-01-03 06:42:07-- https://raw.githubusercontent.com/HSE-LAMBDA/MLDM-2022/main/01-intro/train.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...\n", + "--2023-01-21 09:42:52-- https://raw.githubusercontent.com/HSE-LAMBDA/MLDM-2022/main/01-intro/train.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 60302 (59K) [text/plain]\n", "Saving to: ‘train.csv’\n", "\n", - "\rtrain.csv 0%[ ] 0 --.-KB/s \rtrain.csv 100%[===================>] 58.89K --.-KB/s in 0.01s \n", + "train.csv 100%[===================>] 58.89K --.-KB/s in 0.01s \n", "\n", - "2023-01-03 06:42:07 (5.13 MB/s) - ‘train.csv’ saved [60302/60302]\n", + "2023-01-21 09:42:53 (4.31 MB/s) - ‘train.csv’ saved [60302/60302]\n", "\n" ] } @@ -85,11 +85,249 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { - "id": "G36oVo3RJXVc" + "id": "G36oVo3RJXVc", + "outputId": "37cc3aab-4011-4757-d863-f904e2104996", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 281 + } }, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Survived Pclass \\\n", + "PassengerId \n", + "1 0 3 \n", + "2 1 1 \n", + "3 1 3 \n", + "4 1 1 \n", + "5 0 3 \n", + "\n", + " Name Sex Age \\\n", + "PassengerId \n", + "1 Braund, Mr. Owen Harris male 22.0 \n", + "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", + "3 Heikkinen, Miss. Laina female 26.0 \n", + "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", + "5 Allen, Mr. William Henry male 35.0 \n", + "\n", + " SibSp Parch Ticket Fare Cabin Embarked \n", + "PassengerId \n", + "1 1 0 A/5 21171 7.2500 NaN S \n", + "2 1 0 PC 17599 71.2833 C85 C \n", + "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", + "4 1 0 113803 53.1000 C123 S \n", + "5 0 0 373450 8.0500 NaN S " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -129,13 +367,13 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p7O68NpyJXWw", - "outputId": "6067c134-1a98-4c75-81a6-1649fd364dea" + "outputId": "98105ceb-059a-4846-af17-777492945987" }, "outputs": [ { @@ -186,7 +424,7 @@ "id": "3LD_acbAqRYP", "outputId": "2c330d3e-5ba1-450c-dd67-1320eeb4ea89" }, - "execution_count": 92, + "execution_count": null, "outputs": [ { "output_type": "execute_result", @@ -202,14 +440,14 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 297 + "height": 296 }, "id": "FMGlVI6SuKRh", - "outputId": "ac412eee-5c00-43ec-a5cf-f9c962f4301c" + "outputId": "243587a5-2b86-4dd9-f756-d4bc2a430f18" }, "outputs": [ { @@ -220,7 +458,7 @@ ] }, "metadata": {}, - "execution_count": 101 + "execution_count": 5 }, { "output_type": "display_data", @@ -261,11 +499,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": { - "id": "JhbbBk93JXXV" + "id": "JhbbBk93JXXV", + "outputId": "6d102f9b-78d2-4eca-d2c9-faa5df7cec1b", + "colab": { + "base_uri": "https://localhost:8080/" + } }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test accuracy: 0.79\n" + ] + } + ], "source": [ "# Build a model with KNeighborsClassifier to get the accuracy of\n", "# at least 0.75 on the validation part of the dataset\n", @@ -274,18 +524,17 @@ "from sklearn.metrics import accuracy_score\n", "\n", "def feature_selection_and_preprocessing(dataset):\n", - " # \n", - " # E.g.:\n", - " features = dataset[[\"Fare\", \"Parch\"]].copy()\n", - " features[\"Fare\"] /= features[\"Fare\"].mean()\n", - "\n", - " features['Fare'] *= 1\n", - " return features\n", + " # \n", + " x = dataset.copy()\n", + " x = x[[\"Pclass\",\"Sex\",\"Age\",\"SibSp\",\"Parch\"]]\n", + " x[\"Age\"] = x[\"Age\"].fillna(x[\"Age\"].mean())\n", + " x[\"Sex\"].replace({\"female\": 0,\"male\": 1}, inplace=True)\n", + " return x\n", "\n", "model = KNeighborsClassifier(\n", " # \n", " # E.g.\n", - " n_neighbors=1\n", + " n_neighbors=3\n", ")\n", "\n", "\n", @@ -327,22 +576,49 @@ "id": "jZjuqx3K3KiD" }, "source": [ - "Check how your model from the previous task performs on randomized splits to train / test (with test set of size 100). Plot the histogram of the test error distribution.\n", + ":Check how your model from the previous task performs on randomized splits to train / test (with test set of size 100). Plot the histogram of the test error distribution.\n", "\n", "*Hint: check sklearn's `sklearn.model_selection.train_test_split` function.*" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { - "id": "WB6SCY2I3Dgb" + "id": "WB6SCY2I3Dgb", + "outputId": "bbc75686-4373-439d-8c3f-f41df99a0e60", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 293 + } }, - "outputs": [], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZcAAAEUCAYAAADnQnt7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAANzElEQVR4nO3dbaxkd10H8O+PXSgaVFp6g4UStk0QbDRpyQaJJCIVoYBpMaIuCVq0poIPwajREt6oiRF8IcZogg0iKKYFi8QKElJoG0JCi1ssD20t3QJGsLILCNoYKw9/X8xZGJd7987a37n3zvbzSW7uzDlnznzvf0/nO+fMmdMaYwQAOj1stwMAcPpRLgC0Uy4AtFMuALRTLgC02z/HSs8+++xx4MCBOVYNwB5x2223fW6MsbHZvFnK5cCBAzl8+PAcqwZgj6iqf95qnsNiALRTLgC0Uy4AtFMuALRTLgC0Uy4AtFMuALRTLgC0Uy4AtFMuALSb5fIv8FBz4Kp37naEr/vUq1+w2xHAngsA/ZQLAO2UCwDtlAsA7ZQLAO2UCwDtlAsA7ZQLAO2UCwDtlAsA7ZQLAO2UCwDtlAsA7ZQLAO2UCwDtVi6XqtpXVf9YVe+YMxAA6+9U9lxekeSuuYIAcPpYqVyq6twkL0jy+nnjAHA6WHXP5Q+T/EaSr221QFVdWVWHq+rwsWPHWsIBsJ62LZeq+pEkR8cYt51suTHG1WOMg2OMgxsbG20BAVg/q+y5PCPJpVX1qSTXJrm4qt48ayoA1tq25TLGeOUY49wxxoEkh5LcOMZ4yezJAFhbvucCQLv9p7LwGOPmJDfPkgSA04Y9FwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANopFwDaKRcA2ikXANptWy5V9ciq+mBVfbiq7qiq396JYACsr/0rLPNAkovHGPdX1cOTvL+q3jXGuGXmbACsqW3LZYwxktw/3X349DPmDAXAelvpM5eq2ldVtyc5muSGMcatmyxzZVUdrqrDx44d684JwBpZqVzGGF8dY1yY5NwkT6uq79lkmavHGAfHGAc3Nja6cwKwRk7pbLExxheT3JTkknniAHA6WOVssY2qevR0+1uS/HCSf5o7GADra5Wzxc5J8qaq2pdFGb11jPGOeWMBsM5WOVvsI0ku2oEsAJwmfEMfgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB225ZLVT2hqm6qqjur6o6qesVOBANgfe1fYZmvJPm1McaHqurbktxWVTeMMe6cORsAa2rbPZcxxn1jjA9Nt/8zyV1JHj93MADW1yl95lJVB5JclOTWOcIAcHpYuVyq6lFJ3pbkV8YY/7HJ/Cur6nBVHT527FhnRgDWzErlUlUPz6JY/mqM8TebLTPGuHqMcXCMcXBjY6MzIwBrZpWzxSrJnyW5a4zxB/NHAmDdrbLn8owkP5Xk4qq6ffp5/sy5AFhj256KPMZ4f5LagSwAnCZ8Qx+AdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB2ygWAdsoFgHbKBYB225ZLVb2hqo5W1cd2IhAA62+VPZc3Jrlk5hwAnEa2LZcxxvuSfGEHsgBwmmj7zKWqrqyqw1V1+NixY12rBWANtZXLGOPqMcbBMcbBjY2NrtUCsIacLQZAO+UCQLtVTkW+JskHkjy5qj5dVVfMHwuAdbZ/uwXGGC/eiSAAnD4cFgOgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoJ1yAaCdcgGgnXIBoN3+3Q4A8FBx4Kp37naEr/vUq18w6/rtuQDQTrkA0E65ANBOuQDQTrkA0E65ANBOuQDQTrkA0E65ANBuz35Df698k3Xub7ECnI7suQDQTrkA0E65ANBOuQDQTrkA0E65ANBOuQDQTrkA0G6lcqmqS6rq7qo6UlVXzR0KgPW2bblU1b4kf5LkeUkuSPLiqrpg7mAArK9V9lyeluTIGOMTY4z/SXJtksvmjQXAOlvl2mKPT/IvS/c/neT7Tlyoqq5McuV09/6quvtBZjs7yece5DoetHrNSovtiawrknUeeyarbXZXrU3Wek1L1iduNaPtwpVjjKuTXN21vqo6PMY42LW+Ock6D1nns055ZZ3H3FlXOSz2mSRPWLp/7jQNADa1Srn8Q5InVdV5VfWIJIeSXD9vLADW2baHxcYYX6mqX0ry7iT7krxhjHHH7MkaD7HtAFnnIet81imvrPOYNWuNMeZcPwAPQb6hD0A75QJAu10tl6r68aq6o6q+VlVbnhK31eVnppMMbp2mv2U64WCurGdV1Q1Vdc/0+8xNlnlWVd2+9PPfVfXCad4bq+qTS/Mu3M2s03JfXcpz/dL0vTauF1bVB6Zt5SNV9ZNL82Yf1+0uf1RVZ0zjdGQatwNL8145Tb+7qp7bne3/kfVXq+rOaRzfW1VPXJq36fawi1lfWlXHljL93NK8y6dt5p6qunwPZH3tUs6PV9UXl+bt9Li+oaqOVtXHtphfVfVH09/ykap66tK8vnEdY+zaT5LvTvLkJDcnObjFMvuS3Jvk/CSPSPLhJBdM896a5NB0+3VJXj5j1t9PctV0+6okr9lm+bOSfCHJt07335jkRTs0ritlTXL/FtP31Lgm+a4kT5puPy7JfUkevRPjerLtb2mZX0jyuun2oSRvmW5fMC1/RpLzpvXs2+Wsz1raJl9+POvJtoddzPrSJH+8yWPPSvKJ6feZ0+0zdzPrCcv/chYnPu34uE7P9wNJnprkY1vMf36SdyWpJE9Pcusc47qrey5jjLvGGNt9k3/Ty89UVSW5OMl103JvSvLC+dLmsuk5Vn2uFyV51xjjv2bMtJVTzfp1e3FcxxgfH2PcM93+1yRHk2zMmGnZKpc/Wv4brkvyQ9M4Xpbk2jHGA2OMTyY5Mq1v17KOMW5a2iZvyeJ7a7vhwVxW6rlJbhhjfGGM8e9JbkhyyUw5k1PP+uIk18yY56TGGO/L4o3tVi5L8hdj4ZYkj66qc9I8ruvwmctml595fJLHJPniGOMrJ0yfy2PHGPdNt/8tyWO3Wf5QvnkD+91pN/S1VXVGe8JvWDXrI6vqcFXdcvzwXfb4uFbV07J493jv0uQ5x3Wr7W/TZaZx+1IW47jKYzud6vNdkcU72OM22x7msmrWH5v+ba+rquNf5t6z4zodZjwvyY1Lk3dyXFex1d/TOq5tl3/ZSlW9J8l3bjLrVWOMv537+U/FybIu3xljjKra8hzu6V3A92bx3aDjXpnFi+cjsji//DeT/M4uZ33iGOMzVXV+khur6qNZvDC2ah7Xv0xy+Rjja9Pk1nF9qKiqlyQ5mOSZS5O/aXsYY9y7+Rp2xN8luWaM8UBV/XwWe4cX72KeVRxKct0Y46tL0/bauO6I2ctljPHsB7mKrS4/8/ksduf2T+8WH/RlaU6Wtao+W1XnjDHum17kjp5kVT+R5O1jjC8vrfv4u/MHqurPk/z6bmcdY3xm+v2Jqro5yUVJ3pY9OK5V9e1J3pnFm5JbltbdOq6bWOXyR8eX+XRV7U/yHVlsnzt96aSVnq+qnp1FsT9zjPHA8elbbA9zvQhum3WM8fmlu6/P4vO544/9wRMee3N7wm84lX/HQ0l+cXnCDo/rKrb6e1rHdR0Oi216+Zmx+ATqpiw+20iSy5PMuSd0/fQcqzzXNx1znV44j3+m8cIkm57J0WTbrFV15vFDSFV1dpJnJLlzL47r9O/+9iyOE193wry5x3WVyx8t/w0vSnLjNI7XJzlUi7PJzkvypCQfbM53Slmr6qIkf5rk0jHG0aXpm24Pu5z1nKW7lya5a7r97iTPmTKfmeQ5+b9HCXY865T3KVl8EP6BpWk7Pa6ruD7JT09njT09yZemN2m94zrH2Qqr/iT50SyO6z2Q5LNJ3j1Nf1ySv19a7vlJPp5F279qafr5WfzHeiTJXyc5Y8asj0ny3iT3JHlPkrOm6QeTvH5puQNZvAN42AmPvzHJR7N48XtzkkftZtYk3z/l+fD0+4q9Oq5JXpLky0luX/q5cKfGdbPtL4tDb5dOtx85jdORadzOX3rsq6bH3Z3keXON4ylkfc/039rxcbx+u+1hF7P+XpI7pkw3JXnK0mN/dhrvI0l+ZrezTvd/K8mrT3jcbozrNVmcUfnlLF5fr0jysiQvm+ZXFv8DyHunTAeXHts2ri7/AkC7dTgsBsCaUS4AtFMuALRTLgC0Uy4AtFMuALRTLgC0+1/MuoDlW2jyrgAAAABJRU5ErkJggg==\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], "source": [ "from sklearn.model_selection import train_test_split\n", + "x1 = data.copy()\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " feature_selection_and_preprocessing(x1.drop('Survived', axis=1)), x1['Survived'], test_size=100, random_state=2)\n", + "\n", + "model = KNeighborsClassifier(n_neighbors=3)\n", + "model.fit(X_train,y_train)\n", + "predictions = model.predict(X_train)\n", "\n", - "# " + "f,ax = plt.subplots()\n", + "plt.tight_layout()\n", + "ax.hist(y_train - predictions, bins=10, label=\"test error\", density=True);" ] } ],