diff --git a/01-intro/DataHandling_HW.ipynb b/01-intro/DataHandling_HW.ipynb
index 34539c5..96fe35e 100644
--- a/01-intro/DataHandling_HW.ipynb
+++ b/01-intro/DataHandling_HW.ipynb
@@ -52,29 +52,29 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4CWHxPY62I-D",
- "outputId": "ee47fd61-089c-4236-8093-b9c2ce24fe57"
+ "outputId": "32b5c8e3-3935-4e84-bb3e-b6a704cd2800"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
- "--2023-01-03 06:42:07-- https://raw.githubusercontent.com/HSE-LAMBDA/MLDM-2022/main/01-intro/train.csv\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...\n",
+ "--2023-01-21 09:42:52-- https://raw.githubusercontent.com/HSE-LAMBDA/MLDM-2022/main/01-intro/train.csv\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 60302 (59K) [text/plain]\n",
"Saving to: ‘train.csv’\n",
"\n",
- "\rtrain.csv 0%[ ] 0 --.-KB/s \rtrain.csv 100%[===================>] 58.89K --.-KB/s in 0.01s \n",
+ "train.csv 100%[===================>] 58.89K --.-KB/s in 0.01s \n",
"\n",
- "2023-01-03 06:42:07 (5.13 MB/s) - ‘train.csv’ saved [60302/60302]\n",
+ "2023-01-21 09:42:53 (4.31 MB/s) - ‘train.csv’ saved [60302/60302]\n",
"\n"
]
}
@@ -85,11 +85,249 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
- "id": "G36oVo3RJXVc"
+ "id": "G36oVo3RJXVc",
+ "outputId": "37cc3aab-4011-4757-d863-f904e2104996",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 281
+ }
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Survived Pclass \\\n",
+ "PassengerId \n",
+ "1 0 3 \n",
+ "2 1 1 \n",
+ "3 1 3 \n",
+ "4 1 1 \n",
+ "5 0 3 \n",
+ "\n",
+ " Name Sex Age \\\n",
+ "PassengerId \n",
+ "1 Braund, Mr. Owen Harris male 22.0 \n",
+ "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
+ "3 Heikkinen, Miss. Laina female 26.0 \n",
+ "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
+ "5 Allen, Mr. William Henry male 35.0 \n",
+ "\n",
+ " SibSp Parch Ticket Fare Cabin Embarked \n",
+ "PassengerId \n",
+ "1 1 0 A/5 21171 7.2500 NaN S \n",
+ "2 1 0 PC 17599 71.2833 C85 C \n",
+ "3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
+ "4 1 0 113803 53.1000 C123 S \n",
+ "5 0 0 373450 8.0500 NaN S "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ "
\n",
+ " \n",
+ " PassengerId | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
@@ -129,13 +367,13 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "p7O68NpyJXWw",
- "outputId": "6067c134-1a98-4c75-81a6-1649fd364dea"
+ "outputId": "98105ceb-059a-4846-af17-777492945987"
},
"outputs": [
{
@@ -186,7 +424,7 @@
"id": "3LD_acbAqRYP",
"outputId": "2c330d3e-5ba1-450c-dd67-1320eeb4ea89"
},
- "execution_count": 92,
+ "execution_count": null,
"outputs": [
{
"output_type": "execute_result",
@@ -202,14 +440,14 @@
},
{
"cell_type": "code",
- "execution_count": 101,
+ "execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 297
+ "height": 296
},
"id": "FMGlVI6SuKRh",
- "outputId": "ac412eee-5c00-43ec-a5cf-f9c962f4301c"
+ "outputId": "243587a5-2b86-4dd9-f756-d4bc2a430f18"
},
"outputs": [
{
@@ -220,7 +458,7 @@
]
},
"metadata": {},
- "execution_count": 101
+ "execution_count": 5
},
{
"output_type": "display_data",
@@ -261,11 +499,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {
- "id": "JhbbBk93JXXV"
+ "id": "JhbbBk93JXXV",
+ "outputId": "6d102f9b-78d2-4eca-d2c9-faa5df7cec1b",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test accuracy: 0.79\n"
+ ]
+ }
+ ],
"source": [
"# Build a model with KNeighborsClassifier to get the accuracy of\n",
"# at least 0.75 on the validation part of the dataset\n",
@@ -274,18 +524,17 @@
"from sklearn.metrics import accuracy_score\n",
"\n",
"def feature_selection_and_preprocessing(dataset):\n",
- " # \n",
- " # E.g.:\n",
- " features = dataset[[\"Fare\", \"Parch\"]].copy()\n",
- " features[\"Fare\"] /= features[\"Fare\"].mean()\n",
- "\n",
- " features['Fare'] *= 1\n",
- " return features\n",
+ " # \n",
+ " x = dataset.copy()\n",
+ " x = x[[\"Pclass\",\"Sex\",\"Age\",\"SibSp\",\"Parch\"]]\n",
+ " x[\"Age\"] = x[\"Age\"].fillna(x[\"Age\"].mean())\n",
+ " x[\"Sex\"].replace({\"female\": 0,\"male\": 1}, inplace=True)\n",
+ " return x\n",
"\n",
"model = KNeighborsClassifier(\n",
" # \n",
" # E.g.\n",
- " n_neighbors=1\n",
+ " n_neighbors=3\n",
")\n",
"\n",
"\n",
@@ -327,22 +576,49 @@
"id": "jZjuqx3K3KiD"
},
"source": [
- "Check how your model from the previous task performs on randomized splits to train / test (with test set of size 100). Plot the histogram of the test error distribution.\n",
+ ":Check how your model from the previous task performs on randomized splits to train / test (with test set of size 100). Plot the histogram of the test error distribution.\n",
"\n",
"*Hint: check sklearn's `sklearn.model_selection.train_test_split` function.*"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
"metadata": {
- "id": "WB6SCY2I3Dgb"
+ "id": "WB6SCY2I3Dgb",
+ "outputId": "bbc75686-4373-439d-8c3f-f41df99a0e60",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 293
+ }
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "