diff --git a/exam_project.ipynb b/exam_project.ipynb new file mode 100644 index 0000000..3e33732 --- /dev/null +++ b/exam_project.ipynb @@ -0,0 +1,1390 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FzFJ1FXO8ojj" + }, + "outputs": [], + "source": [ + "import numpy as np # utilities for numeric operations\n", + "import pandas as pd # tabular data processing\n", + "from sklearn.model_selection import train_test_split # splitting dataset into random train and test subsets.\n", + "from scipy.stats import pearsonr # Pearson correlation coefficient\n", + "import lightgbm as lgb # Light GBM" + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import files # Function for uploading files\n", + "files.upload() # Upload kaggle.json file" + ], + "metadata": { + "id": "2qh6JW3i_epj", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "outputId": "9dab6ab0-4adc-4361-b678-ded237df4496" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving kaggle.json to kaggle.json\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'kaggle.json': b'{\"username\":\"egorbevz\",\"key\":\"60602a5f249baeadc5d7ef09fdbe23af\"}'}" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ] + }, + { + "cell_type": "code", + "source": [ + "! pip install -q kaggle # Install kaggle API\n", + "! cp kaggle.json ~/.kaggle/ # Put the kaggle.json into the ~/.kaggle directory\n", + "! chmod 600 ~/.kaggle/kaggle.json # Make the kaggle.json executable\n", + "! kaggle datasets download robikscube/ubiquant-parquet -f train_low_mem.parquet --path . # Download train_low_mem.parquet.zip\n", + "! unzip train_low_mem.parquet.zip # Unzip train_low_mem.parquet.zip" + ], + "metadata": { + "id": "eNdpyMOeCzG4", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "29d01d5b-76e1-4b8f-cc4b-e264a70c3041" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading train_low_mem.parquet.zip to .\n", + " 99% 3.05G/3.07G [00:27<00:00, 67.9MB/s]\n", + "100% 3.07G/3.07G [00:27<00:00, 119MB/s] \n", + "Archive: train_low_mem.parquet.zip\n", + " inflating: train_low_mem.parquet \n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Prepare Data" + ], + "metadata": { + "id": "aFjceRC0HZg8" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Load" + ], + "metadata": { + "id": "WGU6rYQHHjYv" + } + }, + { + "cell_type": "code", + "source": [ + "# train_low_mem.parquet is a huge file, so we are going to use:\n", + "# * N features\n", + "# * M data points for model training\n", + "# * the lest M points for model evaluation\n", + "\n", + "N = 100\n", + "M = 1000000\n", + "features = [f'f_{i}' for i in range(N)] # Use columns f_0, f_1, ..., f_N-1 as features\n", + "\n", + "X_train_reduced = pd.read_parquet(\n", + " 'train_low_mem.parquet',\n", + " columns=features+['target']\n", + " )[-2*M:-M] # Load train data (M lines before the last M lines), load only those columns listed in features and the `target`\n", + "y_train_reduced = X_train_reduced['target'] # Set the column `target` as a target array for model fitting\n", + "X_train_reduced = X_train_reduced.drop(columns=['target']) # Drop the column `target` from X_train_reduced, so X_train_reduced contains only features for model fitting\n", + "display(X_train_reduced, y_train_reduced) # Display the features table (X_train_reduced) and the target array (y_train_reduced)\n" + ], + "metadata": { + "id": "zlxP6vKnHoxS", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 707 + }, + "outputId": "dac900f9-9820-4c13-b319-c66b380a74b1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " f_0 f_1 f_2 f_3 f_4 f_5 f_6 \\\n", + "1141410 1.093324 1.301436 -1.242253 5.828377 -0.384234 1.447711 1.519053 \n", + "1141411 0.104047 -0.113154 0.719374 -0.210036 -0.433466 0.141140 0.893335 \n", + "1141412 1.436316 -0.517323 0.953603 -0.214181 -0.541468 0.125799 0.958002 \n", + "1141413 -0.678037 0.291014 0.467504 -0.241889 -0.366275 0.193799 -0.179444 \n", + "1141414 0.142369 1.301436 -1.456780 1.780014 -0.151572 0.541044 -0.859400 \n", + "... ... ... ... ... ... ... ... \n", + "2141405 1.327377 -0.552583 -0.886323 0.014043 0.125454 1.677687 -1.381808 \n", + "2141406 0.236709 -2.310760 -0.465879 0.433303 -0.253564 -1.375251 0.839487 \n", + "2141407 0.464893 0.201072 -2.056188 -0.535923 0.207322 -0.212354 1.799059 \n", + "2141408 0.290392 0.501166 2.708688 -0.410148 -0.400013 -0.949561 -1.068481 \n", + "2141409 -3.436619 0.968599 1.884645 -0.562264 -0.318674 -0.054221 0.863330 \n", + "\n", + " f_7 f_8 f_9 ... f_90 f_91 f_92 \\\n", + "1141410 0.998088 0.328072 0.781790 ... -0.487156 2.013887 2.145116 \n", + "1141411 0.340964 0.328072 1.189662 ... -0.735104 -0.309837 -0.278825 \n", + "1141412 0.950252 0.328072 -0.748747 ... -0.666868 0.059887 1.580489 \n", + "1141413 -0.197057 0.328072 -1.003611 ... -0.060639 -1.202124 -0.339710 \n", + "1141414 -0.197057 0.328072 -0.360194 ... 1.091521 0.828786 1.394190 \n", + "... ... ... ... ... ... ... ... \n", + "2141405 -1.099508 0.526042 -1.925724 ... -0.479163 -0.026399 -0.805029 \n", + "2141406 1.075578 0.189406 0.142188 ... 1.705065 1.222087 -0.243805 \n", + "2141407 -0.312653 0.817783 -0.697212 ... 0.298559 0.316116 -0.647126 \n", + "2141408 1.539986 0.206195 -0.769957 ... -0.497587 -0.264609 -0.256282 \n", + "2141409 0.470893 0.880417 2.047566 ... 2.593259 -0.177891 -1.076097 \n", + "\n", + " f_93 f_94 f_95 f_96 f_97 f_98 f_99 \n", + "1141410 -0.565078 0.954476 -0.104428 -0.493579 0.562440 1.109892 -0.343760 \n", + "1141411 0.700176 -0.608110 0.484129 -0.824128 -0.789265 0.328594 -0.343760 \n", + "1141412 0.716956 1.041286 1.185396 -1.433365 -0.992975 -0.025335 1.855881 \n", + "1141413 0.441695 0.334402 0.209693 -0.258891 -0.676576 -0.028874 -0.343760 \n", + "1141414 0.454494 1.983799 -0.447326 1.509864 -0.145400 0.996635 -0.343760 \n", + "... ... ... ... ... ... ... ... \n", + "2141405 0.138502 2.100753 -0.970065 -2.638478 0.682677 0.194035 -0.382265 \n", + "2141406 -0.624759 -0.029950 0.294779 2.685435 -0.146490 0.276808 0.479031 \n", + "2141407 -1.071017 0.671536 -0.514084 0.871859 0.663646 2.068613 -0.389321 \n", + "2141408 -1.236324 -2.198807 2.692805 -1.020720 -1.041804 -1.319790 1.017152 \n", + "2141409 0.266917 -0.101650 1.762497 1.121769 -0.839632 -0.830745 -0.371426 \n", + "\n", + "[1000000 rows x 100 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
f_0f_1f_2f_3f_4f_5f_6f_7f_8f_9...f_90f_91f_92f_93f_94f_95f_96f_97f_98f_99
11414101.0933241.301436-1.2422535.828377-0.3842341.4477111.5190530.9980880.3280720.781790...-0.4871562.0138872.145116-0.5650780.954476-0.104428-0.4935790.5624401.109892-0.343760
11414110.104047-0.1131540.719374-0.210036-0.4334660.1411400.8933350.3409640.3280721.189662...-0.735104-0.309837-0.2788250.700176-0.6081100.484129-0.824128-0.7892650.328594-0.343760
11414121.436316-0.5173230.953603-0.214181-0.5414680.1257990.9580020.9502520.328072-0.748747...-0.6668680.0598871.5804890.7169561.0412861.185396-1.433365-0.992975-0.0253351.855881
1141413-0.6780370.2910140.467504-0.241889-0.3662750.193799-0.179444-0.1970570.328072-1.003611...-0.060639-1.202124-0.3397100.4416950.3344020.209693-0.258891-0.676576-0.028874-0.343760
11414140.1423691.301436-1.4567801.780014-0.1515720.541044-0.859400-0.1970570.328072-0.360194...1.0915210.8287861.3941900.4544941.983799-0.4473261.509864-0.1454000.996635-0.343760
..................................................................
21414051.327377-0.552583-0.8863230.0140430.1254541.677687-1.381808-1.0995080.526042-1.925724...-0.479163-0.026399-0.8050290.1385022.100753-0.970065-2.6384780.6826770.194035-0.382265
21414060.236709-2.310760-0.4658790.433303-0.253564-1.3752510.8394871.0755780.1894060.142188...1.7050651.222087-0.243805-0.624759-0.0299500.2947792.685435-0.1464900.2768080.479031
21414070.4648930.201072-2.056188-0.5359230.207322-0.2123541.799059-0.3126530.817783-0.697212...0.2985590.316116-0.647126-1.0710170.671536-0.5140840.8718590.6636462.068613-0.389321
21414080.2903920.5011662.708688-0.410148-0.400013-0.949561-1.0684811.5399860.206195-0.769957...-0.497587-0.264609-0.256282-1.236324-2.1988072.692805-1.020720-1.041804-1.3197901.017152
2141409-3.4366190.9685991.884645-0.562264-0.318674-0.0542210.8633300.4708930.8804172.047566...2.593259-0.177891-1.0760970.266917-0.1016501.7624971.121769-0.839632-0.830745-0.371426
\n", + "

1000000 rows × 100 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "1141410 -1.705758\n", + "1141411 -0.260084\n", + "1141412 0.013137\n", + "1141413 0.841180\n", + "1141414 -0.086323\n", + " ... \n", + "2141405 1.304217\n", + "2141406 -0.376348\n", + "2141407 -0.854828\n", + "2141408 -0.336876\n", + "2141409 0.775711\n", + "Name: target, Length: 1000000, dtype: float32" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Create Model - LGBM" + ], + "metadata": { + "id": "bQ6oUJ7FIz-D" + } + }, + { + "cell_type": "code", + "source": [ + "model_lgb = lgb.LGBMRegressor() # Create an instance of Light GBM regressor\n", + "model_lgb.fit(X_train_reduced, y_train_reduced) # Fit the Light GBM regressor\n", + "\n", + "# Light GBM - Gradient Boosting on Decision Trees Algorithm -> State-of-the-Art for tabular data" + ], + "metadata": { + "id": "MvvrjNNEHu7t", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "74be1b88-d0d2-4e11-ef54-d268ea4ff084" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LGBMRegressor()" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Inference" + ], + "metadata": { + "id": "cydkjtxyDOXu" + } + }, + { + "cell_type": "code", + "source": [ + "X_test_reduced = pd.read_parquet(\n", + " 'train_low_mem.parquet',\n", + " columns=features+['target']\n", + " )[-M:] # Load test data (the last M lines)\n", + "y_test_reduced = X_test_reduced['target'] # Set the column `target` as a target array for assessing model performance\n", + "X_test_reduced = X_test_reduced.drop(columns=['target']) # Drop the column `target` from X_test_reduced\n", + "display(X_test_reduced, y_test_reduced) # Display the features table (X_test_reduced) and the target array (y_test_reduced)\n" + ], + "metadata": { + "id": "idk7suLnDCd1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 707 + }, + "outputId": "49bf640f-e02c-47d7-a4db-f5bdd383cdfd" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " f_0 f_1 f_2 f_3 f_4 f_5 f_6 \\\n", + "2141410 -0.318628 1.878033 -0.254857 -0.034804 -0.134778 -0.646280 -0.099820 \n", + "2141411 -0.793941 -0.635127 0.737949 -0.655404 -0.208031 -0.811264 0.411129 \n", + "2141412 1.056986 -1.240384 1.003570 -0.624688 -0.300291 0.699789 0.107452 \n", + "2141413 -1.622636 1.302770 -0.694586 -0.403037 -0.176304 1.265605 1.415097 \n", + "2141414 0.040462 -0.075609 -0.606644 0.167259 -0.098879 -0.085549 1.016559 \n", + "... ... ... ... ... ... ... ... \n", + "3141405 0.093530 -0.720275 -0.345497 -0.438781 -0.166972 -0.437182 1.475746 \n", + "3141406 -1.344935 -0.199987 -0.107702 -0.454677 -0.221914 -0.141174 -1.498235 \n", + "3141407 0.979489 -1.110491 1.006980 -0.467307 -0.159549 1.355671 0.150812 \n", + "3141408 -2.565332 0.320301 0.076600 1.380182 -0.155366 -0.689000 0.381069 \n", + "3141409 -0.089557 0.190229 -0.548256 0.151205 0.079773 0.447962 1.014983 \n", + "\n", + " f_7 f_8 f_9 ... f_90 f_91 f_92 \\\n", + "2141410 -0.338968 0.117094 -0.117497 ... -0.889317 0.050175 -1.007735 \n", + "2141411 -0.607927 0.131232 1.100606 ... 1.706120 0.055868 0.038690 \n", + "2141412 0.865332 0.098041 2.373290 ... 0.504580 -0.473205 -0.768706 \n", + "2141413 0.083107 0.071317 0.955035 ... 2.575595 -0.995329 0.874402 \n", + "2141414 1.935746 0.083219 0.213701 ... 1.303252 -0.164346 -0.504350 \n", + "... ... ... ... ... ... ... ... \n", + "3141405 1.284423 0.056425 -1.433681 ... -0.265811 -1.018476 0.453945 \n", + "3141406 1.373834 0.056425 -1.211572 ... -0.707159 0.036467 0.002782 \n", + "3141407 -0.088923 0.056425 0.996380 ... -0.265811 0.483454 -0.537039 \n", + "3141408 -1.324759 0.056425 -1.111730 ... -0.265811 0.050828 0.104241 \n", + "3141409 -1.324759 0.056425 -1.952123 ... 0.661801 -0.020749 0.626640 \n", + "\n", + " f_93 f_94 f_95 f_96 f_97 f_98 f_99 \n", + "2141410 1.266060 0.307827 0.431842 -0.566316 -0.373920 -1.237720 0.987137 \n", + "2141411 -1.380178 -2.326279 0.699331 0.191068 -0.685449 0.104734 0.101148 \n", + "2141412 -0.169144 -0.899174 0.772994 0.636828 -0.654049 -1.486701 -0.595339 \n", + "2141413 -0.722395 -1.938771 -0.636336 0.207978 -0.181923 -0.589655 -0.368053 \n", + "2141414 -1.991226 0.707544 -0.278499 0.905299 -0.133844 -0.181467 -0.223240 \n", + "... ... ... ... ... ... ... ... \n", + "3141405 0.395551 0.318854 0.725995 -0.596743 -0.591497 -0.830558 -0.217642 \n", + "3141406 0.424902 0.605383 1.704695 -0.625599 -1.110052 -0.946176 -0.217642 \n", + "3141407 0.363976 0.318854 0.132587 -1.064116 -0.383061 1.458104 -0.217642 \n", + "3141408 0.448094 0.605383 0.269845 -0.303289 -0.229851 0.026150 -0.217642 \n", + "3141409 -0.310978 0.891912 -1.304431 0.448782 0.953866 1.328271 -0.217642 \n", + "\n", + "[1000000 rows x 100 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
f_0f_1f_2f_3f_4f_5f_6f_7f_8f_9...f_90f_91f_92f_93f_94f_95f_96f_97f_98f_99
2141410-0.3186281.878033-0.254857-0.034804-0.134778-0.646280-0.099820-0.3389680.117094-0.117497...-0.8893170.050175-1.0077351.2660600.3078270.431842-0.566316-0.373920-1.2377200.987137
2141411-0.793941-0.6351270.737949-0.655404-0.208031-0.8112640.411129-0.6079270.1312321.100606...1.7061200.0558680.038690-1.380178-2.3262790.6993310.191068-0.6854490.1047340.101148
21414121.056986-1.2403841.003570-0.624688-0.3002910.6997890.1074520.8653320.0980412.373290...0.504580-0.473205-0.768706-0.169144-0.8991740.7729940.636828-0.654049-1.486701-0.595339
2141413-1.6226361.302770-0.694586-0.403037-0.1763041.2656051.4150970.0831070.0713170.955035...2.575595-0.9953290.874402-0.722395-1.938771-0.6363360.207978-0.181923-0.589655-0.368053
21414140.040462-0.075609-0.6066440.167259-0.098879-0.0855491.0165591.9357460.0832190.213701...1.303252-0.164346-0.504350-1.9912260.707544-0.2784990.905299-0.133844-0.181467-0.223240
..................................................................
31414050.093530-0.720275-0.345497-0.438781-0.166972-0.4371821.4757461.2844230.056425-1.433681...-0.265811-1.0184760.4539450.3955510.3188540.725995-0.596743-0.591497-0.830558-0.217642
3141406-1.344935-0.199987-0.107702-0.454677-0.221914-0.141174-1.4982351.3738340.056425-1.211572...-0.7071590.0364670.0027820.4249020.6053831.704695-0.625599-1.110052-0.946176-0.217642
31414070.979489-1.1104911.006980-0.467307-0.1595491.3556710.150812-0.0889230.0564250.996380...-0.2658110.483454-0.5370390.3639760.3188540.132587-1.064116-0.3830611.458104-0.217642
3141408-2.5653320.3203010.0766001.380182-0.155366-0.6890000.381069-1.3247590.056425-1.111730...-0.2658110.0508280.1042410.4480940.6053830.269845-0.303289-0.2298510.026150-0.217642
3141409-0.0895570.190229-0.5482560.1512050.0797730.4479621.014983-1.3247590.056425-1.952123...0.661801-0.0207490.626640-0.3109780.891912-1.3044310.4487820.9538661.328271-0.217642
\n", + "

1000000 rows × 100 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "2141410 0.678174\n", + "2141411 0.059027\n", + "2141412 0.274764\n", + "2141413 0.159570\n", + "2141414 -1.265211\n", + " ... \n", + "3141405 0.033600\n", + "3141406 -0.223264\n", + "3141407 -0.559415\n", + "3141408 0.009599\n", + "3141409 1.212112\n", + "Name: target, Length: 1000000, dtype: float32" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "prediction_train = model_lgb.predict(X_train_reduced) # Predict values for train set using the fitted model\n", + "prediction_test = model_lgb.predict(X_test_reduced) # Predict values for test set using the fitted model " + ], + "metadata": { + "id": "NVyjPI4eCUlL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Evaluation" + ], + "metadata": { + "id": "RqPwbwntDywY" + } + }, + { + "cell_type": "code", + "source": [ + "print(\n", + " 'Train Pearson r:', pearsonr(y_train_reduced, prediction_train)[0], '\\n',\n", + " 'Test Pearson r:', pearsonr(y_test_reduced, prediction_test)[0],\n", + ") # Display Pearson correlation coefficient for train and test sets" + ], + "metadata": { + "id": "XXr9P8KsI37I", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "559c49f8-bccd-4e15-9122-064f2bfe92e9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train Pearson r: 0.2368148605891157 \n", + " Test Pearson r: 0.0914568869131924\n" + ] + } + ] + } + ] +} \ No newline at end of file