cleanup cleanup notebook

ryanmark1867 · Aug 16, 2020 · 7ff9423 · 7ff9423
1 parent 69084ab
commit 7ff9423
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 182 deletions.
diff --git a/data/AB_NYC_2019_output_aug16_2020.pkl b/data/AB_NYC_2019_output_aug16_2020.pkl
diff --git a/notebooks/.ipynb_checkpoints/data_cleanup-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/data_cleanup-checkpoint.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [
     {
@@ -39,7 +39,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "You are using pip version 19.0.3, however version 20.2.1 is available.\n",
+      "You are using pip version 19.0.3, however version 20.2.2 is available.\n",
       "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
      ]
     },
@@ -48,8 +48,8 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
       "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (1.25.3)\n",
+      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
       "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2019.6.16)\n",
       "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (3.0.4)\n",
       "Requirement already satisfied: xlrd in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (1.2.0)\n"
@@ -59,7 +59,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "You are using pip version 19.0.3, however version 20.2.1 is available.\n",
+      "You are using pip version 19.0.3, however version 20.2.2 is available.\n",
       "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
      ]
     }
@@ -71,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 100,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,34 +178,6 @@
     "    return path"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def fill_missing(dataset,columns,defaults):\n",
-    "    ''' replace missing values with placeholders by column type\n",
-    "    \n",
-    "    Args:\n",
-    "        dataset: dataframe in which missing values being processed\n",
-    "        columns: dictionary of columns with keys that are column types and values that are the column names of that type\n",
-    "        defaults: dictionary of replacement values for missing values by column type\n",
-    "\n",
-    "    Returns:\n",
-    "        dataset: dataframe with missing values replaced with default values\n",
-    "\n",
-    "    '''\n",
-    "    logging.debug(\"before mv\")\n",
-    "    for column_category in columns:\n",
-    "        print(\"column_category is \"+str(column_category))\n",
-    "        for col in columns[column_category]:\n",
-    "            print(\"col is \"+str(col))\n",
-    "            dataset[col].fillna(value=defaults[column_category])\n",
-    "     \n",
-    "    return (dataset)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -217,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -247,15 +219,41 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# General cleanup\n",
-    "- correct types for Route and Vehicle\n",
-    "- fill missing values\n",
-    "- create report-date-time index"
+    "# General cleanup\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fill_missing(dataset,columns,defaults):\n",
+    "    ''' replace missing values with placeholders by column type\n",
+    "    \n",
+    "    Args:\n",
+    "        dataset: dataframe in which missing values being processed\n",
+    "        columns: dictionary of columns with keys that are column types and values that are the column names of that type\n",
+    "        defaults: dictionary of replacement values for missing values by column type\n",
+    "\n",
+    "    Returns:\n",
+    "        dataset: dataframe with missing values replaced with default values\n",
+    "\n",
+    "    '''\n",
+    "    logging.debug(\"before mv\")\n",
+    "    for column_category in columns:\n",
+    "        print(\"column_category is \"+str(column_category))\n",
+    "        for col in columns[column_category]:\n",
+    "            print(\"filling mising values in col \"+str(col)+\" with default \"+str(defaults[column_category]))\n",
+    "            dataset[col].fillna(defaults[column_category],inplace = True)\n",
+    "            print(\"in mv Missing values in \",col,\" \",str(dataset[col].isna().sum()))\n",
+    "     \n",
+    "    return(dataset)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -278,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -299,7 +297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 107,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -330,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 108,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -353,7 +351,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -378,7 +376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -397,7 +395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -413,12 +411,12 @@
    "metadata": {},
    "source": [
     "# Master cell\n",
-    "This cell contains calls to the other functions in this notebook to complete the data preparation"
+    "This cell contains calls to the other functions in this notebook to complete the data cleanup"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 112,
    "metadata": {},
    "outputs": [
     {
@@ -443,15 +441,50 @@
       "config value  bounding_box   {'max_long': -73.70018092, 'max_lat': 40.91617849, 'min_long': -74.25909008, 'min_lat': 40.47739894}\n",
       "config value  newark_bounding_box   {'max_long': -74.11278706, 'max_lat': 40.67325015, 'min_long': -74.25132408, 'min_lat': 40.78813864}\n",
       "config value  geo_columns   ['latitude', 'longitude']\n",
-      "config value  file_names   {'input_csv': 'AB_NYC_2019.csv', 'pickled_input_dataframe': 'AB_NYC_2019_input_aug2_2020.pkl', 'pickled_output_dataframe': 'AB_NYC_2019_output_aug2_2020.pkl'}\n",
+      "config value  file_names   {'input_csv': 'AB_NYC_2019.csv', 'pickled_input_dataframe': 'AB_NYC_2019_input_aug16_2020.pkl', 'pickled_output_dataframe': 'AB_NYC_2019_output_aug16_2020.pkl'}\n",
+      "columns is {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}\n",
+      "category_defaults is {'categorical': 'missing', 'continuous': 0.0, 'text': 'missing', 'date': datetime.date(2019, 1, 1), 'excluded': 'missing'}\n",
+      "column_category is categorical\n",
+      "filling mising values in col neighbourhood_group with default missing\n",
+      "in mv Missing values in  neighbourhood_group   0\n",
+      "filling mising values in col neighbourhood with default missing\n",
+      "in mv Missing values in  neighbourhood   0\n",
+      "filling mising values in col room_type with default missing\n",
+      "in mv Missing values in  room_type   0\n",
+      "column_category is continuous\n",
+      "filling mising values in col minimum_nights with default 0.0\n",
+      "in mv Missing values in  minimum_nights   0\n",
+      "filling mising values in col number_of_reviews with default 0.0\n",
+      "in mv Missing values in  number_of_reviews   0\n",
+      "filling mising values in col reviews_per_month with default 0.0\n",
+      "in mv Missing values in  reviews_per_month   0\n",
+      "filling mising values in col calculated_host_listings_count with default 0.0\n",
+      "in mv Missing values in  calculated_host_listings_count   0\n",
+      "filling mising values in col latitude with default 0.0\n",
+      "in mv Missing values in  latitude   0\n",
+      "filling mising values in col longitude with default 0.0\n",
+      "in mv Missing values in  longitude   0\n",
+      "column_category is date\n",
+      "filling mising values in col last_review with default 2019-01-01\n",
+      "in mv Missing values in  last_review   0\n",
+      "column_category is text\n",
+      "filling mising values in col name with default missing\n",
+      "in mv Missing values in  name   0\n",
+      "filling mising values in col host_name with default missing\n",
+      "in mv Missing values in  host_name   0\n",
+      "column_category is excluded\n",
+      "filling mising values in col price with default missing\n",
+      "in mv Missing values in  price   0\n",
+      "filling mising values in col id with default missing\n",
+      "in mv Missing values in  id   0\n",
       "Missing values in  id   0\n",
       "Distinct values in  id   48895\n",
-      "Missing values in  name   16\n",
-      "Distinct values in  name   47905\n",
+      "Missing values in  name   0\n",
+      "Distinct values in  name   47906\n",
       "Missing values in  host_id   0\n",
       "Distinct values in  host_id   37457\n",
-      "Missing values in  host_name   21\n",
-      "Distinct values in  host_name   11452\n",
+      "Missing values in  host_name   0\n",
+      "Distinct values in  host_name   11453\n",
       "Missing values in  neighbourhood_group   0\n",
       "Distinct values in  neighbourhood_group   5\n",
       "Missing values in  neighbourhood   0\n",
@@ -468,10 +501,10 @@
       "Distinct values in  minimum_nights   109\n",
       "Missing values in  number_of_reviews   0\n",
       "Distinct values in  number_of_reviews   394\n",
-      "Missing values in  last_review   10052\n",
-      "Distinct values in  last_review   1764\n",
-      "Missing values in  reviews_per_month   10052\n",
-      "Distinct values in  reviews_per_month   937\n",
+      "Missing values in  last_review   0\n",
+      "Distinct values in  last_review   1765\n",
+      "Missing values in  reviews_per_month   0\n",
+      "Distinct values in  reviews_per_month   938\n",
       "Missing values in  calculated_host_listings_count   0\n",
       "Distinct values in  calculated_host_listings_count   47\n",
       "Missing values in  availability_365   0\n",
@@ -482,37 +515,17 @@
       "negative values in colum  minimum_nights   0\n",
       "non-numeric values in continuous col  number_of_reviews   0\n",
       "negative values in colum  number_of_reviews   0\n",
-      "non-numeric values in continuous col  reviews_per_month   10052\n",
+      "non-numeric values in continuous col  reviews_per_month   0\n",
+      "negative values in colum  reviews_per_month   0\n",
       "non-numeric values in continuous col  calculated_host_listings_count   0\n",
       "negative values in colum  calculated_host_listings_count   0\n",
       "non-numeric values in continuous col  latitude   0\n",
       "non-numeric values in continuous col  longitude   0\n",
       "latitude out of bounds count  0\n",
       "longitude out of bounds count  0\n",
       "location out of bounds count  0\n",
-      "columns is {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}\n",
-      "category_defaults is {'categorical': 'missing', 'continuous': 0.0, 'text': 'missing', 'date': datetime.date(2019, 1, 1), 'excluded': 'missing'}\n",
-      "column_category is categorical\n",
-      "col is neighbourhood_group\n",
-      "col is neighbourhood\n",
-      "col is room_type\n",
-      "column_category is continuous\n",
-      "col is minimum_nights\n",
-      "col is number_of_reviews\n",
-      "col is reviews_per_month\n",
-      "col is calculated_host_listings_count\n",
-      "col is latitude\n",
-      "col is longitude\n",
-      "column_category is date\n",
-      "col is last_review\n",
-      "column_category is text\n",
-      "col is name\n",
-      "col is host_name\n",
-      "column_category is excluded\n",
-      "col is price\n",
-      "col is id\n",
       "path is  C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\n",
-      "file_name is  C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\\AB_NYC_2019_output_aug2_2020.pkl\n"
+      "file_name is  C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\\AB_NYC_2019_output_aug16_2020.pkl\n"
      ]
     },
     {
@@ -607,8 +620,8 @@
        "      <td>150</td>\n",
        "      <td>3</td>\n",
        "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>2019-01-01</td>\n",
+       "      <td>0.00</td>\n",
        "      <td>1</td>\n",
        "      <td>365</td>\n",
        "    </tr>\n",
@@ -672,19 +685,19 @@
        "         room_type  price  minimum_nights  number_of_reviews last_review  \\\n",
        "0     Private room    149               1                  9  2018-10-19   \n",
        "1  Entire home/apt    225               1                 45  2019-05-21   \n",
-       "2     Private room    150               3                  0         NaN   \n",
+       "2     Private room    150               3                  0  2019-01-01   \n",
        "3  Entire home/apt     89               1                270  2019-07-05   \n",
        "4  Entire home/apt     80              10                  9  2018-11-19   \n",
        "\n",
        "   reviews_per_month  calculated_host_listings_count  availability_365  \n",
        "0               0.21                               6               365  \n",
        "1               0.38                               2               355  \n",
-       "2                NaN                               1               365  \n",
+       "2               0.00                               1               365  \n",
        "3               4.64                               1               194  \n",
        "4               0.10                               1                 0  "
       ]
      },
-     "execution_count": 128,
+     "execution_count": 112,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -701,12 +714,15 @@
     "print_config_values(config)\n",
     "# load dataframe\n",
     "df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])\n",
-    "# get basic assessment information for the dataframe\n",
-    "basic_assessment(df,config['columns'],config['valid_values'],config['non_negative_continuous'])\n",
-    "geo_assessment(df,config['bounding_box'])\n",
+    "\n",
+    "\n",
     "print(\"columns is \"+str(config['columns']))\n",
     "print(\"category_defaults is \"+str(config['category_defaults']))\n",
+    "# fill missing values according to the defaults per column\n",
     "df = fill_missing(df,config['columns'],config['category_defaults'])\n",
+    "# get assessment results after filling missing values\n",
+    "basic_assessment(df,config['columns'],config['valid_values'],config['non_negative_continuous'])\n",
+    "geo_assessment(df,config['bounding_box'])\n",
     "if config['general']['save_transformed_dataframe']:\n",
     "    print(\"path is \",path)\n",
     "    file_name = os.path.join(path,config['file_names']['pickled_output_dataframe'])\n",