Skip to content

Commit

Permalink
cleanup cleanup notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanmark1867 committed Aug 16, 2020
1 parent 69084ab commit 7ff9423
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 182 deletions.
Binary file added data/AB_NYC_2019_output_aug16_2020.pkl
Binary file not shown.
196 changes: 106 additions & 90 deletions notebooks/.ipynb_checkpoints/data_cleanup-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 114,
"execution_count": 98,
"metadata": {},
"outputs": [
{
Expand All @@ -39,7 +39,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 19.0.3, however version 20.2.1 is available.\n",
"You are using pip version 19.0.3, however version 20.2.2 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
Expand All @@ -48,8 +48,8 @@
"output_type": "stream",
"text": [
"\n",
"Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (1.25.3)\n",
"Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2019.6.16)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (3.0.4)\n",
"Requirement already satisfied: xlrd in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (1.2.0)\n"
Expand All @@ -59,7 +59,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 19.0.3, however version 20.2.1 is available.\n",
"You are using pip version 19.0.3, however version 20.2.2 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
}
Expand All @@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 115,
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -112,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -141,7 +141,7 @@
},
{
"cell_type": "code",
"execution_count": 117,
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -160,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -178,34 +178,6 @@
" return path"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"def fill_missing(dataset,columns,defaults):\n",
" ''' replace missing values with placeholders by column type\n",
" \n",
" Args:\n",
" dataset: dataframe in which missing values being processed\n",
" columns: dictionary of columns with keys that are column types and values that are the column names of that type\n",
" defaults: dictionary of replacement values for missing values by column type\n",
"\n",
" Returns:\n",
" dataset: dataframe with missing values replaced with default values\n",
"\n",
" '''\n",
" logging.debug(\"before mv\")\n",
" for column_category in columns:\n",
" print(\"column_category is \"+str(column_category))\n",
" for col in columns[column_category]:\n",
" print(\"col is \"+str(col))\n",
" dataset[col].fillna(value=defaults[column_category])\n",
" \n",
" return (dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -217,7 +189,7 @@
},
{
"cell_type": "code",
"execution_count": 120,
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -247,15 +219,41 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# General cleanup\n",
"- correct types for Route and Vehicle\n",
"- fill missing values\n",
"- create report-date-time index"
"# General cleanup\n"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"def fill_missing(dataset,columns,defaults):\n",
" ''' replace missing values with placeholders by column type\n",
" \n",
" Args:\n",
" dataset: dataframe in which missing values being processed\n",
" columns: dictionary of columns with keys that are column types and values that are the column names of that type\n",
" defaults: dictionary of replacement values for missing values by column type\n",
"\n",
" Returns:\n",
" dataset: dataframe with missing values replaced with default values\n",
"\n",
" '''\n",
" logging.debug(\"before mv\")\n",
" for column_category in columns:\n",
" print(\"column_category is \"+str(column_category))\n",
" for col in columns[column_category]:\n",
" print(\"filling mising values in col \"+str(col)+\" with default \"+str(defaults[column_category]))\n",
" dataset[col].fillna(defaults[column_category],inplace = True)\n",
" print(\"in mv Missing values in \",col,\" \",str(dataset[col].isna().sum()))\n",
" \n",
" return(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -278,7 +276,7 @@
},
{
"cell_type": "code",
"execution_count": 122,
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -299,7 +297,7 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -330,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -353,7 +351,7 @@
},
{
"cell_type": "code",
"execution_count": 125,
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -378,7 +376,7 @@
},
{
"cell_type": "code",
"execution_count": 126,
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -397,7 +395,7 @@
},
{
"cell_type": "code",
"execution_count": 127,
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -413,12 +411,12 @@
"metadata": {},
"source": [
"# Master cell\n",
"This cell contains calls to the other functions in this notebook to complete the data preparation"
"This cell contains calls to the other functions in this notebook to complete the data cleanup"
]
},
{
"cell_type": "code",
"execution_count": 128,
"execution_count": 112,
"metadata": {},
"outputs": [
{
Expand All @@ -443,15 +441,50 @@
"config value bounding_box {'max_long': -73.70018092, 'max_lat': 40.91617849, 'min_long': -74.25909008, 'min_lat': 40.47739894}\n",
"config value newark_bounding_box {'max_long': -74.11278706, 'max_lat': 40.67325015, 'min_long': -74.25132408, 'min_lat': 40.78813864}\n",
"config value geo_columns ['latitude', 'longitude']\n",
"config value file_names {'input_csv': 'AB_NYC_2019.csv', 'pickled_input_dataframe': 'AB_NYC_2019_input_aug2_2020.pkl', 'pickled_output_dataframe': 'AB_NYC_2019_output_aug2_2020.pkl'}\n",
"config value file_names {'input_csv': 'AB_NYC_2019.csv', 'pickled_input_dataframe': 'AB_NYC_2019_input_aug16_2020.pkl', 'pickled_output_dataframe': 'AB_NYC_2019_output_aug16_2020.pkl'}\n",
"columns is {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}\n",
"category_defaults is {'categorical': 'missing', 'continuous': 0.0, 'text': 'missing', 'date': datetime.date(2019, 1, 1), 'excluded': 'missing'}\n",
"column_category is categorical\n",
"filling mising values in col neighbourhood_group with default missing\n",
"in mv Missing values in neighbourhood_group 0\n",
"filling mising values in col neighbourhood with default missing\n",
"in mv Missing values in neighbourhood 0\n",
"filling mising values in col room_type with default missing\n",
"in mv Missing values in room_type 0\n",
"column_category is continuous\n",
"filling mising values in col minimum_nights with default 0.0\n",
"in mv Missing values in minimum_nights 0\n",
"filling mising values in col number_of_reviews with default 0.0\n",
"in mv Missing values in number_of_reviews 0\n",
"filling mising values in col reviews_per_month with default 0.0\n",
"in mv Missing values in reviews_per_month 0\n",
"filling mising values in col calculated_host_listings_count with default 0.0\n",
"in mv Missing values in calculated_host_listings_count 0\n",
"filling mising values in col latitude with default 0.0\n",
"in mv Missing values in latitude 0\n",
"filling mising values in col longitude with default 0.0\n",
"in mv Missing values in longitude 0\n",
"column_category is date\n",
"filling mising values in col last_review with default 2019-01-01\n",
"in mv Missing values in last_review 0\n",
"column_category is text\n",
"filling mising values in col name with default missing\n",
"in mv Missing values in name 0\n",
"filling mising values in col host_name with default missing\n",
"in mv Missing values in host_name 0\n",
"column_category is excluded\n",
"filling mising values in col price with default missing\n",
"in mv Missing values in price 0\n",
"filling mising values in col id with default missing\n",
"in mv Missing values in id 0\n",
"Missing values in id 0\n",
"Distinct values in id 48895\n",
"Missing values in name 16\n",
"Distinct values in name 47905\n",
"Missing values in name 0\n",
"Distinct values in name 47906\n",
"Missing values in host_id 0\n",
"Distinct values in host_id 37457\n",
"Missing values in host_name 21\n",
"Distinct values in host_name 11452\n",
"Missing values in host_name 0\n",
"Distinct values in host_name 11453\n",
"Missing values in neighbourhood_group 0\n",
"Distinct values in neighbourhood_group 5\n",
"Missing values in neighbourhood 0\n",
Expand All @@ -468,10 +501,10 @@
"Distinct values in minimum_nights 109\n",
"Missing values in number_of_reviews 0\n",
"Distinct values in number_of_reviews 394\n",
"Missing values in last_review 10052\n",
"Distinct values in last_review 1764\n",
"Missing values in reviews_per_month 10052\n",
"Distinct values in reviews_per_month 937\n",
"Missing values in last_review 0\n",
"Distinct values in last_review 1765\n",
"Missing values in reviews_per_month 0\n",
"Distinct values in reviews_per_month 938\n",
"Missing values in calculated_host_listings_count 0\n",
"Distinct values in calculated_host_listings_count 47\n",
"Missing values in availability_365 0\n",
Expand All @@ -482,37 +515,17 @@
"negative values in colum minimum_nights 0\n",
"non-numeric values in continuous col number_of_reviews 0\n",
"negative values in colum number_of_reviews 0\n",
"non-numeric values in continuous col reviews_per_month 10052\n",
"non-numeric values in continuous col reviews_per_month 0\n",
"negative values in colum reviews_per_month 0\n",
"non-numeric values in continuous col calculated_host_listings_count 0\n",
"negative values in colum calculated_host_listings_count 0\n",
"non-numeric values in continuous col latitude 0\n",
"non-numeric values in continuous col longitude 0\n",
"latitude out of bounds count 0\n",
"longitude out of bounds count 0\n",
"location out of bounds count 0\n",
"columns is {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}\n",
"category_defaults is {'categorical': 'missing', 'continuous': 0.0, 'text': 'missing', 'date': datetime.date(2019, 1, 1), 'excluded': 'missing'}\n",
"column_category is categorical\n",
"col is neighbourhood_group\n",
"col is neighbourhood\n",
"col is room_type\n",
"column_category is continuous\n",
"col is minimum_nights\n",
"col is number_of_reviews\n",
"col is reviews_per_month\n",
"col is calculated_host_listings_count\n",
"col is latitude\n",
"col is longitude\n",
"column_category is date\n",
"col is last_review\n",
"column_category is text\n",
"col is name\n",
"col is host_name\n",
"column_category is excluded\n",
"col is price\n",
"col is id\n",
"path is C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\n",
"file_name is C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\\AB_NYC_2019_output_aug2_2020.pkl\n"
"file_name is C:\\personal\\manning_liveproject\\end_to_end_deep_learning_live_project\\data\\AB_NYC_2019_output_aug16_2020.pkl\n"
]
},
{
Expand Down Expand Up @@ -607,8 +620,8 @@
" <td>150</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2019-01-01</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>365</td>\n",
" </tr>\n",
Expand Down Expand Up @@ -672,19 +685,19 @@
" room_type price minimum_nights number_of_reviews last_review \\\n",
"0 Private room 149 1 9 2018-10-19 \n",
"1 Entire home/apt 225 1 45 2019-05-21 \n",
"2 Private room 150 3 0 NaN \n",
"2 Private room 150 3 0 2019-01-01 \n",
"3 Entire home/apt 89 1 270 2019-07-05 \n",
"4 Entire home/apt 80 10 9 2018-11-19 \n",
"\n",
" reviews_per_month calculated_host_listings_count availability_365 \n",
"0 0.21 6 365 \n",
"1 0.38 2 355 \n",
"2 NaN 1 365 \n",
"2 0.00 1 365 \n",
"3 4.64 1 194 \n",
"4 0.10 1 0 "
]
},
"execution_count": 128,
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -701,12 +714,15 @@
"print_config_values(config)\n",
"# load dataframe\n",
"df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])\n",
"# get basic assessment information for the dataframe\n",
"basic_assessment(df,config['columns'],config['valid_values'],config['non_negative_continuous'])\n",
"geo_assessment(df,config['bounding_box'])\n",
"\n",
"\n",
"print(\"columns is \"+str(config['columns']))\n",
"print(\"category_defaults is \"+str(config['category_defaults']))\n",
"# fill missing values according to the defaults per column\n",
"df = fill_missing(df,config['columns'],config['category_defaults'])\n",
"# get assessment results after filling missing values\n",
"basic_assessment(df,config['columns'],config['valid_values'],config['non_negative_continuous'])\n",
"geo_assessment(df,config['bounding_box'])\n",
"if config['general']['save_transformed_dataframe']:\n",
" print(\"path is \",path)\n",
" file_name = os.path.join(path,config['file_names']['pickled_output_dataframe'])\n",
Expand Down
Loading

0 comments on commit 7ff9423

Please sign in to comment.