-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain_eval.py
170 lines (139 loc) · 5.21 KB
/
train_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
import pandas as pd
import numpy as np
# This trains and evaluates the Titanic model for the Kaggle DS Titanic
# competition
def extract_clean_data(data_file_name):
""" Extracts and does some preliminary feature reduction on the dataset
data_file_name: string of the filename of the .csv file with our training
data
returns: a cleaned up pandas dataframe
"""
data_frame = pd.read_csv(data_file_name)
# Cleaning up data frame, dropping unecessary columns
data_frame = data_frame.drop(
[
"PassengerId",
"Name",
"Ticket",
"Cabin", # TODO strip and factorize
],
axis = 1
)
factor_cols = ["Embarked", "Sex"]
# Convert categorical non-numerical values into enumerated integers
data_frame[factor_cols] = factor_column(data_frame, factor_cols)
# Some age values are missing, replace them with median age of dataset
# the imputer requires a data frame with only numerical values, the
# new dataframe drops all of the non-numerical values
num_df = data_frame.drop(
[
"Embarked",
"Sex",
],
axis = 1
)
imp = Imputer(strategy = "median")
impute_cols = [
"Age",
"Fare",
]
data_frame[impute_cols] = imp.fit_transform(data_frame[impute_cols])
return data_frame
def train_model_rf(train_data, tr_col_name):
"""Trains a random forest estimator on a set of test data
test_data: a pandas dataframe with the data we want to use for training
tr_col_name: the name of the column in the training dataframe that will
be used to fit the random forest classifier
"""
# Create a random forest classifier
# -1 sets num jobs to num cores
rf_classifier = RandomForestClassifier(n_jobs = -1)
# Extract the features to be used for training
features = train_data.drop(tr_col_name, axis = 1).columns
rf_classifier.fit(train_data[features], train_data[tr_col_name])
return rf_classifier
def factor_column(df, col_names):
"""helper function that factorizes the column(s) specified for a data
frame. Returns the dataframe with the columns factorized
"""
return df[col_names].apply(lambda x: pd.factorize(x)[0])
def eval_test_data(model, test_df, features):
""" Evaluates the provided test data to rank the effectiveness of the
training function
model: the trained model to evaluate
test_df: the dataframe containing the test data
"""
return model.predict(test_df[features])
def load_test_data(filename, factor_col_names):
""" Loads, factorizes, cleans test data frame
filename: the file name of the test data set to load
factor_col_names: the name of the columns to factorie
"""
# Loading pandas dataframe
imp = Imputer(strategy = "median")
test = pd.read_csv(filename)
age_col = test["Age"]
test[["Age", "Fare"]] = imp.fit_transform(test[["Age", "Fare"]])
# Factoring non-numerical columns
test[factor_col_names] = factor_column(test, factor_col_names)
# Cabin has NaN values, which we can't have in a random forest
test = test.drop("Cabin", axis = 1)
return test
def write_pred(pred_df, test_df, filename, id_col, data_col):
""" Writes the predictions obtained by the estimator to a CSV file as
specified by the filename
pred_df: a dataframe containing the predictions created by the estimator
test_df: the testing dataframe
filename: the desired filename to write the predictions to disk
"""
# Append id column from test dataset to predictions for submission
pred_df[id_col] = test_df[id_col]
# Specify order of columns/format
csv = pred_df.to_csv(
path_or_buf = filename,
index = False,
columns = [id_col, data_col],
)
def main():
"""Wrapper for main function
"""
# Constants
y_label = "Survived"
factor_cols = ["Embarked", "Sex"]
# Load and clean training data
print("Loading, cleaning training data...")
training_data = extract_clean_data("train.csv")
print("Done")
print()
# These are the features we will feed back into the estimator to
# yield predictions
features = training_data.drop(y_label, axis = 1).columns
# train the model
print("Training random forest classifier with test data...")
estimator = train_model_rf(training_data, y_label)
print("Done")
print()
# test the model on the testing data
print("Loading test data...")
test_df = load_test_data("test.csv", factor_cols)
print("Done")
print()
# Create predictions from the model using the testing data
print("Making predictions from RF model...")
predictions = eval_test_data(estimator, test_df, features)
predict_df = pd.DataFrame(
data = predictions,
columns = ["Survived"],
)
print("Done")
print()
# Write predictions to a CSV file based on the ID
print("Writing prediction to csv...")
write_pred(predict_df, test_df, "model_output.csv", "PassengerId", "Survived")
print("Done")
print()
# Wrapper for main function
if __name__ == "__main__":
main()