Update Part 3 Bagging and Random Forest

sandipanpaul21 · web-flow · commit 9d7bcb863953 · 2019-07-08T11:14:12.000+05:30
diff --git a/Part 3 Bagging and Random Forest b/Part 3 Bagging and Random Forest
@@ -65,3 +65,49 @@ print('Test set accuracy: {:.3f}'.format(test_accuracy))
 print('OOB accuracy: {:.3f}'.format(oob_accuracy)
 # The difference between test and oob accuracy will be minimal which proved that we don't need cross validation to check the model accuracy
 
+# RANDOM FOREST REGRESSOR
+# Basic imports
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error as MSE
+
+# Set seed for reproducibility
+SEED = 1
+
+# Split dataset into 70% train and 30% test
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)
+
+# Instantiate a random forests regressor 'rf' 400 estimators
+rf = RandomForestRegressor(n_estimators=400, min_samples_leaf=0.12, random_state=SEED)
+
+# Fit 'rf' to the training set
+rf.fit(X_train, y_train)
+
+# Predict the test set labels 'y_pred'
+y_pred = rf.predict(X_test)
+
+# Evaluate the test set RMSE
+rmse_test = MSE(y_test, y_pred)**(1/2)
+
+# Print the test set RMSE
+print('Test set RMSE of rf: {:.2f}'.format(rmse_test))
+
+
+# FEATURE IMPORTANCE in sklearn
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Create a pd.Series of features importances
+importances_rf = pd.Series(rf.feature_importances_, index = X.columns)
+
+# Sort importances_rf
+sorted_importances_rf = importances_rf.sort_values()
+
+# Make a horizontal bar plot
+sorted_importances_rf.plot(kind= 'barh', color= 'lightgreen'); 
+plt.show()
+
+
+
+