Skip to content

Commit 16bc0af

Browse files
authored
Merge branch 'master' into image-hosting
2 parents 360db10 + 2f21084 commit 16bc0af

File tree

46 files changed

+407
-454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+407
-454
lines changed

learntools/data_cleaning/ex4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pandas as pd
44
import numpy as np
5-
import chardet
5+
import charset_normalizer
66
import os
77
np.random.seed(0)
88

learntools/data_cleaning/ex5.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import fuzzywuzzy
66
from fuzzywuzzy import process
7-
import chardet
7+
import charset_normalizer
88

99
#-----
1010

learntools/intro_to_programming/ex3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class CustomEngravings(FunctionProblem):
4545
(("Adrian", True), 160),
4646
(("Ana", False), 71),
4747
]
48-
_hint = ("There are two options - either the project uses solid gold or does not. With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not_solid_gold) * ____`. You need to figure out how to fill in the blanks. Also, remember that:\n"
48+
_hint = ("There are two options - either the project uses solid gold or does not. With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not solid_gold) * ____`. You need to figure out how to fill in the blanks. Also, remember that:\n"
4949
"- If `solid_gold = True`, then `(not solid_gold) = False`, and if `solid_gold = False`, then `(not solid_gold) = True`.\n"
5050
"- Multiplying an integer by `True` is equivalent to multiplying it by 1, and multiplying an integer by `False` is equivalent to multiplying it by 0.")
5151
_solution = CS(

learntools/ml_explainability/ex3.py

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55

66
from learntools.core import *
77

8+
# 1
89
class WhyThatUShape(ThoughtExperiment):
910
_solution = \
1011
"""
1112
The code is
1213
1314
for feat_name in base_features:
14-
pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X,
15-
model_features=base_features, feature=feat_name)
16-
pdp.pdp_plot(pdp_dist, feat_name)
15+
PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])
1716
plt.show()
1817
1918
@@ -25,6 +24,7 @@ class WhyThatUShape(ThoughtExperiment):
2524
For the same reason, we see the general U-shape in all our partial dependence plots.
2625
"""
2726

27+
# 2
2828
class PonderPDPContour(ThoughtExperiment):
2929
_solution = \
3030
"""
@@ -40,46 +40,66 @@ class PonderPDPContour(ThoughtExperiment):
4040
4141
The code you need to create the desired plot is:
4242
43-
fnames = ['pickup_longitude', 'dropoff_longitude']
44-
longitudes_partial_plot = pdp.pdp_interact(model=first_model, dataset=val_X,
45-
model_features=base_features, features=fnames)
46-
pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot,
47-
feature_names=fnames, plot_type='contour')
43+
fig, ax = plt.subplots(figsize=(8, 6))
44+
fnames = [('pickup_longitude', 'dropoff_longitude')]
45+
disp = PartialDependenceDisplay.from_estimator(first_model, val_X, fnames, ax=ax)
4846
plt.show()
4947
"""
5048

49+
# 3
5150
class ReadPDPContour(CodingProblem):
5251
_var = 'savings_from_shorter_trip'
53-
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the white contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
54-
_solution = 'About \$15. The price decreases from slightly more than \$24 to slightly more than \$9.'
52+
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
53+
_solution = 'About 6. The price decreases from slightly less than 15 to slightly less than 9.'
5554
def check(self, savings):
5655
if type(savings) == str:
5756
savings = Decimal(dollars.strip('$'))
58-
assert ((savings > 13) and (savings < 17)), "Your answer should be about 15. Not {}".format(savings)
57+
assert ((savings > 4) and (savings < 8)), "Your answer should be about 6. Not {}".format(savings)
5958

59+
# 4
6060
class MakePDPWithAbsFeatures(CodingProblem):
61-
_var = 'pdp_dist'
62-
_hint = 'use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
61+
_var = 'disp'
62+
_hint = 'Use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
6363
_solution = \
6464
"""
65-
The biggest difference is that the partial dependence plot became much smaller. The the lowest vertical value is about $15 below the highest vertical value in the top chart, whereas this difference is only about $3 in the chart you just created. In other words, once you control for absolute distance traveled, the pickup_longitude has only a very small impact on predictions.
65+
The difference is that the partial dependence plot became smaller. Both plots have a lowest vertical value of 8.5. But, the highest vertical value in the top chart is around 10.7, and the highest vertical value in the bottom chart is below 9.1. In other words, once you control for absolute distance traveled, the pickup_longitude has a smaller impact on predictions.
6666
6767
# create new features
6868
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
6969
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)
7070
"""
71-
72-
def check(self, pdp_result):
73-
correct = np.array([9.92212681, 8.97384862, 8.80044327, 8.71024292, 8.71564739,
74-
8.73523192, 8.76626448, 8.87855912, 9.00098688, 10.99584622])
75-
submitted = pdp_result.pdp
71+
72+
def check(self, disp):
73+
correct = np.array([8.730515 , 8.73239078, 8.71804165, 8.72179009, 8.93013488,
74+
8.68796391, 8.6773792 , 8.6816932 , 8.67547295, 8.64980733,
75+
8.64402745, 8.65616918, 8.63485345, 8.60505726, 8.59167824,
76+
8.57101857, 8.55601734, 8.55780041, 8.53660205, 8.53548254,
77+
8.50739547, 8.50599988, 8.50685068, 8.51981394, 8.52555708,
78+
8.50483315, 8.53151955, 8.49615781, 8.49384454, 8.49156773,
79+
8.5123399 , 8.47138576, 8.47491902, 8.50240045, 8.50495725,
80+
8.50433279, 8.4941558 , 8.50175984, 8.50394946, 8.50890372,
81+
8.50606589, 8.48335522, 8.48281078, 8.4730394 , 8.47720942,
82+
8.47699659, 8.52118039, 8.50234077, 8.59717268, 8.51092865,
83+
8.51177667, 8.51159374, 8.51159432, 8.54379423, 8.50500559,
84+
8.50631149, 8.52264825, 8.51989952, 8.52841122, 8.52757692,
85+
8.54425047, 8.56425312, 8.56874055, 8.58372296, 8.5589557 ,
86+
8.57709991, 8.57441775, 8.59449221, 8.60063777, 8.62185164,
87+
8.6155473 , 8.6118143 , 8.61590988, 8.60758597, 8.62013413,
88+
8.6334263 , 8.64035478, 8.65324115, 8.66043255, 8.67502176,
89+
8.68940416, 8.6840402 , 8.67197893, 8.65512484, 8.66810839,
90+
8.6614093 , 8.65865671, 8.66485738, 8.67966737, 8.82833712,
91+
9.04135448, 9.03734449, 8.69506545, 8.70261503, 8.70673595,
92+
8.69045255, 8.69679997, 8.70716659, 8.71006281, 8.71739009])
93+
submitted = disp.pd_results[0]['average'][0]
7694
assert np.allclose(submitted, correct, rtol=0.1)
7795

96+
# 5
7897
class DoesSteepnessImplyImportance(ThoughtExperiment):
7998
_solution = "No. This doesn't guarantee `feat_a` is more important. For example, `feat_a` could have a big effect in the cases where it varies, but could have a single value 99\% of the time. In that case, permuting `feat_a` wouldn't matter much, since most values would be unchanged."
8099

100+
# 6
81101
class DesignDatasetUShapedPdp(CodingProblem):
82-
_var = 'pdp_dist'
102+
_var = 'disp'
83103
_hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`"
84104
_solution = CS(
85105
"""
@@ -89,22 +109,26 @@ class DesignDatasetUShapedPdp(CodingProblem):
89109
# You don't need any more changes
90110
""")
91111

92-
def check(self, pdp_result):
93-
segment_1_end = np.argmin(pdp_result.feature_grids<-1)
94-
segment_3_start = np.argmax(pdp_result.feature_grids>1)
112+
def check(self, disp):
113+
pdp_result = disp.pd_results[0]
114+
x_values = pdp_result['values'][0]
115+
y_values = pdp_result['average'][0]
116+
117+
segment_1_end = np.argmin(x_values<-1)
118+
segment_3_start = np.argmax(x_values>1)
95119
segment_2_start = segment_1_end + 1
96120
segment_2_end = segment_3_start - 1
97121

98-
segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[segment_1_end]
99-
segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[segment_2_end]
100-
segment_3_slopes_down = pdp_result.pdp[segment_3_start] > pdp_result.pdp[-1]
122+
segment_1_slopes_down = y_values[0] > y_values[segment_1_end]
123+
segment_2_slopes_up = y_values[segment_2_start] < y_values[segment_2_end]
124+
segment_3_slopes_down = y_values[segment_3_start] > y_values[-1]
101125

102126
assert segment_1_slopes_down, ("The partial dependence plot does not slope down for values below -1.")
103127
assert segment_2_slopes_up, ("The partial dependence plot does not slope up for values between -1 and 1.")
104128
assert segment_3_slopes_down, ("The partial dependence plot does not slope down for values above 1.")
105129

106130
class DesignFlatPDPWithHighImportance(CodingProblem):
107-
_vars = ['perm', 'pdp_dist']
131+
_vars = ['perm', 'disp']
108132
_hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa."
109133
_solution = CS(
110134
"""
@@ -117,9 +141,10 @@ class DesignFlatPDPWithHighImportance(CodingProblem):
117141
# Aside from these lines, use the code provided
118142
""")
119143

120-
def check(self, importance, pdpResult):
144+
def check(self, importance, disp):
121145
X1_imp = importance.feature_importances_[0]
122-
pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp)
146+
pdpResult = disp.pd_results[0]['average'][0]
147+
pdpRange = max(pdpResult) - min(pdpResult)
123148
assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. "
124149
"Actual importance was {}").format(X1_imp)
125150
assert (pdpRange < 0.5), ("Tested that the highest point on the Partial "

learntools/ml_explainability/ex4.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from learntools.core import *
77

8+
# (1)
89
class SummarizeModel(ThoughtExperiment):
910
_solution = CS(
1011
"""
@@ -19,41 +20,36 @@ class SummarizeModel(ThoughtExperiment):
1920
"""
2021
)
2122

23+
# (2)
2224
class EffectNumInpatient(ThoughtExperiment):
2325
_solution = CS(
2426
"""
2527
# PDP for number_inpatient feature
2628
2729
from matplotlib import pyplot as plt
28-
from pdpbox import pdp, get_dataset, info_plots
30+
from sklearn.inspection import PartialDependenceDisplay
2931
3032
feature_name = 'number_inpatient'
31-
# Create the data that we will plot
32-
my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
33-
34-
# plot it
35-
pdp.pdp_plot(my_pdp, feature_name)
33+
PartialDependenceDisplay.from_estimator(my_model, val_X, [feature_name])
3634
plt.show()
3735
"""
3836
)
3937

38+
# (3)
4039
class EffectTimeInHospital(ThoughtExperiment):
4140
_solution = \
4241
"""
4342
The results are very different. Specifically time in hospital has a much smaller effect. Code below:
4443
4544
from matplotlib import pyplot as plt
46-
from pdpbox import pdp, get_dataset, info_plots
45+
from sklearn.inspection import PartialDependenceDisplay
4746
4847
feature_name = 'time_in_hospital'
49-
# Create the data that we will plot
50-
my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
51-
52-
# plot it
53-
pdp.pdp_plot(my_pdp, feature_name)
48+
PartialDependenceDisplay.from_estimator(my_model, val_X, [feature_name])
5449
plt.show()
5550
"""
5651

52+
# (4)
5753
class RawActualsInsteadOfPDP(ThoughtExperiment):
5854
_hint = "This requires a groupby (from pandas) on the raw data, rather than using a model"
5955
_solution = CS(
@@ -68,6 +64,7 @@ class RawActualsInsteadOfPDP(ThoughtExperiment):
6864
"""
6965
)
7066

67+
# (5)
7168
class UseShap(ThoughtExperiment):
7269
_hint = "Here's the time to use SHAP values"
7370
_solution = CS(

learntools/ml_intermediate/ex3.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ class OneHot(CodingProblem):
141141
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
142142
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
143143
144+
# Ensure all columns have string type
145+
OH_X_train.columns = OH_X_train.columns.astype(str)
146+
OH_X_valid.columns = OH_X_valid.columns.astype(str)
144147
""")
145148

146149
def check(self, OH_X_train, OH_X_valid):

learntools/ml_intermediate/ex6.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,10 @@ def check(self, my_model_1):
5353
("Please instantiate the XGBoost model with default parameters, and set the random seed "
5454
"to 0 (e.g., `my_model_1 = XGBRegressor(random_state=0)`).")
5555

56-
assert my_model_1._Booster is not None, \
57-
"Please fit the model to the training data."
56+
try:
57+
my_model_1.get_booster()
58+
except:
59+
assert 0==1, "Please fit the model to the training data."
5860

5961
class Model1B(CodingProblem):
6062
_var = 'predictions_1'

learntools/sql/ex5.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month,
2222
COUNT(1) AS num_trips
2323
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
24-
WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
24+
WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
2525
GROUP BY month
2626
ORDER BY month
2727
"""
@@ -36,8 +36,8 @@
3636
trip_miles,
3737
trip_seconds
3838
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
39-
WHERE trip_start_timestamp > '2017-01-01' AND
40-
trip_start_timestamp < '2017-07-01' AND
39+
WHERE trip_start_timestamp > '2016-01-01' AND
40+
trip_start_timestamp < '2016-04-01' AND
4141
trip_seconds > 0 AND
4242
trip_miles > 0
4343
)
@@ -82,8 +82,7 @@ class WhatsWrongWithData(ThoughtExperiment):
8282
client.list_rows(table, max_results=5).to_dataframe()
8383
```
8484
85-
Some trips in the top few rows have `trip_seconds` or `trip_miles` values of 0.
86-
Other location fields have values of `None`. That is a problem if we want to use those fields.
85+
Some location fields have values of `None` or `NaN`. That is a problem if we want to use those fields.
8786
"""
8887

8988
# (3)
@@ -97,7 +96,7 @@ def check(self, results):
9796
# check 2: length of dataframe
9897
assert (len(results) == len(rides_per_year_answer)), ("The results don't look right. Try again.")
9998
# check 3: one value in particular
100-
year_to_check = list(rides_per_year_answer["year"])[0]
99+
year_to_check = list(rides_per_year_answer["year"])[-1]
101100
correct_number = int(rides_per_year_answer.loc[rides_per_year_answer["year"]==year_to_check]["num_trips"].values)
102101
submitted_number = int(results.loc[results["year"]==year_to_check]["num_trips"].values)
103102
assert (correct_number == submitted_number), ("The results don't look right. Try again.")
@@ -145,7 +144,7 @@ def check(self, results):
145144
SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month,
146145
COUNT(1) AS num_trips
147146
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
148-
WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
147+
WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
149148
GROUP BY month
150149
ORDER BY month
151150
\"""
@@ -191,8 +190,8 @@ def check(self, results):
191190
trip_miles,
192191
trip_seconds
193192
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
194-
WHERE trip_start_timestamp > '2017-01-01' AND
195-
trip_start_timestamp < '2017-07-01' AND
193+
WHERE trip_start_timestamp > '2016-01-01' AND
194+
trip_start_timestamp < '2016-04-01' AND
196195
trip_seconds > 0 AND
197196
trip_miles > 0
198197
)

0 commit comments

Comments
 (0)