BatataNatna
diff --git a/‎learntools/data_cleaning/ex4.py
Lines changed: 1 addition & 1 deletion b/‎learntools/data_cleaning/ex4.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎learntools/data_cleaning/ex5.py
Lines changed: 1 addition & 1 deletion b/‎learntools/data_cleaning/ex5.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎learntools/intro_to_programming/ex3.py
Lines changed: 1 addition & 1 deletion b/‎learntools/intro_to_programming/ex3.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎learntools/ml_explainability/ex3.py
Lines changed: 54 additions & 29 deletions b/‎learntools/ml_explainability/ex3.py
Lines changed: 54 additions & 29 deletions
diff --git a/‎learntools/ml_explainability/ex4.py
Lines changed: 9 additions & 12 deletions b/‎learntools/ml_explainability/ex4.py
Lines changed: 9 additions & 12 deletions
diff --git a/‎learntools/ml_intermediate/ex3.py
Lines changed: 3 additions & 0 deletions b/‎learntools/ml_intermediate/ex3.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎learntools/ml_intermediate/ex6.py
Lines changed: 4 additions & 2 deletions b/‎learntools/ml_intermediate/ex6.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎learntools/sql/ex5.py
Lines changed: 8 additions & 9 deletions b/‎learntools/sql/ex5.py
Lines changed: 8 additions & 9 deletions
@@ -2,7 +2,7 @@
 
 import pandas as pd
 import numpy as np
-import chardet
+import charset_normalizer
 import os
 np.random.seed(0)
 
 
@@ -4,7 +4,7 @@
 import numpy as np
 import fuzzywuzzy
 from fuzzywuzzy import process
-import chardet
+import charset_normalizer
 
 #-----
 
 
@@ -45,7 +45,7 @@ class CustomEngravings(FunctionProblem):
         (("Adrian", True), 160),
         (("Ana", False), 71),
     ]
-    _hint = ("There are two options - either the project uses solid gold or does not.  With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not_solid_gold) * ____`.  You need to figure out how to fill in the blanks. Also, remember that:\n"
+    _hint = ("There are two options - either the project uses solid gold or does not.  With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not solid_gold) * ____`.  You need to figure out how to fill in the blanks. Also, remember that:\n"
              "- If `solid_gold = True`, then `(not solid_gold) = False`, and if `solid_gold = False`, then `(not solid_gold) = True`.\n"
              "- Multiplying an integer by `True` is equivalent to multiplying it by 1, and multiplying an integer by `False` is equivalent to multiplying it by 0.")
     _solution = CS(
 
@@ -5,15 +5,14 @@
 
 from learntools.core import *
 
+# 1
 class WhyThatUShape(ThoughtExperiment):
     _solution = \
 """
 The code is
 
     for feat_name in base_features:
-        pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X,
-                                   model_features=base_features, feature=feat_name)
-        pdp.pdp_plot(pdp_dist, feat_name)
+        PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])
         plt.show()
 
 
@@ -25,6 +24,7 @@ class WhyThatUShape(ThoughtExperiment):
 For the same reason, we see the general U-shape in all our partial dependence plots.
 """
 
+# 2
 class PonderPDPContour(ThoughtExperiment):
     _solution = \
 """
@@ -40,46 +40,66 @@ class PonderPDPContour(ThoughtExperiment):
 
 The code you need to create the desired plot is:
 
-    fnames = ['pickup_longitude', 'dropoff_longitude']
-    longitudes_partial_plot  =  pdp.pdp_interact(model=first_model, dataset=val_X,
-                                                model_features=base_features, features=fnames)
-    pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot,
-                          feature_names=fnames, plot_type='contour')
+    fig, ax = plt.subplots(figsize=(8, 6))
+    fnames = [('pickup_longitude', 'dropoff_longitude')]
+    disp = PartialDependenceDisplay.from_estimator(first_model, val_X, fnames, ax=ax)
     plt.show()
 """
 
+# 3
 class ReadPDPContour(CodingProblem):
     _var = 'savings_from_shorter_trip'
-    _hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the white contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
-    _solution = 'About \$15. The price decreases from slightly more than \$24 to slightly more than \$9.'
+    _hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
+    _solution = 'About 6. The price decreases from slightly less than 15 to slightly less than 9.'
     def check(self, savings):
         if type(savings) == str:
             savings = Decimal(dollars.strip('$'))
-        assert ((savings > 13) and (savings < 17)), "Your answer should be about 15. Not {}".format(savings)
+        assert ((savings > 4) and (savings < 8)), "Your answer should be about 6. Not {}".format(savings)
 
+# 4
 class MakePDPWithAbsFeatures(CodingProblem):
-    _var = 'pdp_dist'
-    _hint = 'use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
+    _var = 'disp'
+    _hint = 'Use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
     _solution = \
 """
-The biggest difference is that the partial dependence plot became much smaller. The the lowest vertical value is about $15 below the highest vertical value in the top chart, whereas this difference is only about $3 in the chart you just created. In other words, once you control for absolute distance traveled, the pickup_longitude has only a very small impact on predictions.
+The difference is that the partial dependence plot became smaller. Both plots have a lowest vertical value of 8.5.  But, the highest vertical value in the top chart is around 10.7, and the highest vertical value in the bottom chart is below 9.1.  In other words, once you control for absolute distance traveled, the pickup_longitude has a smaller impact on predictions.
 
     # create new features
     data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
     data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)
 """
-
-    def check(self, pdp_result):
-        correct = np.array([9.92212681,  8.97384862,  8.80044327,  8.71024292,  8.71564739,
-                         8.73523192,  8.76626448,  8.87855912,  9.00098688, 10.99584622])
-        submitted = pdp_result.pdp
+    
+    def check(self, disp):
+        correct = np.array([8.730515  , 8.73239078, 8.71804165, 8.72179009, 8.93013488,
+                            8.68796391, 8.6773792 , 8.6816932 , 8.67547295, 8.64980733,
+                            8.64402745, 8.65616918, 8.63485345, 8.60505726, 8.59167824,
+                            8.57101857, 8.55601734, 8.55780041, 8.53660205, 8.53548254,
+                            8.50739547, 8.50599988, 8.50685068, 8.51981394, 8.52555708,
+                            8.50483315, 8.53151955, 8.49615781, 8.49384454, 8.49156773,
+                            8.5123399 , 8.47138576, 8.47491902, 8.50240045, 8.50495725,
+                            8.50433279, 8.4941558 , 8.50175984, 8.50394946, 8.50890372,
+                            8.50606589, 8.48335522, 8.48281078, 8.4730394 , 8.47720942,
+                            8.47699659, 8.52118039, 8.50234077, 8.59717268, 8.51092865,
+                            8.51177667, 8.51159374, 8.51159432, 8.54379423, 8.50500559,
+                            8.50631149, 8.52264825, 8.51989952, 8.52841122, 8.52757692,
+                            8.54425047, 8.56425312, 8.56874055, 8.58372296, 8.5589557 ,
+                            8.57709991, 8.57441775, 8.59449221, 8.60063777, 8.62185164,
+                            8.6155473 , 8.6118143 , 8.61590988, 8.60758597, 8.62013413,
+                            8.6334263 , 8.64035478, 8.65324115, 8.66043255, 8.67502176,
+                            8.68940416, 8.6840402 , 8.67197893, 8.65512484, 8.66810839,
+                            8.6614093 , 8.65865671, 8.66485738, 8.67966737, 8.82833712,
+                            9.04135448, 9.03734449, 8.69506545, 8.70261503, 8.70673595,
+                            8.69045255, 8.69679997, 8.70716659, 8.71006281, 8.71739009])
+        submitted = disp.pd_results[0]['average'][0]
         assert np.allclose(submitted, correct, rtol=0.1)
 
+# 5
 class DoesSteepnessImplyImportance(ThoughtExperiment):
     _solution = "No. This doesn't guarantee `feat_a` is more important. For example, `feat_a` could have a big effect in the cases where it varies, but could have a single value 99\% of the time. In that case, permuting `feat_a` wouldn't matter much, since most values would be unchanged."
 
+# 6
 class DesignDatasetUShapedPdp(CodingProblem):
-    _var = 'pdp_dist'
+    _var = 'disp'
     _hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`"
     _solution = CS(
 """
@@ -89,22 +109,26 @@ class DesignDatasetUShapedPdp(CodingProblem):
 # You don't need any more changes
 """)
 
-    def check(self, pdp_result):
-        segment_1_end = np.argmin(pdp_result.feature_grids<-1)
-        segment_3_start = np.argmax(pdp_result.feature_grids>1)
+    def check(self, disp):
+        pdp_result = disp.pd_results[0]
+        x_values = pdp_result['values'][0]
+        y_values = pdp_result['average'][0]
+        
+        segment_1_end = np.argmin(x_values<-1)
+        segment_3_start = np.argmax(x_values>1)
         segment_2_start = segment_1_end + 1
         segment_2_end = segment_3_start - 1
 
-        segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[segment_1_end]
-        segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[segment_2_end]
-        segment_3_slopes_down = pdp_result.pdp[segment_3_start] > pdp_result.pdp[-1]
+        segment_1_slopes_down = y_values[0] > y_values[segment_1_end]
+        segment_2_slopes_up = y_values[segment_2_start] < y_values[segment_2_end]
+        segment_3_slopes_down = y_values[segment_3_start] > y_values[-1]
 
         assert segment_1_slopes_down, ("The partial dependence plot does not slope down for values below -1.")
         assert segment_2_slopes_up, ("The partial dependence plot does not slope up for values between -1 and 1.")
         assert segment_3_slopes_down, ("The partial dependence plot does not slope down for values above 1.")
 
 class DesignFlatPDPWithHighImportance(CodingProblem):
-    _vars = ['perm', 'pdp_dist']
+    _vars = ['perm', 'disp']
     _hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa."
     _solution = CS(
 """
@@ -117,9 +141,10 @@ class DesignFlatPDPWithHighImportance(CodingProblem):
 # Aside from these lines, use the code provided
 """)
 
-    def check(self, importance, pdpResult):
+    def check(self, importance, disp):
         X1_imp = importance.feature_importances_[0]
-        pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp)
+        pdpResult = disp.pd_results[0]['average'][0]
+        pdpRange = max(pdpResult) - min(pdpResult)
         assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. "
                                 "Actual importance was {}").format(X1_imp)
         assert (pdpRange < 0.5), ("Tested that the highest point on the Partial "
 
@@ -5,6 +5,7 @@
 
 from learntools.core import *
 
+# (1)
 class SummarizeModel(ThoughtExperiment):
     _solution = CS(
 """
@@ -19,41 +20,36 @@ class SummarizeModel(ThoughtExperiment):
 """
     )
 
+# (2)
 class EffectNumInpatient(ThoughtExperiment):
     _solution = CS(
 """
 # PDP for number_inpatient feature
 
 from matplotlib import pyplot as plt
-from pdpbox import pdp, get_dataset, info_plots
+from sklearn.inspection import PartialDependenceDisplay
 
 feature_name = 'number_inpatient'
-# Create the data that we will plot
-my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
-
-# plot it
-pdp.pdp_plot(my_pdp, feature_name)
+PartialDependenceDisplay.from_estimator(my_model, val_X, [feature_name])
 plt.show()
 """
     )
 
+# (3)
 class EffectTimeInHospital(ThoughtExperiment):
     _solution = \
 """
 The results are very different. Specifically time in hospital has a much smaller effect. Code below:
 
     from matplotlib import pyplot as plt
-    from pdpbox import pdp, get_dataset, info_plots
+    from sklearn.inspection import PartialDependenceDisplay
 
     feature_name = 'time_in_hospital'
-    # Create the data that we will plot
-    my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
-
-    # plot it
-    pdp.pdp_plot(my_pdp, feature_name)
+    PartialDependenceDisplay.from_estimator(my_model, val_X, [feature_name])
     plt.show()
 """
 
+# (4)
 class RawActualsInsteadOfPDP(ThoughtExperiment):
     _hint = "This requires a groupby (from pandas) on the raw data, rather than using a model"
     _solution = CS(
@@ -68,6 +64,7 @@ class RawActualsInsteadOfPDP(ThoughtExperiment):
 """
     )
 
+# (5)
 class UseShap(ThoughtExperiment):
     _hint = "Here's the time to use SHAP values"
     _solution = CS(
 
@@ -141,6 +141,9 @@ class OneHot(CodingProblem):
 OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
 OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
 
+# Ensure all columns have string type
+OH_X_train.columns = OH_X_train.columns.astype(str)
+OH_X_valid.columns = OH_X_valid.columns.astype(str)
 """)
 
     def check(self, OH_X_train, OH_X_valid):
 
@@ -53,8 +53,10 @@ def check(self, my_model_1):
         ("Please instantiate the XGBoost model with default parameters, and set the random seed "
          "to 0 (e.g., `my_model_1 = XGBRegressor(random_state=0)`).")
 
-        assert my_model_1._Booster is not None, \
-        "Please fit the model to the training data."
+        try: 
+            my_model_1.get_booster()
+        except:
+            assert 0==1, "Please fit the model to the training data."
 
 class Model1B(CodingProblem):
     _var = 'predictions_1'
 
@@ -21,7 +21,7 @@
                         SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, 
                                COUNT(1) AS num_trips
                         FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
-                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
+                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
                         GROUP BY month
                         ORDER BY month
                         """
@@ -36,8 +36,8 @@
                           trip_miles, 
                           trip_seconds
                    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
-                   WHERE trip_start_timestamp > '2017-01-01' AND 
-                         trip_start_timestamp < '2017-07-01' AND 
+                   WHERE trip_start_timestamp > '2016-01-01' AND 
+                         trip_start_timestamp < '2016-04-01' AND 
                          trip_seconds > 0 AND 
                          trip_miles > 0
                )
@@ -82,8 +82,7 @@ class WhatsWrongWithData(ThoughtExperiment):
 client.list_rows(table, max_results=5).to_dataframe()
 ```
 
-Some trips in the top few rows have `trip_seconds` or `trip_miles` values of 0. 
-Other location fields have values of `None`. That is a problem if we want to use those fields.
+Some location fields have values of `None` or `NaN`. That is a problem if we want to use those fields.
 """
 
 # (3)
@@ -97,7 +96,7 @@ def check(self, results):
         # check 2: length of dataframe
         assert (len(results) == len(rides_per_year_answer)), ("The results don't look right. Try again.")
         # check 3: one value in particular
-        year_to_check = list(rides_per_year_answer["year"])[0]
+        year_to_check = list(rides_per_year_answer["year"])[-1]
         correct_number = int(rides_per_year_answer.loc[rides_per_year_answer["year"]==year_to_check]["num_trips"].values)
         submitted_number = int(results.loc[results["year"]==year_to_check]["num_trips"].values)
         assert (correct_number == submitted_number), ("The results don't look right. Try again.")
@@ -145,7 +144,7 @@ def check(self, results):
                         SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, 
                                COUNT(1) AS num_trips
                         FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
-                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
+                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
                         GROUP BY month
                         ORDER BY month
                         \"""
@@ -191,8 +190,8 @@ def check(self, results):
                           trip_miles, 
                           trip_seconds
                    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
-                   WHERE trip_start_timestamp > '2017-01-01' AND 
-                         trip_start_timestamp < '2017-07-01' AND 
+                   WHERE trip_start_timestamp > '2016-01-01' AND 
+                         trip_start_timestamp < '2016-04-01' AND 
                          trip_seconds > 0 AND 
                          trip_miles > 0
                )
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ class CustomEngravings(FunctionProblem):`
`45`	`45`	`(("Adrian", True), 160),`
`46`	`46`	`(("Ana", False), 71),`
`47`	`47`	`]`
`48`		- _hint = ("There are two options - either the project uses solid gold or does not. With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not_solid_gold) * ____`. You need to figure out how to fill in the blanks. Also, remember that:\n"
	`48`	+ _hint = ("There are two options - either the project uses solid gold or does not. With this in mind, you can structure your solution like this: `cost = solid_gold * ____ + (not solid_gold) * ____`. You need to figure out how to fill in the blanks. Also, remember that:\n"
`49`	`49`	"- If `solid_gold = True`, then `(not solid_gold) = False`, and if `solid_gold = False`, then `(not solid_gold) = True`.\n"
`50`	`50`	"- Multiplying an integer by `True` is equivalent to multiplying it by 1, and multiplying an integer by `False` is equivalent to multiplying it by 0.")
`51`	`51`	`_solution = CS(`