modified code according to project handout

bobcchen · Feb 27, 2022 · a7ee9b1 · a7ee9b1
1 parent 5bb5634
commit a7ee9b1
Show file tree

Hide file tree

Showing 6 changed files with 145 additions and 30 deletions.
diff --git a/week2/conf/ltr_featureset.json b/week2/conf/ltr_featureset.json
@@ -26,19 +26,12 @@
         }
       },
       {
-        "name": "name_hyphens_min_df",
-        "params": [
-          "keywords"
-        ],
+        "name": "click_prior",
         "template_language": "mustache",
         "template": {
-          "match_explorer": {
-            "type": "min_raw_df",
-            "query": {
-              "match": {
-                "name.hyphens": "{{keywords}}"
-              }
-            }
+          "query_string": {
+            "query": "{{click_prior_query}}",
+            "fields": ["_id"]
           }
         }
       },
@@ -79,6 +72,63 @@
             }
           }
         }
+      },
+      {
+        "name": "salesRankShortTerm",
+        "template_language": "mustache",
+        "template": {
+          "function_score": {
+            "functions": [
+              {
+                "field_value_factor": {
+                  "field": "salesRankShortTerm",
+                  "missing": 1000000
+                }
+              }
+            ],
+            "query": {
+              "match_all": {}
+            }
+          }
+        }
+      },
+      {
+        "name": "salesRankMediumTerm",
+        "template_language": "mustache",
+        "template": {
+          "function_score": {
+            "functions": [
+              {
+                "field_value_factor": {
+                  "field": "salesRankMediumTerm",
+                  "missing": 1000000
+                }
+              }
+            ],
+            "query": {
+              "match_all": {}
+            }
+          }
+        }
+      },
+      {
+        "name": "salesRankLongTerm",
+        "template_language": "mustache",
+        "template": {
+          "function_score": {
+            "functions": [
+              {
+                "field_value_factor": {
+                  "field": "salesRankLongTerm",
+                  "missing": 1000000
+                }
+              }
+            ],
+            "query": {
+              "match_all": {}
+            }
+          }
+        }
       }
     ]
   }

diff --git a/week2/utilities/click_models.py b/week2/utilities/click_models.py
@@ -8,8 +8,14 @@ def binary_func(x):
     return 0
 
 def step(x):
-    print("IMPLEMENT ME: step(x) a step function with a simple heuristic that buckets grades")
-    return rng.choice([0,0.5, 1.0])
+    if x <= 0.05:
+        return 0
+    elif x <= 0.1:
+        return 0.5
+    elif x <= 0.3:
+        return 0.75
+    else:
+        return 1
 
 
 rng = np.random.default_rng(123456)
@@ -28,7 +34,8 @@ def apply_click_model(data_frame, click_model_type="binary", downsample=True):
             data_frame = down_sample_continuous(data_frame)
     elif click_model_type == "heuristic":
         data_frame["grade"] = (data_frame["clicks"]/data_frame["num_impressions"]).fillna(0).apply(lambda x: step(x))
-        print("IMPLEMENT ME: apply_click_model(): downsampling")
+        if downsample:
+            data_frame = down_sample_buckets(data_frame)
     return data_frame
 
 # https://stackoverflow.com/questions/55119651/downsampling-for-more-than-2-classes

diff --git a/week2/utilities/data_prepper.py b/week2/utilities/data_prepper.py
@@ -233,22 +233,27 @@ def __log_ltr_query_features(self, query_id, key, query_doc_ids, click_prior_que
                                                 self.ltr_store_name,
                                                 size=len(query_doc_ids), terms_field=terms_field)
         # IMPLEMENT_START --
-        print("IMPLEMENT ME: __log_ltr_query_features: Extract log features out of the LTR:EXT response and place in a data frame")
         # Loop over the hits structure returned by running `log_query` and then extract out the features from the response per query_id and doc id.  Also capture and return all query/doc pairs that didn't return features
         # Your structure should look like the data frame below
+        response = self.opensearch.search(body=log_query, index=self.index_name)
+
         feature_results = {}
         feature_results["doc_id"] = []  # capture the doc id so we can join later
         feature_results["query_id"] = []  # ^^^
         feature_results["sku"] = []
-        feature_results["salePrice"] = []
-        feature_results["name_match"] = []
-        rng = np.random.default_rng(12345)
-        for doc_id in query_doc_ids:
-            feature_results["doc_id"].append(doc_id)  # capture the doc id so we can join later
-            feature_results["query_id"].append(query_id)
-            feature_results["sku"].append(doc_id)  # ^^^
-            feature_results["salePrice"].append(rng.random())
-            feature_results["name_match"].append(rng.random())
+
+        if response and len(response['hits']) > 0 and len(response['hits']['hits']) == len(query_doc_ids):
+            for hit in response['hits']['hits']:
+                feature_results["doc_id"].append(hit['_id'])  # capture the doc id so we can join later
+                feature_results["query_id"].append(query_id.iloc[0])
+                feature_results["sku"].append(hit['_id'])  # ^^^
+                features = hit['fields']['_ltrlog'][0]['log_entry']
+                for feature in features:
+                    if feature['name'] not in feature_results:
+                        feature_results[feature['name']] = [feature.get('value', 0)]
+                    else:
+                        feature_results[feature['name']].append(feature.get('value', 0))
+
         frame = pd.DataFrame(feature_results)
         return frame.astype({'doc_id': 'int64', 'query_id': 'int64', 'sku': 'int64'})
         # IMPLEMENT_END

diff --git a/week2/utilities/ltr_utils.py b/week2/utilities/ltr_utils.py
@@ -7,7 +7,26 @@ def create_rescore_ltr_query(user_query, query_obj, click_prior_query, ltr_model
                              active_features=None, rescore_size=500, main_query_weight=1, rescore_query_weight=2):
     # Create the base query, use a much bigger window
     #add on the rescore
-    print("IMPLEMENT ME: create_rescore_ltr_query")
+    query_obj["rescore"] = {
+        "window_size": rescore_size,
+        "query": {
+            "rescore_query": {
+                "sltr": {
+                    "params": {
+                        "keywords": user_query,
+                        "click_prior_query": click_prior_query
+                    },
+                    "model": ltr_model_name,
+                    "store": ltr_store_name,
+                }
+            },
+            "score_mode": "total",
+            "query_weight": str(main_query_weight),
+            "rescore_query_weight": str(rescore_query_weight)
+        }
+    }
+    if active_features is not None and len(active_features) > 0:
+        query_obj["rescore"]["query"]["rescore_query"]["sltr"]["active_features"] =  active_features
     return query_obj
 
 # take an existing query and add in an SLTR so we can use it for explains to see how much SLTR contributes
@@ -50,8 +69,41 @@ def create_sltr_hand_tuned_query(user_query, query_obj, click_prior_query, ltr_m
     return query_obj, len(query_obj["query"]["function_score"]["query"]["bool"]["should"])
 
 def create_feature_log_query(query, doc_ids, click_prior_query, featureset_name, ltr_store_name, size=200, terms_field="_id"):
-    print("IMPLEMENT ME: create_feature_log_query")
-    return None
+    query_obj = {
+        'size': size,
+        'query': {
+            'bool': {
+                "filter": [  # use a filter so that we don't actually score anything
+                    {
+                        "terms": {
+                            terms_field: doc_ids
+                        }
+                    },
+                    {  # use the LTR query bring in the LTR feature set
+                        "sltr": {
+                            "_name": "logged_featureset",
+                            "featureset": featureset_name,
+                            "store": ltr_store_name,
+                            "params": {
+                                "keywords": query,
+                                "click_prior_query": click_prior_query
+                            }
+                        }
+                    }
+                ]
+            }
+        },
+        # Turn on feature logging so that we get weights back for our features
+        "ext": {
+            "ltr_log": {
+                "log_specs": {
+                    "name": "log_entry",
+                    "named_query": "logged_featureset"
+                }
+            }
+        }
+    }
+    return query_obj
 
 
 # Item is a Pandas namedtuple

diff --git a/week2/utilities/query_utils.py b/week2/utilities/query_utils.py
@@ -18,7 +18,7 @@ def create_prior_queries_from_group(click_group): # total impressions isn't curr
     if click_group is not None:
         for item in click_group.itertuples():
             try:
-                click_prior_query += "%s^%.3f  " % (item.doc_id, item.clicks)
+                click_prior_query += "%s^%.3f  " % (item.doc_id, item.clicks/item.num_impressions)
 
             except KeyError as ke:
                 pass # nothing to do in this case, it just means we can't find priors for this doc
@@ -33,7 +33,7 @@ def create_prior_queries(doc_ids, doc_id_weights, query_times_seen): # total imp
         for idx, doc in enumerate(doc_ids):
             try:
                 wgt = doc_id_weights[doc]  # This should be the number of clicks or whatever
-                click_prior_query += "%s^%.3f  " % (doc, wgt)
+                click_prior_query += "%s^%.3f  " % (doc, wgt/query_times_seen)
             except KeyError as ke:
                 pass # nothing to do in this case, it just means we can't find priors for this doc
     return click_prior_query

diff --git a/week2/utilities/xgb_utils.py b/week2/utilities/xgb_utils.py
@@ -31,6 +31,7 @@ def train(xgb_train_data, num_rounds=5, xgb_conf=None ):
     if xgb_conf is not None:
         with open(xgb_conf) as json_file:
             xgb_params = json.load(json_file)
+    dtrain = xgb.DMatrix(xgb_train_data)
     print("Training XG Boost on %s for %s rounds with params: %s" % (xgb_train_data, num_rounds, xgb_params))
-    print("IMPLEMENT ME: train()")
+    bst = xgb.train(xgb_params, dtrain, num_rounds)
     return bst, xgb_params