Skip to content

Commit

Permalink
modified code according to project handout
Browse files Browse the repository at this point in the history
  • Loading branch information
bobcchen committed Feb 27, 2022
1 parent 5bb5634 commit a7ee9b1
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 30 deletions.
72 changes: 61 additions & 11 deletions week2/conf/ltr_featureset.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,12 @@
}
},
{
"name": "name_hyphens_min_df",
"params": [
"keywords"
],
"name": "click_prior",
"template_language": "mustache",
"template": {
"match_explorer": {
"type": "min_raw_df",
"query": {
"match": {
"name.hyphens": "{{keywords}}"
}
}
"query_string": {
"query": "{{click_prior_query}}",
"fields": ["_id"]
}
}
},
Expand Down Expand Up @@ -79,6 +72,63 @@
}
}
}
},
{
"name": "salesRankShortTerm",
"template_language": "mustache",
"template": {
"function_score": {
"functions": [
{
"field_value_factor": {
"field": "salesRankShortTerm",
"missing": 1000000
}
}
],
"query": {
"match_all": {}
}
}
}
},
{
"name": "salesRankMediumTerm",
"template_language": "mustache",
"template": {
"function_score": {
"functions": [
{
"field_value_factor": {
"field": "salesRankMediumTerm",
"missing": 1000000
}
}
],
"query": {
"match_all": {}
}
}
}
},
{
"name": "salesRankLongTerm",
"template_language": "mustache",
"template": {
"function_score": {
"functions": [
{
"field_value_factor": {
"field": "salesRankLongTerm",
"missing": 1000000
}
}
],
"query": {
"match_all": {}
}
}
}
}
]
}
Expand Down
13 changes: 10 additions & 3 deletions week2/utilities/click_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@ def binary_func(x):
return 0

def step(x):
print("IMPLEMENT ME: step(x) a step function with a simple heuristic that buckets grades")
return rng.choice([0,0.5, 1.0])
if x <= 0.05:
return 0
elif x <= 0.1:
return 0.5
elif x <= 0.3:
return 0.75
else:
return 1


rng = np.random.default_rng(123456)
Expand All @@ -28,7 +34,8 @@ def apply_click_model(data_frame, click_model_type="binary", downsample=True):
data_frame = down_sample_continuous(data_frame)
elif click_model_type == "heuristic":
data_frame["grade"] = (data_frame["clicks"]/data_frame["num_impressions"]).fillna(0).apply(lambda x: step(x))
print("IMPLEMENT ME: apply_click_model(): downsampling")
if downsample:
data_frame = down_sample_buckets(data_frame)
return data_frame

# https://stackoverflow.com/questions/55119651/downsampling-for-more-than-2-classes
Expand Down
25 changes: 15 additions & 10 deletions week2/utilities/data_prepper.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,22 +233,27 @@ def __log_ltr_query_features(self, query_id, key, query_doc_ids, click_prior_que
self.ltr_store_name,
size=len(query_doc_ids), terms_field=terms_field)
# IMPLEMENT_START --
print("IMPLEMENT ME: __log_ltr_query_features: Extract log features out of the LTR:EXT response and place in a data frame")
# Loop over the hits structure returned by running `log_query` and then extract out the features from the response per query_id and doc id. Also capture and return all query/doc pairs that didn't return features
# Your structure should look like the data frame below
response = self.opensearch.search(body=log_query, index=self.index_name)

feature_results = {}
feature_results["doc_id"] = [] # capture the doc id so we can join later
feature_results["query_id"] = [] # ^^^
feature_results["sku"] = []
feature_results["salePrice"] = []
feature_results["name_match"] = []
rng = np.random.default_rng(12345)
for doc_id in query_doc_ids:
feature_results["doc_id"].append(doc_id) # capture the doc id so we can join later
feature_results["query_id"].append(query_id)
feature_results["sku"].append(doc_id) # ^^^
feature_results["salePrice"].append(rng.random())
feature_results["name_match"].append(rng.random())

if response and len(response['hits']) > 0 and len(response['hits']['hits']) == len(query_doc_ids):
for hit in response['hits']['hits']:
feature_results["doc_id"].append(hit['_id']) # capture the doc id so we can join later
feature_results["query_id"].append(query_id.iloc[0])
feature_results["sku"].append(hit['_id']) # ^^^
features = hit['fields']['_ltrlog'][0]['log_entry']
for feature in features:
if feature['name'] not in feature_results:
feature_results[feature['name']] = [feature.get('value', 0)]
else:
feature_results[feature['name']].append(feature.get('value', 0))

frame = pd.DataFrame(feature_results)
return frame.astype({'doc_id': 'int64', 'query_id': 'int64', 'sku': 'int64'})
# IMPLEMENT_END
Expand Down
58 changes: 55 additions & 3 deletions week2/utilities/ltr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,26 @@ def create_rescore_ltr_query(user_query, query_obj, click_prior_query, ltr_model
active_features=None, rescore_size=500, main_query_weight=1, rescore_query_weight=2):
# Create the base query, use a much bigger window
#add on the rescore
print("IMPLEMENT ME: create_rescore_ltr_query")
query_obj["rescore"] = {
"window_size": rescore_size,
"query": {
"rescore_query": {
"sltr": {
"params": {
"keywords": user_query,
"click_prior_query": click_prior_query
},
"model": ltr_model_name,
"store": ltr_store_name,
}
},
"score_mode": "total",
"query_weight": str(main_query_weight),
"rescore_query_weight": str(rescore_query_weight)
}
}
if active_features is not None and len(active_features) > 0:
query_obj["rescore"]["query"]["rescore_query"]["sltr"]["active_features"] = active_features
return query_obj

# take an existing query and add in an SLTR so we can use it for explains to see how much SLTR contributes
Expand Down Expand Up @@ -50,8 +69,41 @@ def create_sltr_hand_tuned_query(user_query, query_obj, click_prior_query, ltr_m
return query_obj, len(query_obj["query"]["function_score"]["query"]["bool"]["should"])

def create_feature_log_query(query, doc_ids, click_prior_query, featureset_name, ltr_store_name, size=200, terms_field="_id"):
print("IMPLEMENT ME: create_feature_log_query")
return None
query_obj = {
'size': size,
'query': {
'bool': {
"filter": [ # use a filter so that we don't actually score anything
{
"terms": {
terms_field: doc_ids
}
},
{ # use the LTR query bring in the LTR feature set
"sltr": {
"_name": "logged_featureset",
"featureset": featureset_name,
"store": ltr_store_name,
"params": {
"keywords": query,
"click_prior_query": click_prior_query
}
}
}
]
}
},
# Turn on feature logging so that we get weights back for our features
"ext": {
"ltr_log": {
"log_specs": {
"name": "log_entry",
"named_query": "logged_featureset"
}
}
}
}
return query_obj


# Item is a Pandas namedtuple
Expand Down
4 changes: 2 additions & 2 deletions week2/utilities/query_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def create_prior_queries_from_group(click_group): # total impressions isn't curr
if click_group is not None:
for item in click_group.itertuples():
try:
click_prior_query += "%s^%.3f " % (item.doc_id, item.clicks)
click_prior_query += "%s^%.3f " % (item.doc_id, item.clicks/item.num_impressions)

except KeyError as ke:
pass # nothing to do in this case, it just means we can't find priors for this doc
Expand All @@ -33,7 +33,7 @@ def create_prior_queries(doc_ids, doc_id_weights, query_times_seen): # total imp
for idx, doc in enumerate(doc_ids):
try:
wgt = doc_id_weights[doc] # This should be the number of clicks or whatever
click_prior_query += "%s^%.3f " % (doc, wgt)
click_prior_query += "%s^%.3f " % (doc, wgt/query_times_seen)
except KeyError as ke:
pass # nothing to do in this case, it just means we can't find priors for this doc
return click_prior_query
Expand Down
3 changes: 2 additions & 1 deletion week2/utilities/xgb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def train(xgb_train_data, num_rounds=5, xgb_conf=None ):
if xgb_conf is not None:
with open(xgb_conf) as json_file:
xgb_params = json.load(json_file)
dtrain = xgb.DMatrix(xgb_train_data)
print("Training XG Boost on %s for %s rounds with params: %s" % (xgb_train_data, num_rounds, xgb_params))
print("IMPLEMENT ME: train()")
bst = xgb.train(xgb_params, dtrain, num_rounds)
return bst, xgb_params

0 comments on commit a7ee9b1

Please sign in to comment.