Skip to content

Commit

Permalink
Merge branch 'main' into FE
Browse files Browse the repository at this point in the history
  • Loading branch information
sunnight9507 authored Jun 3, 2021
2 parents ea5c2c5 + cd5c450 commit 7212561
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 33 deletions.
78 changes: 62 additions & 16 deletions feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,54 @@

def find_time_difference(data):
if data["userID"] == data["userID_shift"]:
temp_time_difference = int(((data["next_timestamp"] - data["Timestamp"]) / pd.to_timedelta(1, unit='D')) * (60 * 60 * 24))
if temp_time_difference > 3600: # 1시간 넘는 경우 # 변경 가능
return 3600
else:
return temp_time_difference
temp_time_difference = int(((data["Timestamp"] - data["next_timestamp"]) / pd.to_timedelta(1, unit='D')) * (60 * 60 * 24))
if temp_time_difference > 600: # 10분 넘는 경우 # 변경 가능
return 600
elif temp_time_difference > 3600: # 1시간 넘는 경우 # 변경 가능:
return 0
return temp_time_difference
else:
return 0


def feature_engineering_sun(df):
# assessmentItemID, timestamp 기준 정렬
df.sort_values(by=["KnowledgeTag", "Timestamp"], inplace=True)

# KnowledgeTag 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
df["KnowledgeTag_correct_answer"] = df.groupby("KnowledgeTag")["answercode"].transform(lambda x: x.cumsum().shift(1))
df["KnowledgeTag_total_answer"] = df.groupby("KnowledgeTag")["answercode"].cumcount()
df["KnowledgeTag_acc"] = df["KnowledgeTag_correct_answer"] / df["KnowledgeTag_total_answer"]

# assessmentItemID, timestamp 기준 정렬
df.sort_values(by=["assessmentItemID", "Timestamp"], inplace=True)

# assessmentItemID 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
df["question_correct_answer"] = df.groupby("assessmentItemID")["answercode"].transform(lambda x: x.cumsum().shift(1))
df["question_total_answer"] = df.groupby("assessmentItemID")["answercode"].cumcount()
df["question_acc"] = df["question_correct_answer"] / df["question_total_answer"]

# question class
df["question_class"] = df["assessmentItemID"].apply(lambda x: x[2])
# user_question_class
df["userID_question_class"] = df[["userID", "question_class"]].apply(lambda data: str(data["userID"]) + "_" + data["question_class"], axis=1)

# question_class, timestamp 기준 정렬
df.sort_values(by=["question_class", "Timestamp"], inplace=True)

# question_class 정답 수, 풀이 수, 정답률을 시간순으로 누적해서 계산
df["question_class_correct_answer"] = df.groupby("question_class")["answercode"].transform(lambda x: x.cumsum().shift(1))
df["question_class_total_answer"] = df.groupby("question_class")["answercode"].cumcount()
df["question_class_acc"] = df["question_class_correct_answer"] / df["question_class_total_answer"]

# assessmentItemID, timestamp 기준 정렬
df.sort_values(by=["userID_question_class", "Timestamp"], inplace=True)

# userID_question_class 정답 수, 풀이 수, 정답률을 시간순으로 누적해서 계산
df["user_question_class_correct_answer"] = df.groupby("userID_question_class")["answercode"].transform(lambda x: x.cumsum().shift(1))
df["user_question_class_total_answer"] = df.groupby("userID_question_class")["answercode"].cumcount()
df["user_question_class_acc"] = df["user_question_class_correct_answer"] / df["user_question_class_total_answer"]

# user별 timestamp 기준 정렬
df.sort_values(by=["userID", "Timestamp"], inplace=True)

Expand All @@ -27,31 +65,39 @@ def feature_engineering_sun(df):
df["question_class"] = df["assessmentItemID"].apply(lambda x: x[2])

# user의 문제 풀이 수, 정답 수, 정답률을 시간순으로 누적해서 계산
df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

# testId 기준 mean, sum
group_test = df.groupby(["testId"])["answerCode"].agg(["mean", "sum"])
df["user_correct_answer"] = df.groupby("userID")["answercode"].transform(lambda x: x.cumsum().shift(1))
df["user_total_answer"] = df.groupby("userID")["answercode"].cumcount()
df["user_acc"] = df["user_correct_answer"] / df["user_total_answer"]
# testId 기준 mean, sumanswercode
group_test = df.groupby(["testId"])["answercode"].agg(["mean", "sum"])
group_test.columns = ["test_mean", "test_sum"]
# knowledge_tag 기준 mean, sum
group_tag = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum"])
group_tag = df.groupby(["KnowledgeTag"])["answercode"].agg(["mean", "sum"])
group_tag.columns = ["tag_mean", "tag_sum"]
# userID 기준 mean, sum
group_user = df.groupby(["userID"])["answerCode"].agg(["mean", "sum"])
group_user.columns = ["user_mean", "user_count"]
group_user = df.groupby(["userID"])["answercode"].agg(["sum"])
group_user.columns = ["user_count"]
# question 기준 mean, sum
group_question = df.groupby(['assessmentItemID'])["answerCode"].agg(["mean", "sum"])
group_question = df.groupby(["assessmentItemID"])["answercode"].agg(["mean", "sum"])
group_question.columns = ["question_mean", "question_count"]
# question class(assessmentItemID 두 번째 숫자) 기준 mean, sum
group_question_class = df.groupby(["question_class"])["answerCode"].agg(["mean", "sum"])
group_question_class = df.groupby(["question_class"])["answercode"].agg(["mean", "sum"])
group_question_class.columns = ["question_class_mean", "question_class_count"]
# time_difference 기준 mean, median
group_time_difference = df.groupby(["userID"])["time_difference"].agg(["mean", "median"])
group_time_difference.columns = ["time_difference_mean", "time_difference_median"]
# userID_question_class 기준 mean, sum
group_user_question_class = df.groupby(["userID_question_class"])["answercode"].agg(["mean", "sum"])
group_user_question_class.columns = ["user_question_class_mean", "user_question_class_count"]

# merge
df = pd.merge(df, group_test, on=["testId"], how="left")
df = pd.merge(df, group_tag, on=["KnowledgeTag"], how="left")
df = pd.merge(df, group_user, on=["userID"], how="left")
df = pd.merge(df, group_question, on=["assessmentItemID"], how="left")
df = pd.merge(df, group_question_class, on=["question_class"], how="left")
df = pd.merge(df, group_time_difference, on=["userID"], how="left")
df = pd.merge(df, group_user_question_class, on=["userID_question_class"], how="left")

return df
25 changes: 11 additions & 14 deletions lgbm_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from feature_engineering import feature_engineering
import numpy as np
import random
from matplotlib import pylab as plt
Expand All @@ -20,18 +21,20 @@ def set_params():
params["learning_rate"] = 1e-1 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3
params["objective"] = "binary"
params["metric"] = "auc" # binary_logloss, rmse, huber, auc
params["num_iterations"] = 1500 # 100
params["num_iterations"] = 300 # 100
params["max_depth"] = 6 # -1
params["num_leaves"] = 40 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.
params["min_data_in_leaf"] = 1000 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것
params["max_bin"] = 128 # 256
params["num_leaves"] = 30 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.
params["min_data_in_leaf"] = 5000 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것
params["max_bin"] = 32 # 256
params["scale_pos_weight"] = 1.1 # 1.1~1.5 data 불균형
params["tree_learner"] = "serial" # serial, feature, data, voting
params["early_stopping_rounds"] = 100
params["bagging_fraction"] = 0.9 # 1.0
params["bagging_fraction"] = 0.8 # 1.0
params["feature_fraction"] = 0.5 # 1.0
params["lambda_l1"] = 1e-1 # 0.0
params["lambda_l2"] = 1e-1 # 0.0


print("="*30)
print(params)
print("="*30)
Expand Down Expand Up @@ -74,7 +77,7 @@ def custom_train_test_split(df, ratio=0.2):
return train_lst, test_lst


def inference(FEATS, model, auc, acc):
def inference(FEATS, model, auc, acc, time):
print("="*30)
print("Start inference")
print("="*30)
Expand All @@ -94,7 +97,6 @@ def inference(FEATS, model, auc, acc):
print("="*30)
print()


test_df = pd.concat([df, test_df])

not_test_df = test_df[test_df["answerCode"] != -1]
Expand All @@ -113,21 +115,17 @@ def inference(FEATS, model, auc, acc):

test_df = pd.merge(test_df, not_test_df[["userID", "user_mean"]].drop_duplicates(), on=["userID"], how="inner")
test_df = pd.merge(test_df, not_test_df[["question_class", "question_class_mean"]].drop_duplicates(), on=["question_class"], how="inner")
# print(test_df.shape)

def random_answering(data):
return 1 if random.random() + random.random() < data["user_mean"] + data["question_class_mean"] else 0
return 1 if random.random() < data["user_mean"] * data["question_class_mean"] else 0

test_df["answerCode"] = test_df[["user_mean", "question_class_mean"]].apply(random_answering, axis=1)
test_df.drop(["question_class", "user_mean", "question_class_mean"], axis=1, inplace=True)
# print(test_df.shape)

data = pd.concat([not_test_df, test_df], join="inner")
# print(data.shape)

# FEATURE ENGINEERING
data = feature_engineering(data)
# print(data.shape)

# TEST DATA
test_df = data[data["is_test"]]
Expand All @@ -137,7 +135,6 @@ def random_answering(data):
print(test_df.shape)
print("="*30)
print()
# print(test_df.shape) # (744, 21)

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)
Expand All @@ -147,7 +144,7 @@ def random_answering(data):

# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, f"output_VALID_AUC_{round(auc, 4)}_ACC_{round(acc, 4)}.csv")
write_path = os.path.join(output_dir, f"output_{time}_AUC_{round(auc, 4)}_ACC_{round(acc, 4)}.csv")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
Expand Down
6 changes: 3 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def main(args):
wandb.login()

setSeeds(42)
setSeeds(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
args.device = device

Expand All @@ -22,11 +22,11 @@ def main(args):

train_data, valid_data = preprocess.split_data(train_data)

wandb.init(project='dkt', config=vars(args))
wandb.init(project='P4-DKT', entity='team-ikyo', name=args.run_name, config=vars(args))
trainer.run(args, train_data, valid_data)


if __name__ == "__main__":
args = parse_args(mode='train')
os.makedirs(args.model_dir, exist_ok=True)
main(args)
main(args)

0 comments on commit 7212561

Please sign in to comment.