Skip to content

Commit

Permalink
💩 dataset convert script
Browse files Browse the repository at this point in the history
dataset convert script
  • Loading branch information
ekzm8523 committed Oct 1, 2022
1 parent 91ce465 commit 773c597
Show file tree
Hide file tree
Showing 6 changed files with 1,234 additions and 10 deletions.
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dotenv import load_dotenv

load_dotenv()
session = boto3.Session()
session = boto3.Session(profile_name="cs-broker")
secret_manager = session.client(service_name="secretsmanager", region_name="ap-northeast-2")
log = logging.getLogger("__main__")
log.setLevel(logging.INFO)
Expand Down
864 changes: 864 additions & 0 deletions app/static/changed_user_answer.csv

Large diffs are not rendered by default.

239 changes: 239 additions & 0 deletions app/static/problem_info.csv

Large diffs are not rendered by default.

29 changes: 20 additions & 9 deletions app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ def content_model() -> PromptForClassification:


@pytest.fixture(scope="session")
def problem_df(path: str = "app/static/user_answer.csv") -> pd.DataFrame:
def user_answer_df(path: str = "app/static/user_answer.csv") -> pd.DataFrame:
return pd.read_csv(path)


@pytest.fixture(scope="session")
def problem_dict(keyword_model: SentenceTransformer, problem_df: pd.DataFrame) -> dict:
def problem_dict(keyword_model: SentenceTransformer, user_answer_df) -> dict:
problem_dict = {}
keyword_id = 0
# criterion parsing
for _, data in problem_df.iterrows():
for _, data in user_answer_df.iterrows():
problem_id = data["problem_id"]
if problem_id not in problem_dict:
keyword_standards = []
Expand All @@ -49,9 +49,9 @@ def problem_dict(keyword_model: SentenceTransformer, problem_df: pd.DataFrame) -


@pytest.fixture(scope="function")
def random_keyword_data(problem_dict: dict, problem_df: pd.DataFrame) -> KeywordGradingRequest:
random_idx = random.randint(0, len(problem_df) - 1)
random_data = problem_df.iloc[random_idx]
def random_keyword_data(problem_dict: dict, user_answer_df) -> KeywordGradingRequest:
random_idx = random.randint(0, len(user_answer_df) - 1)
random_data = user_answer_df.iloc[random_idx]
problem_id = random_data["problem_id"]
keyword_standards = problem_dict[problem_id].keyword_standards
return KeywordGradingRequest(
Expand All @@ -60,9 +60,9 @@ def random_keyword_data(problem_dict: dict, problem_df: pd.DataFrame) -> Keyword


@pytest.fixture(scope="function")
def random_content_data(problem_df: pd.DataFrame) -> ContentGradingRequest:
random_idx = random.randint(0, len(problem_df) - 1)
random_series = problem_df.iloc[random_idx]
def random_content_data(user_answer_df) -> ContentGradingRequest:
random_idx = random.randint(0, len(user_answer_df) - 1)
random_series = user_answer_df.iloc[random_idx]
content_standards = []
offset = random.randint(0, 10000)
for i, criterion in enumerate(eval(random_series["scoring_criterion"])):
Expand All @@ -72,3 +72,14 @@ def random_content_data(problem_df: pd.DataFrame) -> ContentGradingRequest:
return ContentGradingRequest(
problem_id=random_idx, user_answer=random_series.user_answer, content_standards=content_standards
)


# from collections import defaultdict
# @pytest.fixture(scope="session")
# def get_problem_df(path: str = "/Users/minjaewon/workspace/AI-server/app/static/problem_info.csv"):
# df = pd.read_csv(path)
# problem_info = {}
# for row in df.iterrows():
#
# if row.problem_id not in problem_info:
# problem_info[row.problem_id] = {"keyword_standards": }
7 changes: 7 additions & 0 deletions app/tests/test_keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,10 @@ def test_keyword_predict_runnable_2(problem_dict: dict, random_keyword_data: Key
problem_keyword_set = set(keyword.id for keyword in runnable.problem_dict[test_problem_id].keyword_standards)
for correct_keyword in result.correct_keywords:
assert correct_keyword.id in problem_keyword_set, "problem_dict에 맞지 않는 키워드를 이용해 예측하였습니다."


def test_keyword_predict_runnable_3(problem_dict: dict, random_keyword_data: KeywordGradingRequest) -> None:
"""
','로 구분되어 있는 키워드 기준들을 함께 보면서 유사도 측정
"""
print()
103 changes: 103 additions & 0 deletions app/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os

import numpy as np
import pandas as pd


def update_problem_info(problem_info_path: str, user_answer_path: str):

problem_info_df = pd.read_csv(problem_info_path)

problem_info = {}
for i in range(len(problem_info_df)):
row = problem_info_df.iloc[i]
if row.problem_id not in problem_info:
problem_info[row.problem_id] = {"keyword_standards": [], "content_standards": []}
if row.content == np.nan:
row.content = "NULL"
if row.type == "KEYWORD":
problem_info[row.problem_id]["keyword_standards"].append(row.content)
elif row.type == "PROMPT":
problem_info[row.problem_id]["content_standards"].append(row.content)

user_answer_df = pd.read_csv(user_answer_path)

change_dict = {4: 449, 6: 453, 1: 454, 5: 450, 9: 447, 3: 451, 8: 448, 2: 446, 0: 452, 7: 443}
visit = set()
keyword_remove_dict = {}
content_remove_dict = {}
for i in range(len(user_answer_df)):
problem_id = change_dict[user_answer_df.iloc[i].problem_id]
converted_problem = problem_info[problem_id]
if problem_id not in visit:
print(user_answer_df.keyword_criterion[i])
print(converted_problem["keyword_standards"])
order = list(map(int, input().split()))
if len(eval(user_answer_df.keyword_criterion[i])) != len(converted_problem["keyword_standards"]):
print("삭제할 인덱스를 적어주세요")
keyword_remove_dict[user_answer_df.iloc[i].problem_id] = list(map(int, input().split()))

converted_problem["keyword_standards"] = [converted_problem["keyword_standards"][j - 1] for j in order]
print(user_answer_df.keyword_criterion[i])
print(converted_problem["keyword_standards"])
print("*" * 50)
print(user_answer_df.scoring_criterion[i])
print(converted_problem["content_standards"])
order = list(map(int, input().split()))
if len(eval(user_answer_df.scoring_criterion[i])) != len(converted_problem["content_standards"]):
print("삭제할 인덱스를 적어주세요")
content_remove_dict[user_answer_df.iloc[i].problem_id] = list(map(int, input().split()))

converted_problem["content_standards"] = [converted_problem["content_standards"][j - 1] for j in order]
print(user_answer_df.scoring_criterion[i])
print(converted_problem["content_standards"])
print("*" * 50)
visit.add(problem_id)
labeled_keyword_ids = []
correct_keyword_criterion = eval(user_answer_df.correct_keyword_criterion[i])
keyword_criterion = eval(user_answer_df.keyword_criterion[i])
if user_answer_df.iloc[i].problem_id in keyword_remove_dict:
for idx in keyword_remove_dict[user_answer_df.iloc[i].problem_id]:
keyword_criterion[idx - 1] = None
keyword_criterion = [value for value in keyword_criterion if value is not None]
keyword_criterion = [value if value != np.nan else "NULL" for value in keyword_criterion]
print(correct_keyword_criterion)
print(keyword_criterion)

for criterion in correct_keyword_criterion:
if criterion in keyword_criterion:
idx = keyword_criterion.index(criterion)
labeled_keyword_ids.append(idx)
print(labeled_keyword_ids)
print(converted_problem["keyword_standards"])
labeled_content_ids = []
correct_scoring_criterion = eval(user_answer_df.correct_scoring_criterion[i])
scoring_criterion = eval(user_answer_df.scoring_criterion[i])

if user_answer_df.iloc[i].problem_id in content_remove_dict:
for idx in content_remove_dict[user_answer_df.iloc[i].problem_id]:
scoring_criterion[idx - 1] = None
scoring_criterion = [value for value in scoring_criterion if value is not None]

print(scoring_criterion)
print(correct_scoring_criterion)
for criterion in correct_scoring_criterion:
if criterion in scoring_criterion:
idx = scoring_criterion.index(criterion)
labeled_content_ids.append(idx)
print(labeled_content_ids)
print(converted_problem["content_standards"])
print("*" * 50)
user_answer_df.correct_keyword_criterion[i] = labeled_keyword_ids
user_answer_df.correct_scoring_criterion[i] = labeled_content_ids
user_answer_df.keyword_criterion[i] = converted_problem["keyword_standards"]
user_answer_df.scoring_criterion[i] = converted_problem["content_standards"]

user_answer_df.to_csv(os.path.join(os.path.dirname(user_answer_path), "changed_user_answer.csv"))


if __name__ == "__main__":
update_problem_info(
problem_info_path="/Users/minjaewon/workspace/AI-server/app/static/problem_info.csv",
user_answer_path="/Users/minjaewon/workspace/AI-server/app/static/user_answer.csv",
)

0 comments on commit 773c597

Please sign in to comment.