Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
data
model
output
test

src/__pycache__
src/wandb
models
Expand Down
Binary file modified requirements.txt
Binary file not shown.
4 changes: 1 addition & 3 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime, timedelta

# Get current time (UTC + 9 hours)
current_time = datetime.utcnow() + timedelta(hours=9)
current_time = datetime.now() + timedelta(hours=9)
current_time_str = current_time.strftime('%Y%m%d_%H%M%S')

# Root directory (adjust this if necessary)
Expand All @@ -23,8 +23,6 @@
raise FileNotFoundError(f"The source directory {src_dir} does not exist. Please adjust the path accordingly.")
os.chdir(src_dir)



run_name = input('Please Enter Your Run Name : ')

while run_name == '':
Expand Down
25 changes: 24 additions & 1 deletion src/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,27 @@ def func2():
new_data_path = os.path.join(parent_dir, 'data', 'test_short.csv')
ten_data.to_csv(new_data_path, index=False, encoding='utf-8')

func2()
def func3():
random_sample_200 = 200

# random sample 200 data rows
random_sample = train_data.sample(random_sample_200)
random_sample.to_csv(os.path.join(parent_dir, 'data', 'train_sample_200.csv'), index=False)

def func4():
# this function reads train csv text and show how many korean characters are in the text
import re
korean_char = re.compile('[가-힣]')
train_data['korean_char_count'] = train_data['text'].apply(lambda x: len(korean_char.findall(x)))

# show in pandas dataframe bar plot
train_data['korean_char_count'].plot(kind='hist', bins=50, title='Korean Character Count in Text')

import matplotlib.pyplot as plt
plt.show()

print(train_data['korean_char_count'].describe())

#func2()
#func3()
func4()
69 changes: 69 additions & 0 deletions src/eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import transformers
import datasets
import torch
import pandas as pd
import re
from nltk.corpus import words

import nltk
nltk.download('words')

english_words = set(words.words())
def remove_gibberish(text):
english_words_in_text = re.findall(r'[A-Za-z]+', text)
gibberish_words = [word for word in english_words_in_text if word.lower() not in english_words]
for word in gibberish_words:
text = text.replace(word, '')
return text

from transformers import PreTrainedTokenizer, PreTrainedModel, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import random
import numpy as np

# SEED = 2024
# set_seed(SEED)
parent_dir = os.path.dirname(os.getcwd())
data_path = os.path.join(parent_dir, 'data', 'train.csv')

train_data = pd.read_csv(data_path)

# remove r"[^\w\s.]" from text using re sub
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r"[^\w\s.]", "", x))
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'[①-⑩]', "", x))

# remove r'(?<=[가-힣])[a-zA-Z](?=[가-힣])' from text using re sub
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'(?<=[가-힣])[a-zA-Z](?=[가-힣])', "", x))

# remove r'([가-힣])[a-zA-Z] '
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'([가-힣])[a-zA-Z] ', r'\1', x))

# remove r' [a-zA-Z]([가-힣])'
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r' [a-zA-Z]([가-힣])', r' \1', x))

# remove number-english-number
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'\d+[a-zA-Z]+\d+', '', x))

# remove all Gibberish characters using nltk words
train_data['text'] = train_data['text'].apply(lambda x: remove_gibberish(x))

# remove all single alphabet characters
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'(?<![a-zA-Z])[b-zB-Z](?![a-zA-Z])', '', x))

# remove small alphabet and large alphabet after
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'(?<![a-zA-Z])[a-z]+[A-Z][a-zA-Z]*(?![a-zA-Z])', '', x))

# remove eng word that repeats same alphabet more than 2 times
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'\b\w([a-zA-Z])\1+\b', '', x))

# drop all the rows that has less then 7 korean characters
korean_char = re.compile('[가-힣]')
train_data['korean_char_count'] = train_data['text'].apply(lambda x: len(korean_char.findall(x)))
train_data = train_data[train_data['korean_char_count'] > 7]

# save csv as utf-8
time_now = pd.Timestamp.now().strftime("%d_%H%M%S")
new_data_path = os.path.join(parent_dir, 'data', f'train_eda_{time_now}.csv')

train_data.to_csv(new_data_path, index=False)
16 changes: 9 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def set_seed(seed: int = 456):
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

SEED = 456
SEED = 2024
set_seed(SEED)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Expand Down Expand Up @@ -66,8 +66,10 @@ def main(run_name):
if train_args.do_train:
model_name = model_args.model_name_or_path #'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if not train_args.do_train:
model_name = os.path.join(model_args.model_name_or_path, 'checkpoint-124')
latest_ckpt = sorted(os.listdir(model_args.model_name_or_path))[-1]
model_name = os.path.join(model_args.model_name_or_path, latest_ckpt)
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
Expand All @@ -81,15 +83,15 @@ def main(run_name):
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

if train_args.do_train:
model = train(model, data_train, data_valid, data_collator, run_name, train_args, data_args)
model = train(model, data_train, data_valid, data_collator, train_args)
if train_args.do_predict:
predict(model, tokenizer, train_args, data_args)

def train(model, data_train, data_valid, data_collator, run_name : str, train_args: TrainingArguments, data_args: DataTrainingArguments):
def train(model, data_train, data_valid, data_collator, train_args: TrainingArguments):
# output_path = os.path.join(MODEL_DIR, run_name)

training_args = TrainingArguments(
output_dir=train_args.output_dir, #output_path,
output_dir=train_args.output_dir,
overwrite_output_dir=True,
do_train=train_args.do_train,
do_eval=train_args.do_eval,
Expand Down Expand Up @@ -139,8 +141,8 @@ def compute_metrics(eval_pred):

return model

def predict(model, tokenizer, train_args: TrainingArguments, data_args: DataTrainingArguments = None, run_name : str = None):
dataset_test = pd.read_csv(data_args.test_dataset_name) #pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
def predict(model, tokenizer, train_args: TrainingArguments, data_args: DataTrainingArguments = None):
dataset_test = pd.read_csv(data_args.test_dataset_name)
model.eval()
preds = []

Expand Down