From cde833b9534c72854885c12e2d70ccfea5a7b60a Mon Sep 17 00:00:00 2001 From: nmd-2000 Date: Mon, 14 Nov 2022 22:32:59 +0700 Subject: [PATCH] Postprocessing --- README.md | 17 ++++++ requirements.txt | 5 +- src/postprocesing.py | 127 ++++++++++++++++++++++++++++--------------- src/processing.py | 4 +- src/utils/logger.py | 59 ++++++++++++++++++++ 5 files changed, 167 insertions(+), 45 deletions(-) create mode 100644 src/utils/logger.py diff --git a/README.md b/README.md index 6f8d879..f05f9ca 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ The dataset we used to extract was collected by codeparrot. They host the raw da *You can create your own dataset using Google Bigquery and the [query here](https://huggingface.co/datasets/codeparrot/github-code/blob/main/query.sql)* ## Getting started +### Process custom dataset For start preprocessing data, define a .yaml file to declare raw data format. (More detail: `/data/format/README.md`) ```bash @@ -129,3 +130,19 @@ python -m codetext.processing *NOTES:* dir must contains raw data store in `.jsonl` extension if you pass argument `--load_from_file` or contains huggingface dataset's +### Analyse and split dataset +The code process is going to save cleaned sample by batch, you can merge it using `postprocess.py`. We also provide analyse tool for get total number of sample, blank_line(\*), comment(\*) and code(\*). You can also split your dataset into `train`, `valid`, `test`. + +```bash +python -m codetext.postprocessing + # path to dir contains /extracted, /filered, /raw +--save_path # path to save final output + +--n_core 10 # number of core for multiprocessing analyzer +--analyze # Analyze trigger +--split # Split train/test/valid trigger +--ratio 0.05 # Test and valid ratio (defaul to equal) +--max_sample 20000 # Max size of test set and valid set +``` + +*NOTES:* (\*) We run `cloc` underneath the program to count blank, comment and code. See more [github.com/AlDanial/cloc](github.com/AlDanial/cloc) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index dc0880c..614aefa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,7 @@ datasets tree-sitter docstring-parser bs4 -Levenshtein \ No newline at end of file +Levenshtein + +# for post-processing +cloc \ No newline at end of file diff --git a/src/postprocesing.py b/src/postprocesing.py index 5cc302e..f3ec353 100644 --- a/src/postprocesing.py +++ b/src/postprocesing.py @@ -9,23 +9,23 @@ import pandas as pd from tqdm import tqdm +from src.utils import create_logger -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) -logger = logging.getLogger('Post-processing') ROOT_PATH = str(Path(__file__).parents[1]) -def seperate_filname(list_filename, parent_path): +def seperate_filename(list_filename, parent_path): fn_list, cls_list, line_list = [], [], [] for filename in list_filename: if 'function' in filename: fn_list.append(os.path.join(parent_path, filename)) - if 'class' in filename: + elif 'class' in filename: cls_list.append(os.path.join(parent_path, filename)) - else: + elif 'line' in filename: line_list.append(os.path.join(parent_path, filename)) + + if not line_list: + return fn_list, cls_list return fn_list, cls_list, line_list @@ -45,7 +45,8 @@ def summary_total(list_file): return list_file -def merge_file(file_list, opt, s: str='RAW'): + +def merge_embled_file(file_list, opt, name: str='raw_function', split: bool=False): """ Count number of repo, number of sample Merge all .jsonl in file_list @@ -58,32 +59,32 @@ def merge_file(file_list, opt, s: str='RAW'): n_sample = 0 n_repos = set() - # For analyser + # For analyzer repos = [] n_samples = [] sets = [] - zip_output = zipfile.ZipFile(f'{s}_code.zip', "w", zipfile.ZIP_DEFLATED) + zip_output = zipfile.ZipFile(os.path.join(opt.save_path, f'{name}_code.zip'), "w", zipfile.ZIP_DEFLATED) - with open(opt.save_path, 'a') as output_file: - for idx, file in enumerate(file_list): + with open(os.path.join(opt.save_path, f'{name}_merge.jsonl'), 'a') as output_file: + for file in file_list: with open(file, 'r') as json_file: dataset = list(json_file) - for data in dataset: + for idx, data in enumerate(dataset): try: data = json.loads(data) except Exception: fail_sample += 1 continue - assert 'code' in data.keys - assert 'repo' in data.keys - assert 'path' in data.keys + assert 'code' in data.keys() + assert 'repo' in data.keys() + assert 'path' in data.keys() code = data['code'] repo = data['repo'] path = data['path'] - unique_path = idx + path[-50:] + unique_idx = str(idx) + code[-10:] + repo + path[-50:] n_repos.add(repo) if repo not in repos: @@ -95,23 +96,28 @@ def merge_file(file_list, opt, s: str='RAW'): index = repos.index(repo) n_samples[index] += 1 - if opt.analyse: - zip_output.writestr(unique_path, code) + if opt.analyze: + zip_output.writestr(unique_idx, code) json.dump(data, output_file) output_file.write('\n') n_sample += 1 + + assert os.path.exists(os.path.join(opt.save_path, f'{name}_merge.jsonl')) == True + assert os.path.exists(os.path.join(opt.save_path, f'{name}_code.zip')) == True + logger.info('Meraged in %s' % (os.path.join(opt.save_path, f'{name}_merge.jsonl'))) - if opt.split: + if opt.split and split: valid_ratio = test_ratio = opt.ratio valid_len = min(opt.max_sample, int(valid_ratio*n_sample)) test_len = min(opt.max_sample, int(test_ratio*n_sample)) train_len = n_sample - valid_len - test_len + logger.info(f"Split data into: Train size: {train_len} ({(100*train_len/n_sample):.2f})% | Valid size: {valid_len} ({(100*valid_len/n_sample):.2f})% | Test ratio: {test_len} ({(100*test_len/n_sample):.2f})%") metadata_dict = {'repo': repos, 'n_sample': n_samples, 'set': sets} df = pd.DataFrame(metadata_dict, columns = ['repo', 'n_sample', 'set']) - for index, row in tqdm(df.iterrows(), total=df.shape[0]): + for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Spliting data'): if df.at[index, 'set'] is None: if valid_len - row['n_sample'] > 0: valid_len -= row['n_sample'] @@ -124,13 +130,18 @@ def merge_file(file_list, opt, s: str='RAW'): else: df.at[index, 'set'] = 'train' - df.to_csv(os.path.join(opt.save_path, 'split_info'), index=False) + if not os.path.exists(os.path.join(opt.save_path, 'final')): + os.mkdir(os.path.join(opt.save_path, 'final')) - trainfile = open(os.path.join(opt.save_path, f'train.jsonl'), "a") - validfile = open(os.path.join(opt.save_path, f'valid.jsonl'), "a") - testfile = open(os.path.join(opt.save_path, f'test.jsonl'), "a") + df.to_csv(os.path.join(opt.save_path, 'final', 'split_info.csv'), index=False) + + trainfile = open(os.path.join(opt.save_path, 'final', f'{name}_train.jsonl'), "a") + validfile = open(os.path.join(opt.save_path, 'final', f'{name}_valid.jsonl'), "a") + testfile = open(os.path.join(opt.save_path, 'final', f'{name}_test.jsonl'), "a") - for ids in tqdm(range(len(dataset))): + with open(os.path.join(opt.save_path, f'{name}_merge.jsonl'), 'r') as data_reader: + dataset = list(data_reader) + for ids in tqdm(range(len(dataset)), desc='Writing splited dataset'): data = json.loads(dataset[ids]) repo = data['repo'] @@ -150,15 +161,26 @@ def merge_file(file_list, opt, s: str='RAW'): json.dump(data, validfile, ensure_ascii=False) validfile.write('\n') - logger.info(f"\n Split data into: Train size: {train_len} ({(train_len/n_sample):.2f})% | Valid size: {valid_len} ({(valid_len/n_sample):.2f})% | Test ratio: {test_len} ({(test_len/n_sample):.2f})%") # Analyze zip_output.close() - if opt.analyse: - command = f"" + if opt.analyze: + command = f"cloc {os.path.join(opt.save_path, f'{name}_code.zip')} --processes={opt.n_core}" subprocess.Popen(command ,shell=True).wait() - logger.info('\n\n=============%s Total %i samples in %i repos =============%' % (name, n_sample, len(n_repos))) + logger.info(f'============= SUMMARY: {name} | Total {n_sample} samples in {len(n_repos)} repos =============\n') + + +def merge_file(file_list, opt, name: str='raw', split: bool=False): + if len(file_list) >= 2: # function & class + function_list, class_list = file_list[:2] + + merge_embled_file(function_list, opt, f'{name}_function', split) + merge_embled_file(class_list, opt, f'{name}_class', split) + + if len(file_list) == 3: # inline + line_list = file_list[-1] + merge_embled_file(line_list, opt, f'{name}_line', split=True) def main(opt): @@ -170,20 +192,24 @@ def main(opt): filter_list_file = os.listdir(os.path.join(opt.data_path, 'filtered')) extract_list_file = os.listdir(os.path.join(opt.data_path, 'extracted')) - raw_list = seperate_filname(raw_list_file, os.path.join(opt.data_path, 'raw')) - filter_list = seperate_filname(filter_list_file, os.path.join(opt.data_path, 'filtered')) - extract_list = seperate_filname(extract_list_file, os.path.join(opt.data_path, 'extracted')) + raw_list = seperate_filename(raw_list_file, os.path.join(opt.data_path, 'raw')) + filter_list = seperate_filename(filter_list_file, os.path.join(opt.data_path, 'filtered')) + extract_list = seperate_filename(extract_list_file, os.path.join(opt.data_path, 'extracted')) + raw_list = summary_total(raw_list) filter_list = summary_total(filter_list) extract_list = summary_total(extract_list) - s = f"\nRAW | #function file {len(raw_list[0])} | #class file {len(raw_list[1])} | #inline file {len(raw_list[2])}" + \ - f"\nFILTERED | #function file {len(filter_list[0])} | #class file {len(filter_list[1])}" + \ - f"\nEXTRACTED | #function file {len(extract_list[0])} | #class file {len(extract_list[1])}" + s = f"RAW: #function file {len(raw_list[0])} | #class file {len(raw_list[1])} | #inline file {len(raw_list[2])}" + \ + f"\nFILTERED: #function file {len(filter_list[0])} | #class file {len(filter_list[1])}" + \ + f"\nEXTRACTED: #function file {len(extract_list[0])} | #class file {len(extract_list[1])}" logger.info(s) - - # TODO Merge file + + merge_file(raw_list, opt, 'raw') + merge_file(filter_list, opt, 'filter') + merge_file(extract_list, opt, 'extract', split=True) + logger.info('============= Done =============%') if __name__ == '__main__': @@ -198,6 +224,12 @@ def main(opt): default='path/to/final', help='Save path' ) + parser.add_argument( + '--n_core', + type=int, + default=0, + help='Multiprocessing analyzer' + ) # Analyze parser.add_argument( @@ -213,19 +245,28 @@ def main(opt): help='Split data into train/set/valid or not' ) parser.add_argument( - '--n_test', + '--ratio', type=float, default=0.05, - help='test ratio' + help='test and valid ratio' ) parser.add_argument( - '--n_valid', + '--max_sample', type=float, - default=0.05, - help='valid ratio' + default=20000, + help='test and valid ratio' ) opt = parser.parse_args() + create_logger(filepath=os.path.join(opt.save_path, 'log.txt'), rank=0) + # logging.basicConfig(filename=, + # filemode='a', + # format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + # datefmt = '%m/%d/%Y %H:%M:%S', + # level = logging.DEBUG) + + logger = logging.getLogger() + logger.info("") logger.info(f'Execute Arguments: {opt}') diff --git a/src/processing.py b/src/processing.py index 784b0f2..8466769 100644 --- a/src/processing.py +++ b/src/processing.py @@ -184,6 +184,7 @@ def _processing(dataset, indexs, ast, lang_parser, thread_idx, opt): # is_file=N # Extract function raw_fn = list(process_raw_node(tree, raw_code, lang_parser)) + raw_fn = [item.update(metadata_data) for item in raw_fn] filtered_fn_list = list(get_node_definitions(raw_fn, raw_code)) extracted_function_list = list(extract_node(filtered_fn_list, language)) @@ -194,11 +195,13 @@ def _processing(dataset, indexs, ast, lang_parser, thread_idx, opt): # is_file=N # # Extract line # raw_line = list(get_line_definitions(tree, raw_code, lang_parser)) + # raw_line = [item.update(metadata_data) for item in raw_line] # raw_line_set.extend(raw_line) # # Extract class # if not (language == 'GO' or language == 'C'): # raw_class = list(process_raw_node(tree, raw_code, lang_parser, is_class=True)) + # raw_class = [item.update(metadata_data) for item in raw_class] # filtered_class_list = list(get_node_definitions(raw_class, raw_code)) # extracted_class_list = list(extract_node(filtered_class_list, language)) @@ -239,7 +242,6 @@ def _processing(dataset, indexs, ast, lang_parser, thread_idx, opt): # is_file=N f'Total extractable function {res[5]} | Total extractable class {res[6]} \n' ) - return res diff --git a/src/utils/logger.py b/src/utils/logger.py new file mode 100644 index 0000000..d647dad --- /dev/null +++ b/src/utils/logger.py @@ -0,0 +1,59 @@ +import logging +import time +from datetime import timedelta + + +class LogFormatter: + def __init__(self): + self.start_time = time.time() + + def format(self, record): + elapsed_seconds = round(record.created - self.start_time) + + prefix = "%s - %s - %s" % ( + record.levelname, + time.strftime("%x %X"), + timedelta(seconds=elapsed_seconds), + ) + message = record.getMessage() + message = message.replace("\n", "\n" + " " * (len(prefix) + 3)) + return "%s - %s" % (prefix, message) if message else "" + + +def create_logger(filepath, rank): + """ + Create a logger. + Use a different log file for each process. + """ + # create log formatter + log_formatter = LogFormatter() + + # create file handler and set level to debug + if filepath is not None: + if rank > 0: + filepath = "%s-%i" % (filepath, rank) + file_handler = logging.FileHandler(filepath, "a") + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(log_formatter) + + # create console handler and set level to info + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(log_formatter) + + # create logger and set level to debug + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + if filepath is not None: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + # reset logger elapsed time + def reset_time(): + log_formatter.start_time = time.time() + + logger.reset_time = reset_time + + return logger