Skip to content

code refactoring #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 46 additions & 45 deletions json_check.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,78 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# 修复github的meta爬虫因早期代码bug导致的数据问题
# author:water,esbatmop
# 用法:python json_check.py meta数据目录
import json
import os
import logging
import argparse
import jsonlines
import traceback

try:
from tqdm import tqdm
except ImportError:
def tqdm(iterator, *args, **kwargs):
return iterator

logging.basicConfig(filename='json_check.log', level=logging.INFO)
def recursive_parse_jsonl_files(input_path, properties):


# 找出目标jsonl文件,重命名成jsonl.raw,调用清洗函数
def recursive_parse_jsonl_files(input_path):
# 遍历目录及子目录查找 jsonl 文件
for root, dirs, files in os.walk(input_path):
for file_name in files:
if file_name.endswith('.jsonl'):
input_file = os.path.join(root, file_name)
checked_file = os.path.join(root, f"{file_name}.checked")
raw_file = os.path.join(root, f"{file_name}.raw")
checked_file = os.path.join(root, file_name)
err_file = os.path.join(root, f"{file_name}.err")
try:
logging.info(f"Parsing jsonl files: {input_file}")
parse_jsonl_file(input_file, properties, checked_file, err_file)
except Exception as e:
logging.exception(f"Error occurred while processing the file: {input_file}. Exception: {e}")
def parse_jsonl_file(input_file, properties, checked_file, err_file):
# 打印解析日志信息
logging.info(f"Parsing jsonl files: {input_file}")
# 重命名 jsonl 文件为 raw 文件,以便备份
os.rename(input_file, raw_file)
# 根据文件名,判断文件中id范围
file_without_extension = os.path.splitext(file_name)[0]
id_start, id_end = file_without_extension.split('-')
# 清洗单个 jsonl 文件
parse_jsonl_file(int(id_start), int(id_end), raw_file, checked_file, err_file)


# 清洗单个jsonl
def parse_jsonl_file(id_start, id_end, input_file, checked_file, err_file):
checked_set = set()
with open(input_file, 'r', encoding='utf-8') as f, open(checked_file, 'w', encoding='utf-8') as c, open(err_file, 'w', encoding='utf-8') as ef:
for line in tqdm(f):
with open(input_file, 'r', encoding='utf-8') as f, \
jsonlines.open(checked_file, 'w') as c, \
open(err_file, 'w', encoding='utf-8') as ef:
for line in tqdm(f.readlines()):
try:
line = line.strip()
line_hash = hash(line)
if line_hash in checked_set:
continue
json_data = json.loads(line)
# 如有报错,证明json格式不正确,写入error文件
json_data = json.loads(line.strip())
if isinstance(json_data, str):
json_data = json.loads(json_data)

is_valid = True
for prop in properties:
prop_parts = prop.split()
if len(prop_parts) == 3:
prop_name, operator, target_value = prop_parts
if prop_name in json_data.keys():
if operator == '!=':
if str(json_data[prop_name]) == target_value:
is_valid = False
elif operator == '>':
if float(json_data[prop_name]) <= float(target_value):
is_valid = False
elif operator == '==':
if str(json_data[prop_name]) != target_value:
is_valid = False
elif operator == '<':
if float(json_data[prop_name]) >= float(target_value):
is_valid = False
else:
is_valid = False
else:
is_valid = False
if is_valid:
c.write(line + "\n")
checked_set.add(line_hash)
# 如果json中没有id,则写入error文件
id = json_data['id']
# 如果id重复了,则跳过
if id in checked_set:
continue
# 记录每个id
checked_set.add(id)
# 写入文件
if id_start <= id < id_end:
c.write(json_data)
else:
ef.write(line + "\n")
ef.write("out range:" + line + "\n")
except Exception as e:
logging.error(f"Error occurred while parsing {input_file} at line: {line}. Exception: {e}")
logging.error(traceback.format_exc())
ef.write(line + "\n")


if __name__ == '__main__':

parser = argparse.ArgumentParser(description='Parse jsonl files')
parser.add_argument('input_path', type=str, help='input path to recurse')
parser.add_argument('--property', nargs='+', help='Property to check (e.g. "name == John", "age > 18")')
args = parser.parse_args()
recursive_parse_jsonl_files(args.input_path, args.property)
recursive_parse_jsonl_files(args.input_path)
logging.info("Finished parsing all jsonl files in the input path")