-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
110 lines (85 loc) · 3.71 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from pathlib import Path
import argparse
import pandas as pd
def is_sentence(text):
return text[0].isupper() and text[-1] == '.' and len(text) > 30
def preprocess(text):
return text.strip()
def filter_languages(df: pd.DataFrame, source_lang: str, target_lang: str) -> pd.DataFrame:
"""
Leaves only the rows that match the given languages. It doesn't matter if the languages are in the wrong order.
:param df:
:param source_lang: e.g. "Finnish"
:param target_lang: e.g. "English"
:return:
"""
filtered_df = df[(df.from_lang == source_lang) & (df.to_lang == target_lang)]
filtered_reverted_df = df[(df.from_lang == target_lang) & (df.to_lang == source_lang)].rename(
{"text": "translation", "translation": "text"}, axis=1
)
return pd.concat([filtered_df, filtered_reverted_df])
def filter_sentences(df: pd.DataFrame) -> pd.DataFrame:
"""
Removes sentences from the dataframe. Sentences are defined in is_sentence as follows:
- First letter is uppercase
- Last letter is '.'
- Length of the sentence is greater than 30
:param df:
:return:
"""
df = df[~df.text.apply(is_sentence)]
return df[~df.translation.apply(is_sentence)]
def preprocess_text(df):
df.text = df.text.apply(preprocess)
df.translation = df.translation.apply(preprocess)
return df
def filter_data(df, source_lang, target_lang):
"""Filters the dataframe by given languages, removes sentences, preprocesses text.
"""
df = filter_languages(df, source_lang, target_lang)
print("Data size after filtering languages", df.shape[0])
df = filter_sentences(df)
print("Data size without sentences", df.shape[0])
df = preprocess_text(df)
print("Data size after preprocessing", df.shape[0])
return df
def remove_old_data(df: pd.DataFrame, old_df: pd.DataFrame, source_lang: str, target_lang: str) -> pd.DataFrame:
"""Removes old data from the dataframe.
"""
old_df = filter_data(old_df, source_lang, target_lang)
return df[~df.text.isin(old_df.text)]
def main(args):
# check that input file exists and of the right type
if not Path(args.file).is_file():
print(f"File '{args.file}' not found")
exit(1)
if not args.file.endswith(".csv") and not args.file.endswith(".xlsx"):
print(f"File '{args.file}' is not a .csv or .xlsx file")
exit(1)
# modify output extension if it is not .xlsx
output = args.output if args.output.endswith(".xlsx") else args.output + ".xlsx"
source_lang, target_lang = args.source, args.target
if args.file.endswith(".csv"):
df = pd.read_csv(args.file)
else:
df = pd.read_excel(
args.file, sheet_name=None, engine="openpyxl"
)
df = filter_data(df, source_lang, target_lang)
old_df = pd.read_csv(args.old_data)
if args.rm_old:
df = remove_old_data(df, old_df, source_lang, target_lang)
print("Data size after filtering old data", df.shape[0])
df[["text", "translation"]].to_excel(output, index=False, engine="openpyxl")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="Exported translations from Google Translate in .csv.")
parser.add_argument("-o", "--output", help="File to output")
parser.add_argument("-s", "--source", help="Source language", default="Finnish")
parser.add_argument("-t", "--target", help="Target language", default="English")
parser.add_argument(
"--rm_old", help="If true, previously added data is removed from the new.", default=False, action="store_true"
)
parser.add_argument("--old_data", help="File with old exported translations, in .csv format.")
args = parser.parse_args()
main(args)