-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgender-assumer.py
executable file
·80 lines (62 loc) · 2.26 KB
/
gender-assumer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# input csv or xlsx file, adds column with gender guess
# F, M, both, X
import time, sys, os
import pandas as pd
from tqdm import tqdm
from utils.strings import replaceMultiple
boynamez = 'names/A_males.csv'
girlnamez = 'names/non-A_females.csv'
output_folder_name = 'gender-guessed'
input_file = sys.argv[1]
name_col = sys.argv[2]
start_time = time.time()
folder = os.path.dirname(input_file)
# create output folder if it doesn't exist
if not os.path.exists(folder + '/' + output_folder_name):
os.makedirs(folder + '/' + output_folder_name)
# Determine the output file name
output_file = folder + '/' + output_folder_name + '/' + os.path.splitext(os.path.basename(input_file))[0] + '+gen.xlsx'
# Read the input file based on its extension
if input_file.endswith('.csv'):
input_data = pd.read_csv(input_file)
elif input_file.endswith('.xlsx'):
input_data = pd.read_excel(input_file)
else:
raise ValueError("Unsupported file format. Please use a CSV or XLSX file.")
# Read the boys and girls names files
boys = pd.read_csv(boynamez)
girls = pd.read_csv(girlnamez)
# Process each row in the input data
pbar = tqdm(total=len(input_data))
for index, row in input_data.iterrows():
firstnames = row[name_col]
firstnames = replaceMultiple(firstnames, ['-', '–'], ' ')
firstnamez = firstnames.split()
genders = set()
for name in firstnamez:
name = name.lower()
iz_boy = (boys['prenume'].str.lower() == name).any()
iz_girl = (girls['prenume'].str.lower() == name).any()
if not (iz_boy or iz_girl):
last_char = name[-1]
iz_boy = last_char != 'a'
iz_girl = last_char == 'a'
if iz_boy and iz_girl:
genders.add('both')
elif iz_girl:
genders.add('F')
elif iz_boy:
genders.add('M')
else:
genders.add('X')
if 'both' in genders or len(genders) > 1:
gender = 'X'
else:
gender = genders.pop() if genders else 'X'
input_data.loc[index, 'gen'] = gender
pbar.update(1)
# Save the output data to an Excel file
input_data.to_excel(output_file, index=False)
elapsed = round(time.time() - start_time)
tqdm.write("output -> " + output_file)
tqdm.write("elapsed time: " + str(elapsed) + " seconds")