-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconsensus_catalog.py
175 lines (146 loc) · 6.96 KB
/
consensus_catalog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Takes a directory with various semesters of "raw" catalog information, and
synthesizes them into a "consensus" catalog containing the most recent version
of each course. The related file is copied directly from the most recent semester.
"""
import os
import sys
import csv
import re
import pandas as pd
import numpy as np
from utils.catalog_constants import *
from utils.parse_evaluations import *
KEYS_TO_WRITE = [key for key in CONDENSED_ATTRIBUTES if key != CourseAttribute.subjectID] + [CourseAttribute.sourceSemester, CourseAttribute.isHistorical]
def semester_sort_key(x):
comps = x.split('-')
return int(comps[2]) * 10 + (5 if comps[1] == "fall" else 0)
def make_corrections(corrections, consensus):
"""Based on the given correction dictionary objects, modifies
the appropriate fields in the given consensus dataframe."""
for correction in corrections:
subject_id = correction["Subject Id"]
if '*' in subject_id:
# Use regex matching to find appropriate rows
regex = re.escape(subject_id).replace('\*', '.')
consensus_rows = consensus[consensus.index.str.match(regex)]
for idx, consensus_row in consensus_rows.iterrows():
for col in correction:
if col == "Subject Id": continue
if correction[col]:
if col not in consensus.columns:
consensus[col] = ""
print("Correction for {}: {} ==> {}".format(idx, col, correction[col]))
consensus.ix[idx][col] = correction[col]
elif subject_id in consensus.index:
# Find the subject in the consensus dataframe
consensus_row = consensus.ix[subject_id]
for col in correction:
if col == "Subject Id": continue
if correction[col]:
print("Correction for {}: {} ==> {}".format(subject_id, col, correction[col]))
consensus_row[col] = correction[col]
else:
# Add the subject
print("Correction: adding subject {}".format(subject_id))
consensus.loc[subject_id] = {col: correction.get(col, None) for col in consensus.columns}
def build_consensus(base_path, out_path, corrections=None,
evaluations_path=None):
if not os.path.exists(out_path):
os.mkdir(out_path)
if evaluations_path is not None:
eval_data = load_evaluation_data(evaluations_path)
else:
eval_data = None
semester_data = {}
for semester in os.listdir(base_path):
if 'sem-' not in semester: continue
all_courses = pd.read_csv(os.path.join(base_path, semester, 'courses.txt'), dtype=str).replace(np.nan, '', regex=True)
semester_data[semester] = all_courses
# Sort in reverse chronological order
semester_data = sorted(semester_data.items(), key=lambda x: semester_sort_key(x[0]), reverse=True)
if len(semester_data) == 0:
print("No raw semester data found.")
return
# Build consensus by iterating from new to old
consensus = None
last_size = 0
for i, (semester, data) in enumerate(semester_data):
data[CourseAttribute.sourceSemester] = semester[semester.find("-") + 1:]
data[CourseAttribute.isHistorical] = "Y" if (i != 0) else ""
# Get set of old subject IDs that have been renumbered in future
# semesters
old_ids = set().union(*(
data.loc[:, CourseAttribute.oldID].replace("", np.nan).dropna()
if CourseAttribute.oldID in data.columns else []
for semester, data in semester_data[:i]
))
# Remove the old IDs
data = data.loc[~data[CourseAttribute.subjectID].isin(old_ids)]
if consensus is None:
consensus = data
else:
if CourseAttribute.oldID in data.columns:
# Propagate old ID field to newer semesters
for _, subject_id, old_id in (
data.replace("", np.nan)
.dropna(subset=[CourseAttribute.oldID])
.loc[:, (CourseAttribute.subjectID, CourseAttribute.oldID)]
.itertuples()
):
consensus[CourseAttribute.oldID][
consensus[CourseAttribute.subjectID] == subject_id
] = old_id
consensus = pd.concat([consensus, data], sort=False)
consensus = consensus.drop_duplicates(subset=[CourseAttribute.subjectID], keep='first')
print("Added {} courses with {}.".format(len(consensus) - last_size, semester))
last_size = len(consensus)
consensus.set_index(CourseAttribute.subjectID, inplace=True)
if corrections is not None:
make_corrections(corrections, consensus)
if eval_data is not None:
parse_evaluations(eval_data, consensus)
print("Writing courses...")
seen_departments = set()
for subject_id in consensus.index:
if "." not in subject_id: continue
dept = subject_id[:subject_id.find(".")]
if dept in seen_departments: continue
dept_courses = consensus[consensus.index.str.startswith(dept + ".")]
write_df(dept_courses, os.path.join(out_path, dept + ".txt"))
seen_departments.add(dept)
write_df(consensus, os.path.join(out_path, "courses.txt"))
write_condensed_files(consensus, out_path)
# Copy the first available related file
for semester, data in semester_data:
related_path = os.path.join(base_path, semester, "related.txt")
if os.path.exists(related_path):
with open(related_path, 'r') as file:
with open(os.path.join(out_path, "related.txt"), 'w') as outfile:
for line in file:
outfile.write(line)
break
def write_condensed_files(consensus, out_path, split_count=4):
for i in range(split_count):
lower_bound = int(i / 4.0 * len(consensus))
upper_bound = min(len(consensus), int((i + 1) / 4.0 * len(consensus)))
write_df(consensus[KEYS_TO_WRITE].iloc[lower_bound:upper_bound], os.path.join(out_path, "condensed_{}.txt".format(i)))
def write_df(df, path):
"""Writes the df to the given path with appropriate quoting."""
with open(path, 'w') as file:
file.write(','.join([CourseAttribute.subjectID] + list(df.columns)) + '\n')
file.write(df.to_csv(header=False, quoting=csv.QUOTE_NONNUMERIC).replace('""', ''))
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python consensus_catalog.py raw-dir out-dir [evaluations-file]")
exit(1)
in_path = sys.argv[1]
out_path = sys.argv[2]
if len(sys.argv) > 2:
eval_path = sys.argv[3]
else:
eval_path = None
if os.path.exists(out_path):
print("Fatal: the directory {} already exists. Please delete it or choose a different location.".format(out_path))
exit(1)
build_consensus(in_path, out_path, evaluations_path=eval_path)