Skip to content

Commit d19584c

Browse files
committed
Simplified reddit_parse control flow
1 parent 1296361 commit d19584c

File tree

1 file changed

+26
-17
lines changed

1 file changed

+26
-17
lines changed

reddit-parse/reddit_parse.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def main():
2020
parser.add_argument('--comment_cache_size', type=int, default=1e7,
2121
help='max number of comments to cache in memory before flushing')
2222
parser.add_argument('--output_file_size', type=int, default=2e8,
23-
help='max number of comments to cache in memory before flushing')
23+
help='max size of each output file (give or take one conversation)')
2424
parser.add_argument('--print_every', type=int, default=1000,
2525
help='print an update to the screen this often')
2626
parser.add_argument('--min_conversation_length', type=int, default=5,
@@ -75,9 +75,25 @@ def parse_main(args):
7575
os.makedirs(args.logdir)
7676
subreddit_dict = {}
7777
comment_dict = {}
78-
cache_count = 0
7978
raw_data = raw_data_generator(args.input_file)
8079
output_handler = OutputHandler(os.path.join(args.logdir, OUTPUT_FILE), args.output_file_size)
80+
done = False
81+
total_read = 0
82+
while not done:
83+
done, i = read_comments_into_cache(raw_data, comment_dict, args.print_every, args.print_subreddit,
84+
args.comment_cache_size, subreddit_dict, substring_blacklist, subreddit_whitelist, substring_blacklist)
85+
total_read += i
86+
process_comment_cache(comment_dict, args.print_every)
87+
write_comment_cache(comment_dict, output_handler, args.print_every,
88+
args.print_subreddit, args.min_conversation_length)
89+
write_report(os.path.join(args.logdir, REPORT_FILE), subreddit_dict)
90+
comment_dict.clear()
91+
print("\nRead all {:,d} lines from {}.".format(total_read, args.input_file))
92+
93+
def read_comments_into_cache(raw_data, comment_dict, print_every, print_subreddit, comment_cache_size,
94+
subreddit_dict, subreddit_blacklist, subreddit_whitelist, substring_blacklist):
95+
done = False
96+
cache_count = 0
8197
for i, line in enumerate(raw_data):
8298
# Ignore certain kinds of malformed JSON
8399
if len(line) > 1 and (line[-1] == '}' or line[-2] == '}'):
@@ -88,23 +104,16 @@ def parse_main(args):
88104
if sub in subreddit_dict:
89105
subreddit_dict[sub] += 1
90106
else: subreddit_dict[sub] = 1
91-
comment_dict[comment['id']] = RedditComment(comment, args.print_subreddit)
107+
comment_dict[comment['id']] = RedditComment(comment, print_subreddit)
92108
cache_count += 1
93-
if cache_count % args.print_every == 0:
109+
if cache_count % print_every == 0:
94110
print("\rCached {:,d} comments".format(cache_count), end='')
95111
sys.stdout.flush()
96-
if cache_count > args.comment_cache_size:
97-
print()
98-
process_comment_cache(comment_dict, args.print_every)
99-
write_comment_cache(comment_dict, output_handler, args.print_every,
100-
args.print_subreddit, args.min_conversation_length)
101-
write_report(os.path.join(args.logdir, REPORT_FILE), subreddit_dict)
102-
comment_dict.clear()
103-
cache_count = 0
104-
print("\nRead all {:,d} lines from {}.".format(i, args.input_file))
105-
process_comment_cache(comment_dict, args.print_every)
106-
write_comment_cache(comment_dict, output_handler, args.print_every, args.print_subreddit)
107-
write_report(os.path.join(args.logdir, REPORT_FILE), subreddit_dict)
112+
if cache_count > comment_cache_size: break
113+
else: # raw_data has been exhausted.
114+
done = True
115+
print()
116+
return done, i
108117

109118
def raw_data_generator(path):
110119
if os.path.isdir(path):
@@ -164,7 +173,7 @@ def post_qualifies(json_object, subreddit_blacklist,
164173
for substring in substring_blacklist:
165174
if body.find(substring) >= 0: return False
166175
# Preprocess the comment text.
167-
body = re.sub('[ \t\n]+', ' ', body) # Replace runs of whitespace with a single space.
176+
body = re.sub('[ \t\n\r]+', ' ', body) # Replace runs of whitespace with a single space.
168177
body = re.sub('\^', '', body) # Strip out carets.
169178
body = re.sub('\\\\', '', body) # Strip out backslashes.
170179
body = re.sub('&lt;', '<', body) # Replace '&lt;' with '<'

0 commit comments

Comments
 (0)