@@ -20,7 +20,7 @@ def main():
2020 parser .add_argument ('--comment_cache_size' , type = int , default = 1e7 ,
2121 help = 'max number of comments to cache in memory before flushing' )
2222 parser .add_argument ('--output_file_size' , type = int , default = 2e8 ,
23- help = 'max number of comments to cache in memory before flushing ' )
23+ help = 'max size of each output file (give or take one conversation) ' )
2424 parser .add_argument ('--print_every' , type = int , default = 1000 ,
2525 help = 'print an update to the screen this often' )
2626 parser .add_argument ('--min_conversation_length' , type = int , default = 5 ,
@@ -75,9 +75,25 @@ def parse_main(args):
7575 os .makedirs (args .logdir )
7676 subreddit_dict = {}
7777 comment_dict = {}
78- cache_count = 0
7978 raw_data = raw_data_generator (args .input_file )
8079 output_handler = OutputHandler (os .path .join (args .logdir , OUTPUT_FILE ), args .output_file_size )
80+ done = False
81+ total_read = 0
82+ while not done :
83+ done , i = read_comments_into_cache (raw_data , comment_dict , args .print_every , args .print_subreddit ,
84+ args .comment_cache_size , subreddit_dict , substring_blacklist , subreddit_whitelist , substring_blacklist )
85+ total_read += i
86+ process_comment_cache (comment_dict , args .print_every )
87+ write_comment_cache (comment_dict , output_handler , args .print_every ,
88+ args .print_subreddit , args .min_conversation_length )
89+ write_report (os .path .join (args .logdir , REPORT_FILE ), subreddit_dict )
90+ comment_dict .clear ()
91+ print ("\n Read all {:,d} lines from {}." .format (total_read , args .input_file ))
92+
93+ def read_comments_into_cache (raw_data , comment_dict , print_every , print_subreddit , comment_cache_size ,
94+ subreddit_dict , subreddit_blacklist , subreddit_whitelist , substring_blacklist ):
95+ done = False
96+ cache_count = 0
8197 for i , line in enumerate (raw_data ):
8298 # Ignore certain kinds of malformed JSON
8399 if len (line ) > 1 and (line [- 1 ] == '}' or line [- 2 ] == '}' ):
@@ -88,23 +104,16 @@ def parse_main(args):
88104 if sub in subreddit_dict :
89105 subreddit_dict [sub ] += 1
90106 else : subreddit_dict [sub ] = 1
91- comment_dict [comment ['id' ]] = RedditComment (comment , args . print_subreddit )
107+ comment_dict [comment ['id' ]] = RedditComment (comment , print_subreddit )
92108 cache_count += 1
93- if cache_count % args . print_every == 0 :
109+ if cache_count % print_every == 0 :
94110 print ("\r Cached {:,d} comments" .format (cache_count ), end = '' )
95111 sys .stdout .flush ()
96- if cache_count > args .comment_cache_size :
97- print ()
98- process_comment_cache (comment_dict , args .print_every )
99- write_comment_cache (comment_dict , output_handler , args .print_every ,
100- args .print_subreddit , args .min_conversation_length )
101- write_report (os .path .join (args .logdir , REPORT_FILE ), subreddit_dict )
102- comment_dict .clear ()
103- cache_count = 0
104- print ("\n Read all {:,d} lines from {}." .format (i , args .input_file ))
105- process_comment_cache (comment_dict , args .print_every )
106- write_comment_cache (comment_dict , output_handler , args .print_every , args .print_subreddit )
107- write_report (os .path .join (args .logdir , REPORT_FILE ), subreddit_dict )
112+ if cache_count > comment_cache_size : break
113+ else : # raw_data has been exhausted.
114+ done = True
115+ print ()
116+ return done , i
108117
109118def raw_data_generator (path ):
110119 if os .path .isdir (path ):
@@ -164,7 +173,7 @@ def post_qualifies(json_object, subreddit_blacklist,
164173 for substring in substring_blacklist :
165174 if body .find (substring ) >= 0 : return False
166175 # Preprocess the comment text.
167- body = re .sub ('[ \t \n ]+' , ' ' , body ) # Replace runs of whitespace with a single space.
176+ body = re .sub ('[ \t \n \r ]+' , ' ' , body ) # Replace runs of whitespace with a single space.
168177 body = re .sub ('\^' , '' , body ) # Strip out carets.
169178 body = re .sub ('\\ \\ ' , '' , body ) # Strip out backslashes.
170179 body = re .sub ('<' , '<' , body ) # Replace '<' with '<'
0 commit comments