Skip to content

Commit

Permalink
handles end-of-month for Apache lists
Browse files Browse the repository at this point in the history
  • Loading branch information
ceteri committed Dec 28, 2014
1 parent 6bf4e95 commit 0645185
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
2 changes: 1 addition & 1 deletion textrank/defaults.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
iterations: 2000
nap_time: 2
base_url: http://mail-archives.apache.org
start_url: /mod_mbox/spark-user/201411.mbox/%3c2014110717230889451320@dewmobile.net%3e
start_url: /mod_mbox/spark-user/201412.mbox/%3CCALEj8ePZPoq2a40hjuJS1QGc%2Bc07Fon5TqiQyTQM%3DGC678jbOw%40mail.gmail.com%3E
14 changes: 9 additions & 5 deletions textrank/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import time
import urllib

DEBUG = False # True

PAT_ID = re.compile("^.*\%3c(.*)\@.*$")

Expand Down Expand Up @@ -97,10 +98,13 @@ def pretty_print (obj, indent=False):

with open(sys.argv[1], 'w') as f:
for i in xrange(0, iterations):
meta = parse_email(scrape_url(url), base_url)
if len(url) < 1:
break
else:
meta = parse_email(scrape_url(url), base_url)

f.write(pretty_print(meta))
f.write('\n')
f.write(pretty_print(meta))
f.write('\n')

url = meta["next_url"]
time.sleep(nap_time)
url = meta["next_url"]
time.sleep(nap_time)

0 comments on commit 0645185

Please sign in to comment.