Skip to content

Commit 94e2e80

Browse files
committed
Added boilerpipe-stdin-urls-to-mongo.py
1 parent 5ffd120 commit 94e2e80

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ README.txt - This file
66
all-xml-to-json.sh - For every XML file in the command-line,
77
convert it to JSON.
88

9+
boilerpipe-stdin-urls-to-mongo.py
10+
- Run every sys.stdin URL through Boilerpipe
11+
(or diffbot), and store in a MongoDB.
12+
913
citeseer-get.pl - Fetch PDFs from citeseer.
1014

1115
cumulative.py - Output a cumulative sum for each line in

boilerpipe-stdin-urls-to-mongo.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/python
2+
"""
3+
Run every sys.stdin URL through Boilerpipe (or diffbot), and store in a MongoDB.
4+
"""
5+
6+
import common.mongodb
7+
import random
8+
9+
import common.html2text
10+
import common.str
11+
from common.stats import stats
12+
13+
import sys
14+
import string
15+
16+
from optparse import OptionParser
17+
18+
parser = OptionParser()
19+
parser.add_option("-c", "--collection", dest="collection", help="collection name")
20+
parser.add_option("-d", "--database", dest="database", help="database name")
21+
parser.add_option("-p", "--port", dest="port", help="port number for mongodb", type="int")
22+
parser.add_option("--hostname", dest="hostname", help="hostname for mongodb")
23+
(options, args) = parser.parse_args()
24+
25+
def boilerpipe_all():
26+
collection = common.mongodb.collection(options.database, name=options.collection)
27+
28+
for i, url in enumerate(sys.stdin):
29+
url = string.strip(url)
30+
# TODO: Find doc if it is already in collection
31+
doc = {}
32+
doc["_id"] = url
33+
34+
# if (i+1) % 100 == 0:
35+
# print >> sys.stderr, "Retrieving boilerplate for doc # %s" % (common.str.percent(i+1, len(alldocids)))
36+
# print >> sys.stderr, stats()
37+
38+
if doc is None:
39+
print >> sys.stderr, "WHA? no doc for %s" % id
40+
continue
41+
42+
print >> sys.stderr, "Getting text for url %s ..." % doc["_id"].encode("utf-8")
43+
print >> sys.stderr, stats()
44+
try:
45+
doc["BoilerpipePageText"] = common.html2text.boilerpipe_url2text(doc["_id"])
46+
# doc["diffbot"] = common.html2text.diffbot_url2text(doc["_id"], token=DIFFBOT_TOKEN)
47+
collection.save(doc)
48+
print >> sys.stderr, "...done getting text for url %s" % doc["_id"].encode("utf-8")
49+
except:
50+
print >> sys.stderr, "ERROR for url %s" % doc["_id"].encode("utf-8")
51+
print >> sys.stderr, stats()
52+
53+
if __name__ == "__main__":
54+
import logging
55+
logging.basicConfig(level=logging.DEBUG)
56+
57+
boilerpipe_all()

0 commit comments

Comments
 (0)