|
| 1 | +#!/usr/bin/python |
| 2 | +""" |
| 3 | +Run every sys.stdin URL through Boilerpipe (or diffbot), and store in a MongoDB. |
| 4 | +""" |
| 5 | + |
| 6 | +import common.mongodb |
| 7 | +import random |
| 8 | + |
| 9 | +import common.html2text |
| 10 | +import common.str |
| 11 | +from common.stats import stats |
| 12 | + |
| 13 | +import sys |
| 14 | +import string |
| 15 | + |
| 16 | +from optparse import OptionParser |
| 17 | + |
| 18 | +parser = OptionParser() |
| 19 | +parser.add_option("-c", "--collection", dest="collection", help="collection name") |
| 20 | +parser.add_option("-d", "--database", dest="database", help="database name") |
| 21 | +parser.add_option("-p", "--port", dest="port", help="port number for mongodb", type="int") |
| 22 | +parser.add_option("--hostname", dest="hostname", help="hostname for mongodb") |
| 23 | +(options, args) = parser.parse_args() |
| 24 | + |
| 25 | +def boilerpipe_all(): |
| 26 | + collection = common.mongodb.collection(options.database, name=options.collection) |
| 27 | + |
| 28 | + for i, url in enumerate(sys.stdin): |
| 29 | + url = string.strip(url) |
| 30 | + # TODO: Find doc if it is already in collection |
| 31 | + doc = {} |
| 32 | + doc["_id"] = url |
| 33 | + |
| 34 | +# if (i+1) % 100 == 0: |
| 35 | +# print >> sys.stderr, "Retrieving boilerplate for doc # %s" % (common.str.percent(i+1, len(alldocids))) |
| 36 | +# print >> sys.stderr, stats() |
| 37 | + |
| 38 | + if doc is None: |
| 39 | + print >> sys.stderr, "WHA? no doc for %s" % id |
| 40 | + continue |
| 41 | + |
| 42 | + print >> sys.stderr, "Getting text for url %s ..." % doc["_id"].encode("utf-8") |
| 43 | + print >> sys.stderr, stats() |
| 44 | + try: |
| 45 | + doc["BoilerpipePageText"] = common.html2text.boilerpipe_url2text(doc["_id"]) |
| 46 | +# doc["diffbot"] = common.html2text.diffbot_url2text(doc["_id"], token=DIFFBOT_TOKEN) |
| 47 | + collection.save(doc) |
| 48 | + print >> sys.stderr, "...done getting text for url %s" % doc["_id"].encode("utf-8") |
| 49 | + except: |
| 50 | + print >> sys.stderr, "ERROR for url %s" % doc["_id"].encode("utf-8") |
| 51 | + print >> sys.stderr, stats() |
| 52 | + |
| 53 | +if __name__ == "__main__": |
| 54 | + import logging |
| 55 | + logging.basicConfig(level=logging.DEBUG) |
| 56 | + |
| 57 | + boilerpipe_all() |
0 commit comments