diff --git a/INSTALL b/INSTALL index 3d8cff1a3..8504b218f 100644 --- a/INSTALL +++ b/INSTALL @@ -133,6 +133,9 @@ Contents - (optional) Beautiful Soup, for HTML washing: + - (optional) Python Twitter (and its dependencies) if you want + to use the Twitter Fetcher bibtasklet: + Note: MySQLdb version 1.2.1_p2 or higher is recommended. If you are using an older version of MySQLdb, you may get diff --git a/modules/bibsched/lib/tasklets/Makefile.am b/modules/bibsched/lib/tasklets/Makefile.am index b2fb4ec17..3d2bb5c6b 100644 --- a/modules/bibsched/lib/tasklets/Makefile.am +++ b/modules/bibsched/lib/tasklets/Makefile.am @@ -17,7 +17,7 @@ pylibdir=$(libdir)/python/invenio/bibsched_tasklets -pylib_DATA = __init__.py bst_fibonacci.py bst_send_email.py +pylib_DATA = __init__.py bst_fibonacci.py bst_send_email.py bst_twitter_fetcher.py EXTRA_DIST = $(pylib_DATA) diff --git a/modules/bibsched/lib/tasklets/bst_twitter_fetcher.py b/modules/bibsched/lib/tasklets/bst_twitter_fetcher.py new file mode 100644 index 000000000..5ba42162f --- /dev/null +++ b/modules/bibsched/lib/tasklets/bst_twitter_fetcher.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2011 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +""" +Twitter fetcher + +In order to schedule fetching tweets you can type at the command line: + +$ sudo -u www-data /opt/invenio/bin/bibtasklet -T bst_twitter_fetcher -uadmin -s5m -a "query=YOURQUERY" + +""" + +## Here we import the Twitter APIs +import twitter +import re +import os +import sys +import tempfile +import time +import sys + +## Here are some good Invenio APIs + +from invenio.config import CFG_TMPDIR + +## BibRecord -> to create MARCXML records +from invenio.bibrecord import record_add_field, record_xml_output + +## BibTask -> to manipulate Bibliographic Tasks +from invenio.bibtask import task_low_level_submission, write_message, task_update_progress + +## BibDocFile to manipulate documents +from invenio.bibdocfile import check_valid_url + +## WebSearch to search for previous tweets +from invenio.search_engine import perform_request_search, get_fieldvalues + +_TWITTER_API = twitter.Api() + +def get_tweets(query): + """ + This is how simple it is to fetch tweets :-) + """ + ## We shall skip tweets that already in the system. + previous_tweets = perform_request_search(p='980__a:"TWEET" 980__b:"%s"' % query, sf='970__a', so='a') + if previous_tweets: + ## A bit of an algorithm to retrieve the last Tweet ID that was stored + ## in our records + since_id = int(get_fieldvalues(previous_tweets[0], '970__a')[0]) + else: + since_id = 0 + final_results = [] + results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id).results) + final_results.extend(results) + page = 1 + while len(results) == 100: ## We stop if there are less than 100 results per page + page += 1 + results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id, page=page).results) + final_results.extend(results) + return final_results + +_RE_GET_HTTP = re.compile("(https?://.+?)(\s|$)") +_RE_TAGS = re.compile("([#@]\w+)") +def tweet_to_record(tweet, query): + """ + Transform a tweet into a record. + @note: you may want to highly customize this. + """ + rec = {} + ## Let's normalize the body of the tweet. + text = tweet.text.encode('UTF-8') + text = text.replace('>', '>') + text = text.replace('<', '<') + text = text.replace('"', "'") + text = text.replace('&', '&') + + ## Let's add the creation date + try: + creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') + except ValueError: + creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') + record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) + + ## Let's add the Tweet ID + record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) + + ## Let's add the body of the tweet as an abstract + record_add_field(rec, '520', subfields=[('a', text)]) + + ## Let's re-add the body of the tweet as a title. + record_add_field(rec, '245', subfields=[('a', text)]) + + ## Let's fetch information about the user + try: + user = _TWITTER_API.GetUser(tweet.from_user) + + ## Let's add the user name as author of the tweet + record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) + + ## Let's fetch the icon of the user profile, and let's upload it as + ## an image (and an icon of itself) + record_add_field(rec, 'FFT', subfields=[('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8'))]) + except Exception, err: + write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) + if hasattr(tweet, 'iso_language_code'): + ## Let's add the language of the Tweet if available (also this depends) + ## on the kind of Twitter API call we used + record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) + + ## Let's tag this record as a TWEET so that later we can build a collection + ## out of these records. + record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) + + ## Some smart manipulations: let's parse out URLs and tags from the body + ## of the Tweet. + for url in _RE_GET_HTTP.findall(text): + url = url[0] + record_add_field(rec, '856', '4', subfields=[('u', url)]) + + for tag in _RE_TAGS.findall(text): + ## And here we add the keywords. + record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) + + ## Finally we shall serialize everything to MARCXML + return record_xml_output(rec) + +def bst_twitter_fetcher(query): + """ + Fetch the tweets related to the user and upload them into Invenio. + @param user: the user + """ + ## We prepare a temporary MARCXML file to upload. + fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR) + tweets = get_tweets(query) + if tweets: + os.write(fd, """\n""") + for i, tweet in enumerate(tweets): + ## For every tweet we transform it to MARCXML and we dump it in the file. + task_update_progress('DONE: tweet %s out %s' % (i, len(tweets))) + os.write(fd, tweet_to_record(tweet, query)) + + os.write(fd, """""") + os.close(fd) + + ## Invenio magic: we schedule an upload of the created MARCXML to be inserted + ## ASAP in the system. + task_low_level_submission('bibupload', 'admin', '-i', '-r', name, '-P5') + write_message("Uploaded file %s with %s new tweets about %s" % (name, len(tweets), query)) + else: + write_message("No new tweets about %s" % query) + +if __name__ == '__main__': + if len(sys.argv) == 2: + bst_twitter_fetcher(sys.argv[1]) + else: + print "USAGE: %s TWITTER_QUERY" % sys.argv[0] + sys.exit(1)