BibSched: new bst_twitter_fetcher bibtasklet

* New example of useful bibsched tasklet to automatically fetch tweets from Twitter to populate an Invenio instance.
adams164 · Jul 10, 2011 · ec9569e · ec9569e
1 parent 6dfab99
commit ec9569e
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 1 deletion.
diff --git a/INSTALL b/INSTALL
@@ -133,6 +133,9 @@ Contents
 	     <http://utidylib.berlios.de/>
           - (optional) Beautiful Soup, for HTML washing:
 	     <http://www.crummy.com/software/BeautifulSoup/>
+          - (optional) Python Twitter (and its dependencies) if you want
+             to use the Twitter Fetcher bibtasklet:
+             <http://code.google.com/p/python-twitter/>
 
         Note: MySQLdb version 1.2.1_p2 or higher is recommended.  If
               you are using an older version of MySQLdb, you may get

diff --git a/modules/bibsched/lib/tasklets/Makefile.am b/modules/bibsched/lib/tasklets/Makefile.am
@@ -17,7 +17,7 @@
 
 pylibdir=$(libdir)/python/invenio/bibsched_tasklets
 
-pylib_DATA = __init__.py bst_fibonacci.py bst_send_email.py
+pylib_DATA = __init__.py bst_fibonacci.py bst_send_email.py bst_twitter_fetcher.py
 
 EXTRA_DIST = $(pylib_DATA)
 

diff --git a/modules/bibsched/lib/tasklets/bst_twitter_fetcher.py b/modules/bibsched/lib/tasklets/bst_twitter_fetcher.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+##
+## This file is part of Invenio.
+## Copyright (C) 2011 CERN.
+##
+## Invenio is free software; you can redistribute it and/or
+## modify it under the terms of the GNU General Public License as
+## published by the Free Software Foundation; either version 2 of the
+## License, or (at your option) any later version.
+##
+## Invenio is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with Invenio; if not, write to the Free Software Foundation, Inc.,
+## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+"""
+Twitter fetcher
+
+In order to schedule fetching tweets you can type at the command line:
+
+$ sudo -u www-data /opt/invenio/bin/bibtasklet -T bst_twitter_fetcher -uadmin -s5m -a "query=YOURQUERY"
+
+"""
+
+## Here we import the Twitter APIs
+import twitter
+import re
+import os
+import sys
+import tempfile
+import time
+import sys
+
+## Here are some good Invenio APIs
+
+from invenio.config import CFG_TMPDIR
+
+## BibRecord -> to create MARCXML records
+from invenio.bibrecord import record_add_field, record_xml_output
+
+## BibTask -> to manipulate Bibliographic Tasks
+from invenio.bibtask import task_low_level_submission, write_message, task_update_progress
+
+## BibDocFile to manipulate documents
+from invenio.bibdocfile import check_valid_url
+
+## WebSearch to search for previous tweets
+from invenio.search_engine import perform_request_search, get_fieldvalues
+
+_TWITTER_API = twitter.Api()
+
+def get_tweets(query):
+    """
+    This is how simple it is to fetch tweets :-)
+    """
+    ## We shall skip tweets that already in the system.
+    previous_tweets = perform_request_search(p='980__a:"TWEET" 980__b:"%s"' % query, sf='970__a', so='a')
+    if previous_tweets:
+        ## A bit of an algorithm to retrieve the last Tweet ID that was stored
+        ## in our records
+        since_id = int(get_fieldvalues(previous_tweets[0], '970__a')[0])
+    else:
+        since_id = 0
+    final_results = []
+    results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id).results)
+    final_results.extend(results)
+    page = 1
+    while len(results) == 100: ## We stop if there are less than 100 results per page
+        page += 1
+        results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id, page=page).results)
+        final_results.extend(results)
+    return final_results
+
+_RE_GET_HTTP = re.compile("(https?://.+?)(\s|$)")
+_RE_TAGS = re.compile("([#@]\w+)")
+def tweet_to_record(tweet, query):
+    """
+    Transform a tweet into a record.
+    @note: you may want to highly customize this.
+    """
+    rec = {}
+    ## Let's normalize the body of the tweet.
+    text = tweet.text.encode('UTF-8')
+    text = text.replace('&gt;', '>')
+    text = text.replace('&lt;', '<')
+    text = text.replace('&quot;', "'")
+    text = text.replace('&amp;', '&')
+
+    ## Let's add the creation date
+    try:
+        creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000')
+    except ValueError:
+        creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y')
+    record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date))
+
+    ## Let's add the Tweet ID
+    record_add_field(rec, '970', subfields=[('a', str(tweet.id))])
+
+    ## Let's add the body of the tweet as an abstract
+    record_add_field(rec, '520', subfields=[('a', text)])
+
+    ## Let's re-add the body of the tweet as a title.
+    record_add_field(rec, '245', subfields=[('a', text)])
+
+    ## Let's fetch information about the user
+    try:
+        user = _TWITTER_API.GetUser(tweet.from_user)
+
+        ## Let's add the user name as author of the tweet
+        record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))])
+
+        ## Let's fetch the icon of the user profile, and let's upload it as
+        ## an image (and an icon of itself)
+        record_add_field(rec, 'FFT', subfields=[('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8'))])
+    except Exception, err:
+        write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr)
+    if hasattr(tweet, 'iso_language_code'):
+            ## Let's add the language of the Tweet if available (also this depends)
+        ## on the kind of Twitter API call we used
+        record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))])
+
+    ## Let's tag this record as a TWEET so that later we can build a collection
+    ## out of these records.
+    record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)])
+
+    ## Some smart manipulations: let's parse out URLs and tags from the body
+    ## of the Tweet.
+    for url in _RE_GET_HTTP.findall(text):
+        url = url[0]
+        record_add_field(rec, '856', '4', subfields=[('u', url)])
+
+    for tag in _RE_TAGS.findall(text):
+        ## And here we add the keywords.
+        record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')])
+
+    ## Finally we shall serialize everything to MARCXML
+    return record_xml_output(rec)
+
+def bst_twitter_fetcher(query):
+    """
+    Fetch the tweets related to the user and upload them into Invenio.
+    @param user: the user
+    """
+    ## We prepare a temporary MARCXML file to upload.
+    fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR)
+    tweets = get_tweets(query)
+    if tweets:
+        os.write(fd, """<collection>\n""")
+        for i, tweet in enumerate(tweets):
+            ## For every tweet we transform it to MARCXML and we dump it in the file.
+            task_update_progress('DONE: tweet %s out %s' % (i, len(tweets)))
+            os.write(fd, tweet_to_record(tweet, query))
+
+        os.write(fd, """</collection\n>""")
+        os.close(fd)
+
+        ## Invenio magic: we schedule an upload of the created MARCXML to be inserted
+        ## ASAP in the system.
+        task_low_level_submission('bibupload', 'admin', '-i', '-r', name, '-P5')
+        write_message("Uploaded file %s with %s new tweets about %s" % (name, len(tweets), query))
+    else:
+        write_message("No new tweets about %s" % query)
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        bst_twitter_fetcher(sys.argv[1])
+    else:
+        print "USAGE: %s TWITTER_QUERY" % sys.argv[0]
+        sys.exit(1)