forked from jrbl/invenio
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BibSched: new bst_twitter_fetcher bibtasklet
* New example of useful bibsched tasklet to automatically fetch tweets from Twitter to populate an Invenio instance.
- Loading branch information
1 parent
6dfab99
commit ec9569e
Showing
3 changed files
with
178 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
## | ||
## This file is part of Invenio. | ||
## Copyright (C) 2011 CERN. | ||
## | ||
## Invenio is free software; you can redistribute it and/or | ||
## modify it under the terms of the GNU General Public License as | ||
## published by the Free Software Foundation; either version 2 of the | ||
## License, or (at your option) any later version. | ||
## | ||
## Invenio is distributed in the hope that it will be useful, but | ||
## WITHOUT ANY WARRANTY; without even the implied warranty of | ||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
## General Public License for more details. | ||
## | ||
## You should have received a copy of the GNU General Public License | ||
## along with Invenio; if not, write to the Free Software Foundation, Inc., | ||
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | ||
|
||
""" | ||
Twitter fetcher | ||
In order to schedule fetching tweets you can type at the command line: | ||
$ sudo -u www-data /opt/invenio/bin/bibtasklet -T bst_twitter_fetcher -uadmin -s5m -a "query=YOURQUERY" | ||
""" | ||
|
||
## Here we import the Twitter APIs | ||
import twitter | ||
import re | ||
import os | ||
import sys | ||
import tempfile | ||
import time | ||
import sys | ||
|
||
## Here are some good Invenio APIs | ||
|
||
from invenio.config import CFG_TMPDIR | ||
|
||
## BibRecord -> to create MARCXML records | ||
from invenio.bibrecord import record_add_field, record_xml_output | ||
|
||
## BibTask -> to manipulate Bibliographic Tasks | ||
from invenio.bibtask import task_low_level_submission, write_message, task_update_progress | ||
|
||
## BibDocFile to manipulate documents | ||
from invenio.bibdocfile import check_valid_url | ||
|
||
## WebSearch to search for previous tweets | ||
from invenio.search_engine import perform_request_search, get_fieldvalues | ||
|
||
_TWITTER_API = twitter.Api() | ||
|
||
def get_tweets(query): | ||
""" | ||
This is how simple it is to fetch tweets :-) | ||
""" | ||
## We shall skip tweets that already in the system. | ||
previous_tweets = perform_request_search(p='980__a:"TWEET" 980__b:"%s"' % query, sf='970__a', so='a') | ||
if previous_tweets: | ||
## A bit of an algorithm to retrieve the last Tweet ID that was stored | ||
## in our records | ||
since_id = int(get_fieldvalues(previous_tweets[0], '970__a')[0]) | ||
else: | ||
since_id = 0 | ||
final_results = [] | ||
results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id).results) | ||
final_results.extend(results) | ||
page = 1 | ||
while len(results) == 100: ## We stop if there are less than 100 results per page | ||
page += 1 | ||
results = list(_TWITTER_API.Search(query, rpp=100, since_id=since_id, page=page).results) | ||
final_results.extend(results) | ||
return final_results | ||
|
||
_RE_GET_HTTP = re.compile("(https?://.+?)(\s|$)") | ||
_RE_TAGS = re.compile("([#@]\w+)") | ||
def tweet_to_record(tweet, query): | ||
""" | ||
Transform a tweet into a record. | ||
@note: you may want to highly customize this. | ||
""" | ||
rec = {} | ||
## Let's normalize the body of the tweet. | ||
text = tweet.text.encode('UTF-8') | ||
text = text.replace('>', '>') | ||
text = text.replace('<', '<') | ||
text = text.replace('"', "'") | ||
text = text.replace('&', '&') | ||
|
||
## Let's add the creation date | ||
try: | ||
creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') | ||
except ValueError: | ||
creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') | ||
record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) | ||
|
||
## Let's add the Tweet ID | ||
record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) | ||
|
||
## Let's add the body of the tweet as an abstract | ||
record_add_field(rec, '520', subfields=[('a', text)]) | ||
|
||
## Let's re-add the body of the tweet as a title. | ||
record_add_field(rec, '245', subfields=[('a', text)]) | ||
|
||
## Let's fetch information about the user | ||
try: | ||
user = _TWITTER_API.GetUser(tweet.from_user) | ||
|
||
## Let's add the user name as author of the tweet | ||
record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) | ||
|
||
## Let's fetch the icon of the user profile, and let's upload it as | ||
## an image (and an icon of itself) | ||
record_add_field(rec, 'FFT', subfields=[('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8'))]) | ||
except Exception, err: | ||
write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) | ||
if hasattr(tweet, 'iso_language_code'): | ||
## Let's add the language of the Tweet if available (also this depends) | ||
## on the kind of Twitter API call we used | ||
record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) | ||
|
||
## Let's tag this record as a TWEET so that later we can build a collection | ||
## out of these records. | ||
record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) | ||
|
||
## Some smart manipulations: let's parse out URLs and tags from the body | ||
## of the Tweet. | ||
for url in _RE_GET_HTTP.findall(text): | ||
url = url[0] | ||
record_add_field(rec, '856', '4', subfields=[('u', url)]) | ||
|
||
for tag in _RE_TAGS.findall(text): | ||
## And here we add the keywords. | ||
record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) | ||
|
||
## Finally we shall serialize everything to MARCXML | ||
return record_xml_output(rec) | ||
|
||
def bst_twitter_fetcher(query): | ||
""" | ||
Fetch the tweets related to the user and upload them into Invenio. | ||
@param user: the user | ||
""" | ||
## We prepare a temporary MARCXML file to upload. | ||
fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR) | ||
tweets = get_tweets(query) | ||
if tweets: | ||
os.write(fd, """<collection>\n""") | ||
for i, tweet in enumerate(tweets): | ||
## For every tweet we transform it to MARCXML and we dump it in the file. | ||
task_update_progress('DONE: tweet %s out %s' % (i, len(tweets))) | ||
os.write(fd, tweet_to_record(tweet, query)) | ||
|
||
os.write(fd, """</collection\n>""") | ||
os.close(fd) | ||
|
||
## Invenio magic: we schedule an upload of the created MARCXML to be inserted | ||
## ASAP in the system. | ||
task_low_level_submission('bibupload', 'admin', '-i', '-r', name, '-P5') | ||
write_message("Uploaded file %s with %s new tweets about %s" % (name, len(tweets), query)) | ||
else: | ||
write_message("No new tweets about %s" % query) | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) == 2: | ||
bst_twitter_fetcher(sys.argv[1]) | ||
else: | ||
print "USAGE: %s TWITTER_QUERY" % sys.argv[0] | ||
sys.exit(1) |