Skip to content

Commit

Permalink
BibRank: new index term count ranking method
Browse files Browse the repository at this point in the history
* New index term count rank_method available, including template.
  Beware, re-indexes and re-balances ranking weights upon every
  invocation.
* Can be used, for example, in institutions or similar
  authority files by appropriate configuration.
* Can be improved:
  * rank method added in bibrank_tag_based_indexer because
    because it shares many methods, but this should be
    refactored to allow a more pluggable architecture for more
    methods
  * just ranks all institutions all time, no selection enabled
    based on modification dates or recid
  * plug into demo site and write regression test cases
  • Loading branch information
Travis Brooks authored and tiborsimko committed Jul 5, 2011
1 parent 2549463 commit 7fb89bb
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 6 deletions.
10 changes: 7 additions & 3 deletions modules/bibrank/etc/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
Expand All @@ -36,7 +36,9 @@ etc_DATA = bibrankgkb.cfg \
citation.cfg \
citerank_citation_t.cfg \
citerank_pagerank_c.cfg \
citerank_pagerank_t.cfg
citerank_pagerank_t.cfg \
demo_itc_collection.cfg \
template_index_term_count.cfg

EXTRA_DIST = bibrankgkb.cfg.in \
bibrankgkb_jif_conv.kb \
Expand All @@ -57,6 +59,8 @@ EXTRA_DIST = bibrankgkb.cfg.in \
citation.cfg \
citerank_citation_t.cfg \
citerank_pagerank_c.cfg \
citerank_pagerank_t.cfg
citerank_pagerank_t.cfg \
demo_itc_collection.cfg \
template_index_term_count.cfg

CLEANFILES = *~ *.tmp
28 changes: 28 additions & 0 deletions modules/bibrank/etc/demo_itc_collection.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

# Beware, the `index_term_count' ranking method re-indexes and
# re-balances ranking weights upon every invocation.

[rank_method]
function = index_term_count

[index_term_count]
index_table_name = idxPHRASE02F
index_term_value_from_tag = 980__a
relevance_number_output_prologue = (
relevance_number_output_epilogue = )
28 changes: 28 additions & 0 deletions modules/bibrank/etc/template_index_term_count.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

# Beware, the `index_term_count' ranking method re-indexes and
# re-balances ranking weights upon every invocation.

[rank_method]
function = index_term_count

[index_term_count]
index_table_name = idxPHRASE10F
index_term_value_from_tag = 909C0y
relevance_number_output_prologue = (
relevance_number_output_epilogue = )
3 changes: 2 additions & 1 deletion modules/bibrank/lib/bibrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@
citation, \
download_weight_filtering_user, \
download_weight_total, \
file_similarity_by_times_downloaded
file_similarity_by_times_downloaded, \
index_term_count
from invenio.bibrank_word_indexer import word_similarity
from invenio.bibrank_citerank_indexer import citerank
# pylint: enable=W0611
Expand Down
67 changes: 65 additions & 2 deletions modules/bibrank/lib/bibrank_tag_based_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

__revision__ = "$Id$"


import os
import sys
Expand All @@ -32,10 +32,12 @@
from invenio.search_engine import perform_request_search, HitSet
from invenio.bibrank_citation_indexer import get_citation_weight, print_missing, get_cit_dict, insert_into_cit_db
from invenio.bibrank_downloads_indexer import *
from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal
from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal, \
wash_table_column_name, get_table_update_time
from invenio.errorlib import register_exception
from invenio.bibtask import task_get_option, write_message, task_sleep_now_if_required
from invenio.bibindex_engine import create_range_list
from invenio.intbitset import intbitset

options = {}

Expand Down Expand Up @@ -481,3 +483,64 @@ def showtime(timeused):

def citation(run):
return bibrank_engine(run)


# Hack to put index based sorting here, but this is very similar to tag
#based method and should re-use a lot of this code, so better to have here
#than separate
#

def index_term_count_exec(rank_method_code, name, config):
"""Creating the rank method data"""
write_message("Recreating index weighting data")
begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# we must recalculate these every time for all records, since the
# weighting of a record is determined by the index entries of _other_
# records

rnkset = calculate_index_term_count(config)
intoDB(rnkset, begin_date, rank_method_code)

def calculate_index_term_count(config):
"""Calculate the weight of a record set based on number of enries of a
tag from the record in another index...useful for authority files"""

records = []

if config.has_section("index_term_count"):
index = config.get("index_term_count","index_table_name")
tag = config.get("index_term_count","index_term_value_from_tag")
# check against possible SQL injection:
dummy = get_table_update_time(index)
tag = wash_table_column_name(tag)
else:
raise Exception("Config file " + config + " does not have index_term_count section")
return()

task_sleep_now_if_required(can_stop_too=True)
write_message("......Processing all records")
query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \
(tag[0:2], tag[0:2]) # we checked that tag is safe
records = list(run_sql(query, (tag,)))
write_message("Number of records found with the necessary tags: %s" % len(records))


rnkset = {}
for key, value in records:
hits = 0
if len(value):
query = "SELECT hitlist from %s where term = %%s" % index # we checked that index is a table
row = run_sql(query, (value,))
if row and row[0] and row[0][0]:
#has to be prepared for corrupted data!
try:
hits = len(intbitset(row[0][0]))
except:
hits = 0
rnkset[key] = hits
write_message("Number of records available in rank method: %s" % len(rnkset))
return rnkset


def index_term_count(run):
return bibrank_engine(run)

0 comments on commit 7fb89bb

Please sign in to comment.