From 9b93e943f536f32b412c8f0df10cb9b59c545ce8 Mon Sep 17 00:00:00 2001 From: Samuele Kaplun Date: Fri, 18 Mar 2011 15:38:23 +0100 Subject: [PATCH] WebSearch: improvements for collection admin * Fix collection loop check in webcoll (in get_ancestors and get_descendants) that was broken for some cases. * New fix_collection_scores() function in websearchadminlib.py to correct and uniformize all collection scores in collection_collection table. * Improve speed of perform_checkcollectionstatus by correctly exploiting restricted_collections cache and by checking only if collections have sons without calculating the whole descendants set. --- .../doc/admin/websearch-admin-guide.webdoc | 6 +-- modules/websearch/lib/websearch_webcoll.py | 19 ++++++--- modules/websearch/lib/websearchadminlib.py | 41 ++++++++++++------- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/modules/websearch/doc/admin/websearch-admin-guide.webdoc b/modules/websearch/doc/admin/websearch-admin-guide.webdoc index 8d9c3e67b..ecf4fcbd1 100644 --- a/modules/websearch/doc/admin/websearch-admin-guide.webdoc +++ b/modules/websearch/doc/admin/websearch-admin-guide.webdoc @@ -707,9 +707,9 @@ the Apache groups mentioned in this column.
If no errors was found, OK is displayed for each collection. If an error was found, then an error number and short message are shown. The meaning of the error messages is the -following: 1:Query means that the collection was defined via -a query but also via subcollections too; 2:Query means that -the collection wasn't defined neither via query nor via +following: 1:Conflict means that the collection was defined +via a query but also via subcollections too; 2:Empty means +that the collection wasn't defined neither via query nor via subcollections. diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py index fe87a78e3..16e0caf19 100644 --- a/modules/websearch/lib/websearch_webcoll.py +++ b/modules/websearch/lib/websearch_webcoll.py @@ -45,6 +45,7 @@ from invenio.bibrank_record_sorter import get_bibrank_methods from invenio.dateutils import convert_datestruct_to_dategui from invenio.bibformat import format_record +from invenio.intbitset import intbitset from invenio.websearch_external_collections import \ external_collection_load_states, \ dico_collection_external_searches, \ @@ -208,6 +209,7 @@ def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolo def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] + ancestors_ids = intbitset() id_son = self.id while 1: query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\ @@ -216,11 +218,12 @@ def get_ancestors(self): if res: col_ancestor = get_collection(res[0][1]) # looking for loops - if col_ancestor in ancestors: + if self.id in ancestors_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) - raise OverflowError + raise OverflowError("Loop found in collection %s" % self.name) else: ancestors.append(col_ancestor) + ancestors_ids.add(col_ancestor.id) id_son = res[0][0] else: break @@ -250,6 +253,7 @@ def get_sons(self, type='r'): def get_descendants(self, type='r'): "Returns list of all descendants of type 'type' for the current collection." descendants = [] + descendant_ids = intbitset() id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type) @@ -257,12 +261,16 @@ def get_descendants(self, type='r'): for row in res: col_desc = get_collection(row[1]) # looking for loops - if col_desc in descendants: + if self.id in descendant_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) - raise OverflowError + raise OverflowError("Loop found in collection %s" % self.name) else: descendants.append(col_desc) - descendants += col_desc.get_descendants() + descendant_ids.add(col_desc.id) + tmp_descendants = col_desc.get_descendants() + for descendant in tmp_descendants: + descendant_ids.add(descendant.id) + descendants += tmp_descendants return descendants def write_cache_file(self, filename='', filebody=''): @@ -447,7 +455,6 @@ def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_S # apply special filters: if self.name in ['Videos']: # select only videos with movies: - from invenio.intbitset import intbitset recIDs = list(intbitset(recIDs) & \ search_pattern(p='collection:"PUBLVIDEOMOVIE"')) # sort some CERN collections specially: diff --git a/modules/websearch/lib/websearchadminlib.py b/modules/websearch/lib/websearchadminlib.py index 05d9f33f7..896dae124 100644 --- a/modules/websearch/lib/websearchadminlib.py +++ b/modules/websearch/lib/websearchadminlib.py @@ -69,6 +69,7 @@ from invenio.access_control_admin import acc_get_action_id from invenio.access_control_config import VIEWRESTRCOLL from invenio.errorlib import register_exception +from invenio.intbitset import intbitset def getnavtrail(previous = ''): """Get the navtrail""" @@ -77,6 +78,14 @@ def getnavtrail(previous = ''): navtrail = navtrail + previous return navtrail +def fix_collection_scores(): + """ + Re-calculate and re-normalize de scores of the collection relationship. + """ + for id_dad in intbitset(run_sql("SELECT id_dad FROM collection_collection")): + for index, id_son in enumerate(run_sql("SELECT id_son FROM collection_collection WHERE id_dad=%s ORDER BY score DESC", (id_dad, ))): + run_sql("UPDATE collection_collection SET score=%s WHERE id_dad=%s AND id_son=%s", (index * 10 + 10, id_dad, id_son[0])) + def perform_modifytranslations(colID, ln, sel_type='', trans=[], confirm=-1, callback='yes'): """Modify the translations of a collection sel_type - the nametype to modify @@ -2462,7 +2471,7 @@ def perform_modifyrestricted(colID, ln, rest='', callback='yes', confirm=-1): def perform_checkcollectionstatus(colID, ln, confirm=0, callback='yes'): """Check the configuration of the collections.""" - from invenio.search_engine import collection_restricted_p + from invenio.search_engine import collection_restricted_p, restricted_collection_cache subtitle = """Collection Status   [?]""" % CFG_SITE_URL output = "" @@ -2477,9 +2486,11 @@ def perform_checkcollectionstatus(colID, ln, confirm=0, callback='yes'): rnk_list = get_def_name('', "rnkMETHOD") actions = [] + restricted_collection_cache.recreate_cache_if_needed() + for (id, name, dbquery, nbrecs) in collections: - reg_sons = len(get_col_tree(id, 'r')) - vir_sons = len(get_col_tree(id, 'v')) + reg_sons = col_has_son(id, 'r') + vir_sons = col_has_son(id, 'v') status = "" hosted = "" @@ -2494,14 +2505,12 @@ def perform_checkcollectionstatus(colID, ln, confirm=0, callback='yes'): i8n += "%s, " % lang else: i8n = """None""" - if (reg_sons > 1 and dbquery) or dbquery=="": - status = """1:Query""" - elif dbquery is None and reg_sons == 1: - status = """2:Query""" - elif dbquery == "" and reg_sons == 1: - status = """3:Query""" - - if (reg_sons > 1 or vir_sons > 1): + if reg_sons and dbquery: + status = """1:Conflict""" + elif not dbquery and not reg_sons: + status = """2:Empty""" + + if (reg_sons or vir_sons): subs = """Yes""" else: subs = """No""" @@ -2509,13 +2518,13 @@ def perform_checkcollectionstatus(colID, ln, confirm=0, callback='yes'): if dbquery is None: dbquery = """No""" - restricted = collection_restricted_p(name) + restricted = collection_restricted_p(name, recreate_cache_if_needed=False) if restricted: restricted = """Yes""" if status: - status += """,4:Restricted""" + status += """,3:Restricted""" else: - status += """4:Restricted""" + status += """3:Restricted""" else: restricted = """No""" @@ -2668,6 +2677,10 @@ def perform_checkexternalcollections(colID, ln, icl=None, update="", confirm=0, else: return addadminbox(subtitle, body) +def col_has_son(colID, rtype='r'): + """Return True if the collection has at least one son.""" + return run_sql("SELECT id_son FROM collection_collection WHERE id_dad=%s and type=%s LIMIT 1", (colID, rtype)) != () + def get_col_tree(colID, rtype=''): """Returns a presentation of the tree as a list. TODO: Add loop detection colID - startpoint for the tree