BibIndex: removes illegal characters for Solr

* Adds a new function remove_control_characters in textutils for the purpose if correctly stripping control characters before dispatching to Solr. * Adds a test for the new API function and re-organizes some similar tests into the same class.
jrbl · Jun 6, 2012 · 91be3ea · 91be3ea
1 parent befe685
commit 91be3ea
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 11 deletions.
diff --git a/modules/bibindex/lib/bibindex_engine.py b/modules/bibindex/lib/bibindex_engine.py
@@ -66,7 +66,7 @@
 from invenio.intbitset import intbitset
 from invenio.errorlib import register_exception
 from invenio.htmlutils import remove_html_markup, get_links_in_html_page
-from invenio.textutils import wash_for_utf8, strip_accents
+from invenio.textutils import wash_for_utf8, strip_accents, remove_control_characters
 from invenio.search_engine_utils import get_fieldvalues
 
 if CFG_SOLR_URL:
@@ -274,6 +274,8 @@ def solr_add_fulltext(recid, text):
     """
     if recid:
         try:
+            # Remove any illegal characters
+            text = remove_control_characters(text)
             utext = unicode(text, 'utf-8')
             SOLR_CONNECTION.add(id=recid, fulltext=utext)
             SOLR_CONNECTION.commit()

diff --git a/modules/miscutil/lib/textutils.py b/modules/miscutil/lib/textutils.py
@@ -673,3 +673,25 @@ def strip_accents(x):
     y = re_unicode_uppercase_n.sub("N", y)
     # return UTF-8 representation of the Unicode string:
     return y.encode("utf-8")
+
+def remove_control_characters(text):
+    """
+    Replaces control characters* with space in a given string
+    and returns it.
+
+    * except \n (10) \t (9) and \r (13)
+
+    @param text: string to strip characters from
+    @type text: string
+
+    @return: string without control characters*
+    @rtype: string
+    """
+    exceptions = (9, 10, 13)
+    res = []
+    for c in text:
+        if ord(c) >= 32 or ord(c) in exceptions:
+            res.append(c)
+        else:
+            res.append(' ')
+    return ''.join(res)
diff --git a/modules/miscutil/lib/textutils_tests.py b/modules/miscutil/lib/textutils_tests.py
@@ -43,10 +43,12 @@
      decode_to_unicode, \
      translate_latex2unicode, \
      translate_to_ascii, \
-     strip_accents
+     strip_accents, \
+     remove_control_characters
 
 from invenio.testutils import make_test_suite, run_test_suite
 
+
 class GuessMinimumEncodingTest(unittest.TestCase):
     """Test functions related to guess_minimum_encoding function."""
     def test_guess_minimum_encoding(self):
@@ -55,6 +57,7 @@ def test_guess_minimum_encoding(self):
         self.assertEqual(guess_minimum_encoding('àèéìòù'), ('\xe0\xe8\xe9\xec\xf2\xf9', 'latin1'))
         self.assertEqual(guess_minimum_encoding('Ιθάκη'), ('Ιθάκη', 'utf8'))
 
+
 class WashForXMLTest(unittest.TestCase):
     """Test functions related to wash_for_xml function."""
 
@@ -194,6 +197,7 @@ def test_illegal_characters_washing_1_1(self):
                                       xml_version='1.1'), '\x08\tsome chars')
         self.assertEqual(wash_for_xml('$b\bar{b}$', xml_version='1.1'), '$b\x08ar{b}$')
 
+
 class WashForUTF8Test(unittest.TestCase):
     def test_normal_legal_string_washing(self):
         """textutils - testing UTF-8 washing on a perfectly normal string"""
@@ -249,6 +253,7 @@ def test_already_utf8_input(self):
         """textutils - washing a Unicode string into UTF-8 binary string"""
         self.assertEqual('Göppert', wash_for_utf8(u'G\xf6ppert', True))
 
+
 class WrapTextInABoxTest(unittest.TestCase):
     """Test functions related to wrap_text_in_a_box function."""
 
@@ -361,7 +366,6 @@ def test_single_new_line_wrap_text_in_a_box(self):
 """
         self.assertEqual(wrap_text_in_a_box("ciao\ncome và?"), result)
 
-
     def test_indented_box_wrap_text_in_a_box(self):
         """textutils - wrap_text_in_a_box indented box."""
         result = """
@@ -410,6 +414,7 @@ def test_real_longtext_wrap_text_in_a_box(self):
 """
         self.assertEqual(wrap_text_in_a_box(text), result)
 
+
 class DecodeToUnicodeTest(unittest.TestCase):
     """Test functions related to decode_to_unicode function."""
     if CHARDET_AVAILABLE:
@@ -421,6 +426,7 @@ def test_decode_to_unicode(self):
     else:
         pass
 
+
 class Latex2UnicodeTest(unittest.TestCase):
     """Test functions related to translating LaTeX symbols to Unicode."""
 
@@ -431,11 +437,12 @@ def test_latex_to_unicode(self):
         self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson')
         self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"), u'\U0001d6e7')
 
-class TranslateToAsciiTest(unittest.TestCase):
-    """Test functions related to transliterating text to ascii."""
+
+class TestStripping(unittest.TestCase):
+    """Test for stripping functions like accents and control characters."""
     if UNIDECODE_AVAILABLE:
         def test_text_to_ascii(self):
-            """textutils - translate_to_ascii"""
+            """textutils - transliterate to ascii using unidecode"""
             self.assertEqual(translate_to_ascii(["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]), \
                                                 ["a i U", "Hohne", "Age Ost Vaer", "normal"])
             self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
@@ -445,19 +452,23 @@ def test_text_to_ascii(self):
     else:
         pass
 
-class TestStripAccents(unittest.TestCase):
-    """Test for handling of UTF-8 accents."""
-
     def test_strip_accents(self):
-        """textutils - stripping of accented letters"""
+        """textutils - transliterate to ascii (basic)"""
         self.assertEqual("memememe",
                          strip_accents('mémêmëmè'))
         self.assertEqual("MEMEMEME",
                          strip_accents('MÉMÊMËMÈ'))
 
+    def test_remove_control_characters(self):
+        """textutils - stripping of accented letters"""
+        self.assertEqual("foo\nbar\tfab\n\r",
+                         remove_control_characters('foo\nbar\tfab\n\r'))
+        self.assertEqual("abc de",
+                         remove_control_characters('abc\02de'))
+
 TEST_SUITE = make_test_suite(WrapTextInABoxTest, GuessMinimumEncodingTest,
                              WashForXMLTest, WashForUTF8Test, DecodeToUnicodeTest,
-                             Latex2UnicodeTest, TranslateToAsciiTest, TestStripAccents)
+                             Latex2UnicodeTest, TestStripping)
 
 if __name__ == "__main__":
     run_test_suite(TEST_SUITE)