Skip to content

Commit

Permalink
BibIndex: removes illegal characters for Solr
Browse files Browse the repository at this point in the history
* Adds a new function remove_control_characters in textutils for
  the purpose if correctly stripping control characters before
  dispatching to Solr.

* Adds a test for the new API function and re-organizes some
  similar tests into the same class.
  • Loading branch information
jalavik authored and tiborsimko committed Jun 6, 2012
1 parent befe685 commit 91be3ea
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 11 deletions.
4 changes: 3 additions & 1 deletion modules/bibindex/lib/bibindex_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
from invenio.intbitset import intbitset
from invenio.errorlib import register_exception
from invenio.htmlutils import remove_html_markup, get_links_in_html_page
from invenio.textutils import wash_for_utf8, strip_accents
from invenio.textutils import wash_for_utf8, strip_accents, remove_control_characters
from invenio.search_engine_utils import get_fieldvalues

if CFG_SOLR_URL:
Expand Down Expand Up @@ -274,6 +274,8 @@ def solr_add_fulltext(recid, text):
"""
if recid:
try:
# Remove any illegal characters
text = remove_control_characters(text)
utext = unicode(text, 'utf-8')
SOLR_CONNECTION.add(id=recid, fulltext=utext)
SOLR_CONNECTION.commit()
Expand Down
22 changes: 22 additions & 0 deletions modules/miscutil/lib/textutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,25 @@ def strip_accents(x):
y = re_unicode_uppercase_n.sub("N", y)
# return UTF-8 representation of the Unicode string:
return y.encode("utf-8")

def remove_control_characters(text):
"""
Replaces control characters* with space in a given string
and returns it.
* except \n (10) \t (9) and \r (13)
@param text: string to strip characters from
@type text: string
@return: string without control characters*
@rtype: string
"""
exceptions = (9, 10, 13)
res = []
for c in text:
if ord(c) >= 32 or ord(c) in exceptions:
res.append(c)
else:
res.append(' ')
return ''.join(res)
31 changes: 21 additions & 10 deletions modules/miscutil/lib/textutils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@
decode_to_unicode, \
translate_latex2unicode, \
translate_to_ascii, \
strip_accents
strip_accents, \
remove_control_characters

from invenio.testutils import make_test_suite, run_test_suite


class GuessMinimumEncodingTest(unittest.TestCase):
"""Test functions related to guess_minimum_encoding function."""
def test_guess_minimum_encoding(self):
Expand All @@ -55,6 +57,7 @@ def test_guess_minimum_encoding(self):
self.assertEqual(guess_minimum_encoding('àèéìòù'), ('\xe0\xe8\xe9\xec\xf2\xf9', 'latin1'))
self.assertEqual(guess_minimum_encoding('Ιθάκη'), ('Ιθάκη', 'utf8'))


class WashForXMLTest(unittest.TestCase):
"""Test functions related to wash_for_xml function."""

Expand Down Expand Up @@ -194,6 +197,7 @@ def test_illegal_characters_washing_1_1(self):
xml_version='1.1'), '\x08\tsome chars')
self.assertEqual(wash_for_xml('$b\bar{b}$', xml_version='1.1'), '$b\x08ar{b}$')


class WashForUTF8Test(unittest.TestCase):
def test_normal_legal_string_washing(self):
"""textutils - testing UTF-8 washing on a perfectly normal string"""
Expand Down Expand Up @@ -249,6 +253,7 @@ def test_already_utf8_input(self):
"""textutils - washing a Unicode string into UTF-8 binary string"""
self.assertEqual('Göppert', wash_for_utf8(u'G\xf6ppert', True))


class WrapTextInABoxTest(unittest.TestCase):
"""Test functions related to wrap_text_in_a_box function."""

Expand Down Expand Up @@ -361,7 +366,6 @@ def test_single_new_line_wrap_text_in_a_box(self):
"""
self.assertEqual(wrap_text_in_a_box("ciao\ncome và?"), result)


def test_indented_box_wrap_text_in_a_box(self):
"""textutils - wrap_text_in_a_box indented box."""
result = """
Expand Down Expand Up @@ -410,6 +414,7 @@ def test_real_longtext_wrap_text_in_a_box(self):
"""
self.assertEqual(wrap_text_in_a_box(text), result)


class DecodeToUnicodeTest(unittest.TestCase):
"""Test functions related to decode_to_unicode function."""
if CHARDET_AVAILABLE:
Expand All @@ -421,6 +426,7 @@ def test_decode_to_unicode(self):
else:
pass


class Latex2UnicodeTest(unittest.TestCase):
"""Test functions related to translating LaTeX symbols to Unicode."""

Expand All @@ -431,11 +437,12 @@ def test_latex_to_unicode(self):
self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson')
self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"), u'\U0001d6e7')

class TranslateToAsciiTest(unittest.TestCase):
"""Test functions related to transliterating text to ascii."""

class TestStripping(unittest.TestCase):
"""Test for stripping functions like accents and control characters."""
if UNIDECODE_AVAILABLE:
def test_text_to_ascii(self):
"""textutils - translate_to_ascii"""
"""textutils - transliterate to ascii using unidecode"""
self.assertEqual(translate_to_ascii(["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]), \
["a i U", "Hohne", "Age Ost Vaer", "normal"])
self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
Expand All @@ -445,19 +452,23 @@ def test_text_to_ascii(self):
else:
pass

class TestStripAccents(unittest.TestCase):
"""Test for handling of UTF-8 accents."""

def test_strip_accents(self):
"""textutils - stripping of accented letters"""
"""textutils - transliterate to ascii (basic)"""
self.assertEqual("memememe",
strip_accents('mémêmëmè'))
self.assertEqual("MEMEMEME",
strip_accents('MÉMÊMËMÈ'))

def test_remove_control_characters(self):
"""textutils - stripping of accented letters"""
self.assertEqual("foo\nbar\tfab\n\r",
remove_control_characters('foo\nbar\tfab\n\r'))
self.assertEqual("abc de",
remove_control_characters('abc\02de'))

TEST_SUITE = make_test_suite(WrapTextInABoxTest, GuessMinimumEncodingTest,
WashForXMLTest, WashForUTF8Test, DecodeToUnicodeTest,
Latex2UnicodeTest, TranslateToAsciiTest, TestStripAccents)
Latex2UnicodeTest, TestStripping)

if __name__ == "__main__":
run_test_suite(TEST_SUITE)
Expand Down

0 comments on commit 91be3ea

Please sign in to comment.