Use character-by-character string comparison (snguyenthanh#17)

Use character-by-character string comparison
yperevoznikov · Oct 11, 2020 · 890c391 · 890c391
1 parent e352465
commit 890c391
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 23 deletions.
diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py
@@ -1,20 +1,35 @@
 # -*- coding: utf-8 -*-
 
-from itertools import product
+from collections.abc import Iterable
 
 from .constants import ALLOWED_CHARACTERS
-
 from .utils import (
-    read_wordlist,
-    get_replacement_for_swear_word,
     any_next_words_form_swear_word,
     get_complete_path_of_file,
+    get_replacement_for_swear_word,
+    read_wordlist,
 )
+from .varying_string import VaryingString
 
 
 class Profanity:
-    def __init__(self):
-        self.CENSOR_WORDSET = set()
+    def __init__(self, words=None):
+        """
+        Args:
+            words (Iterable/str): Collection of words or file path for a list of
+                words to censor. `None` to use the default word list.
+
+        Raises:
+            TypeError: If `words` is not a valid type.
+            FileNotFoundError: If `words` is a `str` and is not a valid file path.
+        """
+        if (
+            words is not None
+            and not isinstance(words, str)
+            and not isinstance(words, Iterable)
+        ):
+            raise TypeError("words must be of type str, list, or None")
+        self.CENSOR_WORDSET = []
         self.CHARS_MAPPING = {
             "a": ("a", "@", "*", "4"),
             "i": ("i", "*", "l", "1"),
@@ -24,14 +39,17 @@ def __init__(self):
             "l": ("l", "1"),
             "e": ("e", "*", "3"),
             "s": ("s", "$", "5"),
-            "t": ("t", "7",),
+            "t": ("t", "7"),
         }
         self.MAX_NUMBER_COMBINATIONS = 1
         self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
         self._default_wordlist_filename = get_complete_path_of_file(
             "profanity_wordlist.txt"
         )
-        self.load_censor_words()
+        if type(words) == str:
+            self.load_censor_words_from_file(words)
+        else:
+            self.load_censor_words(custom_words=words)
 
     ## PUBLIC ##
 
@@ -62,8 +80,8 @@ def add_censor_words(self, custom_words):
             raise TypeError(
                 "Function 'add_censor_words' only accepts list, tuple or set."
             )
-
-        self.CENSOR_WORDSET.update(custom_words)
+        for w in custom_words:
+            self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))
 
     def contains_profanity(self, text):
         """Return True if  the input text has any swear words."""
@@ -91,8 +109,8 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None):
 
         # Populate the words into an internal wordset
         whitelist_words = set(whitelist_words)
-        all_censor_words = set()
-        for word in words:
+        all_censor_words = []
+        for word in set(words):
             # All words in CENSOR_WORDSET must be in lowercase
             word = word.lower()
 
@@ -103,7 +121,7 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None):
             if num_of_non_allowed_chars > self.MAX_NUMBER_COMBINATIONS:
                 self.MAX_NUMBER_COMBINATIONS = num_of_non_allowed_chars
 
-            all_censor_words.update(set(self._generate_patterns_from_word(word)))
+            all_censor_words.append(VaryingString(word, char_map=self.CHARS_MAPPING))
 
         # The default wordlist takes ~5MB+ of memory
         self.CENSOR_WORDSET = all_censor_words
@@ -115,14 +133,6 @@ def _count_non_allowed_characters(self, word):
                 count += 1
         return count
 
-    def _generate_patterns_from_word(self, word):
-        """Return all patterns can be generated from the word."""
-        combos = [
-            (char,) if char not in self.CHARS_MAPPING else self.CHARS_MAPPING[char]
-            for char in iter(word)
-        ]
-        return ("".join(pattern) for pattern in product(*combos))
-
     def _update_next_words_indices(self, text, words_indices, start_idx):
         """Return a list of next words_indices after the input index."""
         if not words_indices:

diff --git a/better_profanity/constants.py b/better_profanity/constants.py
@@ -6,7 +6,6 @@
 
 from .utils import get_complete_path_of_file
 
-
 ALLOWED_CHARACTERS = set(ascii_letters)
 ALLOWED_CHARACTERS.update(set(digits))
 ALLOWED_CHARACTERS.update({"@", "$", "*", '"', "'"})

diff --git a/better_profanity/utils.py b/better_profanity/utils.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+
 import os.path
 
 

diff --git a/better_profanity/varying_string.py b/better_profanity/varying_string.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+
+
+class VaryingString:
+    """Represents a string with varying character representations."""
+
+    def __init__(self, string, char_map={}):
+        """
+        Args:
+            string (str): String to generate variants of.
+            char_mappings (dict): Maps characters to substitute characters.
+        """
+        self._original = string
+
+        # There is not necessarily a single length for all of this string's variants.
+        # Some character substitutions may include more than one character or empty
+        # substitutions.
+        self._min_len = 0
+        self._max_len = 0
+
+        # Create list of all possible character combinations.
+        self._char_combos = []
+        for char in self._original:
+            if char in char_map:
+                self._char_combos.append(char_map[char])
+                lens = [len(c) for c in char_map[char]]
+                self._min_len += min(lens)
+                self._max_len += max(lens)
+            else:
+                self._char_combos.append((char,))
+                self._min_len += 1
+                self._max_len += 1
+
+    def __str__(self):
+        return self._original
+
+    def __eq__(self, other):
+        if self is other:
+            return True
+        elif other.__class__ == VaryingString:
+            # We have no use case for this yet.
+            raise NotImplementedError
+        elif other.__class__ == str:
+            len_other = len(other)
+            if len_other < self._min_len or len_other > self._max_len:
+                return False
+            # We use a list of slices instead of a single slices to account for
+            # character substitutions that contain multiple characters.
+            slices = [other]
+            for chars in self._char_combos:
+                new_slices = []
+                for char in chars:
+                    if not char:
+                        new_slices.extend(slices)
+                    len_char = len(char)
+                    for sl in slices:
+                        if sl[:len_char] == char:
+                            new_slices.append(sl[len_char:])
+                if len(new_slices) == 0:
+                    return False
+                slices = new_slices
+            for sl in slices:
+                if len(sl) == 0:
+                    return True
+            return False
+        else:
+            return False
diff --git a/tests.py b/tests.py
@@ -2,7 +2,7 @@
 
 import unittest
 
-from better_profanity import profanity
+from better_profanity import profanity, Profanity
 
 
 class ProfanityTest(unittest.TestCase):
@@ -107,6 +107,18 @@ def test_custom_words_doesnt_remove_initial_words(self):
         profanity.add_censor_words(["supremacia ariana"])
         self.assertEqual(profanity.censor(bad_text), censored_text)
 
+    def test_init_with_list(self):
+        custom_badwords = ["happy", "jolly", "merry"]
+        Profanity(custom_badwords)
+        Profanity(set(custom_badwords))
+        Profanity(tuple(custom_badwords))
+
+    def test_init_with_bad_type(self):
+        with self.assertRaises(TypeError):
+            Profanity(123)
+        with self.assertRaises(TypeError):
+            Profanity(False)
+
 
 class ProfanityUnicodeTestRussian(unittest.TestCase):
     def setUp(self):
@@ -189,6 +201,10 @@ def test_read_wordlist_not_found(self):
         with self.assertRaises(FileNotFoundError):
             profanity.load_censor_words_from_file("not_found_file.txt")
 
+    def test_init_wordlist_not_found(self):
+        with self.assertRaises(FileNotFoundError):
+            Profanity("not_found_file.txt")
+
 
 if __name__ == "__main__":
     unittest.main()
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,5 @@
		# -- coding: utf-8 --

		import os.path


Expand Down