Skip to content

Commit

Permalink
Use character-by-character string comparison (snguyenthanh#17)
Browse files Browse the repository at this point in the history
Use character-by-character string comparison
  • Loading branch information
jcbrockschmidt authored Oct 11, 2020
1 parent e352465 commit 890c391
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 23 deletions.
52 changes: 31 additions & 21 deletions better_profanity/better_profanity.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
# -*- coding: utf-8 -*-

from itertools import product
from collections.abc import Iterable

from .constants import ALLOWED_CHARACTERS

from .utils import (
read_wordlist,
get_replacement_for_swear_word,
any_next_words_form_swear_word,
get_complete_path_of_file,
get_replacement_for_swear_word,
read_wordlist,
)
from .varying_string import VaryingString


class Profanity:
def __init__(self):
self.CENSOR_WORDSET = set()
def __init__(self, words=None):
"""
Args:
words (Iterable/str): Collection of words or file path for a list of
words to censor. `None` to use the default word list.
Raises:
TypeError: If `words` is not a valid type.
FileNotFoundError: If `words` is a `str` and is not a valid file path.
"""
if (
words is not None
and not isinstance(words, str)
and not isinstance(words, Iterable)
):
raise TypeError("words must be of type str, list, or None")
self.CENSOR_WORDSET = []
self.CHARS_MAPPING = {
"a": ("a", "@", "*", "4"),
"i": ("i", "*", "l", "1"),
Expand All @@ -24,14 +39,17 @@ def __init__(self):
"l": ("l", "1"),
"e": ("e", "*", "3"),
"s": ("s", "$", "5"),
"t": ("t", "7",),
"t": ("t", "7"),
}
self.MAX_NUMBER_COMBINATIONS = 1
self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
self._default_wordlist_filename = get_complete_path_of_file(
"profanity_wordlist.txt"
)
self.load_censor_words()
if type(words) == str:
self.load_censor_words_from_file(words)
else:
self.load_censor_words(custom_words=words)

## PUBLIC ##

Expand Down Expand Up @@ -62,8 +80,8 @@ def add_censor_words(self, custom_words):
raise TypeError(
"Function 'add_censor_words' only accepts list, tuple or set."
)

self.CENSOR_WORDSET.update(custom_words)
for w in custom_words:
self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))

def contains_profanity(self, text):
"""Return True if the input text has any swear words."""
Expand Down Expand Up @@ -91,8 +109,8 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None):

# Populate the words into an internal wordset
whitelist_words = set(whitelist_words)
all_censor_words = set()
for word in words:
all_censor_words = []
for word in set(words):
# All words in CENSOR_WORDSET must be in lowercase
word = word.lower()

Expand All @@ -103,7 +121,7 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None):
if num_of_non_allowed_chars > self.MAX_NUMBER_COMBINATIONS:
self.MAX_NUMBER_COMBINATIONS = num_of_non_allowed_chars

all_censor_words.update(set(self._generate_patterns_from_word(word)))
all_censor_words.append(VaryingString(word, char_map=self.CHARS_MAPPING))

# The default wordlist takes ~5MB+ of memory
self.CENSOR_WORDSET = all_censor_words
Expand All @@ -115,14 +133,6 @@ def _count_non_allowed_characters(self, word):
count += 1
return count

def _generate_patterns_from_word(self, word):
"""Return all patterns can be generated from the word."""
combos = [
(char,) if char not in self.CHARS_MAPPING else self.CHARS_MAPPING[char]
for char in iter(word)
]
return ("".join(pattern) for pattern in product(*combos))

def _update_next_words_indices(self, text, words_indices, start_idx):
"""Return a list of next words_indices after the input index."""
if not words_indices:
Expand Down
1 change: 0 additions & 1 deletion better_profanity/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from .utils import get_complete_path_of_file


ALLOWED_CHARACTERS = set(ascii_letters)
ALLOWED_CHARACTERS.update(set(digits))
ALLOWED_CHARACTERS.update({"@", "$", "*", '"', "'"})
Expand Down
1 change: 1 addition & 0 deletions better_profanity/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-

import os.path


Expand Down
67 changes: 67 additions & 0 deletions better_profanity/varying_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-


class VaryingString:
"""Represents a string with varying character representations."""

def __init__(self, string, char_map={}):
"""
Args:
string (str): String to generate variants of.
char_mappings (dict): Maps characters to substitute characters.
"""
self._original = string

# There is not necessarily a single length for all of this string's variants.
# Some character substitutions may include more than one character or empty
# substitutions.
self._min_len = 0
self._max_len = 0

# Create list of all possible character combinations.
self._char_combos = []
for char in self._original:
if char in char_map:
self._char_combos.append(char_map[char])
lens = [len(c) for c in char_map[char]]
self._min_len += min(lens)
self._max_len += max(lens)
else:
self._char_combos.append((char,))
self._min_len += 1
self._max_len += 1

def __str__(self):
return self._original

def __eq__(self, other):
if self is other:
return True
elif other.__class__ == VaryingString:
# We have no use case for this yet.
raise NotImplementedError
elif other.__class__ == str:
len_other = len(other)
if len_other < self._min_len or len_other > self._max_len:
return False
# We use a list of slices instead of a single slices to account for
# character substitutions that contain multiple characters.
slices = [other]
for chars in self._char_combos:
new_slices = []
for char in chars:
if not char:
new_slices.extend(slices)
len_char = len(char)
for sl in slices:
if sl[:len_char] == char:
new_slices.append(sl[len_char:])
if len(new_slices) == 0:
return False
slices = new_slices
for sl in slices:
if len(sl) == 0:
return True
return False
else:
return False
18 changes: 17 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import unittest

from better_profanity import profanity
from better_profanity import profanity, Profanity


class ProfanityTest(unittest.TestCase):
Expand Down Expand Up @@ -107,6 +107,18 @@ def test_custom_words_doesnt_remove_initial_words(self):
profanity.add_censor_words(["supremacia ariana"])
self.assertEqual(profanity.censor(bad_text), censored_text)

def test_init_with_list(self):
custom_badwords = ["happy", "jolly", "merry"]
Profanity(custom_badwords)
Profanity(set(custom_badwords))
Profanity(tuple(custom_badwords))

def test_init_with_bad_type(self):
with self.assertRaises(TypeError):
Profanity(123)
with self.assertRaises(TypeError):
Profanity(False)


class ProfanityUnicodeTestRussian(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -189,6 +201,10 @@ def test_read_wordlist_not_found(self):
with self.assertRaises(FileNotFoundError):
profanity.load_censor_words_from_file("not_found_file.txt")

def test_init_wordlist_not_found(self):
with self.assertRaises(FileNotFoundError):
Profanity("not_found_file.txt")


if __name__ == "__main__":
unittest.main()

0 comments on commit 890c391

Please sign in to comment.