Added support for Armenian Greek & Hebrew

DavidJacobson · Jan 2, 2018 · fe48715 · fe48715
1 parent a561e79
commit fe48715
Showing 1 changed file with 137 additions and 0 deletions.
diff --git a/characters_safetext.py b/characters_safetext.py
@@ -0,0 +1,137 @@
+# This is a file to store all the characters that SafeText will look out for
+
+# Zero width characters are visible when reading, as they take up no space. However, they can be used in fingerprinting.
+# Below is a list of Unicode's zero width characters.
+
+
+ZERO_WIDTH_CHARS = {
+    "SPACE": u'\u200b',
+    "NON-JOINER CODE POINT": u'\u200c',
+    "JOINER CODE POINT": u'\u200d',
+    "NO BREAK SPACE CODE POINT": u'\uFEFF',
+}
+
+# A list of letters that have identical counterparts from other character sets.
+# Please note that these are stored with reference to their english counterparts, i.e. CYRILLIC_b is not be the
+# second letter of the Cyrillic alphabet, rather it is the Cyrillic character that most resembles the letter 'b'.
+# List built from https://en.wikipedia.org/wiki/IDN_homograph_attack + manual inspection of characters.
+HOMOGLYPHS = {  # To quickly verify that these characters are not Latin, enter them in Google with autocomplete.
+    #           The response should be a character set other than Latin.
+    #           The characters are organized by: CHARACTER SET _ [UPPER/LOWER] _ LATIN COUNTERPART
+    #           EG: GREEK_SMALL_B
+    # Cyrillic characters are used in Russian, Belarusian, Ukrainian, Bulgarian, Serbian, Bosnian, Croatian and more.
+    # This character set is the most common used in homoglyph fingerprinting as it has the most characters that are
+    # visually similar to their Latin counterparts
+    "CYRILLIC_a":  u"а",
+    "CYRILLIC_small_b": u"ь",
+    "CYRILLIC_large_b": u"Ъ",
+    "CYRILLIC_c": u"с",
+    "CYRILLIC_d": u"ԁ",
+    "CYRILLIC_e": u"е",
+    "CYRILLIC_small_h": u"һ",
+    "CYRILLIC_large_h": u"Һ",
+    "CYRILLIC_i": u"і",
+    "CYRILLIC_j": u"ј",
+    "CYRILLIC_o": u"о",
+    "CYRILLIC_p": u"р",
+    "CYRILLIC_s": u"ѕ",
+    "CYRILLIC_small_v": u"ѵ",
+    "CYRILLIC_large_v": u"Ѵ",
+    "CYRILLIC_x": u"х",
+    "CYRILLIC_y": u"у",
+
+    "CYRILLIC_A": u"А",
+    "CYRILLIC_small_B": u"в",
+    "CYRILLIC_large_B": u"В",
+    "CYRILLIC_small_C": u"с",
+    "CYRILLIC_large_C": u"С",
+    "CYRILLIC_E": u"Е",
+    "CYRILLIC_small_F": u"ғ",
+    "CYRILLIC_large_F": u"Ғ",
+    "CYRILLIC_small_G": u"ԍ",
+    "CYRILLIC_large_G": u"Ԍ",
+    "CYRILLIC_small_H": u"н",
+    "CYRILLIC_large_H": u"Н",
+    "CYRILLIC_I": u"І",
+    "CYRILLIC_J": u"Ј",
+    "CYRILLIC_small_K": u"к",
+    "CYRILLIC_large_K": u"К",
+    "CYRILLIC_small_M": u"м",
+    "CYRILLIC_large_M": u"М",
+    "CYRILLIC_small_O": u"о",
+    "CYRILLIC_large_O": u"О",
+    "CYRILLIC_P": u"Р",
+    "CYRILLIC_S": u"Ѕ",
+    "CYRILLIC_small_T": u"т",
+    "CYRILLIC_large_T": u"Т",
+    "CYRILLIC_X": u"Х",
+    "CYRILLIC_Y": u"У",
+
+
+    "CYRILLIC_3": u"З",
+    "CYRILLIC_4": u"Ч",
+    "CYRILLIC_6": u"б",
+
+
+    # Greek characters
+    # There are Greek characters that are identical to Cyrillic, German, and Serbian
+    # however this tool is built around Latin.
+
+    "GREEK_c": u"ϲ",
+    "GREEK_i": u"ί",
+    "GREEK_o": u"ο",
+    "GREEK_p": u"ρ",
+    "GREEK_w": u"ω",
+    "GREEK_v": u"ν",
+
+    "GREEK_A": u"Α",
+    "GREEK_B": u"Β",
+    "GREEK_C": u"Ϲ",
+    "GREEK_E": u"Ε",
+    "GREEK_H": u"Η",
+    "GREEK_I": u"Ι",
+    "GREEK_J": u"Ϳ",
+    "GREEK_small_K": u"Κ",
+    "GREEK_large_K": u"κ",
+    "GREEK_small_M": u"Μ",
+    "GREEK_large_M": u"Ϻ",
+    "GREEK_N": u"Ν",
+    "GREEK_O": u"Ο",
+    "GREEK_T": u"Τ",
+    "GREEK_U": u"υ",
+    "GREEK_X": u"Χ",
+    "GREEK_Y": u"Υ",
+    "GREEK_Z": u"Ζ",
+
+    # Armenian characters
+
+    "ARMENIAN_g": u"ց",
+    "ARMENIAN_o": u"օ",
+    "ARMENIAN_j": u"յ",
+    "ARMENIAN_h": u"հ",
+    "ARMENIAN_n": u"ո",
+    "ARMENIAN_u": u"ս",
+    "ARMENIAN_q": u"զ",
+
+    "ARMENIAN_L": u"Լ",
+    "ARMENIAN_O": u"Օ",
+    "ARMENIAN_U": u"Ս",
+    "ARMENIAN_S": u"Տ",
+
+    "ARMENIAN_2": u"Ձ",
+    "ARMENIAN_ALT_ 2": u"շ",
+    "ARMENIAN_3": u"3",
+    "ARMENIAN_4": u"վ",
+
+    # Hebrew Characters
+
+    "HEBREW_i": "וֹ",
+    "HEBREW_n": "ח",
+
+    "HEBREW_O": "ס",
+
+    # Script characters
+
+    "SCRIPT_i": u"í",
+
+}