Skip to content

Commit

Permalink
Added support for Armenian Greek & Hebrew
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidJacobson committed Jan 2, 2018
1 parent a561e79 commit fe48715
Showing 1 changed file with 137 additions and 0 deletions.
137 changes: 137 additions & 0 deletions characters_safetext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# This is a file to store all the characters that SafeText will look out for

# Zero width characters are visible when reading, as they take up no space. However, they can be used in fingerprinting.
# Below is a list of Unicode's zero width characters.


ZERO_WIDTH_CHARS = {
"SPACE": u'\u200b',
"NON-JOINER CODE POINT": u'\u200c',
"JOINER CODE POINT": u'\u200d',
"NO BREAK SPACE CODE POINT": u'\uFEFF',
}

# A list of letters that have identical counterparts from other character sets.
# Please note that these are stored with reference to their english counterparts, i.e. CYRILLIC_b is not be the
# second letter of the Cyrillic alphabet, rather it is the Cyrillic character that most resembles the letter 'b'.
# List built from https://en.wikipedia.org/wiki/IDN_homograph_attack + manual inspection of characters.
HOMOGLYPHS = { # To quickly verify that these characters are not Latin, enter them in Google with autocomplete.
# The response should be a character set other than Latin.
# The characters are organized by: CHARACTER SET _ [UPPER/LOWER] _ LATIN COUNTERPART
# EG: GREEK_SMALL_B
# Cyrillic characters are used in Russian, Belarusian, Ukrainian, Bulgarian, Serbian, Bosnian, Croatian and more.
# This character set is the most common used in homoglyph fingerprinting as it has the most characters that are
# visually similar to their Latin counterparts
"CYRILLIC_a": u"а",
"CYRILLIC_small_b": u"ь",
"CYRILLIC_large_b": u"Ъ",
"CYRILLIC_c": u"с",
"CYRILLIC_d": u"ԁ",
"CYRILLIC_e": u"е",
"CYRILLIC_small_h": u"һ",
"CYRILLIC_large_h": u"Һ",
"CYRILLIC_i": u"і",
"CYRILLIC_j": u"ј",
"CYRILLIC_o": u"о",
"CYRILLIC_p": u"р",
"CYRILLIC_s": u"ѕ",
"CYRILLIC_small_v": u"ѵ",
"CYRILLIC_large_v": u"Ѵ",
"CYRILLIC_x": u"х",
"CYRILLIC_y": u"у",

"CYRILLIC_A": u"А",
"CYRILLIC_small_B": u"в",
"CYRILLIC_large_B": u"В",
"CYRILLIC_small_C": u"с",
"CYRILLIC_large_C": u"С",
"CYRILLIC_E": u"Е",
"CYRILLIC_small_F": u"ғ",
"CYRILLIC_large_F": u"Ғ",
"CYRILLIC_small_G": u"ԍ",
"CYRILLIC_large_G": u"Ԍ",
"CYRILLIC_small_H": u"н",
"CYRILLIC_large_H": u"Н",
"CYRILLIC_I": u"І",
"CYRILLIC_J": u"Ј",
"CYRILLIC_small_K": u"к",
"CYRILLIC_large_K": u"К",
"CYRILLIC_small_M": u"м",
"CYRILLIC_large_M": u"М",
"CYRILLIC_small_O": u"о",
"CYRILLIC_large_O": u"О",
"CYRILLIC_P": u"Р",
"CYRILLIC_S": u"Ѕ",
"CYRILLIC_small_T": u"т",
"CYRILLIC_large_T": u"Т",
"CYRILLIC_X": u"Х",
"CYRILLIC_Y": u"У",


"CYRILLIC_3": u"З",
"CYRILLIC_4": u"Ч",
"CYRILLIC_6": u"б",


# Greek characters
# There are Greek characters that are identical to Cyrillic, German, and Serbian
# however this tool is built around Latin.

"GREEK_c": u"ϲ",
"GREEK_i": u"ί",
"GREEK_o": u"ο",
"GREEK_p": u"ρ",
"GREEK_w": u"ω",
"GREEK_v": u"ν",

"GREEK_A": u"Α",
"GREEK_B": u"Β",
"GREEK_C": u"Ϲ",
"GREEK_E": u"Ε",
"GREEK_H": u"Η",
"GREEK_I": u"Ι",
"GREEK_J": u"Ϳ",
"GREEK_small_K": u"Κ",
"GREEK_large_K": u"κ",
"GREEK_small_M": u"Μ",
"GREEK_large_M": u"Ϻ",
"GREEK_N": u"Ν",
"GREEK_O": u"Ο",
"GREEK_T": u"Τ",
"GREEK_U": u"υ",
"GREEK_X": u"Χ",
"GREEK_Y": u"Υ",
"GREEK_Z": u"Ζ",

# Armenian characters

"ARMENIAN_g": u"ց",
"ARMENIAN_o": u"օ",
"ARMENIAN_j": u"յ",
"ARMENIAN_h": u"հ",
"ARMENIAN_n": u"ո",
"ARMENIAN_u": u"ս",
"ARMENIAN_q": u"զ",

"ARMENIAN_L": u"Լ",
"ARMENIAN_O": u"Օ",
"ARMENIAN_U": u"Ս",
"ARMENIAN_S": u"Տ",

"ARMENIAN_2": u"Ձ",
"ARMENIAN_ALT_ 2": u"շ",
"ARMENIAN_3": u"3",
"ARMENIAN_4": u"վ",

# Hebrew Characters

"HEBREW_i": "וֹ",
"HEBREW_n": "ח",

"HEBREW_O": "ס",

# Script characters

"SCRIPT_i": u"í",

}

0 comments on commit fe48715

Please sign in to comment.