generated from subhalingamd/mypy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstants.py
89 lines (79 loc) · 3.4 KB
/
constants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
RANDOM_SEED = 772
LENGTH_THRESH = 8
REGEX = {
"unicode": re.compile(r'((?:\\u[0-9A-Fa-f]+)|(?:[^\x00-\x84]))'),
"hashtag": re.compile(r"#(\w+)"),
"mention": re.compile(r"@(\w+)"),
"url": re.compile(r"(?:https?|ftp)://[a-zA-Z0-9\./]+"),
"repeat": re.compile(r"(.)\1{2,}", flags=re.IGNORECASE),
"delimiter": re.compile(r'\W+'),
"number": re.compile(r"(?:^|\W)\d+(?:\W|$)"),
"emotes": {},
"punctuations": {}
}
EMOTES = [
('__EMOTE__SMILE', [':-)', ':)', '(-:', '(:', ':-3', ':3', ':P', ':-P', ':p', '^-^', '^_^', ':S', ]),
('__EMOTE__LAUGH', [':-D', ':D', 'X-D', 'XD', 'xD', '=D', '8D', '8-D', 'X-p', 'xp', '^.^', ':O', 'XO', 'xO', '=P', '=p', ]),
('__EMOTE__LOVE', ['<3', ':\\*', '♥', ';^)', ':*', ':-*', ':X', '*_*', ]),
('__EMOTE__WINK', [';-)', ';)', ';-D', ';D', '(-;', '(;', '*)', '*-)', 'O.o', ]),
('__EMOTE__SAD', [':-(', ':(', ')-:', '):', ':<', ':-<', ':c', ':-\\', '(-;', ':/', ':-/', 'X-(', ':-@', 'O_O', ]), # ':/', ':-/' do not include??? #hack
('__EMOTE__CRY', [':,(', ':\'(', ':"(', ':((', ':\'-(', '>.<', ]),
]
for name, symbols in EMOTES:
for symbol in symbols:
REGEX['emotes'].update({symbol: name})
REGEX['emotes'].update({" ".join(symbol): name})
if ')' in symbol:
REGEX['emotes'].update({symbol.replace(')', ']'): name})
REGEX['emotes'].update({" ".join(symbol).replace(')', ']'): name})
if '(' in symbol:
REGEX['emotes'].update({symbol.replace('(', '['): name})
REGEX['emotes'].update({" ".join(symbol).replace('(', '['): name})
PUNCTUATIONS = [
('__PUNC__QUES', ['?', '¿', ]),
('__PUNC__EXCL', ['!', '¡', ]),
('__PUNC__ELLP', ['...', '…', '. . .', ]),
]
for name, symbols in PUNCTUATIONS:
for symbol in symbols:
REGEX['punctuations'].update({symbol: name})
CONTRACTIONS = [
(r'won\'t', 'will not'),
(r'can\'t', 'can not'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
(r'&', 'and'),
(r'dont', 'do not'),
(r'wont', 'will not'),
]
escape_for_regex = {
'-': '\-',
'/': '\/',
':': '\:',
')': '\)',
'@': '\@',
'<': '\<',
' ': '\s',
'=': '\=',
'_': '\_',
';': '\;',
'[': '\[',
'^': '\^',
'*': '\*',
']': '\]',
'(': '\(',
'\\': '\\\\',
'>': '\>',
',': '\,',
'.': '\.',
'!': '\!',
'?': '\?',
'+': '\+'
}