This repository has been archived by the owner on Jun 26, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 63
/
regexp-utils.coffee
191 lines (149 loc) · 8.11 KB
/
regexp-utils.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
_ = require('underscore')
EmojiData = require('emoji-data')
UnicodeEmailChars = '\u0080-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u0300-\u036F\u0370-\u03FF\u0400-\u04FF\u0500-\u052F\u0530-\u058F\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F\u0780-\u07BF\u07C0-\u07FF\u0900-\u097F\u0980-\u09FF\u0A00-\u0A7F\u0A80-\u0AFF\u0B00-\u0B7F\u0B80-\u0BFF\u0C00-\u0C7F\u0C80-\u0CFF\u0D00-\u0D7F\u0D80-\u0DFF\u0E00-\u0E7F\u0E80-\u0EFF\u0F00-\u0FFF\u1000-\u109F\u10A0-\u10FF\u1100-\u11FF\u1200-\u137F\u1380-\u139F\u13A0-\u13FF\u1400-\u167F\u1680-\u169F\u16A0-\u16FF\u1700-\u171F\u1720-\u173F\u1740-\u175F\u1760-\u177F\u1780-\u17FF\u1800-\u18AF\u1900-\u194F\u1950-\u197F\u1980-\u19DF\u19E0-\u19FF\u1A00-\u1A1F\u1B00-\u1B7F\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u1E00-\u1EFF\u1F00-\u1FFF\u20D0-\u20FF\u2100-\u214F\u2C00-\u2C5F\u2C60-\u2C7F\u2C80-\u2CFF\u2D00-\u2D2F\u2D30-\u2D7F\u2D80-\u2DDF\u2F00-\u2FDF\u2FF0-\u2FFF\u3040-\u309F\u30A0-\u30FF\u3100-\u312F\u3130-\u318F\u3190-\u319F\u31C0-\u31EF\u31F0-\u31FF\u3200-\u32FF\u3300-\u33FF\u3400-\u4DBF\u4DC0-\u4DFF\u4E00-\u9FFF\uA000-\uA48F\uA490-\uA4CF\uA700-\uA71F\uA800-\uA82F\uA840-\uA87F\uAC00-\uD7AF\uF900-\uFAFF'
RegExpUtils =
# It's important that the regex be wrapped in parens, otherwise
# javascript's RegExp::exec method won't find anything even when the
# regex matches!
#
# It's also imporant we return a fresh copy of the RegExp every time. A
# javascript regex is stateful and multiple functions using this method
# will cause unexpected behavior!
#
# See http://tools.ietf.org/html/rfc5322#section-3.4 and
# https://tools.ietf.org/html/rfc6531 and
# https://en.wikipedia.org/wiki/Email_address#Local_part
emailRegex: -> new RegExp("([a-z.A-Z#{UnicodeEmailChars}0-9!#$%&\\'*+\\-/=?^_`{|}~]+@[A-Za-z#{UnicodeEmailChars}0-9.-]+\\.[A-Za-z]{2,63})", 'g')
# http://stackoverflow.com/questions/16631571/javascript-regular-expression-detect-all-the-phone-number-from-the-page-source
# http://www.regexpal.com/?fam=94521
# NOTE: This is not exhaustive, and balances what is technically a phone number
# with what would be annoying to linkify. eg: 12223334444 does not match.
phoneRegex: -> new RegExp(/([\+\(]+|\b)(?:(\d{1,3}[- ()]*)?)(\d{3})[- )]+(\d{3})[- ]+(\d{4})(?: *x(\d+))?\b/g)
# http://stackoverflow.com/a/16463966
# http://www.regexpal.com/?fam=93928
# NOTE: This does not match full urls with `http` protocol components.
domainRegex: -> new RegExp("^(?!:\\/\\/)([a-zA-Z#{UnicodeEmailChars}0-9-_]+\\.)*[a-zA-Z#{UnicodeEmailChars}0-9][a-zA-Z#{UnicodeEmailChars}0-9-_]+\\.[a-zA-Z]{2,11}?", 'i')
# http://www.regexpal.com/?fam=95875
hashtagOrMentionRegex: -> new RegExp(/\s([@#])([\w_-]+)/i)
# https://www.safaribooksonline.com/library/view/regular-expressions-cookbook/9780596802837/ch07s16.html
ipAddressRegex: -> new RegExp(/^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$/i)
nylasCommandRegex: -> new RegExp(/nylas:\S+/i)
# Test cases: https://regex101.com/r/pD7iS5/3
urlRegex: ({matchEntireString} = {}) ->
commonTlds = ['com', 'org', 'edu', 'gov', 'uk', 'net', 'ca', 'de', 'jp', 'fr', 'au', 'us', 'ru', 'ch', 'it', 'nl', 'se', 'no', 'es', 'mil', 'ly', 'biz', 'ai', 'info', 'it', 'to', 'io', 'co', 'eu', 'aero', 'jobs', 'mobi', 'at', 'be', 'br', 'cn' ]
parts = [
'('
# one of
'('
# scheme, ala https://
'([A-Za-z]{3,9}:(?:\\/\\/))?'
# username:password (optional)
'(?:\\w+:\\w+@)?'
# one of:
'('
# domain with common tld
'(?:(?:[-\\w\\d{1-3}]+\\.)+(?:' + commonTlds.join('|') + '))'
# or
'|'
# ip address
'('
'(\\b25[0-5]\\b|\\b[2][0-4][0-9]\\b|\\b[0-1]?[0-9]?[0-9]\\b)(\\.(\\b25[0-5]\\b|\\b[2][0-4][0-9]\\b|\\b[0-1]?[0-9]?[0-9]\\b)){3}'
')'
')'
# port if specified
'(?::[\\d]{1,5})?'
# URL Path
'(?:(?:(?:\\/(?:[-\\w~!$+|.,=:]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?'
# query strings
'(?:(?:\\?(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?(?:[-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)(?:&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?(?:[-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*'
# Anchor links
'(?:#(?:[-\\w~!$ |\\/.,*:;=]|%[a-f\\d]{2})*)?'
# or
'|'
# mailto links
'mailto:\\/*(?:\\w+\\.|[\\-;:&=\\+\\$.,\\w]+@)[A-Za-z0-9\\.\\-]+'
'|'
# telephone links
'tel:'
')'
# optionally followed by:
'('
# URL components
# (last character must not be puncation, hence two groups)
'(?:[\\+~%\\/\\.\\w\\-_@]*[\\+~%\\/\\w\\-_]+)?'
# optionally followed by: a query string and/or a #location
# (last character must not be puncation, hence two groups)
'(?:(\\?[\\-\\+=&;%@\\.\\w_\\#]*[\\#\\-\\+=&;%@\\w_\\/]+)?#?(?:[\'\\$\\&\\(\\)\\*\\+,;=\\.\\!\\/\\\\\\w%-]*[\\/\\\\\\w]+)?)?'
')?'
')'
]
if matchEntireString
parts.unshift('^')
return new RegExp(parts.join(''), 'gi')
# Test cases: https://regex101.com/r/jD5zC7/2
# Returns the following capturing groups:
# 1. start of the opening a tag to href="
# 2. The contents of the href without quotes
# 3. the rest of the opening a tag
# 4. the contents of the a tag
# 5. the closing tag
linkTagRegex: -> new RegExp(/(<a.*?href\s*?=\s*?['"])(.*?)(['"].*?>)([\s\S]*?)(<\/a>)/gim)
# Test cases: https://regex101.com/r/cK0zD8/4
# Catches link tags containing which are:
# - Non empty
# - Not a mailto: link
# Returns the following capturing groups:
# 1. start of the opening a tag to href="
# 2. The contents of the href without quotes
# 3. the rest of the opening a tag
# 4. the contents of the a tag
# 5. the closing tag
urlLinkTagRegex: -> new RegExp(/(<a.*?href\s*?=\s*?['"])((?!mailto).+?)(['"].*?>)([\s\S]*?)(<\/a>)/gim)
# https://regex101.com/r/zG7aW4/3
imageTagRegex: -> /<img\s+[^>]*src="([^"]*)"[^>]*>/g
# Regex that matches our link tracking urls, surrounded by quotes
# ("link.nylas.com...?redirect=")
# Test cases: https://regex101.com/r/rB4fO4/3
# Returns the following capturing groups
# 1.The redirect url: the actual url you want to visit by clicking a url
# that matches this regex
trackedLinkRegex: -> /[\"|\']https:\/\/link\.nylas\.com\/link\/.*?\?.*?redirect=([^&\"\']*).*?[\"|\']/g
punctuation: ({exclude}={}) ->
exclude ?= []
punctuation = [ '.', ',', '\\/', '#', '!', '$', '%', '^', '&', '*',
';', ':', '{', '}', '=', '\\-', '_', '`', '~', '(', ')', '@', '+',
'?', '>', '<', '\\[', '\\]', '+' ]
punctuation = _.difference(punctuation, exclude).join('')
return new RegExp("[#{punctuation}]", 'g')
# This tests for valid schemes as per RFC 3986
# We need both http: https: and mailto: and a variety of other schemes.
# This does not check for invalid usage of the http: scheme. For
# example, http:bad.com would pass. We do not check for
# protocol-relative uri's.
#
# Regex explanation here: https://regex101.com/r/nR2yL6/2
# See RFC here: https://tools.ietf.org/html/rfc3986#section-3.1
# SO discussion: http://stackoverflow.com/questions/10687099/how-to-test-if-a-url-string-is-absolute-or-relative/31991870#31991870
hasValidSchemeRegex: -> new RegExp('^[a-z][a-z0-9+.-]*:', 'i')
emojiRegex: -> FBS_REGEXP = new RegExp(
"(?:#{EmojiData.chars({include_variants: true}).join("|")})",
"g")
looseStyleTag: -> /<style/gim
# Regular expression matching javasript function arguments:
# https://regex101.com/r/pZ6zF0/2
functionArgs: -> /(?:\(\s*([^)]+?)\s*\)|(\w+)\s?=>)/
illegalPathCharactersRegexp: ->
#https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
/[\\\/:|?*><"#]/g
# https://regex101.com/r/nC0qL2/2
signatureRegex: ->
new RegExp(/(<br\/>){0,2}<signature>[^]*<\/signature>/)
# Finds the start of a quoted text region as inserted by N1. This is not
# a general-purpose quote detection scheme and only works for
# N1-composed emails.
n1QuoteStartRegex: ->
new RegExp(/<\w+[^>]*gmail_quote/i)
# https://regex101.com/r/jK8cC2/1
subcategorySplitRegex: ->
/[./\\]/g
module.exports = RegExpUtils