|
| 1 | +import { |
| 2 | + anyOf, |
| 3 | + buildRegExp, |
| 4 | + capture, |
| 5 | + charClass, |
| 6 | + charRange, |
| 7 | + digit, |
| 8 | + endOfString, |
| 9 | + negativeLookahead, |
| 10 | + oneOrMore, |
| 11 | + optional, |
| 12 | + repeat, |
| 13 | + startOfString, |
| 14 | +} from '../index'; |
| 15 | + |
| 16 | +// |
| 17 | +// URL: |
| 18 | +// URL = Scheme ":"["//" Authority]Path["?" Query]["#" Fragment] |
| 19 | +// https://en.wikipedia.org/wiki/URL#External_links |
| 20 | +// |
| 21 | + |
| 22 | +// |
| 23 | +// The building blocks of the URL regex. |
| 24 | +// |
| 25 | +const lowercase = charRange('a', 'z'); |
| 26 | +const uppercase = charRange('A', 'Z'); |
| 27 | +const hyphen = anyOf('-'); |
| 28 | +const alphabetical = charClass(lowercase, uppercase); |
| 29 | +const specialChars = anyOf('._%+-'); |
| 30 | +const portSeperator = ':'; |
| 31 | +const schemeSeperator = ':'; |
| 32 | +const doubleSlash = '//'; |
| 33 | +const at = '@'; |
| 34 | +const pathSeparator = '/'; |
| 35 | +const querySeparator = '?'; |
| 36 | +const fragmentSeparator = '#'; |
| 37 | +const usernameChars = charClass(lowercase, digit, specialChars); |
| 38 | +const hostnameChars = charClass(charRange('a', 'z'), digit, anyOf('-')); |
| 39 | +const domainChars = charRange('a', 'z'); |
| 40 | + |
| 41 | +// |
| 42 | +// Scheme: |
| 43 | +// The scheme is the first part of the URL and defines the protocol to be used. |
| 44 | +// Examples of popular schemes include http, https, ftp, mailto, file, data and irc. |
| 45 | +// A URL string must be a scheme, followed by a colon, followed by a scheme-specific part. |
| 46 | +// |
| 47 | +const Scheme = [repeat(charClass(hyphen, alphabetical), { min: 3, max: 6 }), optional('s')]; |
| 48 | + |
| 49 | +const scheme = buildRegExp([startOfString, capture(Scheme), endOfString], { |
| 50 | + global: false, |
| 51 | + ignoreCase: true, |
| 52 | +}); |
| 53 | + |
| 54 | +test('Matching the Schema components.', () => { |
| 55 | + expect(scheme).toMatchString('ftp'); |
| 56 | + expect(scheme).not.toMatchString('ftp:'); |
| 57 | + expect(scheme).not.toMatchString('h'); |
| 58 | + expect(scheme).not.toMatchString('nameiswaytoolong'); |
| 59 | + expect(scheme).toMatchString('HTTPS'); |
| 60 | + expect(scheme).toMatchString('http'); |
| 61 | +}); |
| 62 | + |
| 63 | +// |
| 64 | +// Authority: |
| 65 | +// The authority part of a URL consists of three sub-parts: |
| 66 | +// 1. An optional username, followed by an at symbol (@) |
| 67 | +// 2. A hostname (e.g. www.google.com) |
| 68 | +// 3. An optional port number, preceded by a colon (:) |
| 69 | +// Authority = [userinfo "@"] host [":" port] |
| 70 | +// |
| 71 | +const userInfo = oneOrMore(usernameChars); |
| 72 | +const hostlabel = repeat(hostnameChars, { min: 1, max: 63 }); |
| 73 | +const hostlabelEnd = capture([hostlabel, endOfString]); |
| 74 | +const host = capture([oneOrMore([hostlabel, '.'])]); |
| 75 | +const port = [portSeperator, oneOrMore(digit)]; |
| 76 | + |
| 77 | +const Authority = [doubleSlash, optional([userInfo, at]), hostlabel, optional(port)]; |
| 78 | + |
| 79 | +const authorityRegex = buildRegExp([startOfString, capture(Authority), endOfString], { |
| 80 | + ignoreCase: true, |
| 81 | +}); |
| 82 | + |
| 83 | +const hostEx = buildRegExp([startOfString, host, hostlabelEnd, endOfString], { ignoreCase: true }); |
| 84 | + |
| 85 | +test('Matching the hostname component.', () => { |
| 86 | + expect(hostEx).toMatchString('www.google.com'); |
| 87 | + expect(hostEx).not.toMatchString('www.google.com.'); |
| 88 | +}); |
| 89 | + |
| 90 | +test('Matching the Authority components.', () => { |
| 91 | + expect(authorityRegex).toMatchString('//davidbowie@localhost:8080'); |
| 92 | + expect(authorityRegex).toMatchString('//localhost:1234'); |
| 93 | + expect(authorityRegex).not.toMatchString('davidbowie@localhost:1972'); |
| 94 | + expect(authorityRegex).not.toMatchString('nameiswaytoolong'); |
| 95 | +}); |
| 96 | + |
| 97 | +// |
| 98 | +// Path: |
| 99 | +// The path is the part of the URL that comes after the authority and before the query. |
| 100 | +// It consists of a sequence of path segments separated by a forward slash (/). |
| 101 | +// A path string must begin with a forward slash (/). |
| 102 | +// |
| 103 | + |
| 104 | +const pathSegment = [ |
| 105 | + pathSeparator, |
| 106 | + optional(oneOrMore(charClass(lowercase, uppercase, digit, anyOf(':@%._+~#=')))), |
| 107 | +]; |
| 108 | + |
| 109 | +const Path = oneOrMore(pathSegment); |
| 110 | + |
| 111 | +const path = buildRegExp([startOfString, capture(Path), endOfString], { |
| 112 | + global: false, |
| 113 | + ignoreCase: true, |
| 114 | +}); |
| 115 | + |
| 116 | +test('Matching the Path components.', () => { |
| 117 | + expect(path).toMatchString('/'); |
| 118 | + expect(path).not.toMatchString(''); |
| 119 | + expect(path).toMatchString('/a'); |
| 120 | + expect(path).not.toMatchString('a'); |
| 121 | + expect(path).not.toMatchString('a/'); |
| 122 | + expect(path).toMatchString('/a/b'); |
| 123 | + expect(path).not.toMatchString('a/b'); |
| 124 | + expect(path).not.toMatchString('a/b/'); |
| 125 | +}); |
| 126 | + |
| 127 | +// |
| 128 | +// Query: |
| 129 | +// The query part of a URL is optional and comes after the path. |
| 130 | +// It is separated from the path by a question mark (?). |
| 131 | +// The query string consists of a sequence of field-value pairs separated by an ampersand (&). |
| 132 | +// Each field-value pair is separated by an equals sign (=). |
| 133 | +// |
| 134 | + |
| 135 | +const queryKey = oneOrMore(charClass(lowercase, uppercase, digit, anyOf('_-'))); |
| 136 | +const queryValue = oneOrMore(charClass(lowercase, uppercase, digit, anyOf('_-'))); |
| 137 | + |
| 138 | +const queryDelimiter = anyOf('&;'); |
| 139 | +const equals = '='; |
| 140 | + |
| 141 | +const queryKVPair = buildRegExp([queryKey, equals, queryValue]); |
| 142 | + |
| 143 | +const Query = [querySeparator, oneOrMore([queryKVPair, optional(queryDelimiter)])]; |
| 144 | + |
| 145 | +const query = buildRegExp([startOfString, capture(Query), endOfString], { |
| 146 | + global: false, |
| 147 | + ignoreCase: true, |
| 148 | +}); |
| 149 | + |
| 150 | +test('Matching the Query components.', () => { |
| 151 | + expect(query).not.toMatchString(''); |
| 152 | + expect(query).not.toMatchString('??'); |
| 153 | + expect(query).not.toMatchString('?'); |
| 154 | + expect(query).not.toMatchString('?a-b'); |
| 155 | + expect(query).toMatchString('?a=b'); |
| 156 | + expect(query).toMatchString('?a=b&c=d'); |
| 157 | + expect(query).not.toMatchString('a=b&c-d'); |
| 158 | +}); |
| 159 | + |
| 160 | +// |
| 161 | +// Fragment: |
| 162 | +// The fragment part of a URL is optional and comes after the query. |
| 163 | +// It is separated from the query by a hash (#). |
| 164 | +// The fragment string consists of a sequence of characters. |
| 165 | +// |
| 166 | +const Fragment = [ |
| 167 | + fragmentSeparator, |
| 168 | + oneOrMore(charClass(lowercase, uppercase, digit, anyOf(':@%._+~#=&'))), |
| 169 | +]; |
| 170 | + |
| 171 | +const fragment = buildRegExp([startOfString, capture(Fragment), endOfString], { |
| 172 | + global: false, |
| 173 | + ignoreCase: true, |
| 174 | +}); |
| 175 | + |
| 176 | +test('Matching the Fragment components.', () => { |
| 177 | + expect(fragment).not.toMatchString(''); |
| 178 | + expect(fragment).toMatchString('#section1'); |
| 179 | + expect(fragment).not.toMatchString('#'); |
| 180 | +}); |
| 181 | + |
| 182 | +const Url = [ |
| 183 | + optional(Scheme), |
| 184 | + schemeSeperator, |
| 185 | + optional(Authority), |
| 186 | + Path, |
| 187 | + optional(Query), |
| 188 | + optional(Fragment), |
| 189 | +]; |
| 190 | + |
| 191 | +const urlRegex = buildRegExp([startOfString, capture(Url), endOfString], { |
| 192 | + ignoreCase: true, |
| 193 | +}); |
| 194 | + |
| 195 | +test('Matching URL components.', () => { |
| 196 | + expect(urlRegex).not.toMatchString(''); |
| 197 | + expect(urlRegex).not.toMatchString('http'); |
| 198 | + expect(urlRegex).toMatchString('http://localhost:8080'); |
| 199 | + expect(urlRegex).toMatchString('http://localhost:8080/users/paul/research/data.json'); |
| 200 | + expect(urlRegex).toMatchString( |
| 201 | + 'http://localhost:8080/users/paul/research/data.json?request=regex&email=me', |
| 202 | + ); |
| 203 | + expect(urlRegex).toMatchString( |
| 204 | + 'http://localhost:8080/users/paul/research/data.json?request=regex&email=me#section1', |
| 205 | + ); |
| 206 | +}); |
| 207 | + |
| 208 | +const Email = [ |
| 209 | + oneOrMore(usernameChars), |
| 210 | + '@', |
| 211 | + oneOrMore(hostnameChars), |
| 212 | + '.', |
| 213 | + repeat(domainChars, { min: 2 }), |
| 214 | +]; |
| 215 | + |
| 216 | +const emailRegex = buildRegExp([startOfString, capture(Email), endOfString], { |
| 217 | + ignoreCase: true, |
| 218 | +}); |
| 219 | + |
| 220 | +test('Matching email addresses.', () => { |
| 221 | + expect(emailRegex).not.toMatchString(''); |
| 222 | + expect(emailRegex).toMatchString('stevenwilson@porcupinetree.com'); |
| 223 | + expect(emailRegex).not.toMatchString('stevenwilson@porcupinetree'); |
| 224 | +}); |
| 225 | + |
| 226 | +const selectLinksNotEmails = buildRegExp( |
| 227 | + [startOfString, urlRegex, negativeLookahead(emailRegex), endOfString], |
| 228 | + { |
| 229 | + ignoreCase: true, |
| 230 | + }, |
| 231 | +); |
| 232 | + |
| 233 | +test('Matching URLs.', () => { |
| 234 | + expect(selectLinksNotEmails).toMatchString('http://localhost:8080'); |
| 235 | + expect(selectLinksNotEmails).toMatchString( |
| 236 | + 'http://paul@localhost:8080/users/paul/research/data.json?request=regex&email=me#section1', |
| 237 | + ); |
| 238 | + expect(selectLinksNotEmails).toMatchString('ftp://data/#January'); |
| 239 | + expect(selectLinksNotEmails).not.toMatchString('https:'); |
| 240 | + expect(selectLinksNotEmails).not.toMatchString('piotr@riverside.com'); |
| 241 | + expect(selectLinksNotEmails).toMatchString('http://www.google.com'); |
| 242 | + expect(selectLinksNotEmails).toMatchString('https://www.google.com?search=regex'); |
| 243 | + expect(selectLinksNotEmails).not.toMatchString('www.google.com?search=regex&email=me'); |
| 244 | + expect(selectLinksNotEmails).toMatchString('mailto://paul@thebeatles.com'); |
| 245 | + expect(selectLinksNotEmails).not.toMatchString('ftphttpmailto://neal@nealmorse'); |
| 246 | +}); |
0 commit comments