diff --git a/package.json b/package.json index c5315a98..de346bcf 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,8 @@ "node": ">=8" }, "workspaces": [ + "./packages/linkifyjs", + "./packages/linkify-plugin-*/", "./packages/*" ] } diff --git a/packages/linkifyjs/src/core/fsm.js b/packages/linkifyjs/src/core/fsm.js index 72d465a8..9d30b731 100644 --- a/packages/linkifyjs/src/core/fsm.js +++ b/packages/linkifyjs/src/core/fsm.js @@ -10,6 +10,7 @@ * @param {string|class} token to emit */ export function State(token) { + // this.n = null; // DEBUG: State name this.j = {}; // IMPLEMENTATION 1 // this.j = []; // IMPLEMENTATION 2 this.jr = []; @@ -49,11 +50,21 @@ State.prototype = { * transitioned to on the given input regardless of what that input * previously did. * - * @param {string} input character or token to transition on + * @param {string} input character or token type to transition on * @param {Token|State} tokenOrState transition to a matching state * @returns State taken after the given input */ tt(input, tokenOrState) { + if (input instanceof Array) { + // Recursive case + if (input.length === 0) { return; } + const nextState = this.tt(input[0], tokenOrState); + for (let i = 1; i < input.length; i++) { + this.tt(input[i], nextState); + } + return nextState; + } + if (tokenOrState && tokenOrState.j) { // State, default a basic transition this.j[input] = tokenOrState; @@ -92,13 +103,21 @@ State.prototype = { * Utility function to create state without using new keyword (reduced file size * when minified) */ -export const makeState = () => new State(); +export const makeState = (/*name*/) => { + const s = new State(); + // if (name) { s.n = name; } // DEBUG + return s; +}; /** * Similar to previous except it is an accepting state that emits a token * @param {Token} token */ -export const makeAcceptingState = (token) => new State(token); +export const makeAcceptingState = (token/*, name*/) => { + const s = new State(token); + // if (name) { s.n = name; } // DEBUG + return s; +}; /** * Create a transition from startState to nextState via the given character @@ -112,6 +131,7 @@ export const makeT = (startState, input, nextState) => { // IMPLEMENTATION 2: Add to array (slower) // startState.j.push([input, nextState]); + return startState.j[input]; }; /** @@ -127,7 +147,7 @@ export const makeRegexT = (startState, regex, nextState) => { /** * Follow the transition from the given character to the next state * @param {State} state - * @param {Token} input character or other concrete token type to transition + * @param {string|Token} input character or other concrete token type to transition * @returns {?State} the next state, if any */ export const takeT = (state, input) => { @@ -145,8 +165,8 @@ export const takeT = (state, input) => { for (let i = 0; i < state.jr.length; i++) { const regex = state.jr[i][0]; - const nextState = state.jr[i][1]; - if (regex.test(input)) {return nextState;} + const nextState = state.jr[i][1]; // note: might be empty to prevent default jump + if (nextState && regex.test(input)) { return nextState; } } // Nowhere left to jump! Return default, if any return state.jd; @@ -176,6 +196,7 @@ export const makeBatchT = (startState, transitions) => { for (let i = 0; i < transitions.length; i++) { const input = transitions[i][0]; const nextState = transitions[i][1]; + // if (!nextState.n && typeof input === 'string') { nextState.n = input; } // DEBUG makeT(startState, input, nextState); } }; @@ -193,6 +214,7 @@ export const makeBatchT = (startState, transitions) => { * @param {string} str * @param {Token} endStateFactory * @param {Token} defaultStateFactory + * @return {State} the final state */ export const makeChainT = (state, str, endState, defaultStateFactory) => { let i = 0, len = str.length, nextState; @@ -203,7 +225,7 @@ export const makeChainT = (state, str, endState, defaultStateFactory) => { i++; } - if (i >= len) { return []; } // no new tokens were added + if (i >= len) { return state; } // no new tokens were added while (i < len - 1) { nextState = defaultStateFactory(); @@ -213,4 +235,6 @@ export const makeChainT = (state, str, endState, defaultStateFactory) => { } makeT(state, str[len - 1], endState); + // if (!endState.n) { endState.n === str; } // DEBUG + return endState; }; diff --git a/packages/linkifyjs/src/core/parser.js b/packages/linkifyjs/src/core/parser.js index ef3de40b..4fd2c2fc 100644 --- a/packages/linkifyjs/src/core/parser.js +++ b/packages/linkifyjs/src/core/parser.js @@ -5,7 +5,7 @@ overkill). URL format: http://en.wikipedia.org/wiki/URI_scheme - Email format: http://en.wikipedia.org/wiki/Email_address (links to RFC in + Email format: http://en.wikipedia.org/wiki/EmailAddress (links to RFC in reference) @module linkify @@ -29,99 +29,99 @@ import * as mtk from './tokens/multi'; */ export function init() { // The universal starting state. - let S_START = makeState(); + const Start = makeState(); // Intermediate states for URLs. Note that domains that begin with a protocol // are treated slighly differently from those that don't. - let S_PROTOCOL = makeState(); // e.g., 'http:' - let S_MAILTO = makeState(); // 'mailto:' - let S_PROTOCOL_SLASH = makeState(); // e.g., 'http:/'' - let S_PROTOCOL_SLASH_SLASH = makeState(); // e.g.,'http://' - let S_DOMAIN = makeState(); // parsed string ends with a potential domain name (A) - let S_DOMAIN_DOT = makeState(); // (A) domain followed by DOT - let S_TLD = makeAcceptingState(mtk.Url); // (A) Simplest possible URL with no query string - let S_TLD_COLON = makeState(); // (A) URL followed by colon (potential port number here) - let S_TLD_PORT = makeAcceptingState(mtk.Url); // TLD followed by a port number - let S_URL = makeAcceptingState(mtk.Url); // Long URL with optional port and maybe query string - let S_URL_NON_ACCEPTING = makeState(); // URL followed by some symbols (will not be part of the final URL) - let S_URL_OPENBRACE = makeState(); // URL followed by { - let S_URL_OPENBRACKET = makeState(); // URL followed by [ - let S_URL_OPENANGLEBRACKET = makeState(); // URL followed by < - let S_URL_OPENPAREN = makeState(); // URL followed by ( - let S_URL_OPENBRACE_Q = makeAcceptingState(mtk.Url); // URL followed by { and some symbols that the URL can end it - let S_URL_OPENBRACKET_Q = makeAcceptingState(mtk.Url); // URL followed by [ and some symbols that the URL can end it - let S_URL_OPENANGLEBRACKET_Q = makeAcceptingState(mtk.Url); // URL followed by < and some symbols that the URL can end it - let S_URL_OPENPAREN_Q = makeAcceptingState(mtk.Url); // URL followed by ( and some symbols that the URL can end it - let S_URL_OPENBRACE_SYMS = makeState(); // S_URL_OPENBRACE_Q followed by some symbols it cannot end it - let S_URL_OPENBRACKET_SYMS = makeState(); // S_URL_OPENBRACKET_Q followed by some symbols it cannot end it - let S_URL_OPENANGLEBRACKET_SYMS = makeState(); // S_URL_OPENANGLEBRACKET_Q followed by some symbols it cannot end it - let S_URL_OPENPAREN_SYMS = makeState(); // S_URL_OPENPAREN_Q followed by some symbols it cannot end it - let S_EMAIL_DOMAIN = makeState(); // parsed string starts with local email info + @ with a potential domain name (C) - let S_EMAIL_DOMAIN_DOT = makeState(); // (C) domain followed by DOT - let S_EMAIL = makeAcceptingState(mtk.Email); // (C) Possible email address (could have more tlds) - let S_EMAIL_COLON = makeState(); // (C) URL followed by colon (potential port number here) - let S_EMAIL_PORT = makeAcceptingState(mtk.Email); // (C) Email address with a port - let S_MAILTO_EMAIL = makeAcceptingState(mtk.MailtoEmail); // Email that begins with the mailto prefix (D) - let S_MAILTO_EMAIL_NON_ACCEPTING = makeState(); // (D) Followed by some non-query string chars - let S_LOCALPART = makeState(); // Local part of the email address - let S_LOCALPART_AT = makeState(); // Local part of the email address plus @ - let S_LOCALPART_DOT = makeState(); // Local part of the email address plus '.' (localpart cannot end in .) - let S_NL = makeAcceptingState(mtk.Nl); // single new line + const Scheme = makeState(); // e.g., 'mailto' + const SlashScheme = makeState(); // e.g., 'http' + const SlashSchemeColon = makeState(); // e.g., 'http:' + const SlashSchemeColonSlash = makeState(); // e.g., 'http:/' + const UriPrefix = makeState(); // e.g., 'mailto:' or 'http://' + + const Domain = makeState(); // parsed string ends with a potential domain name (A) + const DomainDot = makeState(); // domain followed by DOT + const DomainHyphen = makeState(); // domain followed by hyphen + const DomainDotTld = makeAcceptingState(mtk.Url); // Simplest possible URL with no query string + const DomainDotTldColon = makeState(); // URL followed by colon (potential port number here) + const DomainDotTldColonPort = makeAcceptingState(mtk.Url); // TLD followed by a port number + + const Url = makeAcceptingState(mtk.Url); // Long URL with optional port and maybe query string + const UrlNonaccept = makeState(); // URL followed by some symbols (will not be part of the final URL) + const UrlOpenbrace = makeState(); // URL followed by { + const UrlOpenbracket = makeState(); // URL followed by [ + const UrlOpenanglebracket = makeState(); // URL followed by < + const UrlOpenparen = makeState(); // URL followed by ( + const UrlOpenbraceQ = makeAcceptingState(mtk.Url); // URL followed by { and some symbols that the URL can end it + const UrlOpenbracketQ = makeAcceptingState(mtk.Url); // URL followed by [ and some symbols that the URL can end it + const UrlOpenanglebracketQ = makeAcceptingState(mtk.Url); // URL followed by < and some symbols that the URL can end it + const UrlOpenparenQ = makeAcceptingState(mtk.Url); // URL followed by ( and some symbols that the URL can end it + const UrlOpenbraceSyms = makeState(); // UrlOpenbrace followed by some symbols it cannot end it + const UrlOpenbracketSyms = makeState(); // UrlOpenbracketQ followed by some symbols it cannot end it + const UrlOpenanglebracketSyms = makeState(); // UrlOpenanglebracketQ followed by some symbols it cannot end it + const UrlOpenparenSyms = makeState(); // UrlOpenparenQ followed by some symbols it cannot end it + + const EmailDomain = makeState(); // parsed string starts with local email info + @ with a potential domain name + const EmailDomainDot = makeState(); // domain followed by DOT + const EmailDomainHyphen = makeState(); // parsed string starts with local email info + @ with a potential domain name + const Email = makeAcceptingState(mtk.Email); // Possible email address (could have more tlds) + const EmailColon = makeState(); // URL followed by colon (potential port number here) + const EmailColonPort = makeAcceptingState(mtk.Email); // URL followed by colon and potential port numner + const Localpart = makeState(); // Local part of the email address + const LocalpartAt = makeState(); // Local part of the email address plus @ + const LocalpartAtNum = makeState(); // Local part of the email address plus @ plus a number + const LocalpartDot = makeState(); // Local part of the email address plus '.' (localpart cannot end in .) + + const Nl = makeAcceptingState(mtk.Nl); // single new line // Make path from start to protocol (with '//') - makeT(S_START, tk.NL, S_NL); - makeT(S_START, tk.PROTOCOL, S_PROTOCOL); - makeT(S_START, tk.MAILTO, S_MAILTO); - - makeT(S_PROTOCOL, tk.SLASH, S_PROTOCOL_SLASH); - makeT(S_PROTOCOL_SLASH, tk.SLASH, S_PROTOCOL_SLASH_SLASH); - - // The very first potential domain name - makeT(S_START, tk.TLD, S_DOMAIN); - makeT(S_START, tk.DOMAIN, S_DOMAIN); - makeT(S_START, tk.LOCALHOST, S_TLD); - makeT(S_START, tk.NUM, S_DOMAIN); - - // Force URL for protocol followed by anything sane - makeT(S_PROTOCOL_SLASH_SLASH, tk.TLD, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.DOMAIN, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.NUM, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.LOCALHOST, S_URL); - - // Account for dots and hyphens - // hyphens are usually parts of domain names - makeT(S_DOMAIN, tk.DOT, S_DOMAIN_DOT); - makeT(S_EMAIL_DOMAIN, tk.DOT, S_EMAIL_DOMAIN_DOT); - + makeT(Start, tk.NL, Nl); + makeT(Start, tk.SCHEME, Scheme); + makeT(Start, tk.SLASH_SCHEME, SlashScheme); + makeT(Start, tk.COMPOUND_SCHEME, Scheme); + makeT(Start, tk.COMPOUND_SLASH_SCHEME, SlashScheme); + + // Most transitions after a UriPrefix will be considered URL tokens + makeT(Scheme, tk.COLON, UriPrefix); + makeT(SlashScheme, tk.COLON, SlashSchemeColon); + makeT(SlashSchemeColon, tk.SLASH, SlashSchemeColonSlash); + makeT(SlashSchemeColonSlash, tk.SLASH, UriPrefix); + + // The very first potential domain name + full URL + makeT(Start, tk.LOCALHOST, DomainDotTld); + + // Some transitions from this call are ignored because they're already + // accounted for in the scheme state definitions above + makeMultiT(Start, tk.domain, Domain); + + + // Account for dots and hyphens. Hyphens are usually parts of domain names + // (but not TLDs) + makeT(Domain, tk.DOT, DomainDot); + makeT(Domain, tk.HYPHEN, DomainHyphen); + makeMultiT(Domain, tk.domain, Domain); + makeT(DomainDot, tk.TLD, DomainDotTld); + makeT(DomainDot, tk.UTLD, DomainDotTld); + makeMultiT(DomainDot, tk.domain, Domain); // Hyphen can jump back to a domain name - - // After the first domain and a dot, we can find either a URL or another domain - makeT(S_DOMAIN_DOT, tk.TLD, S_TLD); - makeT(S_DOMAIN_DOT, tk.DOMAIN, S_DOMAIN); - makeT(S_DOMAIN_DOT, tk.NUM, S_DOMAIN); - makeT(S_DOMAIN_DOT, tk.LOCALHOST, S_DOMAIN); - - makeT(S_EMAIL_DOMAIN_DOT, tk.TLD, S_EMAIL); - makeT(S_EMAIL_DOMAIN_DOT, tk.DOMAIN, S_EMAIL_DOMAIN); - makeT(S_EMAIL_DOMAIN_DOT, tk.NUM, S_EMAIL_DOMAIN); - makeT(S_EMAIL_DOMAIN_DOT, tk.LOCALHOST, S_EMAIL_DOMAIN); - - // S_TLD accepts! But the URL could be longer, try to find a match greedily - // The `run` function should be able to "rollback" to the accepting state - makeT(S_TLD, tk.DOT, S_DOMAIN_DOT); - makeT(S_EMAIL, tk.DOT, S_EMAIL_DOMAIN_DOT); + makeMultiT(DomainHyphen, tk.domain, Domain); + makeT(DomainDotTld, tk.DOT, DomainDot); + makeT(DomainDotTld, tk.HYPHEN, DomainHyphen); + makeMultiT(DomainDotTld, tk.domain, Domain); // Become real URLs after `SLASH` or `COLON NUM SLASH` - // Here PSS and non-PSS converge - makeT(S_TLD, tk.COLON, S_TLD_COLON); - makeT(S_TLD, tk.SLASH, S_URL); - makeT(S_TLD_COLON, tk.NUM, S_TLD_PORT); - makeT(S_TLD_PORT, tk.SLASH, S_URL); - makeT(S_EMAIL, tk.COLON, S_EMAIL_COLON); - makeT(S_EMAIL_COLON, tk.NUM, S_EMAIL_PORT); + // Here works with or without scheme:// prefix + makeT(DomainDotTld, tk.COLON, DomainDotTldColon); + makeT(DomainDotTld, tk.SLASH, Url); + makeMultiT(DomainDotTldColon, tk.numeric, DomainDotTldColonPort); + makeT(DomainDotTldColonPort, tk.SLASH, Url); + + // Force URL with scheme prefix followed by anything sane + makeT(UriPrefix, tk.SLASH, Url); + makeMultiT(UriPrefix, tk.domain, Url); // Types of characters the URL can definitely end in - const qsAccepting = [ + const qsAccepting = tk.domain.concat([ tk.AMPERSAND, tk.ASTERISK, tk.AT, @@ -129,22 +129,18 @@ export function init() { tk.BACKTICK, tk.CARET, tk.DOLLAR, - tk.DOMAIN, tk.EQUALS, tk.HYPHEN, - tk.LOCALHOST, tk.NUM, tk.PERCENT, tk.PIPE, tk.PLUS, tk.POUND, - tk.PROTOCOL, tk.SLASH, tk.SYM, tk.TILDE, - tk.TLD, tk.UNDERSCORE - ]; + ]); // Types of tokens that can follow a URL and be part of the query string // but cannot be the very last characters @@ -172,86 +168,73 @@ export function init() { // include the final round bracket. // URL, followed by an opening bracket - makeT(S_URL, tk.OPENBRACE, S_URL_OPENBRACE); - makeT(S_URL, tk.OPENBRACKET, S_URL_OPENBRACKET); - makeT(S_URL, tk.OPENANGLEBRACKET, S_URL_OPENANGLEBRACKET); - makeT(S_URL, tk.OPENPAREN, S_URL_OPENPAREN); + makeT(Url, tk.OPENBRACE, UrlOpenbrace); + makeT(Url, tk.OPENBRACKET, UrlOpenbracket); + makeT(Url, tk.OPENANGLEBRACKET, UrlOpenanglebracket); + makeT(Url, tk.OPENPAREN, UrlOpenparen); // URL with extra symbols at the end, followed by an opening bracket - makeT(S_URL_NON_ACCEPTING, tk.OPENBRACE, S_URL_OPENBRACE); - makeT(S_URL_NON_ACCEPTING, tk.OPENBRACKET, S_URL_OPENBRACKET); - makeT(S_URL_NON_ACCEPTING, tk.OPENANGLEBRACKET, S_URL_OPENANGLEBRACKET); - makeT(S_URL_NON_ACCEPTING, tk.OPENPAREN, S_URL_OPENPAREN); + makeT(UrlNonaccept, tk.OPENBRACE, UrlOpenbrace); + makeT(UrlNonaccept, tk.OPENBRACKET, UrlOpenbracket); + makeT(UrlNonaccept, tk.OPENANGLEBRACKET, UrlOpenanglebracket); + makeT(UrlNonaccept, tk.OPENPAREN, UrlOpenparen); // Closing bracket component. This character WILL be included in the URL - makeT(S_URL_OPENBRACE, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN, tk.CLOSEPAREN, S_URL); - makeT(S_URL_OPENBRACE_Q, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET_Q, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET_Q, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN_Q, tk.CLOSEPAREN, S_URL); - makeT(S_URL_OPENBRACE_SYMS, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET_SYMS, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET_SYMS, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN_SYMS, tk.CLOSEPAREN, S_URL); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracket, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracket, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparen, tk.CLOSEPAREN, Url); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracketQ, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracketQ, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparenQ, tk.CLOSEPAREN, Url); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracketSyms, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracketSyms, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparenSyms, tk.CLOSEPAREN, Url); // URL that beings with an opening bracket, followed by a symbols. - // Note that the final state can still be `S_URL_OPENBRACE_Q` (if the URL only + // Note that the final state can still be `UrlOpenbrace` (if the URL only // has a single opening bracket for some reason). - makeMultiT(S_URL_OPENBRACE, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE, qsNonAccepting, S_URL_OPENBRACE_SYMS); - makeMultiT(S_URL_OPENBRACKET, qsNonAccepting, S_URL_OPENBRACKET_SYMS); - makeMultiT(S_URL_OPENANGLEBRACKET, qsNonAccepting, S_URL_OPENANGLEBRACKET_SYMS); - makeMultiT(S_URL_OPENPAREN, qsNonAccepting, S_URL_OPENPAREN_SYMS); + makeMultiT(UrlOpenbrace, qsAccepting, UrlOpenbrace); + makeMultiT(UrlOpenbracket, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracket, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparen, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbrace, qsNonAccepting, UrlOpenbrace); + makeMultiT(UrlOpenbracket, qsNonAccepting, UrlOpenbracketSyms); + makeMultiT(UrlOpenanglebracket, qsNonAccepting, UrlOpenanglebracketSyms); + makeMultiT(UrlOpenparen, qsNonAccepting, UrlOpenparenSyms); // URL that begins with an opening bracket, followed by some symbols - makeMultiT(S_URL_OPENBRACE_Q, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_Q, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_Q, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_Q, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE_Q, qsNonAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_Q, qsNonAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_Q, qsNonAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_Q, qsNonAccepting, S_URL_OPENPAREN_Q); - - makeMultiT(S_URL_OPENBRACE_SYMS, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_SYMS, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_SYMS, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_SYMS, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE_SYMS, qsNonAccepting, S_URL_OPENBRACE_SYMS); - makeMultiT(S_URL_OPENBRACKET_SYMS, qsNonAccepting, S_URL_OPENBRACKET_SYMS); - makeMultiT(S_URL_OPENANGLEBRACKET_SYMS, qsNonAccepting, S_URL_OPENANGLEBRACKET_SYMS); - makeMultiT(S_URL_OPENPAREN_SYMS, qsNonAccepting, S_URL_OPENPAREN_SYMS); + makeMultiT(UrlOpenbraceQ, qsAccepting, UrlOpenbraceQ); + makeMultiT(UrlOpenbracketQ, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketQ, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenQ, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbraceQ, qsNonAccepting, UrlOpenbraceQ); + makeMultiT(UrlOpenbracketQ, qsNonAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketQ, qsNonAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenQ, qsNonAccepting, UrlOpenparenQ); + + makeMultiT(UrlOpenbraceSyms, qsAccepting, UrlOpenbraceSyms); + makeMultiT(UrlOpenbracketSyms, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketSyms, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenSyms, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbraceSyms, qsNonAccepting, UrlOpenbraceSyms); + makeMultiT(UrlOpenbracketSyms, qsNonAccepting, UrlOpenbracketSyms); + makeMultiT(UrlOpenanglebracketSyms, qsNonAccepting, UrlOpenanglebracketSyms); + makeMultiT(UrlOpenparenSyms, qsNonAccepting, UrlOpenparenSyms); // Account for the query string - makeMultiT(S_URL, qsAccepting, S_URL); - makeMultiT(S_URL_NON_ACCEPTING, qsAccepting, S_URL); + makeMultiT(Url, qsAccepting, Url); + makeMultiT(UrlNonaccept, qsAccepting, Url); - makeMultiT(S_URL, qsNonAccepting, S_URL_NON_ACCEPTING); - makeMultiT(S_URL_NON_ACCEPTING, qsNonAccepting, S_URL_NON_ACCEPTING); + makeMultiT(Url, qsNonAccepting, UrlNonaccept); + makeMultiT(UrlNonaccept, qsNonAccepting, UrlNonaccept); // Email address-specific state definitions // Note: We are not allowing '/' in email addresses since this would interfere // with real URLs - // For addresses with the mailto prefix - // 'mailto:' followed by anything sane is a valid email - makeT(S_MAILTO, tk.TLD, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.DOMAIN, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.NUM, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.LOCALHOST, S_MAILTO_EMAIL); - - // Greedily get more potential valid email values - makeMultiT(S_MAILTO_EMAIL, qsAccepting, S_MAILTO_EMAIL); - makeMultiT(S_MAILTO_EMAIL, qsNonAccepting, S_MAILTO_EMAIL_NON_ACCEPTING); - makeMultiT(S_MAILTO_EMAIL_NON_ACCEPTING, qsAccepting, S_MAILTO_EMAIL); - makeMultiT(S_MAILTO_EMAIL_NON_ACCEPTING, qsNonAccepting, S_MAILTO_EMAIL_NON_ACCEPTING); - // For addresses without the mailto prefix // Tokens allowed in the localpart of the email const localpartAccepting = [ @@ -263,7 +246,6 @@ export function init() { tk.CARET, tk.CLOSEBRACE, tk.DOLLAR, - tk.DOMAIN, tk.EQUALS, tk.HYPHEN, tk.NUM, @@ -276,33 +258,47 @@ export function init() { tk.SLASH, tk.SYM, tk.TILDE, - tk.TLD, tk.UNDERSCORE ]; // Some of the tokens in `localpartAccepting` are already accounted for here and - // will not be overwritten (don't worry) - makeMultiT(S_DOMAIN, localpartAccepting, S_LOCALPART); - makeT(S_DOMAIN, tk.AT, S_LOCALPART_AT); - makeMultiT(S_TLD, localpartAccepting, S_LOCALPART); - makeT(S_TLD, tk.AT, S_LOCALPART_AT); - makeMultiT(S_DOMAIN_DOT, localpartAccepting, S_LOCALPART); + // will not be overwritten + makeMultiT(Domain, localpartAccepting, Localpart); + makeT(Domain, tk.AT, LocalpartAt); + makeMultiT(DomainDotTld, localpartAccepting, Localpart); + makeT(DomainDotTld, tk.AT, LocalpartAt); + makeMultiT(DomainDot, localpartAccepting, Localpart); // Now in localpart of address - // TODO: IP addresses and what if the email starts with numbers? - makeMultiT(S_LOCALPART, localpartAccepting, S_LOCALPART); - makeT(S_LOCALPART, tk.AT, S_LOCALPART_AT); // close to an email address now - makeT(S_LOCALPART, tk.DOT, S_LOCALPART_DOT); - makeMultiT(S_LOCALPART_DOT, localpartAccepting, S_LOCALPART); - makeT(S_LOCALPART_AT, tk.TLD, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.DOMAIN, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.NUM, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.LOCALHOST, S_EMAIL); + makeMultiT(Localpart, tk.domain, Localpart); + makeMultiT(Localpart, localpartAccepting, Localpart); + makeT(Localpart, tk.AT, LocalpartAt); // close to an email address now + makeT(Localpart, tk.DOT, LocalpartDot); + makeMultiT(LocalpartDot, tk.domain, Localpart); + makeMultiT(LocalpartDot, localpartAccepting, Localpart); + makeT(LocalpartAt, tk.LOCALHOST, Email); + makeMultiT(LocalpartAt, tk.domain, EmailDomain); + makeMultiT(LocalpartAtNum, tk.domain, EmailDomain); + + makeT(EmailDomain, tk.DOT, EmailDomainDot); + makeT(EmailDomain, tk.HYPHEN, EmailDomainHyphen); + makeT(EmailDomainDot, tk.TLD, Email); + makeT(EmailDomainDot, tk.UTLD, Email); + makeMultiT(EmailDomainDot, tk.domain, EmailDomain); + + // Hyphen can jump back to a domain name + makeMultiT(EmailDomainHyphen, tk.domain, EmailDomain); + makeT(Email, tk.DOT, EmailDomainDot); + makeT(Email, tk.HYPHEN, EmailDomainHyphen); + makeMultiT(Email, tk.domain, EmailDomain); - // States following `@` defined above + // Become real URLs after `SLASH` or `COLON NUM SLASH` + // Here works with or without scheme:// prefix + makeT(Email, tk.COLON, EmailColon); + makeMultiT(EmailColon, tk.numeric, EmailColonPort); - return S_START; + return Start; } /** @@ -356,13 +352,14 @@ export function run(start, input, tokens) { } if (sinceAccepts < 0) { - - // No accepting state was found, part of a regular text token - // Add all the tokens we looked at to the text tokens array - for (let i = cursor - multiLength; i < cursor; i++) { - textTokens.push(tokens[i]); + // No accepting state was found, part of a regular text token add + // the first text token to the text tokens array and try again from + // the next + cursor -= multiLength; + if (cursor < len) { + textTokens.push(tokens[cursor]); + cursor++; } - } else { // Accepting state! // First close off the textTokens (if available) diff --git a/packages/linkifyjs/src/core/scanner.js b/packages/linkifyjs/src/core/scanner.js index 200af610..e1ac8f08 100644 --- a/packages/linkifyjs/src/core/scanner.js +++ b/packages/linkifyjs/src/core/scanner.js @@ -16,52 +16,74 @@ import { makeChainT } from './fsm'; import * as tk from './tokens/text'; -import tlds from './tlds'; +import { tlds, utlds } from './tlds'; // Note that these two Unicode ones expand into a really big one with Babel +export const ASCII_LETTER = /[a-z]/; export const LETTER = /\p{L}/u; // Any Unicode character with letter data type export const EMOJI = /\p{Emoji}/u; // Any Unicode emoji character -export const EMOJI_VARIATION = /\uFE0F/; // Variation selector, follows heart and others +export const EMOJI_VARIATION = /\ufe0f/; // Variation selector, follows heart and others export const DIGIT = /\d/; export const SPACE = /\s/; /** - * Initialize the scanner character-based state machine for the given start state + * Initialize the scanner character-based state machine for the given start + * state + * @param {[string, boolean][]} customSchemes List of custom schemes, where each + * item is a length-2 tuple with the first element set to the string scheme, and + * the second element set to `true` if the `://` after the scheme is optional * @return {State} scanner starting state */ -export function init(customProtocols = []) { - // Frequently used states - const S_START = makeState(); - const S_NUM = makeAcceptingState(tk.NUM); - const S_DOMAIN = makeAcceptingState(tk.DOMAIN); - const S_DOMAIN_HYPHEN = makeState(); // domain followed by 1 or more hyphen characters - const S_WS = makeAcceptingState(tk.WS); - - const DOMAIN_REGEX_TRANSITIONS = [ - [DIGIT, S_DOMAIN], - [LETTER, S_DOMAIN], - [EMOJI, S_DOMAIN], - [EMOJI_VARIATION, S_DOMAIN] - ]; - - // Create a state which emits a domain token - const makeDomainState = () => { - const state = makeAcceptingState(tk.DOMAIN); - state.j = {'-': S_DOMAIN_HYPHEN }; - state.jr = [...DOMAIN_REGEX_TRANSITIONS]; +export function init(customSchemes = []) { + // Frequently used states (name argument removed during minification) + const Start = makeState('Start'); + const NonAccepting = makeState('NonAccepting'); // must never have any transitions + const Num = makeAcceptingState(tk.NUM, 'Num'); + const Word = makeAcceptingState(tk.WORD, 'Word'); + const UWord = makeAcceptingState(tk.UWORD, 'UWord'); + const Emoji = makeAcceptingState(tk.EMOJIS, 'Emoji'); + const Ws = makeAcceptingState(tk.WS, 'Ws'); + + /** + * Create a state which emits a word token + */ + const makeWordState = (name) => { + const state = makeAcceptingState(tk.WORD, name); + state.jr = [[ASCII_LETTER, Word]]; return state; }; - // Create a state which does not emit a domain state but the usual alphanumeric - // transitions are domains - const makeNearDomainState = (token) => { - const state = makeDomainState(); + /** + * Same as previous, but specific to non-ASCII alphabet words + */ + const makeUWordState = (name) => { + const state = makeAcceptingState(tk.UWORD, name); + state.jr = [[ASCII_LETTER, NonAccepting], [LETTER, UWord]]; + return state; + }; + + /** + * Create a state which does not emit a word but the usual alphanumeric + * transitions are domains + */ + const makeNearWordState = (token, name) => { + const state = makeWordState(name); + state.t = token; + return state; + }; + + /** + * Create a state which does not emit a word but the usual alphanumeric + * transitions are domains + */ + const makeNearUWordState = (token, name) => { + const state = makeUWordState(name); state.t = token; return state; }; // States for special URL symbols that accept immediately after start - makeBatchT(S_START, [ + makeBatchT(Start, [ ["'", makeAcceptingState(tk.APOSTROPHE)], ['{', makeAcceptingState(tk.OPENBRACE)], ['[', makeAcceptingState(tk.OPENBRACKET)], @@ -98,83 +120,83 @@ export function init(customProtocols = []) { // Whitespace jumps // Tokens of only non-newline whitespace are arbitrarily long - makeT(S_START, '\n', makeAcceptingState(tk.NL)); - makeRegexT(S_START, SPACE, S_WS); + makeT(Start, '\n', makeAcceptingState(tk.NL, 'Nl')); + makeRegexT(Start, SPACE, Ws); // If any whitespace except newline, more whitespace! - makeT(S_WS, '\n', makeState()); // non-accepting state - makeRegexT(S_WS, SPACE, S_WS); + makeT(Ws, '\n', makeState()); // non-accepting state + makeRegexT(Ws, SPACE, Ws); // Generates states for top-level domains // Note that this is most accurate when tlds are in alphabetical order for (let i = 0; i < tlds.length; i++) { - makeChainT(S_START, tlds[i], makeNearDomainState(tk.TLD), makeDomainState); + makeChainT(Start, tlds[i], makeNearWordState(tk.TLD), makeWordState); + } + for (let i = 0; i < utlds.length; i++) { + makeChainT(Start, utlds[i], makeNearUWordState(tk.UTLD), makeUWordState); } // Collect the states generated by different protocls - const S_PROTOCOL_FILE = makeDomainState(); - const S_PROTOCOL_FTP = makeDomainState(); - const S_PROTOCOL_HTTP = makeDomainState(); - const S_MAILTO = makeDomainState(); - makeChainT(S_START, 'file', S_PROTOCOL_FILE, makeDomainState); - makeChainT(S_START, 'ftp', S_PROTOCOL_FTP, makeDomainState); - makeChainT(S_START, 'http', S_PROTOCOL_HTTP, makeDomainState); - makeChainT(S_START, 'mailto', S_MAILTO, makeDomainState); - - // Protocol states - const S_PROTOCOL_SECURE = makeDomainState(); - const S_FULL_PROTOCOL = makeAcceptingState(tk.PROTOCOL); // Full protocol ends with COLON - const S_FULL_MAILTO = makeAcceptingState(tk.MAILTO); // Mailto ends with COLON - - // Secure protocols (end with 's') - makeT(S_PROTOCOL_FTP, 's', S_PROTOCOL_SECURE); - makeT(S_PROTOCOL_FTP, ':', S_FULL_PROTOCOL); - makeT(S_PROTOCOL_HTTP, 's', S_PROTOCOL_SECURE); - makeT(S_PROTOCOL_HTTP, ':', S_FULL_PROTOCOL); - - // Become protocol tokens after a COLON - makeT(S_PROTOCOL_FILE, ':', S_FULL_PROTOCOL); - makeT(S_PROTOCOL_SECURE, ':', S_FULL_PROTOCOL); - makeT(S_MAILTO, ':', S_FULL_MAILTO); - - // Register custom protocols - const S_CUSTOM_PROTOCOL = makeDomainState(); - for (let i = 0; i < customProtocols.length; i++) { - makeChainT(S_START, customProtocols[i], S_CUSTOM_PROTOCOL, makeDomainState); + const DefaultScheme = makeNearWordState(tk.SCHEME, 'DefaultScheme'); + const DefaultSlashScheme = makeNearWordState(tk.SLASH_SCHEME, 'DefaultSlashScheme'); + makeChainT(Start, 'file', DefaultScheme, makeWordState); + makeChainT(Start, 'mailto', DefaultScheme, makeWordState); + makeChainT(Start, 'ftp', DefaultSlashScheme, makeWordState); + makeChainT(Start, 'http', DefaultSlashScheme, makeWordState); + + // Secure (https, ftps) protocols (end with 's') + makeT(DefaultSlashScheme, 's', DefaultSlashScheme); + + // Register custom schemes + const CustomScheme = makeNearWordState(tk.SCHEME, 'CustomScheme'); + const CustomSlashScheme = makeNearWordState(tk.SLASH_SCHEME, 'CustomSlashScheme'); + const CustomCompoundScheme = makeAcceptingState(tk.SCHEME, 'CustomCompoundScheme'); + const CustomCompoundSlashScheme = makeAcceptingState(tk.SLASH_SCHEME, 'CustomCompoundSlashScheme'); + customSchemes = customSchemes.sort((a, b) => a[0] > b[0] ? 1 : -1); + for (let i = 0; i < customSchemes.length; i++) { + const schemeParts = customSchemes[i][0].split('-'); + const schemeState = schemeParts.length === 1 + ? (customSchemes[i][1] ? CustomScheme : CustomSlashScheme) + : (customSchemes[i][1] ? CustomCompoundScheme : CustomCompoundSlashScheme); + + let state = Start; + for (let j = 0; j < schemeParts.length; j++) { + let defaultStateFactory = j === 0 ? makeWordState : makeState; + let endState = j === schemeParts.length - 1 ? schemeState : defaultStateFactory(); + state = makeChainT(state, schemeParts[j], endState, defaultStateFactory); + if (schemeParts.length > 1 && j < schemeParts.length - 1) { + state = makeT(state, '-', makeState()); + } + } } - makeT(S_CUSTOM_PROTOCOL, ':', S_FULL_PROTOCOL); - // Localhost - makeChainT(S_START, 'localhost', makeNearDomainState(tk.LOCALHOST), makeDomainState); + // Localhost token + makeChainT(Start, 'localhost', makeNearWordState(tk.LOCALHOST), makeWordState); // Everything else - // DOMAINs make more DOMAINs // Number and character transitions - makeRegexT(S_START, DIGIT, S_NUM); - makeRegexT(S_START, LETTER, S_DOMAIN); - makeRegexT(S_START, EMOJI, S_DOMAIN); - makeRegexT(S_START, EMOJI_VARIATION, S_DOMAIN); - makeRegexT(S_NUM, DIGIT, S_NUM); - makeRegexT(S_NUM, LETTER, S_DOMAIN); // number becomes DOMAIN - makeRegexT(S_NUM, EMOJI, S_DOMAIN); // number becomes DOMAIN - makeRegexT(S_NUM, EMOJI_VARIATION, S_DOMAIN); // number becomes DOMAIN - makeT(S_NUM, '-', S_DOMAIN_HYPHEN); - - // Default domain transitions - makeT(S_DOMAIN, '-', S_DOMAIN_HYPHEN); - makeT(S_DOMAIN_HYPHEN, '-', S_DOMAIN_HYPHEN); - makeRegexT(S_DOMAIN, DIGIT, S_DOMAIN); - makeRegexT(S_DOMAIN, LETTER, S_DOMAIN); - makeRegexT(S_DOMAIN, EMOJI, S_DOMAIN); - makeRegexT(S_DOMAIN, EMOJI_VARIATION, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, DIGIT, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, LETTER, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, EMOJI, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, EMOJI_VARIATION, S_DOMAIN); + makeRegexT(Start, DIGIT, Num); + makeRegexT(Start, ASCII_LETTER, Word); + makeRegexT(Start, LETTER, UWord); + makeRegexT(Start, EMOJI, Emoji); + makeRegexT(Start, EMOJI_VARIATION, Emoji); // This one is sketchy + makeRegexT(Num, DIGIT, Num); + makeRegexT(Word, ASCII_LETTER, Word); + makeRegexT(UWord, ASCII_LETTER, NonAccepting); + makeRegexT(UWord, LETTER, UWord); + makeRegexT(Emoji, EMOJI, Emoji); + makeRegexT(Emoji, EMOJI_VARIATION, Emoji); + + // Account for zero-width joiner for chaining multiple emojis + // Not sure if these are actu + const EmojiJoiner = makeState(); + makeT(Emoji, '\u200d', EmojiJoiner); + makeRegexT(EmojiJoiner, EMOJI, Emoji); + makeRegexT(EmojiJoiner, EMOJI_VARIATION, Emoji); // Set default transition for start state (some symbol) - S_START.jd = makeAcceptingState(tk.SYM); - return S_START; + Start.jd = makeAcceptingState(tk.SYM, 'Sym'); + return Start; } /** @@ -188,10 +210,9 @@ export function init(customProtocols = []) { */ export function run(start, str) { // State machine is not case sensitive, so input is tokenized in lowercased - // form (still returns the regular case though) Uses selective `toLowerCase` - // is used because lowercasing the entire string causes the length and - // character position to vary in some non-English strings with V8-based - // runtimes. + // form (still returns regular case). Uses selective `toLowerCase` because + // lowercasing the entire string causes the length and character position to + // vary in some non-English strings with V8-based runtimes. const iterable = stringToArray(str.replace(/[A-Z]/g, (c) => c.toLowerCase())); const charCount = iterable.length; // <= len if there are emojis, etc const tokens = []; // return value diff --git a/packages/linkifyjs/src/core/tlds.js b/packages/linkifyjs/src/core/tlds.js index 404fa07a..6e588e99 100644 --- a/packages/linkifyjs/src/core/tlds.js +++ b/packages/linkifyjs/src/core/tlds.js @@ -2,10 +2,13 @@ // be as commonly used without the http prefix anyway and linkify will already // force-encode those. +// NOTE: vermögensberater vermögensberatung are special cases because they're +// the only ones in this list that contain non-ASCII characters + // To be updated with the values in this list // http://data.iana.org/TLD/tlds-alpha-by-domain.txt // Version 2021022800, Last Updated Sun Feb 28 07:07:01 2021 UTC -export default 'aaa \ +export const tlds = 'aaa \ aarp \ abarth \ abb \ @@ -1264,6 +1267,8 @@ ve \ vegas \ ventures \ verisign \ +vermögensberater \ +vermögensberatung \ versicherung \ vet \ vg \ @@ -1353,10 +1358,10 @@ zip \ zm \ zone \ zuerich \ -zw \ -vermögensberater-ctb \ -vermögensberatung-pwb \ -ελ \ +zw'.split(' '); + +// Internationalized domain names containing non-ASCII +export const utlds = 'ελ \ ευ \ бг \ бел \ diff --git a/packages/linkifyjs/src/core/tokens/multi.js b/packages/linkifyjs/src/core/tokens/multi.js index e6663d11..df1ebc0a 100644 --- a/packages/linkifyjs/src/core/tokens/multi.js +++ b/packages/linkifyjs/src/core/tokens/multi.js @@ -1,4 +1,4 @@ -import { PROTOCOL, SLASH } from './text'; +import { scheme, COLON } from './text'; import { defaults } from '../options'; /****************************************************************************** @@ -124,13 +124,6 @@ export function createTokenClass(type, props) { return Token; } -/** - Represents an arbitrarily mailto email address with the prefix included - @class MailtoEmail - @extends MultiToken -*/ -export const MailtoEmail = createTokenClass('email', { isLink: true }); - /** Represents a list of tokens making up a valid email address @class Email @@ -175,42 +168,12 @@ export const Url = createTokenClass('url', { @return {string} */ toHref(protocol = defaults.defaultProtocol) { - const tokens = this.tk; - let hasProtocol = false; - let hasSlashSlash = false; - let result = []; - let i = 0; - - // Make the first part of the domain lowercase - // Lowercase protocol - while (tokens[i].t === PROTOCOL) { - hasProtocol = true; - result.push(tokens[i].v); - i++; - } - - // Skip slash-slash - while (tokens[i].t === SLASH) { - hasSlashSlash = true; - result.push(tokens[i].v); - i++; - } - - // Continue pushing characters - for (; i < tokens.length; i++) { - result.push(tokens[i].v); - } - - result = result.join(''); - - if (!(hasProtocol || hasSlashSlash)) { - result = `${protocol}://${result}`; - } - - return result; + // Check if already has a prefix scheme + return this.hasProtocol() ? this.v : `${protocol}://${this.v}`; }, hasProtocol() { - return this.tk[0].t === PROTOCOL; + const tokens = this.tk; + return tokens.length >= 2 && scheme.indexOf(tokens[0].t) >= 0 && tokens[1].t === COLON; } }); diff --git a/packages/linkifyjs/src/core/tokens/text.js b/packages/linkifyjs/src/core/tokens/text.js index 85b6f0ef..a6aa958b 100644 --- a/packages/linkifyjs/src/core/tokens/text.js +++ b/packages/linkifyjs/src/core/tokens/text.js @@ -4,25 +4,43 @@ ******************************************************************************/ // A valid web domain token -export const DOMAIN = 'DOMAIN'; -export const LOCALHOST = 'LOCALHOST'; // special case of domain +export const WORD = 'WORD'; // only contains a-z +export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN -// Valid top-level domain (see tlds.js) +// Special case of word +export const LOCALHOST = 'LOCALHOST'; + +// Valid top-level domain, special case of WORD (see tlds.js) export const TLD = 'TLD'; -// Any sequence of digits 0-9 -export const NUM = 'NUM'; +// Valid IDN TLD, special case of UWORD (see tlds.js) +export const UTLD = 'UTLD'; + +// The scheme portion of a web URI protocol. Supported types include: `mailto`, +// `file`, and user-defined custom protocols. Limited to schemes that contain +// only letters +export const SCHEME = 'SCHEME'; + +// Similar to SCHEME, except makes distinction for schemes that must always be +// followed by `://`, not just `:`. Supported types include `http`, `https`, +// `ftp`, `ftps` +export const SLASH_SCHEME = 'SLASH_SCHEME'; + +// Similar to SCHEME, except contains - +export const COMPOUND_SCHEME = 'COMPOUND_SCHEME'; -// A web URL protocol. Supported types include -// - `http:` -// - `https:` -// - `ftp:` -// - `ftps:` -// - user-defined custom protocols -export const PROTOCOL = 'PROTOCOL'; +// Similar to SLASH_SCHEME, except contains - +export const COMPOUND_SLASH_SCHEME = 'COMPOUND_SLASH_SCHEME'; -// Start of the email URI protocol -export const MAILTO = 'MAILTO'; // mailto: +// TODO: Move this to keyword plugin +// Arbirary words that can keyword links +// export const KEYWORD = 'KEYWORD'; // simple [0-9a-z] +// export const UKEYWORD = 'UKEYWORD'; // containing [0-9\{Letter}] +// export const COMPOUND_KEYWORD = 'COMPOUND_KEYWORD'; // similar to KEYWORD but can have hyphens +// export const COMPOUND_UKEYWORD = 'COMPOUND_UKEYWORD'; // similar to UKEYWORD but can have hyphens + +// Any sequence of digits 0-9 +export const NUM = 'NUM'; // Any number of consecutive whitespace characters that are not newline export const WS = 'WS'; @@ -67,5 +85,19 @@ export const SLASH = 'SLASH'; // / export const TILDE = 'TILDE'; // ~ export const UNDERSCORE = 'UNDERSCORE'; // _ +// Emoji symbol +export const EMOJIS = 'EMOJIS'; + // Default token - anything that is not one of the above export const SYM = 'SYM'; + +// Token collections for grouping similar jumps in the parser +export const numeric = [NUM]; +export const ascii = [WORD, LOCALHOST, TLD, SCHEME, SLASH_SCHEME]; +export const asciinumeric = ascii.concat(NUM); +export const words = ascii.concat(UWORD, UTLD); +export const alphanumeric = words.concat(NUM); +export const domain = words.concat(COMPOUND_SCHEME, COMPOUND_SLASH_SCHEME, NUM, EMOJIS); +export const scheme = [SCHEME, SLASH_SCHEME, COMPOUND_SCHEME, COMPOUND_SLASH_SCHEME]; + +export const collections = { ascii, asciinumeric, words, alphanumeric, domain, scheme }; diff --git a/packages/linkifyjs/src/linkify.js b/packages/linkifyjs/src/linkify.js index 874f4c10..b52e6a64 100644 --- a/packages/linkifyjs/src/linkify.js +++ b/packages/linkifyjs/src/linkify.js @@ -8,7 +8,7 @@ const INIT = { scanner: null, parser: null, pluginQueue: [], - customProtocols: [], + customSchemes: [], initialized: false, }; @@ -21,7 +21,7 @@ export function reset() { INIT.scanner = null; INIT.parser = null; INIT.pluginQueue = []; - INIT.customProtocols = []; + INIT.customSchemes = []; INIT.initialized = false; } @@ -45,18 +45,20 @@ export function registerPlugin(name, plugin) { } /** - * Detect URLs with the following additional protocol. Anything following - * "protocol:" will be considered a link. + * Detect URLs with the following additional protocol. Anything with format + * "protocol://..." will be considered a link. If `optionalSlashSlash` is set to + * `true`, anything with format "protocol:..." will be considered a link. * @param {string} protocol + * @param {boolean} [optionalSlashSlash] if set to true, */ -export function registerCustomProtocol(protocol) { +export function registerCustomProtocol(protocol, optionalSlashSlash = false) { if (INIT.initialized) { - warn(`linkifyjs: already initialized - will not register custom protocol "${protocol}" until you manually call linkify.init(). To avoid this warning, please register all custom protocols before invoking linkify the first time.`); + warn(`linkifyjs: already initialized - will not register custom protocol "${protocol}" until you manually call linkify.init(). To avoid this warning, please register all custom schemes before invoking linkify the first time.`); } - if (!/^[a-z-]+$/.test(protocol)) { - throw Error('linkifyjs: protocols containing characters other than a-z or - (hyphen) are not supported'); + if (!/^[a-z]+(-[a-z]+)*$/.test(protocol)) { + throw Error('linkifyjs: incorrect protocol format.\n 1. Must only contain lowercase ASCII letters or -\n 2. Cannot start or end with -\n 3. - cannot repeat'); } - INIT.customProtocols.push(protocol); + INIT.customSchemes.push([protocol, optionalSlashSlash]); } /** @@ -65,7 +67,7 @@ export function registerCustomProtocol(protocol) { */ export function init() { // Initialize state machines - INIT.scanner = { start: scanner.init(INIT.customProtocols), tokens: scanner.tokens }; + INIT.scanner = { start: scanner.init(INIT.customSchemes), tokens: scanner.tokens }; INIT.parser = { start: parser.init(), tokens: parser.tokens }; const utils = { createTokenClass: parser.tokens.createTokenClass }; diff --git a/packages/linkifyjs/src/plugins/hashtag.js b/packages/linkifyjs/src/plugins/hashtag.js index ed3ee386..e2b47d50 100644 --- a/packages/linkifyjs/src/plugins/hashtag.js +++ b/packages/linkifyjs/src/plugins/hashtag.js @@ -5,37 +5,31 @@ import { registerPlugin } from 'linkifyjs'; export const hashtag = ({ scanner, parser, utils }) => { // Various tokens that may compose a hashtag - const { POUND, DOMAIN, TLD, LOCALHOST, UNDERSCORE } = scanner.tokens; + const { POUND, NUM, UNDERSCORE, words } = scanner.tokens; // The start state - const START_STATE = parser.start; + const Start = parser.start; // Create a new token that class that the parser emits when it finds a hashtag - const Hashtag = utils.createTokenClass('hashtag', { isLink: true }); + const HashtagToken = utils.createTokenClass('hashtag', { isLink: true }); // Take or create a transition from start to the '#' sign (non-accepting) - const HASH_STATE = START_STATE.tt(POUND); + const Hash = Start.tt(POUND); // Take transition from '#' to any text token to yield valid hashtag state - const HASHTAG_STATE = HASH_STATE.tt(DOMAIN, Hashtag); - - // Now that we have the hashtag state, no need to create new states - HASH_STATE.tt(TLD, HASHTAG_STATE); - HASH_STATE.tt(LOCALHOST, HASHTAG_STATE); + const Hashtag = Hash.tt(words, HashtagToken); + Hashtag.tt(NUM, Hashtag); + Hashtag.tt(UNDERSCORE, Hashtag); // Trailing underscore is okay + Hashtag.tt(words, Hashtag); // Account for leading underscore (non-accepting unless followed by domain) - const HASH_UNDERSCORE_STATE = HASH_STATE.tt(UNDERSCORE); - HASH_UNDERSCORE_STATE.tt(UNDERSCORE, HASH_UNDERSCORE_STATE); - HASH_UNDERSCORE_STATE.tt(DOMAIN, HASHTAG_STATE); - HASH_UNDERSCORE_STATE.tt(TLD, HASHTAG_STATE); - HASH_UNDERSCORE_STATE.tt(LOCALHOST, HASHTAG_STATE); - - // Continue the transitions - HASHTAG_STATE.tt(UNDERSCORE, HASHTAG_STATE); - HASHTAG_STATE.tt(DOMAIN, HASHTAG_STATE); - HASHTAG_STATE.tt(TLD, HASHTAG_STATE); - HASHTAG_STATE.tt(LOCALHOST, HASHTAG_STATE); - // Trailing underscore is okay + const HashPrefix = Hash.tt(NUM); + + Hash.tt(UNDERSCORE, HashPrefix); + HashPrefix.tt(NUM, HashPrefix); + HashPrefix.tt(UNDERSCORE, HashPrefix); + HashPrefix.tt(words, Hashtag); + }; diff --git a/packages/linkifyjs/src/plugins/mention.js b/packages/linkifyjs/src/plugins/mention.js index 9ab82c45..75738d20 100644 --- a/packages/linkifyjs/src/plugins/mention.js +++ b/packages/linkifyjs/src/plugins/mention.js @@ -4,10 +4,10 @@ import { registerPlugin } from 'linkifyjs'; export const mention = ({ scanner, parser, utils }) => { - const { DOMAIN, LOCALHOST, TLD, NUM, SLASH, UNDERSCORE, DOT, AT } = scanner.tokens; - const START_STATE = parser.start; + const { numeric, domain, HYPHEN, SLASH, UNDERSCORE, AT } = scanner.tokens; + const Start = parser.start; - const Mention = utils.createTokenClass('mention', { + const MentionToken = utils.createTokenClass('mention', { isLink: true, toHref() { return '/' + this.toString().substr(1); @@ -15,53 +15,27 @@ export const mention = ({ scanner, parser, utils }) => { }); // @ - const AT_STATE = START_STATE.tt(AT); // @ - - // @_, - const AT_SYMS_STATE = AT_STATE.tt(UNDERSCORE); - - // @_* - AT_SYMS_STATE.tt(UNDERSCORE, AT_SYMS_STATE); - AT_SYMS_STATE.tt(DOT, AT_SYMS_STATE); + const At = Start.tt(AT); // @ // Valid mention (not made up entirely of symbols) - const MENTION_STATE = AT_STATE.tt(DOMAIN, Mention); - AT_STATE.tt(TLD, MENTION_STATE); - AT_STATE.tt(LOCALHOST, MENTION_STATE); - AT_STATE.tt(NUM, MENTION_STATE); - - // @[_.]* + valid mention - AT_SYMS_STATE.tt(DOMAIN, MENTION_STATE); - AT_SYMS_STATE.tt(LOCALHOST, MENTION_STATE); - AT_SYMS_STATE.tt(TLD, MENTION_STATE); - AT_SYMS_STATE.tt(NUM, MENTION_STATE); + const Mention = At.tt(domain, MentionToken); + At.tt(numeric, Mention); + At.tt(UNDERSCORE, Mention); // More valid mentions - MENTION_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_STATE.tt(TLD, MENTION_STATE); - MENTION_STATE.tt(NUM, MENTION_STATE); - MENTION_STATE.tt(UNDERSCORE, MENTION_STATE); + Mention.tt(domain, Mention); + Mention.tt(numeric, Mention); + Mention.tt(UNDERSCORE, Mention); + Mention.tt(HYPHEN, Mention); // Mention with a divider - const MENTION_DIVIDER_STATE = MENTION_STATE.tt(SLASH); - MENTION_STATE.tt(SLASH, MENTION_DIVIDER_STATE); - MENTION_STATE.tt(DOT, MENTION_DIVIDER_STATE); - MENTION_STATE.tt(AT, MENTION_DIVIDER_STATE); - - // Mention _ trailing stash plus syms - const MENTION_DIVIDER_SYMS_STATE = MENTION_DIVIDER_STATE.tt(UNDERSCORE); - MENTION_DIVIDER_SYMS_STATE.tt(UNDERSCORE, MENTION_DIVIDER_SYMS_STATE); + const MentionDivider = Mention.tt(SLASH); // Once we get a word token, mentions can start up again - MENTION_DIVIDER_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(TLD, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(NUM, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(TLD, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(NUM, MENTION_STATE); + MentionDivider.tt(domain, Mention); + MentionDivider.tt(numeric, Mention); + MentionDivider.tt(UNDERSCORE, Mention); + MentionDivider.tt(HYPHEN, Mention); }; registerPlugin('mention', mention); diff --git a/packages/linkifyjs/src/plugins/ticket.js b/packages/linkifyjs/src/plugins/ticket.js index df3864f9..b6301fe3 100644 --- a/packages/linkifyjs/src/plugins/ticket.js +++ b/packages/linkifyjs/src/plugins/ticket.js @@ -6,12 +6,12 @@ import { registerPlugin } from 'linkifyjs'; export const ticket = ({ scanner, parser, utils }) => { // TODO: Add cross-repo style tickets? e.g., Hypercontext/linkifyjs#42 // Is that even feasible? - const { POUND, NUM } = scanner.tokens; - const START_STATE = parser.start; - const Ticket = utils.createTokenClass('ticket', { isLink: true }); + const { POUND, numeric } = scanner.tokens; + const Start = parser.start; + const TicketToken = utils.createTokenClass('ticket', { isLink: true }); - const HASH_STATE = START_STATE.tt(POUND); - HASH_STATE.tt(NUM, Ticket); + const Hash = Start.tt(POUND); + Hash.tt(numeric, TicketToken); }; registerPlugin('ticket', ticket); diff --git a/test/benchmarks.js b/test/benchmarks.js index 13470b1b..7de244a4 100644 --- a/test/benchmarks.js +++ b/test/benchmarks.js @@ -3,8 +3,8 @@ var ITERATIONS = 500; function bench1(linkify) { // eslint-disable-next-line no-debugger debugger; // prevents V8 optimization - delete require.cache[require.resolve('../lib/linkify')]; - linkify = require('../lib/linkify'); + delete require.cache[require.resolve('linkifyjs')]; + linkify = require('linkifyjs'); // linkify.init(); linkify.find(''); // delete require.cache[require.resolve('moment')]; @@ -252,7 +252,7 @@ mailto:bar`); [bench1, bench2].forEach((bench) => { debugger; // var usageInitial = process.memoryUsage(); - var linkify = require('../lib/linkify'); + var linkify = require('linkifyjs'); linkify.init(); // var usageLinkify = process.memoryUsage(); diff --git a/test/spec/core/parser.test.js b/test/spec/core/parser.test.js index 2bb5ec26..183a8f56 100644 --- a/test/spec/core/parser.test.js +++ b/test/spec/core/parser.test.js @@ -1,7 +1,7 @@ const { expect } = require('chai'); const scanner = require('linkifyjs/src/core/scanner'); const parser = require('linkifyjs/src/core/parser'); -const { Text, Url, Email, MailtoEmail } = require('linkifyjs/src/core/tokens/multi'); +const { Text, Url, Email } = require('linkifyjs/src/core/tokens/multi'); /** [0] - Original text to parse (should tokenize first) @@ -106,15 +106,15 @@ const tests = [ ['Emails cannot have two dots, e.g.: nick..', 'f@yahoo.ca'] ], [ 'The `mailto:` part should be included in mailto:this.is.a.test@yandex.ru', - [Text, MailtoEmail], + [Text, Url], ['The `mailto:` part should be included in ', 'mailto:this.is.a.test@yandex.ru'] ], [ 'mailto:echalk-dev@logicify.com?Subject=Hello%20again is another test', - [MailtoEmail, Text], + [Url, Text], ['mailto:echalk-dev@logicify.com?Subject=Hello%20again', ' is another test'] ], [ 'Mailto is greedy mailto:localhost?subject=Hello%20World.', - [Text, MailtoEmail, Text], + [Text, Url, Text], ['Mailto is greedy ', 'mailto:localhost?subject=Hello%20World', '.'] ], [ 'Emails like: test@42.domain.com and test@42.abc.11.domain.com should be matched in its entirety.', @@ -196,6 +196,22 @@ const tests = [ 'o\'malley@example.com.au', // Email with apostrophe [Email], ['o\'malley@example.com.au'] + ], [ + 'foohttp://example.com bar', + [Text, Url, Text], + ['foohttp://', 'example.com', ' bar'], + ], [ + 'テストhttp://example.comテスト', + [Text, Url], + ['テスト', 'http://example.comテスト'], + ], [ + 'file:/etc/motd', + [Url], + ['file:/etc/motd'] + ], [ + 'file:///etc/motd', + [Url], + ['file:///etc/motd'] ] ]; @@ -216,6 +232,7 @@ describe('linkifyjs/core/parser#run()', () => { }); } + // eslint-disable-next-line mocha/no-setup-in-describe tests.map(makeTest, this); it('Correctly sets start and end indexes', () => { diff --git a/test/spec/core/scanner.test.js b/test/spec/core/scanner.test.js index dc84d451..c0130a2b 100644 --- a/test/spec/core/scanner.test.js +++ b/test/spec/core/scanner.test.js @@ -29,58 +29,86 @@ const tests = [ ['&?<>(', [t.AMPERSAND, t.QUERY, t.OPENANGLEBRACKET, t.CLOSEANGLEBRACKET, t.OPENPAREN], ['&', '?', '<', '>', '(']], ['([{}])', [t.OPENPAREN, t.OPENBRACKET, t.OPENBRACE, t.CLOSEBRACE, t.CLOSEBRACKET, t.CLOSEPAREN], ['(', '[', '{', '}', ']', ')']], ['!,;\'', [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', '\'']], - ['hello', [t.DOMAIN], ['hello']], - ['Hello123', [t.DOMAIN], ['Hello123']], - ['hello123world', [t.DOMAIN], ['hello123world']], + ['hello', [t.WORD], ['hello']], + ['Hello123', [t.WORD, t.NUM], ['Hello', '123']], + ['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']], ['0123', [t.NUM], ['0123']], - ['123abc', [t.DOMAIN], ['123abc']], - ['http', [t.DOMAIN], ['http']], - ['http:', [t.PROTOCOL], ['http:']], - ['https:', [t.PROTOCOL], ['https:']], - ['files:', [t.DOMAIN, t.COLON], ['files', ':']], - ['file//', [t.DOMAIN, t.SLASH, t.SLASH], ['file', '/', '/']], - ['ftp://', [t.PROTOCOL, t.SLASH, t.SLASH], ['ftp:', '/', '/']], - ['mailto', [t.DOMAIN], ['mailto']], - ['mailto:', [t.MAILTO], ['mailto:']], - ['c', [t.DOMAIN], ['c']], + ['123abc', [t.NUM, t.TLD], ['123', 'abc']], + ['http', [t.SLASH_SCHEME], ['http']], + ['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']], + ['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']], + ['files:', [t.WORD, t.COLON], ['files', ':']], + ['file//', [t.SCHEME, t.SLASH, t.SLASH], ['file', '/', '/']], + ['ftp://', [t.SLASH_SCHEME, t.COLON, t.SLASH, t.SLASH], ['ftp', ':', '/', '/']], + ['mailto', [t.SCHEME], ['mailto']], + ['mailto:', [t.SCHEME, t.COLON], ['mailto', ':']], + ['c', [t.WORD], ['c']], ['co', [t.TLD], ['co']], ['com', [t.TLD], ['com']], - ['comm', [t.DOMAIN], ['comm']], - ['abc 123 DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.DOMAIN], ['abc', ' ', '123', ' ', 'DoReMi']], - ['abc 123 \n DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.NL, t.WS, t.DOMAIN], ['abc', ' ', '123', ' ', '\n', ' ', 'DoReMi']], - ['local', [t.DOMAIN], ['local']], + ['comm', [t.WORD], ['comm']], + ['vermögensberater السعودية москва', [t.TLD, t.WS, t.UTLD, t.WS, t.UTLD], ['vermögensberater', ' ', 'السعودية', ' ', 'москва']], + ['abc 123 DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.WORD], ['abc', ' ', '123', ' ', 'DoReMi']], + ['abc 123 \n DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.NL, t.WS, t.WORD], ['abc', ' ', '123', ' ', '\n', ' ', 'DoReMi']], + ['local', [t.WORD], ['local']], ['localhost', [t.LOCALHOST], ['localhost']], - ['localhosts', [t.DOMAIN], ['localhosts']], - ['500px', [t.DOMAIN], ['500px']], - ['500-px', [t.DOMAIN], ['500-px']], - ['-500px', [t.HYPHEN, t.DOMAIN], ['-', '500px']], - ['500px-', [t.DOMAIN, t.HYPHEN], ['500px', '-']], - ['123-456', [t.DOMAIN], ['123-456']], + ['localhosts', [t.WORD], ['localhosts']], + ['500px', [t.NUM, t.WORD], ['500', 'px']], + ['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']], + ['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']], + ['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']], + ['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']], ['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp - ['çïrâ.ca', [t.DOMAIN, t.DOT, t.TLD], ['çïrâ', '.', 'ca']], - ['www.🍕💩.ws', [t.DOMAIN, t.DOT, t.DOMAIN, t.DOT, t.TLD], ['www', '.', '🍕💩', '.', 'ws']], + ['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']], + ['❤️💚', [t.EMOJIS], ['❤️💚']], + ['👊🏿🧑🏼‍🔬🌚', [t.EMOJIS], ['👊🏿🧑🏼‍🔬🌚']], // contains zero-width joiner \u200d + ['www.🍕💩.ws', [t.WORD, t.DOT, t.EMOJIS, t.DOT, t.TLD], ['www', '.', '🍕💩', '.', 'ws']], [ 'za̡͊͠͝lgό.gay', // May support diacritics in the future if someone complains - [t.TLD, t.SYM, t.SYM, t.SYM, t.SYM, t.DOMAIN, t.DOT, t.TLD], - ['za', '͠', '̡', '͊', '͝', 'lgό','.','gay'] + [t.TLD, t.SYM, t.SYM, t.SYM, t.SYM, t.WORD, t.UWORD, t.DOT, t.TLD], + ['za', '͠', '̡', '͊', '͝', 'lg', 'ό','.','gay'] ], [ 'Direniş İzleme Grubu\'nun', - [t.DOMAIN, t.WS, t.DOMAIN, t.WS, t.DOMAIN, t.APOSTROPHE, t.DOMAIN], - ['Direniş', ' ', 'İzleme', ' ', 'Grubu', '\'', 'nun'] + [t.WORD, t.UWORD, t.WS, t.UWORD, t.WORD, t.WS, t.WORD, t.APOSTROPHE, t.WORD], + ['Direni', 'ş', ' ', 'İ', 'zleme', ' ', 'Grubu', '\'', 'nun'] ], [ 'example.com   テスト', // spaces are ideographic space - [t.DOMAIN, t.DOT, t.TLD, t.WS, t.DOMAIN], + [t.WORD, t.DOT, t.TLD, t.WS, t.UWORD], ['example', '.', 'com', '   ', 'テスト'] ], [ '#АБВ_бв #한글 #سلام', - [t.POUND, t.DOMAIN, t.UNDERSCORE, t.DOMAIN, t.WS, t.POUND, t.DOMAIN, t.WS, t.POUND, t.DOMAIN], + [t.POUND, t.UWORD, t.UNDERSCORE, t.UWORD, t.WS, t.POUND, t.UWORD, t.WS, t.POUND, t.UWORD], ['#', 'АБВ', '_', 'бв', ' ', '#', '한글', ' ', '#', 'سلام'] + ], + [ + 'テストexample.comテスト', + [t.UWORD, t.WORD, t.DOT, t.TLD, t.UWORD], + ['テスト', 'example', '.', 'com', 'テスト'] + ], + [ + 'テストhttp://example.comテスト', + [t.UWORD, t.SLASH_SCHEME, t.COLON, t.SLASH, t.SLASH, t.WORD, t.DOT, t.TLD, t.UWORD], + ['テスト', 'http', ':', '/', '/', 'example', '.', 'com', 'テスト'] ] ]; +const customSchemeTests = [ + ['stea', [t.WORD], ['stea']], + ['steam', [t.SCHEME], ['steam']], + ['steams', [t.WORD], ['steams']], + ['view', [t.WORD], ['view']], + ['view-', [t.WORD, t.HYPHEN], ['view', '-']], + ['view-s', [t.WORD, t.HYPHEN, t.WORD], ['view', '-', 's']], + ['view-sour', [t.WORD, t.HYPHEN, t.WORD], ['view', '-', 'sour']], + ['view-source', [t.SLASH_SCHEME], ['view-source']], + ['view-sources', [t.SLASH_SCHEME, t.WORD], ['view-source', 's']], // This is an unfortunate consequence :( + ['fb', [t.SLASH_SCHEME], ['fb']], + ['twitter sux', [t.SLASH_SCHEME, t.WS, t.WORD], ['twitter', ' ', 'sux']], + ['ms-settings', [t.SCHEME], ['ms-settings']], +]; + describe('linkifyjs/core/scanner#run()', () => { let start; @@ -103,7 +131,7 @@ describe('linkifyjs/core/scanner#run()', () => { it('Correctly sets start and end indexes', () => { expect(scanner.run(start, 'Hello, World!')).to.eql([ - { t: t.DOMAIN, v: 'Hello', s: 0, e: 5 }, + { t: t.WORD, v: 'Hello', s: 0, e: 5 }, { t: t.COMMA, v: ',', s: 5, e: 6 }, { t: t.WS, v: ' ', s: 6, e: 7 }, { t: t.TLD, v: 'World', s: 7, e: 12 }, @@ -112,22 +140,35 @@ describe('linkifyjs/core/scanner#run()', () => { }); describe('Custom protocols', () => { - before(() => { start = scanner.init(['twitter', 'fb', 'steam']); }); + + before(() => { + start = scanner.init([ + ['twitter', false], + ['fb', false], + ['steam', true], + ['view-source', false], + ['ms-settings', true] + ]); + }); + + // eslint-disable-next-line mocha/no-setup-in-describe + customSchemeTests.map(makeTest, this); it('Correctly tokenizes a full custom protocols', () => { expect(scanner.run(start, 'steam://hello')).to.eql([ - { t: t.PROTOCOL, v: 'steam:', s: 0, e: 6 }, + { t: t.SCHEME, v: 'steam', s: 0, e: 5 }, + { t: t.COLON, v: ':', s: 5, e: 6 }, { t: t.SLASH, v: '/', s: 6, e: 7 }, { t: t.SLASH, v: '/', s: 7, e: 8 }, - { t: t.DOMAIN, v: 'hello', s: 8, e: 13 } + { t: t.WORD, v: 'hello', s: 8, e: 13 } ]); }); - it('Classifies partial custom protocols as domains', () => { + it('Classifies partial schemes', () => { expect(scanner.run(start, 'twitter sux')).to.eql([ - { t: t.DOMAIN, v: 'twitter', s: 0, e: 7 }, + { t: t.SLASH_SCHEME, v: 'twitter', s: 0, e: 7 }, { t: t.WS, v: ' ', s: 7, e: 8 }, - { t: t.DOMAIN, v: 'sux', s: 8, e: 11 } + { t: t.WORD, v: 'sux', s: 8, e: 11 } ]); }); }); diff --git a/test/spec/core/tokens/multi.test.js b/test/spec/core/tokens/multi.test.js index a4126bdd..44952cb3 100644 --- a/test/spec/core/tokens/multi.test.js +++ b/test/spec/core/tokens/multi.test.js @@ -16,48 +16,38 @@ describe('linkifyjs/core/tokens/multi', () => { describe('Url', () => { let input1 = 'Ftps://www.github.com/Hypercontext/linkify'; - let input2 = '//Amazon.ca/Sales'; - let input3 = 'co.co?o=%2D&p=@gc#wat'; - let url1, url2, url3; + let input2 = 'co.co/?o=%2D&p=@gc#wat'; + let url1, url2; before(() => { const urlTextTokens1 = scanner.run(scannerStart, input1); const urlTextTokens2 = scanner.run(scannerStart, input2); - const urlTextTokens3 = scanner.run(scannerStart, input3); url1 = new mtk.Url(input1, urlTextTokens1); url2 = new mtk.Url(input2, urlTextTokens2); - url3 = new mtk.Url(input3, urlTextTokens3); }); describe('#isLink', () => { it('Is true in all cases', () => { expect(url1.isLink).to.be.ok; expect(url2.isLink).to.be.ok; - expect(url3.isLink).to.be.ok; }); }); describe('#toString()', () => { it('Returns the exact URL text', () => { expect(url1.toString()).to.be.eql('Ftps://www.github.com/Hypercontext/linkify'); - expect(url2.toString()).to.be.eql('//Amazon.ca/Sales'); - expect(url3.toString()).to.be.eql('co.co?o=%2D&p=@gc#wat'); + expect(url2.toString()).to.be.eql('co.co/?o=%2D&p=@gc#wat'); }); }); describe('#toHref()', () => { - it('Keeps the protocol the same as the original URL (and lowercases it)', () => { + it('Keeps the protocol the same as the original URL', () => { expect(url1.toHref()).to.be.eql('Ftps://www.github.com/Hypercontext/linkify'); }); - it('Lowercases the domain name only and leaves off the protocol if the URL begins with "//"', () => { - expect(url2.toHref()).to.be.eql('//Amazon.ca/Sales'); - }); - it('Adds a default protocol, if required', () => { - expect(url3.toHref()).to.be.eql('http://co.co?o=%2D&p=@gc#wat'); - expect(url3.toHref('ftp')).to.be.eql('ftp://co.co?o=%2D&p=@gc#wat'); + expect(url2.toHref()).to.be.eql('http://co.co/?o=%2D&p=@gc#wat'); }); }); @@ -73,23 +63,14 @@ describe('linkifyjs/core/tokens/multi', () => { end: input1.length }); - expect(url2.toObject()).to.be.eql({ + expect(url2.toObject('https')).to.be.eql({ type: 'url', value: input2, - href: input2, + href: 'https://co.co/?o=%2D&p=@gc#wat', isLink: true, start: 0, end: input2.length }); - - expect(url3.toObject('https')).to.be.eql({ - type: 'url', - value: input3, - href: 'https://co.co?o=%2D&p=@gc#wat', - isLink: true, - start: 0, - end: input3.length - }); }); }); @@ -99,7 +80,6 @@ describe('linkifyjs/core/tokens/multi', () => { }); it('Tests false when there is no protocol', () => { expect(url2.hasProtocol()).to.not.be.ok; - expect(url3.hasProtocol()).to.not.be.ok; }); }); @@ -140,7 +120,7 @@ describe('linkifyjs/core/tokens/multi', () => { before(() => { const emailTextTokens = scanner.run(scannerStart, input); - email = new mtk.MailtoEmail(input, emailTextTokens); + email = new mtk.Url(input, emailTextTokens); }); describe('#isLink', () => { diff --git a/test/spec/linkifyjs.test.js b/test/spec/linkifyjs.test.js index 9f57f7f1..1d7054ca 100644 --- a/test/spec/linkifyjs.test.js +++ b/test/spec/linkifyjs.test.js @@ -1,7 +1,71 @@ /* eslint-disable mocha/no-setup-in-describe */ +const { expect } = require('chai'); const linkify = require('linkifyjs/src/linkify'); +const ticketPlugin = ({ scanner, parser, utils }) => { + const { POUND, numeric } = scanner.tokens; + const TicketToken = utils.createTokenClass('ticket', { isLink: true }); + const Hash = parser.start.tt(POUND); + Hash.tt(numeric, TicketToken); +}; + describe('linkifyjs', () => { + describe('registerPlugin', () => { + beforeEach(() => { + linkify.registerPlugin('ticket', ticketPlugin); + }); + + it('Detects tickets after applying', () => { + expect(linkify.test('#123', 'ticket')).to.be.ok; + }); + + it('Logs a warning if registering same plugin twice', () => { + linkify.registerPlugin('ticket', ticketPlugin); + expect(linkify.test('#123', 'ticket')).to.be.ok; + }); + + it('Logs a warning if already initialized', () => { + linkify.init(); + linkify.registerPlugin('ticket2', ticketPlugin); + }); + }); + + describe('registerCustomProtocol', () => { + beforeEach(() => { + linkify.registerCustomProtocol('instagram', true); + linkify.registerCustomProtocol('view-source'); + }); + + it('Detects basic protocol', () => { + expect(linkify.test('instagram:user/nfrasser', 'url')).to.be.ok; + }); + + it('Detects basic protocol with slash slash', () => { + expect(linkify.test('instagram://user/nfrasser', 'url')).to.be.ok; + }); + + it('Detects compound protocol', () => { + expect(linkify.test('view-source://http://github.com/', 'url')).to.be.ok; + }); + + it('Does not detect protocol with non-optional //', () => { + expect(linkify.test('view-source:http://github.com/', 'url')).to.not.be.ok; + }); + + it('Does not detect custom protocol if already initialized', () => { + linkify.init(); + linkify.registerCustomProtocol('fb'); + expect(linkify.test('fb://feed')).to.not.be.ok; + }); + + it('Throws error when protocol has invalid format', () => { + expect(() => linkify.registerCustomProtocol('-')).to.throw(); + expect(() => linkify.registerCustomProtocol('-fb')).to.throw(); + expect(() => linkify.registerCustomProtocol('fb-')).to.throw(); + expect(() => linkify.registerCustomProtocol('git+https')).to.throw(); // this may work in the future + }); + }); + describe('tokenize', () => { it('is a function', () => { expect(linkify.tokenize).to.be.a('function'); @@ -15,9 +79,40 @@ describe('linkifyjs', () => { it('is a function', () => { expect(linkify.find).to.be.a('function'); }); + it('takes a single argument', () => { expect(linkify.find.length).to.be.eql(1); // type is optional }); + + it('Find nothing in an empty string', () => { + expect(linkify.find('')).to.deep.eql([]); + }); + + it('Find nothing in a string with no links', () => { + expect(linkify.find('Hello World!')).to.deep.eql([]); + }); + + it('Find the link', () => { + expect(linkify.find('hello.world!')).to.deep.eql([{ + type: 'url', + value: 'hello.world', + href: 'http://hello.world', + isLink: true, + start: 0, + end: 11 + }]); + }); + + it('Find the link of the specific type', () => { + expect(linkify.find('For help with github.com, please contact support@example.com', 'email')).to.deep.eql([{ + type: 'email', + value: 'support@example.com', + href: 'mailto:support@example.com', + isLink: true, + start: 41, + end: 60 + }]); + }); }); describe('test', () => { @@ -38,7 +133,7 @@ describe('linkifyjs', () => { ['test+4@uwaterloo.ca', true], ['test+4@uwaterloo.ca', false, 'url'], ['test+4@uwaterloo.ca', true, 'email'], - ['mailto:test+5@uwaterloo.ca', true, 'email'], + ['mailto:test+5@uwaterloo.ca', true, 'url'], ['t.co', true], ['t.co g.co', false], // can only be one ['test@g.co t.co', false] // can only be one diff --git a/test/spec/plugins/hashtag.test.js b/test/spec/plugins/hashtag.test.js index 396015a8..c8dc1120 100644 --- a/test/spec/plugins/hashtag.test.js +++ b/test/spec/plugins/hashtag.test.js @@ -13,9 +13,12 @@ describe('plugins/hashtag', () => { }); describe('after plugin is applied', () => { - it ('can parse hashtags after applying the plugin', () => { + beforeEach(() => { linkify.registerPlugin('hashtag', hashtag); - expect(linkify.find('There is a #hashtag #YOLO-2015 #__swag__ and #1234 and #%^&*( #_ #__ should not work')) + }); + + it ('can parse hashtags after applying the plugin', () => { + expect(linkify.find('There is a #hashtag #YOLO_2015 #__swag__ and #1234 and #%^&*( #_ #__ should not work')) .to.be.eql([{ type: 'hashtag', value: '#hashtag', @@ -25,8 +28,8 @@ describe('plugins/hashtag', () => { end: 19 }, { type: 'hashtag', - value: '#YOLO-2015', - href: '#YOLO-2015', + value: '#YOLO_2015', + href: '#YOLO_2015', isLink: true, start: 20, end: 30 @@ -38,12 +41,50 @@ describe('plugins/hashtag', () => { start: 31, end: 40 }]); + }); + it('Works with basic hashtags', () => { expect(linkify.test('#wat', 'hashtag')).to.be.ok; + }); + + it('Works with trailing underscores', () => { expect(linkify.test('#bug_', 'hashtag')).to.be.ok; + }); + + it('Works with underscores', () => { expect(linkify.test('#bug_test', 'hashtag')).to.be.ok; + }); + + it('Works with double underscores', () => { expect(linkify.test('#bug__test', 'hashtag')).to.be.ok; + }); + + it('Works with number prefix', () => { + expect(linkify.test('#123abc', 'hashtag')).to.be.ok; + }); + + it('Works with number/underscore prefix', () => { + expect(linkify.test('#123_abc', 'hashtag')).to.be.ok; + }); + + it('Works with Hangul characters', () => { + expect(linkify.test('#일상', 'hashtag')).to.be.ok; + }); + + it('Works with Cyrillic characters', () => { + expect(linkify.test('#АБВ_бв', 'hashtag')).to.be.ok; + }); + + it('Works with Arabic characters', () => { + expect(linkify.test('#سلام', 'hashtag')).to.be.ok; + }); + + it('Does not work with just numbers', () => { expect(linkify.test('#987', 'hashtag')).to.not.be.ok; }); + + it('Does not work with just numbers and underscore', () => { + expect(linkify.test('#987_654', 'hashtag')).to.not.be.ok; + }); }); }); diff --git a/test/spec/plugins/mention.test.js b/test/spec/plugins/mention.test.js index f75481ae..163e505b 100644 --- a/test/spec/plugins/mention.test.js +++ b/test/spec/plugins/mention.test.js @@ -1,3 +1,4 @@ +const { expect } = require('chai'); const linkify = require('linkifyjs'); const { mention } = require('linkifyjs/src/plugins/mention'); @@ -9,7 +10,7 @@ describe('plugins/mention', () => { .to.be.eql([]); expect(linkify.test('@wat', 'mention')).to.not.be.ok; - expect(linkify.test('@987', 'mention')).to.not.be.ok; + expect(linkify.test('@007', 'mention')).to.not.be.ok; }); describe('after plugin is applied', () => { @@ -76,36 +77,6 @@ describe('plugins/mention', () => { }]); }); - it('parses mentions with email syntax', () => { - expect(linkify.find('Hey @developers@soapbox')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox', - href: '/developers@soapbox', - isLink: true, - start: 4, - end: 23 - }]); - - expect(linkify.find('Hey @developers@soapbox.example.com')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox.example.com', - href: '/developers@soapbox.example.com', - isLink: true, - start: 4, - end: 35 - }]); - - expect(linkify.find('Hey @developers@soapbox you can mail me at someone@soapbox')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox', - href: '/developers@soapbox', - isLink: true, - start: 4, - end: 23 - }]); - - }); - it('parses github team-style mentions with slashes', () => { expect(linkify.find('Hey @500px/web please review this')).to.deep.equal([{ type: 'mention', @@ -135,29 +106,29 @@ describe('plugins/mention', () => { }]); }); - it('parses mentions with dots', () => { + it('parses mentions with dots (ignores past the dots)', () => { expect(linkify.find('Hey @john.doe please review this')).to.deep.equal([{ type: 'mention', - value: '@john.doe', - href: '/john.doe', + value: '@john', + href: '/john', isLink: true, start: 4, - end: 13 + end: 9 }]); }); it('ignores extra dots at the end of mentions', () => { - expect(linkify.find('We should get ...@soapbox._developers.@soapbox.cs.... to be awesome')).to.deep.equal([{ + expect(linkify.find('We should get ...@soapbox-_developers.@soapbox_cs.... to be awesome')).to.deep.equal([{ type: 'mention', - value: '@soapbox._developers', - href: '/soapbox._developers', + value: '@soapbox-_developers', + href: '/soapbox-_developers', isLink: true, start: 17, end: 37 }, { type: 'mention', - value: '@soapbox.cs', - href: '/soapbox.cs', + value: '@soapbox_cs', + href: '/soapbox_cs', isLink: true, start: 38, end: 49 @@ -169,7 +140,7 @@ describe('plugins/mention', () => { }); it('ignores text only made up of symbols', () => { - expect(linkify.find('Is @- or @__ a person? What about @%_% no, probably not')).to.deep.equal([]); + expect(linkify.find('Is @- or @~! a person? What about @%_% no, probably not')).to.deep.equal([]); }); it('ignores punctuation at the end of mentions', () => { @@ -221,6 +192,26 @@ describe('plugins/mention', () => { end: 25 }]); }); + + it('detects trailing hyphen', () => { + expect(linkify.test('@123-', 'mention')).to.be.ok; + }); + + it('detects interjecting hyphen', () => { + expect(linkify.test('@123-abc', 'mention')).to.be.ok; + }); + + it('detects single underscore', () => { + expect(linkify.test('@_', 'mention')).to.be.ok; + }); + + it('detects multiple underscore', () => { + expect(linkify.test('@__', 'mention')).to.be.ok; + }); + + it('ignores interjecting dot', () => { + expect(linkify.test('@hello.world', 'mention')).to.not.be.ok; + }); }); afterEach(() => { linkify.reset(); });