Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit e3187ed

Browse files
authored
Fix links being parsed as markdown links improperly (#7200)
* Fix links being parsed as markdown links improperly Fixes #4674 * Fix a typo * Fix overriding too much stuff * Fix parsing * Remove useless console.log * Remove unnecessary emph function * Properly fix tests * Add some better docs * Add missing license header
1 parent 8fe582b commit e3187ed

File tree

3 files changed

+268
-4
lines changed

3 files changed

+268
-4
lines changed

src/Markdown.ts

Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717

1818
import * as commonmark from 'commonmark';
1919
import { escape } from "lodash";
20+
import { logger } from 'matrix-js-sdk/src/logger';
21+
import * as linkify from 'linkifyjs';
2022

2123
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
2224

@@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
2931
link: (node: commonmark.Node, entering: boolean) => void;
3032
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
3133
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
34+
text: (node: commonmark.Node) => void;
35+
out: (text: string) => void;
36+
emph: (node: commonmark.Node) => void;
3237
}
3338

3439
function isAllowedHtmlTag(node: commonmark.Node): boolean {
@@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
6166
return par.firstChild != par.lastChild;
6267
}
6368

69+
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
70+
let currentNode = node;
71+
let text = '';
72+
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
73+
const { literal, type } = currentNode;
74+
if (type === 'text' && literal) {
75+
let n = 0;
76+
let char = literal[n];
77+
while (char !== ' ' && char !== null && n <= literal.length) {
78+
if (char === ' ') {
79+
break;
80+
}
81+
if (char) {
82+
text += char;
83+
}
84+
n += 1;
85+
char = literal[n];
86+
}
87+
if (char === ' ') {
88+
break;
89+
}
90+
}
91+
currentNode = currentNode.next;
92+
}
93+
return text;
94+
}
95+
6496
/**
6597
* Class that wraps commonmark, adding the ability to see whether
6698
* a given message actually uses any markdown syntax or whether
@@ -70,11 +102,103 @@ export default class Markdown {
70102
private input: string;
71103
private parsed: commonmark.Node;
72104

73-
constructor(input) {
105+
constructor(input: string) {
74106
this.input = input;
75107

76108
const parser = new commonmark.Parser();
77109
this.parsed = parser.parse(this.input);
110+
this.parsed = this.repairLinks(this.parsed);
111+
}
112+
113+
/**
114+
* This method is modifying the parsed AST in such a way that links are always
115+
* properly linkified instead of sometimes being wrongly emphasised in case
116+
* if you were to write a link like the example below:
117+
* https://my_weird-link_domain.domain.com
118+
* ^ this link would be parsed to something like this:
119+
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
120+
* This method makes it so the link gets properly modified to a version where it is
121+
* not emphasised until it actually ends.
122+
* See: https://github.com/vector-im/element-web/issues/4674
123+
* @param parsed
124+
*/
125+
private repairLinks(parsed: commonmark.Node) {
126+
const walker = parsed.walker();
127+
let event: commonmark.NodeWalkingStep = null;
128+
let text = '';
129+
let isInPara = false;
130+
let previousNode: commonmark.Node | null = null;
131+
let shouldUnlinkEmphasisNode = false;
132+
while ((event = walker.next())) {
133+
const { node } = event;
134+
if (node.type === 'paragraph') {
135+
if (event.entering) {
136+
isInPara = true;
137+
} else {
138+
isInPara = false;
139+
}
140+
}
141+
if (isInPara) {
142+
// Clear saved string when line ends
143+
if (
144+
node.type === 'softbreak' ||
145+
node.type === 'linebreak' ||
146+
// Also start calculating the text from the beginning on any spaces
147+
(node.type === 'text' && node.literal === ' ')
148+
) {
149+
text = '';
150+
}
151+
if (node.type === 'text') {
152+
text += node.literal;
153+
}
154+
// We should not do this if previous node was not a textnode, as we can't combine it then.
155+
if (node.type === 'emph' && previousNode.type === 'text') {
156+
if (event.entering) {
157+
const foundLinks = linkify.find(text);
158+
for (const { value } of foundLinks) {
159+
if (node.firstChild.literal) {
160+
/**
161+
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
162+
* but this solution seems to work well and is hopefully slightly easier to understand too
163+
*/
164+
const nonEmphasizedText = `_${node.firstChild.literal}_`;
165+
const f = getTextUntilEndOrLinebreak(node);
166+
const newText = value + nonEmphasizedText + f;
167+
const newLinks = linkify.find(newText);
168+
// Should always find only one link here, if it finds more it means that the algorithm is broken
169+
if (newLinks.length === 1) {
170+
const emphasisTextNode = new commonmark.Node('text');
171+
emphasisTextNode.literal = nonEmphasizedText;
172+
previousNode.insertAfter(emphasisTextNode);
173+
node.firstChild.literal = '';
174+
event = node.walker().next();
175+
// Remove `em` opening and closing nodes
176+
node.unlink();
177+
previousNode.insertAfter(event.node);
178+
shouldUnlinkEmphasisNode = true;
179+
} else {
180+
logger.error(
181+
"Markdown links escaping found too many links for following text: ",
182+
text,
183+
);
184+
logger.error(
185+
"Markdown links escaping found too many links for modified text: ",
186+
newText,
187+
);
188+
}
189+
}
190+
}
191+
} else {
192+
if (shouldUnlinkEmphasisNode) {
193+
node.unlink();
194+
shouldUnlinkEmphasisNode = false;
195+
}
196+
}
197+
}
198+
}
199+
previousNode = node;
200+
}
201+
return parsed;
78202
}
79203

80204
isPlainText(): boolean {
@@ -120,9 +244,7 @@ export default class Markdown {
120244
// you can nest them.
121245
//
122246
// Let's try sending with <p/>s anyway for now, though.
123-
124247
const realParagraph = renderer.paragraph;
125-
126248
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
127249
// If there is only one top level node, just return the
128250
// bare text: it's a single line of text and so should be

test/Markdown-test.ts

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
Copyright 2021 The Matrix.org Foundation C.I.C.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
import * as linkifyjs from 'linkifyjs';
17+
import Markdown from "../src/Markdown";
18+
import matrixLinkify from '../src/linkify-matrix';
19+
20+
beforeAll(() => {
21+
// We need to call linkifier plugins before running those tests
22+
matrixLinkify(linkifyjs);
23+
});
24+
25+
describe("Markdown parser test", () => {
26+
describe("fixing HTML links", () => {
27+
const testString = [
28+
"Test1:",
29+
"#_foonetic_xkcd:matrix.org",
30+
"http://google.com/_thing_",
31+
"https://matrix.org/_matrix/client/foo/123_",
32+
"#_foonetic_xkcd:matrix.org",
33+
"",
34+
"Test1A:",
35+
"#_foonetic_xkcd:matrix.org",
36+
"http://google.com/_thing_",
37+
"https://matrix.org/_matrix/client/foo/123_",
38+
"#_foonetic_xkcd:matrix.org",
39+
"",
40+
"Test2:",
41+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
42+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
43+
"",
44+
"Test3:",
45+
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
46+
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
47+
].join("\n");
48+
49+
it('tests that links are getting properly HTML formatted', () => {
50+
/* eslint-disable max-len */
51+
const expectedResult = [
52+
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
53+
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
54+
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
55+
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
56+
"",
57+
].join("\n");
58+
/* eslint-enable max-len */
59+
const md = new Markdown(testString);
60+
expect(md.toHTML()).toEqual(expectedResult);
61+
});
62+
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
63+
const test = [
64+
"Test1:",
65+
"<#_foonetic_xkcd:matrix.org>",
66+
"<http://google.com/_thing_>",
67+
"<https://matrix.org/_matrix/client/foo/123_>",
68+
"<#_foonetic_xkcd:matrix.org>",
69+
"",
70+
"Test1A:",
71+
"<#_foonetic_xkcd:matrix.org>",
72+
"<http://google.com/_thing_>",
73+
"<https://matrix.org/_matrix/client/foo/123_>",
74+
"<#_foonetic_xkcd:matrix.org>",
75+
"",
76+
"Test2:",
77+
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
78+
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
79+
"",
80+
"Test3:",
81+
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
82+
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
83+
].join("\n");
84+
/* eslint-disable max-len */
85+
/**
86+
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
87+
* but it seems to be actually working properly
88+
*/
89+
const expectedResult = [
90+
"<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
91+
"<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
92+
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
93+
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
94+
"",
95+
].join("\n");
96+
/* eslint-enable max-len */
97+
const md = new Markdown(test);
98+
expect(md.toHTML()).toEqual(expectedResult);
99+
});
100+
101+
it('expects that links in codeblock are not modified', () => {
102+
const expectedResult = [
103+
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
104+
'http://google.com/_thing_',
105+
'https://matrix.org/_matrix/client/foo/123_',
106+
'#_foonetic_xkcd:matrix.org',
107+
'',
108+
'Test1A:',
109+
'#_foonetic_xkcd:matrix.org',
110+
'http://google.com/_thing_',
111+
'https://matrix.org/_matrix/client/foo/123_',
112+
'#_foonetic_xkcd:matrix.org',
113+
'',
114+
'Test2:',
115+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
116+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
117+
'',
118+
'Test3:',
119+
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
120+
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
121+
'</code></pre>',
122+
'',
123+
].join('\n');
124+
const md = new Markdown("```" + testString + "```");
125+
expect(md.toHTML()).toEqual(expectedResult);
126+
});
127+
128+
it('expects that links in one line will be "escaped" properly', () => {
129+
/* eslint-disable max-len */
130+
const testString = [
131+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
132+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
133+
].join('\n');
134+
const expectedResult = [
135+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
136+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
137+
].join('<br />');
138+
/* eslint-enable max-len */
139+
const md = new Markdown(testString);
140+
expect(md.toHTML()).toEqual(expectedResult);
141+
});
142+
});
143+
});

test/editor/deserialize-test.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,6 @@ describe('editor/deserialize', function() {
197197
it('code block with no trailing text', function() {
198198
const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
199199
const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
200-
console.log(parts);
201200
expect(parts.length).toBe(5);
202201
expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
203202
expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });

0 commit comments

Comments
 (0)