Skip to content

Commit 374188f

Browse files
authored
fix: Preserve invalid nested A tags in AST (see #215 for detail)
1 parent 8a98795 commit 374188f

File tree

4 files changed

+109
-37
lines changed

4 files changed

+109
-37
lines changed

.gitattributes

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# These settings are for any web project
2+
3+
# Handle line endings automatically for files detected as text
4+
# and leave all files detected as binary untouched.
5+
* text=auto
6+
7+
# Force the following filetypes to have unix eols, so Windows does not break them
8+
*.* text eol=lf
9+
10+
# Windows forced line-endings
11+
/.idea/* text eol=crlf
12+
13+
#
14+
## These files are binary and should be left untouched
15+
#
16+
17+
# (binary is a macro for -text -diff)
18+
*.png binary
19+
*.jpg binary
20+
*.jpeg binary
21+
*.gif binary
22+
*.ico binary
23+
*.mov binary
24+
*.mp4 binary
25+
*.mp3 binary
26+
*.flv binary
27+
*.fla binary
28+
*.swf binary
29+
*.gz binary
30+
*.zip binary
31+
*.7z binary
32+
*.ttf binary
33+
*.eot binary
34+
*.woff binary
35+
*.pyc binary
36+
*.pdf binary
37+
*.ez binary
38+
*.bz2 binary
39+
*.swp binary

src/nodes/html.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,10 @@ const kElementsClosedByClosing = {
982982
export interface Options {
983983
lowerCaseTagName: boolean;
984984
comment: boolean;
985+
/**
986+
* @see PR #215 for explanation
987+
*/
988+
fixNestedATags?: boolean;
985989
parseNoneClosedTags?: boolean;
986990
blockTextElements: {
987991
[tag: string]: boolean;
@@ -1036,7 +1040,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
10361040
let match: RegExpExecArray;
10371041
// https://github.com/taoqf/node-html-parser/issues/38
10381042
data = `<${frameflag}>${data}</${frameflag}>`;
1039-
const { lowerCaseTagName } = options;
1043+
const { lowerCaseTagName, fixNestedATags } = options;
10401044

10411045
const dataEndPos = data.length - (frameflag.length + 2);
10421046
const frameFlagOffset = frameflag.length + 2;
@@ -1097,7 +1101,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
10971101
}
10981102

10991103
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
1100-
if (tagName === 'a' || tagName === 'A') {
1104+
if (fixNestedATags && (tagName === 'a' || tagName === 'A')) {
11011105
if (noNestedTagIndex !== undefined) {
11021106
stack.splice(noNestedTagIndex);
11031107
currentParent = arr_back(stack);
@@ -1142,7 +1146,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
11421146
// Handle closing tags or self-closed elements (ie </tag> or <br>)
11431147
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) {
11441148
while (true) {
1145-
if (tagName === 'a' || tagName === 'A') noNestedTagIndex = undefined;
1149+
if (noNestedTagIndex != null && (tagName === 'a' || tagName === 'A')) noNestedTagIndex = undefined;
11461150
if (currentParent.rawTagName === tagName) {
11471151
// Update range end for closed tag
11481152
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];

test/tests/issues/144.js

Lines changed: 0 additions & 34 deletions
This file was deleted.

test/tests/nested-a-tag.js

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
const { parse, NodeType } = require('@test/test-target');
2+
3+
describe('Nested A Tags', function () {
4+
it('Tags preserved by default', function () {
5+
const html = `<A href="#"><b>link <a href="#">nested link</a> end</b></A>`;
6+
7+
const root = parse(html);
8+
9+
root.innerHTML.should.eql(`<A href="#"><b>link <a href="#">nested link</a> end</b></A>`);
10+
root.childNodes.length.should.eql(1);
11+
12+
const a1 = root.childNodes[0];
13+
a1.tagName.should.eql('A');
14+
a1.nodeType.should.eql(NodeType.ELEMENT_NODE);
15+
a1.childNodes.length.should.eql(1);
16+
17+
const b = a1.childNodes[0];
18+
b.tagName.should.eql('B');
19+
b.childNodes.length.should.eql(3);
20+
b.text.should.eql('link nested link end');
21+
22+
const a2 = b.childNodes[1];
23+
a2.tagName.should.eql('A');
24+
a2.nodeType.should.eql(NodeType.ELEMENT_NODE);
25+
a2.childNodes.length.should.eql(1);
26+
a2.childNodes[0].nodeType.should.eql(NodeType.TEXT_NODE);
27+
a2.text.should.eql('nested link');
28+
29+
const endText = b.childNodes[2];
30+
endText.nodeType.should.eql(NodeType.TEXT_NODE);
31+
endText.textContent.should.eql(' end');
32+
});
33+
34+
it('Tags fixed with fixNestedATags option', function () {
35+
const html = `<A href="#"><b>link <a href="#">nested link</a> end</b></A>`;
36+
37+
const root = parse(html, { fixNestedATags: true });
38+
39+
root.innerHTML.should.eql(`<A href="#"><b>link </b></A><a href="#">nested link</a> end`);
40+
root.childNodes.length.should.eql(3);
41+
42+
const a1 = root.childNodes[0];
43+
a1.tagName.should.eql('A');
44+
a1.nodeType.should.eql(NodeType.ELEMENT_NODE);
45+
a1.childNodes.length.should.eql(1);
46+
47+
const b = a1.childNodes[0];
48+
b.tagName.should.eql('B');
49+
b.childNodes.length.should.eql(1);
50+
b.text.should.eql('link ');
51+
52+
const a2 = root.childNodes[1];
53+
a2.tagName.should.eql('A');
54+
a2.nodeType.should.eql(NodeType.ELEMENT_NODE);
55+
a2.childNodes.length.should.eql(1);
56+
a2.childNodes[0].nodeType.should.eql(NodeType.TEXT_NODE);
57+
a2.text.should.eql('nested link');
58+
59+
const endText = root.childNodes[2];
60+
endText.nodeType.should.eql(NodeType.TEXT_NODE);
61+
endText.textContent.should.eql(' end');
62+
});
63+
});

0 commit comments

Comments
 (0)