Skip to content

Commit

Permalink
refactor(strip_html): re-implementation (#232)
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW authored Sep 20, 2020
1 parent 19315b8 commit 57f70d9
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 3 deletions.
119 changes: 118 additions & 1 deletion lib/strip_html.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,120 @@
'use strict';

module.exports = require('striptags');
const STATE_PLAINTEXT = Symbol('plaintext');
const STATE_HTML = Symbol('html');
const STATE_COMMENT = Symbol('comment');

function striptags(html = '') {
let state = STATE_PLAINTEXT;
let tag_buffer = '';
let depth = 0;
let in_quote_char = '';
let output = '';

const { length } = html;

for (let idx = 0; idx < length; idx++) {
const char = html[idx];

if (state === STATE_PLAINTEXT) {
switch (char) {
case '<':
state = STATE_HTML;
tag_buffer = tag_buffer + char;
break;

default:
output += char;
break;
}
} else if (state === STATE_HTML) {
switch (char) {
case '<':
// ignore '<' if inside a quote
if (in_quote_char) break;

// we're seeing a nested '<'
depth++;
break;

case '>':
// ignore '>' if inside a quote
if (in_quote_char) {
break;
}

// something like this is happening: '<<>>'
if (depth) {
depth--;

break;
}

// this is closing the tag in tag_buffer
in_quote_char = '';
state = STATE_PLAINTEXT;
// tag_buffer += '>';

tag_buffer = '';
break;

case '"':
case '\'':
// catch both single and double quotes

if (char === in_quote_char) {
in_quote_char = '';
} else {
in_quote_char = in_quote_char || char;
}

tag_buffer = tag_buffer + char;
break;

case '-':
if (tag_buffer === '<!-') {
state = STATE_COMMENT;
}

tag_buffer = tag_buffer + char;
break;

case ' ':
case '\n':
if (tag_buffer === '<') {
state = STATE_PLAINTEXT;
output += '< ';
tag_buffer = '';

break;
}

tag_buffer = tag_buffer + char;
break;

default:
tag_buffer = tag_buffer + char;
break;
}
} else if (state === STATE_COMMENT) {
switch (char) {
case '>':
if (tag_buffer.slice(-2) === '--') {
// close the comment
state = STATE_PLAINTEXT;
}

tag_buffer = '';
break;

default:
tag_buffer = tag_buffer + char;
break;
}
}
}

return output;
}

module.exports = striptags;
3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@
"highlight.js": "^10.0.0",
"htmlparser2": "^4.0.0",
"prismjs": "^1.17.1",
"strip-indent": "^3.0.0",
"striptags": "^3.1.1"
"strip-indent": "^3.0.0"
},
"engines": {
"node": ">=10.13.0"
Expand Down
54 changes: 54 additions & 0 deletions test/strip_html.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
'use strict';

const stripHTML = require('../lib/strip_html');

describe('stripHTML', () => {
it('should not strip invalid tags', () => {
const text = 'lorem ipsum < a> < div>';

stripHTML(text).should.eql(text);
});

it('should remove simple HTML tags', () => {
const html = '<a href="">lorem <strong>ipsum</strong></a>';
const text = 'lorem ipsum';

stripHTML(html).should.eql(text);
});

it('should remove comments', () => {
const html = '<!-- lorem -- ipsum -- --> dolor sit amet';
const text = ' dolor sit amet';

stripHTML(html).should.eql(text);
});

it('should strip tags within comments', () => {
const html = '<!-- <strong>lorem ipsum</strong> --> dolor sit';
const text = ' dolor sit';

stripHTML(html).should.eql(text);
});


it('should not fail with nested quotes', () => {
const html = '<article attr="foo \'bar\'">lorem</article> ipsum';
const text = 'lorem ipsum';

stripHTML(html).should.eql(text);
});

it('should strip extra < within tags', () => {
const html = '<div<>>lorem ipsum</div>';
const text = 'lorem ipsum';

stripHTML(html).should.eql(text);
});

it('should strip <> within quotes', () => {
const html = '<a href="<script>">lorem ipsum</a>';
const text = 'lorem ipsum';

stripHTML(html).should.eql(text);
});
});

0 comments on commit 57f70d9

Please sign in to comment.