From 732602d0506a851b2135cab8083523ccac692ea4 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 20 Sep 2020 22:03:50 +0800 Subject: [PATCH] refactor(strip_html): remove striptags deps --- lib/strip_html.js | 119 +++++++++++++++++++++++++++++++++++++++- package.json | 3 +- test/strip_html.spec.js | 54 ++++++++++++++++++ 3 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 test/strip_html.spec.js diff --git a/lib/strip_html.js b/lib/strip_html.js index 804647cb..4c4618ad 100644 --- a/lib/strip_html.js +++ b/lib/strip_html.js @@ -1,3 +1,120 @@ 'use strict'; -module.exports = require('striptags'); +const STATE_PLAINTEXT = Symbol('plaintext'); +const STATE_HTML = Symbol('html'); +const STATE_COMMENT = Symbol('comment'); + +function striptags(html = '') { + let state = STATE_PLAINTEXT; + let tag_buffer = ''; + let depth = 0; + let in_quote_char = ''; + let output = ''; + + const { length } = html; + + for (let idx = 0; idx < length; idx++) { + const char = html[idx]; + + if (state === STATE_PLAINTEXT) { + switch (char) { + case '<': + state = STATE_HTML; + tag_buffer = tag_buffer + char; + break; + + default: + output += char; + break; + } + } else if (state === STATE_HTML) { + switch (char) { + case '<': + // ignore '<' if inside a quote + if (in_quote_char) break; + + // we're seeing a nested '<' + depth++; + break; + + case '>': + // ignore '>' if inside a quote + if (in_quote_char) { + break; + } + + // something like this is happening: '<<>>' + if (depth) { + depth--; + + break; + } + + // this is closing the tag in tag_buffer + in_quote_char = ''; + state = STATE_PLAINTEXT; + // tag_buffer += '>'; + + tag_buffer = ''; + break; + + case '"': + case '\'': + // catch both single and double quotes + + if (char === in_quote_char) { + in_quote_char = ''; + } else { + in_quote_char = in_quote_char || char; + } + + tag_buffer = tag_buffer + char; + break; + + case '-': + if (tag_buffer === '': + if (tag_buffer.slice(-2) === '--') { + // close the comment + state = STATE_PLAINTEXT; + } + + tag_buffer = ''; + break; + + default: + tag_buffer = tag_buffer + char; + break; + } + } + } + + return output; +} + +module.exports = striptags; diff --git a/package.json b/package.json index 9830eaff..3f1b7eee 100644 --- a/package.json +++ b/package.json @@ -50,8 +50,7 @@ "highlight.js": "^10.0.0", "htmlparser2": "^4.0.0", "prismjs": "^1.17.1", - "strip-indent": "^3.0.0", - "striptags": "^3.1.1" + "strip-indent": "^3.0.0" }, "engines": { "node": ">=10.13.0" diff --git a/test/strip_html.spec.js b/test/strip_html.spec.js new file mode 100644 index 00000000..78336b7d --- /dev/null +++ b/test/strip_html.spec.js @@ -0,0 +1,54 @@ +'use strict'; + +const stripHTML = require('../lib/strip_html'); + +describe('stripHTML', () => { + it('should not strip invalid tags', () => { + const text = 'lorem ipsum < a> < div>'; + + stripHTML(text).should.eql(text); + }); + + it('should remove simple HTML tags', () => { + const html = 'lorem ipsum'; + const text = 'lorem ipsum'; + + stripHTML(html).should.eql(text); + }); + + it('should remove comments', () => { + const html = ' dolor sit amet'; + const text = ' dolor sit amet'; + + stripHTML(html).should.eql(text); + }); + + it('should strip tags within comments', () => { + const html = ' dolor sit'; + const text = ' dolor sit'; + + stripHTML(html).should.eql(text); + }); + + + it('should not fail with nested quotes', () => { + const html = '
lorem
ipsum'; + const text = 'lorem ipsum'; + + stripHTML(html).should.eql(text); + }); + + it('should strip extra < within tags', () => { + const html = '>lorem ipsum'; + const text = 'lorem ipsum'; + + stripHTML(html).should.eql(text); + }); + + it('should strip <> within quotes', () => { + const html = 'lorem ipsum'; + const text = 'lorem ipsum'; + + stripHTML(html).should.eql(text); + }); +});