-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathhtml-parser.js
126 lines (111 loc) · 3.4 KB
/
html-parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'use strict';
const URL = require('url').URL; // for backwards compability (node < 10)
const cheerio = require('cheerio');
class HTMLParser {
constructor(html) {
this.html = html;
this.$ = cheerio.load(this.html);
}
getInlineCSS() {
let inlineCSS = [];
this.$('style').each((id, el) => {
if (typeof el.attribs.src === 'undefined') {
el.children.forEach(child => {
inlineCSS.push(child.data);
});
}
});
return inlineCSS;
}
getInlineJS() {
let inlineJS = [];
this.$('script').each((id, el) => {
if (typeof el.attribs.src === 'undefined') {
el.children.forEach(child => {
inlineJS.push(child.data);
});
}
});
return inlineJS;
}
getStylesheetURLs(url) {
const Url = new URL(url);
let stylesheets = [];
this.$('link').each((id, el) => {
if (el.attribs.rel === 'stylesheet' && typeof el.attribs.href !== 'undefined') {
let style = {};
let res = el.attribs.href;
if (res.match(/https?:\/\//) === null) { // protocol is not available
if (!res.startsWith('//')) { // relative
res = Url.protocol + '//' + Url.host + ((!res.startsWith('/')) ? '/' : '') + res;
} else res = Url.protocol + res; // without protocol, but still not relative
}
style.url = new URL(res);
stylesheets.push(style);
}
});
return stylesheets;
}
getJavascriptURLs(url) {
const Url = new URL(url);
let javascripts = [];
this.$('script').each((id, el) => {
if (typeof el.attribs.src !== 'undefined') {
let js = {};
let res = el.attribs.src;
if (res.match(/https?:\/\//) === null) { // protocol is not available
if (!res.startsWith('//')) { // relative
res = Url.protocol + '//' + Url.host + ((!res.startsWith('/')) ? '/' : '') + res;
} else res = Url.protocol + res; // without protocol, but still not relative
}
js.url = new URL(res);
javascripts.push(js);
}
});
return javascripts;
}
getImageURLs(url) {
const Url = new URL(url);
let images = [];
this.$('img').each((id, el) => {
if (typeof el.attribs.src !== 'undefined') {
let img = {};
let res = el.attribs.src;
if (res.match(/https?:\/\//) === null) {
res = (!res.startsWith('//')) ? res = Url.protocol + '//' + Url.host + ((!res.startsWith('/')) ? '/' : '') + res : Url.protocol + res;
}
img.url = new URL(res);
images.push(img);
}
});
return images;
}
getiFrames() {
let iframes = [];
this.$('iframe').each((id, el) => {
if (typeof el.attribs.src !== 'undefined') iframes.push(el.attribs.src);
});
return iframes;
}
getMetaData() {
let metatags = {};
this.$('meta').each((id, el) => {
const key = Object.keys(el.attribs).find((attr) => ['name', 'property', 'itemprop', 'http-equiv'].indexOf(attr) !== -1);
const name = el.attribs[key];
const value = el.attribs['content'];
if (!metatags[name]) {
metatags[name] = [];
}
metatags[name].push(value);
});
return metatags;
}
checkPrefetching() {
let prefetching = [];
this.$('link').each((id, el) => {
if (el.attribs.rel === 'dns-prefetch') prefetching.push(el.attribs.href);
});
return prefetching;
}
}
module.exports = HTMLParser;