Skip to content

Commit dd912d2

Browse files
authored
Merge pull request #77 from du201/master
Add support for HTML file to extract-regexes.pl
2 parents 2cb9a92 + dc8c5f0 commit dd912d2

File tree

7 files changed

+98
-4
lines changed

7 files changed

+98
-4
lines changed

configure

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ if ($INSTALL_PKG_DEPS) {
3939
my @rxxr2Packages_ubuntu = ("ocaml");
4040
my @wustholzPackages_ubuntu = ("default-jdk");
4141
my @shenPackages_ubuntu = ("maven");
42-
my @dynamicAnalysisPackages_ubuntu = ("nodejs", "php-cli", "ruby", "cargo", "golang-go");
42+
my @dynamicAnalysisPackages_ubuntu = ("nodejs", "php-cli", "ruby", "cargo", "golang-go", "python3-bs4");
4343
my @requiredPackages_ubuntu = (@miscPackages_ubuntu, @rxxr2Packages_ubuntu, @wustholzPackages_ubuntu, @shenPackages_ubuntu, @dynamicAnalysisPackages_ubuntu);
4444

4545
&log("Installing dependencies");
@@ -90,7 +90,6 @@ sub configureDetectors {
9090
&configureWustholz();
9191
&configureWeideman();
9292
&configureShen();
93-
9493
chdir "$ENV{VULN_REGEX_DETECTOR_ROOT}" or die "Error, chdir failed: $!\n";
9594
return;
9695
}

src/extract/extract-regexes.pl

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
my %language2extractor = (
2929
"javascript" => "$pref/javascript/extract-regexps.js",
3030
"python" => "$pref/python/python-extract-regexps-wrapper.pl",
31+
"html" => "$pref/html/extract-regexps-html.py"
3132
);
3233

3334
for my $lang (keys %language2extractor) {
@@ -64,10 +65,13 @@
6465
}
6566

6667
# Invoke the appropriate extractor.
68+
# If HTML file, extract the js part to create a new js file and pipeline it into the js extractor
69+
6770
my $extractor = $language2extractor{$language};
6871
if ($extractor and -x $extractor) {
6972
print STDERR "$extractor '$json->{file}'\n";
7073
my $result = decode_json(`$extractor '$json->{file}' 2>/dev/null`);
74+
7175
# Add the language to the output to simplify pipelining.
7276
$result->{language} = $language;
7377
print STDOUT encode_json($result) . "\n";
@@ -95,6 +99,7 @@ sub determineLanguage {
9599

96100
# Check the 'file' command's guess.
97101
my ($rc, $out) = &cmd("file $file");
102+
98103
#print "rc $rc out $out\n";
99104
if ($rc eq 0) {
100105
if ($out =~ m/(\s|\/)node(js)?\s/i) {
@@ -103,6 +108,9 @@ sub determineLanguage {
103108
elsif ($out =~ m/\sPython\s/i) {
104109
$language = "python";
105110
}
111+
elsif ($out =~ m/\sHTML\s/i) {
112+
$language = "html";
113+
}
106114
}
107115
# Did it work?
108116
if ($language ne $UNKNOWN_LANGUAGE) {
@@ -112,6 +120,7 @@ sub determineLanguage {
112120
return $language;
113121
}
114122

123+
115124
sub extension2language {
116125
my ($ext) = @_;
117126

@@ -122,6 +131,9 @@ sub extension2language {
122131
elsif (lc $ext eq "py") {
123132
$language = "python";
124133
}
134+
elsif (lc $ext eq "html") {
135+
$language = "html";
136+
}
125137

126138
return $language;
127139
}
@@ -132,4 +144,4 @@ sub cmd {
132144
my $rc = $? >> 8;
133145

134146
return ($rc, $out);
135-
}
147+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env python3
2+
# Description: This file takes in a html file from extract-regexes.pl, finds all the script
3+
# tags, and combine the JS in them into a temporary js file. It then sends the path of the
4+
# temporary js file back to extract-regexes.pl to let it pipeline the js file to the javascript
5+
# extractor. After extract-regexes.pl finishes extracting, The temporary JS file will be
6+
# deleted by extract-regexes.pl.
7+
8+
from bs4 import BeautifulSoup
9+
import sys
10+
import subprocess
11+
import json
12+
import tempfile
13+
import os
14+
15+
def extract_js(file_path):
16+
with open(file_path) as fp:
17+
soup = BeautifulSoup(fp, 'html.parser')
18+
19+
js_from_html = ''
20+
for script in soup.find_all('script'):
21+
js_from_html += script.string
22+
23+
return js_from_html
24+
25+
def extract_regexes(js_from_html, file_path):
26+
js_tempfile = tempfile.NamedTemporaryFile(suffix='.js', mode='w+t', delete = False)
27+
js_tempfile.writelines(js_from_html)
28+
js_tempfile.close()
29+
30+
# create temp json file to pass to the meta-program
31+
json_tempfile = tempfile.NamedTemporaryFile(suffix='.json', mode='w+t', delete = False)
32+
json_tempfile.writelines(json.dumps({"file": js_tempfile.name, "language": "javascript"}))
33+
json_tempfile.close()
34+
35+
output = subprocess.run(
36+
[os.path.join(os.environ['VULN_REGEX_DETECTOR_ROOT'], 'src/extract/extract-regexes.pl'),
37+
json_tempfile.name],
38+
capture_output=True, text=True)
39+
40+
# delete the temp js and json file
41+
os.remove(js_tempfile.name)
42+
os.remove(json_tempfile.name)
43+
44+
output_json = json.loads(output.stdout)
45+
output_json['file'] = file_path
46+
return json.dumps(output_json)
47+
48+
file_path = sys.argv[1]
49+
js_from_html = extract_js(file_path)
50+
51+
# call the meta-program
52+
print(extract_regexes(js_from_html, file_path), end = '')
53+
54+

src/extract/test/html/jsonFile.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"file": "./test/html/t.html", "language": "html"}

src/extract/test/html/t.html

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<header>
4+
<script>
5+
let script_var = 5;
6+
console.log('test');
7+
'abc'.match(/def/);
8+
new RegExp('aaa');
9+
</script>
10+
11+
<script>
12+
var re = /abcsdxxx/;
13+
</script>
14+
15+
</header>
16+
17+
<body>
18+
<h1>My First Heading</h1>
19+
<p>My first paragraph.</p>
20+
<script>
21+
var re = /abcsdsdfdf/;
22+
</script>
23+
</body>
24+
25+
</html>
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"file":"test/js/t.js"}
1+
{"file": "./test/javascript/t.js", "language": "javascript"}

src/extract/test/javascript/t.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33
var re = /abc/;
44
'abc'.match(/def/);
55
new RegExp('aaa');
6+
7+
var re_string = '\\w+';
8+
new RegExp(re_string);

0 commit comments

Comments
 (0)