Skip to content

Add support for HTML file to extract-regexes.pl #77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ if ($INSTALL_PKG_DEPS) {
my @rxxr2Packages_ubuntu = ("ocaml");
my @wustholzPackages_ubuntu = ("default-jdk");
my @shenPackages_ubuntu = ("maven");
my @dynamicAnalysisPackages_ubuntu = ("nodejs", "php-cli", "ruby", "cargo", "golang-go");
my @dynamicAnalysisPackages_ubuntu = ("nodejs", "php-cli", "ruby", "cargo", "golang-go", "python3-bs4");
my @requiredPackages_ubuntu = (@miscPackages_ubuntu, @rxxr2Packages_ubuntu, @wustholzPackages_ubuntu, @shenPackages_ubuntu, @dynamicAnalysisPackages_ubuntu);

&log("Installing dependencies");
Expand Down Expand Up @@ -90,7 +90,6 @@ sub configureDetectors {
&configureWustholz();
&configureWeideman();
&configureShen();

chdir "$ENV{VULN_REGEX_DETECTOR_ROOT}" or die "Error, chdir failed: $!\n";
return;
}
Expand Down
14 changes: 13 additions & 1 deletion src/extract/extract-regexes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
my %language2extractor = (
"javascript" => "$pref/javascript/extract-regexps.js",
"python" => "$pref/python/python-extract-regexps-wrapper.pl",
"html" => "$pref/html/extract-regexps-html.py"
);

for my $lang (keys %language2extractor) {
Expand Down Expand Up @@ -64,10 +65,13 @@
}

# Invoke the appropriate extractor.
# If HTML file, extract the js part to create a new js file and pipeline it into the js extractor

my $extractor = $language2extractor{$language};
if ($extractor and -x $extractor) {
print STDERR "$extractor '$json->{file}'\n";
my $result = decode_json(`$extractor '$json->{file}' 2>/dev/null`);

# Add the language to the output to simplify pipelining.
$result->{language} = $language;
print STDOUT encode_json($result) . "\n";
Expand Down Expand Up @@ -95,6 +99,7 @@ sub determineLanguage {

# Check the 'file' command's guess.
my ($rc, $out) = &cmd("file $file");

#print "rc $rc out $out\n";
if ($rc eq 0) {
if ($out =~ m/(\s|\/)node(js)?\s/i) {
Expand All @@ -103,6 +108,9 @@ sub determineLanguage {
elsif ($out =~ m/\sPython\s/i) {
$language = "python";
}
elsif ($out =~ m/\sHTML\s/i) {
$language = "html";
}
}
# Did it work?
if ($language ne $UNKNOWN_LANGUAGE) {
Expand All @@ -112,6 +120,7 @@ sub determineLanguage {
return $language;
}


sub extension2language {
my ($ext) = @_;

Expand All @@ -122,6 +131,9 @@ sub extension2language {
elsif (lc $ext eq "py") {
$language = "python";
}
elsif (lc $ext eq "html") {
$language = "html";
}

return $language;
}
Expand All @@ -132,4 +144,4 @@ sub cmd {
my $rc = $? >> 8;

return ($rc, $out);
}
}
54 changes: 54 additions & 0 deletions src/extract/src/html/extract-regexps-html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
# Description: This file takes in a html file from extract-regexes.pl, finds all the script
# tags, and combine the JS in them into a temporary js file. It then sends the path of the
# temporary js file back to extract-regexes.pl to let it pipeline the js file to the javascript
# extractor. After extract-regexes.pl finishes extracting, The temporary JS file will be
# deleted by extract-regexes.pl.

from bs4 import BeautifulSoup
import sys
import subprocess
import json
import tempfile
import os

def extract_js(file_path):
with open(file_path) as fp:
soup = BeautifulSoup(fp, 'html.parser')

js_from_html = ''
for script in soup.find_all('script'):
js_from_html += script.string

return js_from_html

def extract_regexes(js_from_html, file_path):
js_tempfile = tempfile.NamedTemporaryFile(suffix='.js', mode='w+t', delete = False)
js_tempfile.writelines(js_from_html)
js_tempfile.close()

# create temp json file to pass to the meta-program
json_tempfile = tempfile.NamedTemporaryFile(suffix='.json', mode='w+t', delete = False)
json_tempfile.writelines(json.dumps({"file": js_tempfile.name, "language": "javascript"}))
json_tempfile.close()

output = subprocess.run(
[os.path.join(os.environ['VULN_REGEX_DETECTOR_ROOT'], 'src/extract/extract-regexes.pl'),
json_tempfile.name],
capture_output=True, text=True)

# delete the temp js and json file
os.remove(js_tempfile.name)
os.remove(json_tempfile.name)

output_json = json.loads(output.stdout)
output_json['file'] = file_path
return json.dumps(output_json)

file_path = sys.argv[1]
js_from_html = extract_js(file_path)

# call the meta-program
print(extract_regexes(js_from_html, file_path), end = '')


1 change: 1 addition & 0 deletions src/extract/test/html/jsonFile.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"file": "./test/html/t.html", "language": "html"}
25 changes: 25 additions & 0 deletions src/extract/test/html/t.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<!DOCTYPE html>
<html>
<header>
<script>
let script_var = 5;
console.log('test');
'abc'.match(/def/);
new RegExp('aaa');
</script>

<script>
var re = /abcsdxxx/;
</script>

</header>

<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<script>
var re = /abcsdsdfdf/;
</script>
</body>

</html>
2 changes: 1 addition & 1 deletion src/extract/test/javascript/jsonFile.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"file":"test/js/t.js"}
{"file": "./test/javascript/t.js", "language": "javascript"}
3 changes: 3 additions & 0 deletions src/extract/test/javascript/t.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
var re = /abc/;
'abc'.match(/def/);
new RegExp('aaa');

var re_string = '\\w+';
new RegExp(re_string);