-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathvectorize_with_raw_features.py
executable file
·65 lines (51 loc) · 1.93 KB
/
vectorize_with_raw_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from idpanel.training.vectorization import vectorize
from idpanel.training.features import load_raw_features
from idpanel.labels import load_labels
from idpanel.blacklist import labels_to_ignore
import json
from multiprocessing.pool import Pool
from multiprocessing import cpu_count
_raw_features = None
_sites = None
def preload_process(sites):
global _raw_features, _sites
_raw_features = load_raw_features()
_sites = sites
def compute_vectors(site):
global _raw_features, _sites
return list(vectorize(_raw_features, _sites[site])), site
if __name__ == "__main__":
print "Loading prevectors"
data_points = []
with open("prevectors.json", "r") as f:
for line in f:
line = line.strip()
if len(line) == 0:
continue
line = json.loads(line)
data_points.append(line)
label_indeces = load_labels()
raw_features = load_raw_features()
print "Loaded {0} features".format(len(raw_features))
print "Grouping prevectors by base_url"
sites = {}
site_labels = {}
for dp in data_points:
if dp['base_url'] not in sites:
sites[dp['base_url']] = {}
site_labels[dp['base_url']] = dp['label']
sites[dp['base_url']][dp['offset']] = {"code": dp['code'], "content_ssdeep": dp['content_ssdeep']}
print "Vectorizing {0} base urls".format(len(sites))
labels = []
names = []
vectors = []
pool = Pool(processes=cpu_count(), initializer=preload_process, initargs=(sites,))
for vector, site in pool.imap_unordered(compute_vectors, sites.keys()):
if site_labels[site] in labels_to_ignore:
continue
vectors.append(vector)
labels.append(site_labels[site])
names.append(site)
print "Vector for {0} completed".format(site)
with open("raw_feature_vectors.json", "w") as f:
json.dump({"labels": labels, "names": names, "vectors": vectors}, f)