Skip to content

Commit 2ef415b

Browse files
committed
Merge pull request #27 from mlsecproject/releaseprep
Release v0.1
2 parents def42ed + a682363 commit 2ef415b

29 files changed

+2662949
-240
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,5 @@ venv
5151
# other data files
5252
crop.json
5353
harvest.csv
54+
55+
.ipynb_checkpoints

MANIFEST.in

Lines changed: 0 additions & 1 deletion
This file was deleted.

README.md

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,26 @@ combine
33

44
Combine gathers OSINT Threat Intelligence Feeds
55

6-
You can run the original harvest.py tool with a cmd line like this:
6+
You can run the core tool with `combine.py`:
7+
```
8+
usage: combine.py [-h] [-t TYPE] [-f FILE] [-d] [-e] [--tiq-test]
9+
10+
optional arguments:
11+
-h, --help show this help message and exit
12+
-t TYPE, --type TYPE Specify output type. Currently supported: CSV
13+
-f FILE, --file FILE Specify output file. Defaults to harvest.FILETYPE
14+
-d, --delete Delete intermediate files
15+
-e, --enrich Enrich data
16+
--tiq-test Output in tiq-test format
17+
```
18+
19+
Alternately, you can run each phase individually:
20+
721

822
````
923
python reaper.py
1024
python thresher.py
25+
python winnower.py
1126
python baler.py
1227
`````
1328
@@ -37,6 +52,20 @@ An output example:
3752
"bgr.runk.pl","FQDN","outbound","mtc_malwaredns","Malware","2014-06-01"
3853
```
3954
55+
The output can optionally be filtered and enriched with additional data. The enrichments look like the following:
56+
```
57+
"entity","type","direction","source","notes","date","asnumber","asname","country","host","rhost"
58+
"1.234.23.28","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","9318","Hanaro Telecom Inc.","KR",,
59+
"1.234.35.198","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","9318","Hanaro Telecom Inc.","KR",,
60+
"1.25.36.76","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4837","CNCGROUP China169 Backbone","CN",,
61+
"1.93.1.162","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4808","CNCGROUP IP network China169 Beijing Province Network","CN",,
62+
"1.93.44.147","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4808","CNCGROUP IP network China169 Beijing Province Network","CN",,
63+
"100.42.218.250","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","18450","WebNX, Inc.","US",,"100-42-218-250.static.webnx.com"
64+
"100.42.55.2","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"stats.wren.arvixe.com"
65+
"100.42.55.220","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"stats.warthog.arvixe.com"
66+
"100.42.58.137","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"100.42.58.137-static.reverse.mysitehosted.com"
67+
```
68+
4069
### Copyright Info
4170
4271
Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -52,3 +81,19 @@ all copies or substantial portions of the Software.
5281
Copyright 2014 MLSec Project
5382
5483
Licensed under GPLv3 - https://github.com/mlsecproject/combine/blob/master/LICENSE
84+
85+
### DNSDB used under license
86+
87+
Copyright (c) 2013 by Farsight Security, Inc.
88+
89+
Licensed under the Apache License, Version 2.0 (the "License");
90+
you may not use this file except in compliance with the License.
91+
You may obtain a copy of the License at
92+
93+
http://www.apache.org/licenses/LICENSE-2.0
94+
95+
Unless required by applicable law or agreed to in writing, software
96+
distributed under the License is distributed on an "AS IS" BASIS,
97+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
98+
See the License for the specific language governing permissions and
99+
limitations under the License.

baler.py

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,64 @@
1+
import ConfigParser
12
import csv
3+
import datetime as dt
4+
import gzip
25
import json
6+
import os
7+
import sys
38

49

5-
def bale_csv(harvest, output_file):
10+
def tiq_output(reg_file, enr_file):
11+
config = ConfigParser.ConfigParser()
12+
config.read('combine.cfg')
13+
tiq_dir = os.path.join(config.get('Baler', 'tiq_directory'), 'data')
14+
today = dt.datetime.today().strftime('%Y%m%d')
15+
16+
with open(reg_file, 'rb') as f:
17+
reg_data = json.load(f)
18+
19+
with open(enr_file, 'rb') as f:
20+
enr_data = json.load(f)
21+
22+
sys.stderr.write('Preparing tiq directory structure under %s\n' % tiq_dir)
23+
if not os.path.isdir(tiq_dir):
24+
os.makedirs(os.path.join(tiq_dir, 'raw', 'public_inbound'))
25+
os.makedirs(os.path.join(tiq_dir, 'raw', 'public_outbound'))
26+
os.makedirs(os.path.join(tiq_dir, 'enriched', 'public_inbound'))
27+
os.makedirs(os.path.join(tiq_dir, 'enriched', 'public_outbound'))
28+
29+
inbound_data = [row for row in reg_data if row[2] == 'inbound']
30+
outbound_data = [row for row in reg_data if row[2] == 'outbound']
31+
32+
try:
33+
bale_reg_csvgz(inbound_data, os.path.join(tiq_dir, 'raw', 'public_inbound', today+'.csv.gz'))
34+
bale_reg_csvgz(outbound_data, os.path.join(tiq_dir, 'raw', 'public_outbound', today+'.csv.gz'))
35+
except:
36+
pass
37+
38+
inbound_data = [row for row in enr_data if row[2] == 'inbound']
39+
outbound_data = [row for row in enr_data if row[2] == 'outbound']
40+
41+
try:
42+
bale_enr_csvgz(inbound_data, os.path.join(tiq_dir, 'enriched', 'public_inbound', today+'.csv.gz'))
43+
bale_enr_csvgz(outbound_data, os.path.join(tiq_dir, 'enriched', 'public_outbound', today+'.csv.gz'))
44+
except:
45+
pass
46+
47+
48+
# oh my god this is such a hack
49+
50+
def bale_reg_csvgz(harvest, output_file):
51+
sys.stderr.write('Output regular data as GZip CSV to %s\n' % output_file)
52+
with gzip.open(output_file, 'wb') as csv_file:
53+
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
54+
55+
# header row
56+
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date'))
57+
bale_writer.writerows(harvest)
58+
59+
60+
def bale_reg_csv(harvest, output_file):
61+
sys.stderr.write('Output regular data as CSV to %s\n' % output_file)
662
with open(output_file, 'wb') as csv_file:
763
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
864

@@ -11,11 +67,33 @@ def bale_csv(harvest, output_file):
1167
bale_writer.writerows(harvest)
1268

1369

70+
def bale_enr_csv(harvest, output_file):
71+
sys.stderr.write('Output enriched data as CSV to %s\n' % output_file)
72+
with open(output_file, 'wb') as csv_file:
73+
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
74+
75+
# header row
76+
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date', 'asnumber', 'asname', 'country', 'host', 'rhost'))
77+
bale_writer.writerows(harvest)
78+
79+
80+
def bale_enr_csvgz(harvest, output_file):
81+
sys.stderr.write('Output enriched data as GZip CSV to %s\n' % output_file)
82+
with gzip.open(output_file, 'wb') as csv_file:
83+
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
84+
85+
# header row
86+
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date', 'asnumber', 'asname', 'country', 'host', 'rhost'))
87+
bale_writer.writerows(harvest)
88+
89+
1490
def bale(input_file, output_file, output_format):
91+
sys.stderr.write('Reading processed data from %s\n' % input_file)
1592
with open(input_file, 'rb') as f:
1693
harvest = json.load(f)
1794

18-
format_funcs = { 'csv': bale_csv }
95+
# TODO: also need plugins here (cf. #23)
96+
format_funcs = {'csv': bale_reg_csv}
1997
format_funcs[output_format](harvest, output_file)
2098

2199

combine.cfg

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Reaper]
2+
inbound_urls = inbound_urls.txt
3+
outbound_urls = outbound_urls.txt
4+
5+
[Winnower]
6+
dnsdb_server =
7+
dnsdb_api =
8+
enrich_dns = 1
9+
enrich_ip = 0
10+
11+
[Baler]
12+
tiq_directory = tiq_test
13+
winnow = 1

combine.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import os
5+
import sys
6+
7+
# Combine components
8+
from reaper import reap
9+
from thresher import thresh
10+
from baler import bale, tiq_output
11+
from winnower import winnow
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument('-t', '--type', help="Specify output type. Currently supported: CSV")
15+
parser.add_argument('-f', '--file', help="Specify output file. Defaults to harvest.FILETYPE")
16+
parser.add_argument('-d', '--delete', help="Delete intermediate files", action="store_true")
17+
parser.add_argument('-e', '--enrich', help="Enrich data", action="store_true")
18+
parser.add_argument('--tiq-test', help="Output in tiq-test format", action="store_true")
19+
args = parser.parse_args()
20+
21+
possible_types = ['csv', 'CSV']
22+
23+
if not args.type:
24+
out_type = 'csv'
25+
elif args.type not in possible_types:
26+
sys.exit('Invalid file type specified. Possible types are: %s' % possible_types)
27+
else:
28+
out_type = args.type
29+
30+
if args.file:
31+
out_file = args.file
32+
else:
33+
out_file = 'harvest.'+out_type
34+
35+
reap('harvest.json')
36+
thresh('harvest.json', 'crop.json')
37+
if args.enrich:
38+
winnow('crop.json', 'crop.json', 'enrich.json')
39+
bale('crop.json', out_file, out_type)
40+
41+
if args.tiq-test:
42+
tiq_output('crop.json', 'enrich.json')
43+
44+
if args.delete:
45+
# be careful with this when we support a JSON output type
46+
os.remove('harvest.json')
47+
os.remove('crop.json')

combine_inbound.cfg

Lines changed: 0 additions & 68 deletions
This file was deleted.

combine_outbound.cfg

Lines changed: 0 additions & 74 deletions
This file was deleted.

data/GeoIP.dat

685 KB
Binary file not shown.

0 commit comments

Comments
 (0)