-
Notifications
You must be signed in to change notification settings - Fork 10
/
collector.py
113 lines (90 loc) · 3.17 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018, Marcelo Jorge Vieira <metal@alucinados.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
# for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import logging
import os
from elasticsearch_dsl.connections import connections
from collector.models import setup_index_template, setup_index
from collector.tse import TSE
from collector.tse_headers import year_headers
DEFAULT_DOWNLOAD_DIRECTORY = os.path.abspath(
os.path.expanduser('~/Downloads/tse')
)
def run(args):
"Collect data"
FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(format=FORMAT, level=args.log_level)
# mute elasticsearch INFO logs
log = logging.getLogger('elasticsearch')
log.setLevel('ERROR')
# Define a default ElasticSearch client
es_hosts = [dict(host=args.es_host, port=args.es_port)]
connections.create_connection(hosts=es_hosts, timeout=30)
# Setup elastic search indices once before starting
setup_index_template()
if args.year:
years = args.year.split(',')
else:
years = year_headers.keys()
# Collect!
for year in years:
setup_index(year)
tse = TSE(year, path=args.download_directory)
tse.download_and_extract(remove_tmp_dir=False, remove_zip=False)
tse.all_candidates()
def main():
"Parse command line and launch collector"
parser = argparse.ArgumentParser(description='Data Collector')
# Log levels accepted by logging library. Probably a good idea to
# rely on _levelToName but didn't find anything better :(
log_levels = list(logging._levelToName.values())
parser.add_argument(
'-d', '--download-dir',
dest='download_directory',
action='store',
default=DEFAULT_DOWNLOAD_DIRECTORY,
help='Directory where files will be downloaded',
)
parser.add_argument(
'-l', '--log-level',
default='CRITICAL',
choices=log_levels,
type=lambda x: x.upper(),
help=f'Log verbosity level: {", ".join(log_levels)}',
)
parser.add_argument(
'-eh', '--es-host',
action='store',
default='localhost',
help='the elasticsearch host (default: localhost)',
)
parser.add_argument(
'-ep', '--es-port',
action='store',
default=9200,
help='the elasticsearch port (default: 9200)',
)
parser.add_argument(
'-y', '--year',
action='store',
default=None,
help='the election year',
)
run(parser.parse_args())
if __name__ == '__main__':
main()