-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawl_all.py
65 lines (56 loc) · 2 KB
/
crawl_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Florian Maurer, Christian Rieke
#
# SPDX-License-Identifier: AGPL-3.0-or-later
import logging
import os.path as osp
import sys
from glob import glob
from pathlib import Path
log = logging.getLogger("crawler")
log.setLevel(logging.INFO)
def import_and_exec(module, schema_name):
"""
imports and executes the main(db_uri) method of each module.
A module must reside in the crawler folder.
"""
try:
imported_module = __import__(f"crawler.{module}", fromlist=["eex.main"])
imported_module.main(schema_name)
log.info(f"executed main from {module}")
except AttributeError as e:
log.error(repr(e))
except Exception as e:
log.error(f"could not import/execute main of crawler: {module} - {e}")
def get_available_crawlers():
crawler_path = osp.join(osp.dirname(__file__), "crawler")
crawlers = []
for f in glob(crawler_path + "/*.py"):
crawler = osp.basename(f)[:-3]
if crawler not in [
"__init__",
"base_crawler",
"config",
"config_example",
"axxteq",
"enet",
"dwd",
]:
crawlers.append(crawler)
crawlers.sort()
return crawlers
if __name__ == "__main__":
sys.path.append(str(Path().absolute()) + "/crawler")
logging.basicConfig()
# remove crawlers without publicly available data
available_crawlers = get_available_crawlers()
crawlers = sorted(available_crawlers)
for crawler_name in crawlers:
if crawler_name in available_crawlers:
log.info(f"executing crawler {crawler_name}")
schema_name = crawler_name.replace("_crawler", "")
# the move to schemas does not allow to have multiple gis based databases
# all gis based databases now have to write into the public schema
if schema_name == "nuts_mapper":
schema_name == "public"
import_and_exec(crawler_name, schema_name)