-
Notifications
You must be signed in to change notification settings - Fork 0
/
audit_script.py
119 lines (102 loc) · 4.02 KB
/
audit_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re
# Dictionary to count various tags
def count_tags(filename):
tag_dict = {}
for event,elem in ET.iterparse(filename):
if elem.tag in tag_dict:
tag_dict[elem.tag] += 1
else:
tag_dict[elem.tag] = 1
return tag_dict
# Finding various attributes for xml tags
def find_attributes(filename):
attribute_dict = defaultdict(set)
for event, element in ET.iterparse(filename):
for each_attrib in element.attrib :
attribute_dict[element.tag].add(each_attrib)
return attribute_dict
# Making a dictionary to overview "<tag>" attributes data and values
def find_tag_keys_vals(filename):
tag_key = defaultdict(set)
for _,element in ET.iterparse(filename):
if element.tag=="way":
for child in element.iter("tag"):
tag_key[child.attrib["k"]].add(child.attrib["v"])
return tag_key
# Similar to course material -------------------------------------------------
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
def key_type(element, keys):
if element.tag == "tag":
if lower.match(element.attrib["k"]):
keys["lower"] +=1
elif lower_colon.match(element.attrib["k"]):
keys["lower_colon"] +=1
elif problemchars.match(element.attrib["k"]):
keys["problemchars"] +=1
else:
print(element.attrib["k"])
keys["other"] +=1
return keys
# Ends here ---------------------------------------------------------------------------
# Finding tags with multiple fields such as "addr:housenumber"
def secondary_tags(filename):
compile_seconds = defaultdict(set)
for _, element in ET.iterparse(filename):
if element.tag == "tag":
for key in element.attrib:
value = element.get(key)
key_pairs = value.split(":")
if len(key_pairs) == 2:
compile_seconds[key_pairs[0]].add(key_pairs[1])
return compile_seconds
# Overview of fields containing street names
def street_names(filename):
streets = []
for _, element in ET.iterparse(filename):
if element.tag == "node" or element.tag == "way":
for each in element.iter("tag"):
if each.get("k") in ["addr:street","addr:full", "name"]:
streets.append(each.attrib["v"])
return streets
# Overview of fields containing phone numbers
def phone_numbers(filename):
numbers = []
for _, element in ET.iterparse(filename):
if element.tag == "node" or element.tag == "way":
for each in element.iter("tag"):
if each.get("k") in ["contact:phone", "contact:mobile", "phone"]:
## add check for mobile or not and create dictionary
numbers.append(each.attrib["v"])
return numbers
# Auditing source names and their counts
def source_names(filename):
sources = {}
for _, element in ET.iterparse(filename):
if element.tag == "node" or element.tag == "way":
for each in element.iter("tag"):
if each.get("k") == "source":
if each.get("v") in sources:
sources[each.get("v")] += 1
else:
sources[each.get("v")] = 1
return sources
# Checking out postal codes
def post_codes(filename):
codes = set()
for _, element in ET.iterparse(filename):
if element.tag == "node" or element.tag == "way":
for each in element.iter("tag"):
if each.get("k") == "addr:postcode":
codes.add(each.get("v"))
return codes
pprint.pprint(count_tags("delhi_map.osm"))