Skip to content

Commit 09c0ce9

Browse files
committed
Encoding
- Add option `--encoding` to control file input and output encoding
1 parent 8f41b19 commit 09c0ce9

File tree

9 files changed

+53
-14
lines changed

9 files changed

+53
-14
lines changed

dsc_datatool/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def process(self, datasets)
4949
generators = {}
5050
transformers = {}
5151
process_dataset = {}
52+
encoding = 'utf-8'
5253

5354

5455
class Dataset(object):
@@ -339,6 +340,8 @@ def _process(datasets, generators, transformers, outputs):
339340
help='Set the special DSC skipped key. (default to "-:SKIPPED:-")')
340341
parser.add_argument('--skipped-sum-key', nargs=1, default='-:SKIPPED_SUM:-',
341342
help='Set the special DSC skipped sum key. (default to "-:SKIPPED_SUM:-")')
343+
parser.add_argument('--encoding', nargs=1, default='utf-8',
344+
help='Encoding to use for all files, default utf-8.')
342345
parser.add_argument('-v', '--verbose', action='count', default=0,
343346
help='Increase the verbose level, can be given multiple times.')
344347
parser.add_argument('-V', '--version', action='version', version='%(prog)s v'+__version__,

dsc_datatool/generator/client_subnet_authority.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from urllib.request import Request, urlopen
1414
from io import StringIO
1515

16-
from dsc_datatool import Generator, Dataset, Dimension, args
16+
from dsc_datatool import Generator, Dataset, Dimension, args, encoding
1717

1818

1919
_whois2rir = {
@@ -97,7 +97,7 @@ def __init__(self, opts):
9797
if not isinstance(csvs, list):
9898
csvs = [ csvs ]
9999
for file in csvs:
100-
with open(file, newline='') as csvfile:
100+
with open(file, newline='', encoding=encoding) as csvfile:
101101
self._read(csvfile)
102102
elif opts.get('fetch', 'no').lower() == 'yes':
103103
urls = opts.get('url', [ urlv4, urlv6 ])

dsc_datatool/input/dat.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import re
1111

12-
from dsc_datatool import Input, Dataset, Dimension, process_dataset
12+
from dsc_datatool import Input, Dataset, Dimension, process_dataset, encoding
1313

1414

1515
_dataset1d = [
@@ -78,7 +78,7 @@ def process(self, dir):
7878

7979
def process1d(self, file, name):
8080
datasets = []
81-
with open(file, 'r') as f:
81+
with open(file, 'r', encoding=encoding) as f:
8282
for l in f.readlines():
8383
if re.match(r'^#', l):
8484
continue
@@ -103,7 +103,7 @@ def process1d(self, file, name):
103103

104104
def process2d(self, file, name, field):
105105
datasets = []
106-
with open(file, 'r') as f:
106+
with open(file, 'r', encoding=encoding) as f:
107107
for l in f.readlines():
108108
if re.match(r'^#', l):
109109
continue
@@ -135,7 +135,7 @@ def process2d(self, file, name, field):
135135

136136
def process3d(self, file, name, first, second):
137137
datasets = []
138-
with open(file, 'r') as f:
138+
with open(file, 'r', encoding=encoding) as f:
139139
for l in f.readlines():
140140
if re.match(r'^#', l):
141141
continue

dsc_datatool/output/influxdb.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import sys
1212
import atexit
1313

14-
from dsc_datatool import Output, args
14+
from dsc_datatool import Output, args, encoding
1515

1616

1717
_re = re.compile(r'([,=\s])')
@@ -67,9 +67,9 @@ def __init__(self, opts):
6767
append = opts.get('append', False)
6868
if file:
6969
if append:
70-
self.fh = open(file, 'a')
70+
self.fh = open(file, 'a', encoding=encoding)
7171
else:
72-
self.fh = open(file, 'w')
72+
self.fh = open(file, 'w', encoding=encoding)
7373
atexit.register(self.close)
7474
else:
7575
self.fh = sys.stdout

dsc_datatool/output/prometheus.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import sys
1212
import atexit
1313

14-
from dsc_datatool import Output, args
14+
from dsc_datatool import Output, args, encoding
1515

1616

1717
_re = re.compile(r'([\\\n"])')
@@ -52,9 +52,9 @@ def __init__(self, opts):
5252
append = opts.get('append', False)
5353
if file:
5454
if append:
55-
self.fh = open(file, 'a')
55+
self.fh = open(file, 'a', encoding=encoding)
5656
else:
57-
self.fh = open(file, 'w')
57+
self.fh = open(file, 'w', encoding=encoding)
5858
atexit.register(self.close)
5959
else:
6060
self.fh = sys.stdout

dsc_datatool/transformer/labler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import yaml
1111

12-
from dsc_datatool import Transformer
12+
from dsc_datatool import Transformer, encoding
1313

1414

1515
def _process(label, d):
@@ -43,7 +43,7 @@ def __init__(self, opts):
4343
Transformer.__init__(self, opts)
4444
if not 'yaml' in opts:
4545
raise Exception('yaml=file option required')
46-
f = open(opts.get('yaml'), 'r')
46+
f = open(opts.get('yaml'), 'r', encoding=encoding)
4747
try:
4848
self.label = yaml.full_load(f)
4949
except AttributeError:

tests/test.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,21 @@ dsc-datatool \
5555

5656
sort -s "$base/test.gold3" > "$base/test.gold3.tmp"
5757
diff -u "$base/test.gold3.tmp" "$base/test.out"
58+
59+
dsc-datatool \
60+
-vvv \
61+
-s test-server-åäö \
62+
-n test-node \
63+
--output ";InfluxDB;dml=1;database=dsc" \
64+
--transform ";Labler;*;yaml=$base/labler.yaml" \
65+
--transform ";ReRanger;rcode_vs_replylen;range=/64;pad_to=5" \
66+
--transform ";ReRanger;qtype_vs_qnamelen;range=/16;pad_to=3" \
67+
--transform ";ReRanger;client_port_range;key=low;range=/2048;pad_to=5" \
68+
--transform ";ReRanger;edns_bufsiz,priming_queries;key=low;range=/512;pad_to=5;allow_invalid_keys=1" \
69+
--transform ";ReRanger;priming_responses;key=low;range=/128;pad_to=4" \
70+
--transform ";NetRemap;client_subnet,client_subnet2,client_addr_vs_rcode,ipv6_rsn_abusers;net=16" \
71+
--generator ";client_subnet_authority;csv=$base/ipv4-address-space.csv;csv=$base/ipv6-unicast-address-assignments.csv" \
72+
--xml "$base/utf8.xml" | sort -s > "$base/test4.out"
73+
74+
sort -s "$base/test4.gold" > "$base/test4.gold.tmp"
75+
diff -u "$base/test4.gold.tmp" "$base/test4.out"

tests/test4.gold

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# CONTEXT-DATABASE: dsc
2+
# DML
3+
pcap_stats,server=test-server-åäö,node=test-node,ifname=eth0åäö,pcap_stat=filter_received value=5625 1563520560000000000
4+
pcap_stats,server=test-server-åäö,node=test-node,ifname=eth0åäö,pcap_stat=kernel_dropped value=731 1563520560000000000
5+
pcap_stats,server=test-server-åäö,node=test-node,ifname=eth0åäö,pcap_stat=pkts_captured value=4894 1563520560000000000

tests/utf8.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<dscdata>
2+
<array name="pcap_stats" dimensions="2" start_time="1563520560" stop_time="1563520621">
3+
<dimension number="1" type="ifname"/>
4+
<dimension number="2" type="pcap_stat"/>
5+
<data>
6+
<ifname val="eth0åäö">
7+
<pcap_stat val="filter_received" count="5625"/>
8+
<pcap_stat val="pkts_captured" count="4894"/>
9+
<pcap_stat val="kernel_dropped" count="731"/>
10+
</ifname>
11+
</data>
12+
</array>
13+
</dscdata>

0 commit comments

Comments
 (0)