Skip to content

Commit f0946e6

Browse files
committed
Refactor scraper_template
1 parent ed0c0e8 commit f0946e6

File tree

1 file changed

+41
-37
lines changed

1 file changed

+41
-37
lines changed

scraper_template.py

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -61,67 +61,71 @@ def parse_arguments(description='', features=''):
6161
return parser.parse_args().conf
6262

6363

64-
DataTypes = {
65-
'bool': '$bool',
66-
'str': '$cat_string',
67-
'num': '$num',
68-
'datetime': '$num_datetime'
69-
}
70-
71-
72-
def features(ini, **kwargs):
64+
def get_features(params, **kwargs):
65+
"""Returns string in PolyAnalyst's json format."""
7366
return json.dumps(
7467
{
7568
'columns': [{'name': k, 'type': v} for k, v in kwargs.items()],
76-
'params': ini
77-
},
78-
indent=4
69+
'params': params
70+
}
7971
)
8072

8173

8274
def parse_ini(ini):
83-
"""Converts ini file string to dict."""
84-
configparser.ConfigParser.optionxform = str # make parser case insensitive
75+
"""Returns keys from default section of ini file as a dict."""
8576
parser = configparser.ConfigParser(allow_no_value=True)
77+
parser.optionxform = str # make parser case-sensitive
8678
parser.read_string(ini)
8779

88-
return {k: v for k, v in parser['DEFAULT'].items()}
80+
return dict(parser['DEFAULT'])
81+
82+
83+
# dict of Internet Source's supported data types
84+
DataTypes = {
85+
'bool': '$bool',
86+
'str': '$cat_string',
87+
'num': '$num',
88+
'datetime': '$num_datetime'
89+
}
8990

9091

9192
def write(path, url, content, title, **kwargs):
92-
encoded = base64.standard_b64encode(content).decode('ascii')
93+
"""Writes json file with PolyAnalyst's result format."""
94+
data = {
95+
'docs': [
96+
{
97+
'url': url,
98+
'docurl': url,
99+
'title': title,
100+
'mime': 'text/html',
101+
'content': base64.standard_b64encode(content).decode('ascii'),
102+
'columns': kwargs,
103+
'files': {},
104+
}
105+
]
106+
}
93107

94108
with open(path, mode='w', encoding='utf_8') as f:
95-
data = {
96-
'docs': [
97-
{
98-
'url': url,
99-
'docurl': url,
100-
'title': title,
101-
'mime': 'text/html',
102-
'content': encoded,
103-
'columns': kwargs,
104-
'files': {},
105-
}
106-
]
107-
}
108-
json.dump(data, f, indent=4)
109+
json.dump(data, f)
109110

110111

111112
def main(data):
112113
write(
113-
data['output_folder'] + '\example_result.json',
114-
'http://example.com',
115-
'Example text content'.encode('utf_8'),
116-
'Example title',
114+
path=data['output_folder'] + '\example_result.json',
115+
url='http://example.com',
116+
content=b'Example text content',
117+
title='Example title',
118+
ExtraColumn=data['params']['value_for_ExtraColumn'],
117119
)
118120

119121

120122
if __name__ == '__main__':
121-
descr = 'web scraper template'
122-
file = parse_arguments(descr, features(''))
123+
description = 'web scraper template'
124+
ini = '[DEFAULT]\\nvalue_for_ExtraColumn=default value'
125+
features = get_features(ini, ExtraColumn=DataTypes['str'])
126+
file = parse_arguments(description, features)
123127

124-
data = json.loads(file.read())
128+
data = json.load(file)
125129
data['params'] = parse_ini(data['params'])
126130

127131
main(data)

0 commit comments

Comments
 (0)