Skip to content

Commit 576d3db

Browse files
committed
Merge pull request #46 from kmike/unicode-support
[MRG] scrapely.tool: add support for non-ascii <text> and <data> arguments
2 parents 1addba6 + 4e43934 commit 576d3db

File tree

1 file changed

+17
-14
lines changed

1 file changed

+17
-14
lines changed

scrapely/tool.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,21 @@ def do_t(self, line):
4343
"""t <template> <text> - test selection text"""
4444
template_id, criteria = line.split(' ', 1)
4545
t = self._load_template(template_id)
46-
criteria = parse_criteria(criteria)
46+
criteria = self._parse_criteria(criteria)
4747
tm = TemplateMaker(t)
4848
selection = apply_criteria(criteria, tm)
4949
for n, i in enumerate(selection):
5050
print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
5151

5252
def do_a(self, line):
5353
"""a <template> <data> [-n number] [-f field]- add or test annotation
54-
54+
5555
Add a new annotation (if -f is passed) or test what would be annotated
5656
otherwise
5757
"""
5858
template_id, criteria = line.split(' ', 1)
5959
t = self._load_template(template_id)
60-
criteria = parse_criteria(criteria)
60+
criteria = self._parse_criteria(criteria)
6161
tm = TemplateMaker(t)
6262
selection = apply_criteria(criteria, tm)
6363
if criteria.field:
@@ -78,7 +78,7 @@ def do_al(self, template_id):
7878
t = self._load_template(template_id)
7979
tm = TemplateMaker(t)
8080
for n, (a, i) in enumerate(tm.annotations()):
81-
print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
81+
print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
8282
remove_annotation(tm.selected_data(i)))
8383

8484
def do_s(self, url):
@@ -126,21 +126,24 @@ def _save_templates(self, templates):
126126
with open(self.filename, 'w') as f:
127127
templates = [page_to_dict(t) for t in templates]
128128
return json.dump({'templates': templates}, f)
129-
129+
130+
def _parse_criteria(self, criteria_str):
131+
"""Parse the given criteria string and returns a criteria object"""
132+
p = optparse.OptionParser()
133+
p.add_option('-f', '--field', help='field to annotate')
134+
p.add_option('-n', '--number', type="int", help='number of result to select')
135+
o, a = p.parse_args(shlex.split(criteria_str))
136+
137+
encoding = getattr(self.stdin, 'encoding', None) or sys.stdin.encoding
138+
o.text = ' '.join(a).decode(encoding or 'ascii')
139+
return o
140+
141+
130142
def parse_at(ta_line):
131143
p = optparse.OptionParser()
132144
p.add_option('-e', '--encoding', help='page encoding')
133145
return p.parse_args(shlex.split(ta_line))
134146

135-
def parse_criteria(criteria_str):
136-
"""Parse the given criteria string and returns a criteria object"""
137-
p = optparse.OptionParser()
138-
p.add_option('-f', '--field', help='field to annotate')
139-
p.add_option('-n', '--number', type="int", help='number of result to select')
140-
o, a = p.parse_args(shlex.split(criteria_str))
141-
o.text = ' '.join(a)
142-
return o
143-
144147
def apply_criteria(criteria, tm):
145148
"""Apply the given criteria object to the given template"""
146149
func = best_match(criteria.text) if criteria.text else lambda x, y: False

0 commit comments

Comments
 (0)