Skip to content

support CJK string annotation; print readably CJK string in scrapely.tool's output #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions scrapely/tests/test_template.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
# encoding: utf8

from unittest import TestCase

from scrapely.htmlpage import HtmlPage
from scrapely.template import TemplateMaker, FragmentNotFound, \
FragmentAlreadyAnnotated, best_match
from scrapely.extraction import InstanceBasedLearningExtractor

class TemplateMakerTest(TestCase):

class BaseTestCase(TestCase):
PAGE = HtmlPage("http://www.example.com", body=u'')

def _matches(self, text):
bm = best_match(text)
matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
matches = [x for x in matches if x[0]]
matches.sort(reverse=True)
return [self.PAGE.fragment_data(x[1]) for x in matches]


class TemplateMakerTest(BaseTestCase):

PAGE = HtmlPage("http://www.example.com", body=u"""
<html>
Expand Down Expand Up @@ -72,9 +86,19 @@ def test_best_match(self):
self.assertEquals(self._matches('text to annotate'),
['Some text to annotate here', 'Another text to annotate there'])

def _matches(self, text):
bm = best_match(text)
matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
matches = [x for x in matches if x[0]]
matches.sort(reverse=True)
return [self.PAGE.fragment_data(x[1]) for x in matches]

class TemplateMakerCJKTest(BaseTestCase):

PAGE = HtmlPage("http://www.example.com", body=u"""
<html>
<body>
<h1>标题</h1>
<p>段落</p>
<h2>另一个标题</h2>
<p>另一个段落</p>
</body>
</html>
""")

def test_best_match(self):
self.assertEquals(self._matches(u'标题'), [u'标题', u'另一个标题'])
40 changes: 40 additions & 0 deletions scrapely/tests/test_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# encoding: utf8

from unittest import TestCase

from scrapely.htmlpage import HtmlPage
from scrapely.template import TemplateMaker
from scrapely.tool import parse_criteria, apply_criteria, readable_repr


class ToolCJKTestCase(TestCase):

PAGE = HtmlPage("http://www.example.com", body=u"""
<html>
<body>
<h1>标题</h1>
<p>段落</p>
<h2>另一个标题</h2>
<p>另一个段落</p>
</body>
</html>
""")

def test_apply_criteria_should_support_cjk_chars(self):
criteria = parse_criteria('标题')
tm = TemplateMaker(self.PAGE)

selection = apply_criteria(criteria, tm)

self.assertEqual(selection, [6, 14])
self.assertEqual(tm.selected_data(6), u'<h1>标题</h1>')
self.assertEqual(tm.selected_data(14), u'<h2>另一个标题</h2>')


class ReadableReprTextCase(TestCase):

def test_readable_repr(self):
cjk = u'cjk\t中日韩\n\\u535a'
readable = u"u'cjk\\t中日韩\\n\\\\u535a'"

self.assertEqual(readable_repr(cjk), readable)
55 changes: 48 additions & 7 deletions scrapely/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor

REPR_UNICODE_CHAR = re.compile(r'(?<!\\)(\\u[0-9a-f]{4,4})')


def readable_repr(obj):
'''Return printing-friendly unicode repr string

Make CJK characters still readable like ASCII if you print it.
'''
def replace_unicode_char(repr_char):
return unichr(int(str(repr_char.group())[2:], base=16))

repr_string = repr(obj)
return REPR_UNICODE_CHAR.sub(replace_unicode_char, repr_string)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it different from repr_bytesting.decode('unicode-escape') ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is repr_bytesting?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is repr_string

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kmike, it's different. The decode('unicode-escape') restore whole string; readable_repr restore CJK characters(all four bytes characters actually) only, not include '\n', '\t', '\\', etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @xyb,

Thanks for the fix and an explanation. This approach makes sense; it is basically undoing what Python 2.x repr is doing for unicode strings (if we don't want to print new lines, etc.)

A couple of notes:

  1. Your regex doesn't catch all symbols that can be safely decoded, e.g. ² (\xb2) or £ (\xa3) could be nice to see in the output;
  2. 'readable_repr' name is a bit confusing because in Python 2.x repr must be a bytestring, and readable_repr returns unicode. What do you think about calling it e.g. 'unicode_repr'?

The best fix for this issue would be to port scrapely to Python 3 - it doesn't escape non-ascii letters and symbols in repr of unicode strings, but w3lib must be ported before that :)



class IblTool(cmd.Cmd):

prompt = 'scrapely> '
Expand All @@ -17,6 +32,8 @@ def __init__(self, filename, **kw):
def do_ta(self, line):
"""ta <url> [--encoding ENCODING] - add template"""
opts, (url,) = parse_at(line)
if assert_or_print(url, "missing url"):
return
t = url_to_page(url, opts.encoding)
templates = self._load_templates()
templates.append(t)
Expand All @@ -31,7 +48,11 @@ def do_tl(self, line):

def do_td(self, template_id):
"""dt <template> - delete template"""
if assert_or_print(template_id, "missing template id"):
return
templates = self._load_templates()
if assert_or_print(teamplates, "no templates available"):
return
try:
del templates[int(template_id)]
self._save_templates(templates)
Expand All @@ -41,22 +62,35 @@ def do_td(self, template_id):

def do_t(self, line):
"""t <template> <text> - test selection text"""
if assert_or_print(line, "missing template id or selection text"):
return
if assert_or_print(' ' in line, "missing template id or selection text"):
return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
for n, i in enumerate(selection):
print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
print "[%d] %s" % (n,
readable_repr(remove_annotation(tm.selected_data(i))))

def do_a(self, line):
"""a <template> <data> [-n number] [-f field]- add or test annotation

Add a new annotation (if -f is passed) or test what would be annotated
otherwise
"""
if assert_or_print(line, "missing template id and selection text"):
return
if assert_or_print(' ' in line, "missing template id or selection text"):
return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
Expand All @@ -65,27 +99,31 @@ def do_a(self, line):
index = selection[0]
tm.annotate_fragment(index, criteria.field)
self._save_template(template_id, tm.get_template())
print "[new] (%s) %r" % (criteria.field,
remove_annotation(tm.selected_data(index)))
print "[new] (%s) %s" % (criteria.field,
readable_repr(remove_annotation(tm.selected_data(index))))
else:
for n, i in enumerate(selection):
print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
print "[%d] %s" % (n, readable_repr(remove_annotation(tm.selected_data(i))))

def do_al(self, template_id):
"""al <template> - list annotations"""
if assert_or_print(template_id, "missing template id"):
return
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
tm = TemplateMaker(t)
for n, (a, i) in enumerate(tm.annotations()):
print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
remove_annotation(tm.selected_data(i)))
print "[%s-%d] (%s) %s" % (template_id, n, a['annotations']['content'],
readable_repr(remove_annotation(tm.selected_data(i))))

def do_s(self, url):
"""s <url> - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
if assert_or_print(url, "missing url"):
return
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
Expand Down Expand Up @@ -143,7 +181,10 @@ def parse_criteria(criteria_str):

def apply_criteria(criteria, tm):
"""Apply the given criteria object to the given template"""
func = best_match(criteria.text) if criteria.text else lambda x, y: False
text = criteria.text
if text and isinstance(text, str):
text = text.decode(tm.get_template().encoding or 'utf-8')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is a wrong place to decode criteria.text, and the encoding it is decoded from is incorrect - it should be decoded from IblTool.stdin.encoding, and so it makes sense to decode it in IblTool itself. See #46.

func = best_match(text) if text else lambda x, y: False
sel = tm.select(func)
if criteria.number is not None:
if criteria.number < len(sel):
Expand Down