Skip to content

Move annotations from GATE to WebAnnotator format #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Sep 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ notebooks/*.ipynb
notebooks/*.marisa
notebooks/*.wapiti
notebooks/*.crfsuite
webstruct_data/corpus/random_pages/wa/*.html
webstruct_data/corpus/us_contact_pages/cleaned
example/_data/*
example/*.joblib
example/*.html
example/*.html
9 changes: 6 additions & 3 deletions example/ner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,14 @@ def load_webstruct_data() -> List:
)

trees2 = webstruct.load_trees(
str(WEBSTRUCT_DATA / "corpus/us_contact_pages/annotated/*.xml"),
loader=gate_loader
str(WEBSTRUCT_DATA / "corpus/us_contact_pages/wa/*.html"),
loader=wa_loader
Copy link
Member

@kmike kmike Sep 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring is no longer valid, and there is no need to create gate_loader.

)
trees = chain(trees1, trees2)
return list(pages_progress(trees, desc="Loading webstruct default annotated data"))
return list(pages_progress(
trees,
desc="Loading webstruct default annotated data"
))


def load_countries() -> Set[str]:
Expand Down
53 changes: 53 additions & 0 deletions webstruct/annotation_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import argparse

import webstruct.loaders
from webstruct.webannotator import EntityColors, to_webannotator


def main():
cmdline = argparse.ArgumentParser(description=('utility '
'to convert annotations '
'from GATE format to '
'WebAnnotator format'))
cmdline.add_argument('--GATE',
help='path to file annotated in GATE format',
type=str,
required=True)
cmdline.add_argument('--sample',
help=('path to file annotated in WebAnnotator format '
'for colors and entities transfer'),
type=str,
required=True)
cmdline.add_argument('--WebAnnotator',
help='path to result file in WebAnnotator format',
type=str,
required=True)
cmdline.add_argument('--loglevel',
help='logging level',
type=str,
default='INFO')
args = cmdline.parse_args()

logging.basicConfig(level=getattr(logging, args.loglevel.upper()),
format=('%(asctime)s [%(levelname)s]'
'%(pathname)s:%(lineno)d %(message)s'))
with open(args.sample, 'rb') as sample_reader:
colors = EntityColors.from_htmlbytes(sample_reader.read())
entities = [typ for typ in colors]

logging.debug('Current entities %s', entities)
logging.debug('Current colors %s', colors)

gate = webstruct.loaders.GateLoader(known_entities=entities)
tokenizer = webstruct.HtmlTokenizer(tagset=entities)
with open(args.GATE, 'rb') as reader:
data = reader.read()
tree = gate.loadbytes(data)
tokens, annotations = tokenizer.tokenize_single(tree)
tree = to_webannotator(tree, entity_colors=colors)
with open(args.WebAnnotator, 'wb') as writer:
tree.write(writer, method='html', pretty_print=True)

if __name__ == "__main__":
main()
171 changes: 171 additions & 0 deletions webstruct/annotation_verifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import json
import logging
import argparse

import webstruct.loaders
import webstruct.webannotator

DEFAULT_ENTITIES = [
'ORG', 'TEL', 'FAX', 'HOURS',
'STREET', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY',
'EMAIL', 'PER', 'FUNC', 'SUBJ'
]


def nodes_difference(l, r):
if l.tag != r.tag:
return {'tag': '"{0}" != "{1}"'.format(l.tag, r.tag)}

l_attrib = [(k, l.attrib[k]) for k in l.attrib]
l_attrib.sort(key=lambda x: x[0])
r_attrib = [(k, r.attrib[k]) for k in r.attrib]
r_attrib.sort(key=lambda x: x[0])

idx = 0
while idx < len(l_attrib) and idx < len(r_attrib):
l_attr = l_attrib[idx]
r_attr = r_attrib[idx]
idx = idx + 1

if l_attr != r_attr:
return {'attributes': '"{0}" != "{1}"'.format(l_attr, r_attr)}

if idx < len(l_attrib):
return {'attributes': "{0} != None".format(l_attrib[idx])}

if idx < len(r_attrib):
return {'attributes': "None != {0}".format(r_attrib[idx])}

l_text = ''
if l.text:
l.text = l.text.strip()

r_text = ''
if r.text:
r.text = r.text.strip()

if l_text != r_text:
return {'text': "{0} != {1}".format(l_text, r_text)}

l_tail = ''
if l.tail:
l.tail = l.tail.strip()

r_tail = ''
if r.tail:
r.tail = r.tail.strip()

if l_tail != r_tail:
return {'tail': "{0} != {1}".format(l_tail, r_tail)}

if len(l) != len(r):
return {'children count': "{0} != {1}".format(len(l), len(r))}

return None


def node_path(node):
ret = ''
current = node
while current is not None:
parent = current.getparent()
idx = 0
if parent:
idx = parent.index(current)
step = '{0}:{1}'.format(idx, current.tag)
ret = step + '/' + ret
current = parent

return ret


def tree_difference(l, r):
stack = [(l, r)]
while stack:
l_node, r_node = stack.pop(0)
diff = nodes_difference(l_node, r_node)

if diff:
return {"l": node_path(l_node),
"r": node_path(r_node),
"diff": diff}

for idx, l_child in enumerate(l_node):
stack.append((l_child, r_node[idx]))

return None


def main():
cmdline = argparse.ArgumentParser(description=('utility to verify '
'annotation conversion '
'from GATE format '
'to WebAnnotator format'))
cmdline.add_argument('--GATE',
help='path to file annotated in GATE format',
type=str,
required=True)
cmdline.add_argument('--WebAnnotator',
help='path to file annotated in WebAnnotator format',
type=str,
required=True)
cmdline.add_argument('--entity',
help='enitity type to verify against',
type=str,
action='append',
required=False)
cmdline.add_argument('--loglevel',
help='logging level',
type=str,
default='INFO')
args = cmdline.parse_args()

logging.basicConfig(level=getattr(logging, args.loglevel.upper()),
format=('%(asctime)s [%(levelname)s] '
'%(pathname)s:%(lineno)d %(message)s'))

if args.entity:
entities = args.entity
else:
entities = DEFAULT_ENTITIES

logging.debug('Known entities %s', entities)

gate = webstruct.loaders.GateLoader(known_entities=entities)
wa = webstruct.loaders.WebAnnotatorLoader(known_entities=entities)

tokenizer = webstruct.HtmlTokenizer(tagset=entities)
with open(args.GATE, 'rb') as reader:
data = reader.read()
gate_tree = gate.loadbytes(data)
gate_tokens, gate_annotations = tokenizer.tokenize_single(gate_tree)

with open(args.WebAnnotator, 'rb') as reader:
data = reader.read()
wa_tree = wa.loadbytes(data)
wa_tokens, wa_annotations = tokenizer.tokenize_single(wa_tree)

is_diff = False
tree_diff = tree_difference(gate_tree, wa_tree)
if tree_diff:
logging.error('tree differs %s', json.dumps(tree_diff))
is_diff = True

annot_diff = list()
for idx, (gate_a, wa_a) in enumerate(zip(gate_annotations,
wa_annotations)):
if gate_a == wa_a:
continue

annot_diff.append({'idx': idx,
'gate_a': gate_a,
'wa_a': wa_a})

if annot_diff:
logging.error('annotation differs %s', json.dumps(annot_diff))
is_diff = True

return is_diff is False

if __name__ == "__main__":
main()
35 changes: 35 additions & 0 deletions webstruct/tests/test_html_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import unittest

import lxml

import webstruct.annotation_verifier

html_1 = '<span style="abc" class="span">aa</span>'
html_2 = '<span style="abd" class="span">aa</span>'
html_3 = '<span style="abd" class="span">aa<p>s</p></span>'
html_3 = '<span style="abd" class="span">aa<p>s</p><p>ss</p></span>'


class SomethingTest(unittest.TestCase):

def test_is_node_equal_to_self(self):
tree_1 = lxml.etree.fromstring(html_1)
diff = webstruct.annotation_verifier.nodes_difference(tree_1, tree_1)
self.assertIsNone(diff)

def test_is_different_nodes_are_diffirent(self):
tree_1 = lxml.etree.fromstring(html_1)
tree_2 = lxml.etree.fromstring(html_2)
diff = webstruct.annotation_verifier.nodes_difference(tree_1, tree_2)
self.assertIsNotNone(diff)

def test_is_tree_equal_to_self(self):
tree_1 = lxml.etree.fromstring(html_3)
diff = webstruct.annotation_verifier.tree_difference(tree_1, tree_1)
self.assertIsNone(diff)

def test_is_different_trees_are_diffirent(self):
tree_1 = lxml.etree.fromstring(html_2)
tree_2 = lxml.etree.fromstring(html_3)
diff = webstruct.annotation_verifier.tree_difference(tree_1, tree_2)
self.assertIsNotNone(diff)
18 changes: 18 additions & 0 deletions webstruct/tests/test_webannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,24 @@ def assertApplyWaTitle(self, source, result):
webannotator.apply_wa_title(tree)
self.assertHtmlTreeEqual(tree, html_document_fromstring(result))

def test_wa_title_no_attributes(self):
self.assertApplyWaTitle(
b"""
<html>
<head><title>Foo</title></head>
<body>contents</body>
<wa-title class="classy"><b>hello</b>, world</wa-title>
</html>
""",

b"""
<html>
<head><title><b>hello</b>, world</title></head>
<body>contents</body>
</html>
"""
)

def test_wa_title(self):
self.assertApplyWaTitle(
b"""
Expand Down
4 changes: 3 additions & 1 deletion webstruct/webannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def apply_wa_title(tree):
head.insert(head.index(title), wa_title)
title.drop_tree()
wa_title.tag = 'title'
for attr in wa_title.attrib:
wa_title.attrib.pop(attr)
return


Expand Down Expand Up @@ -254,7 +256,7 @@ def _ensure_head(tree):


def _set_base(tree, baseurl):
"""
"""
Add <base> tag to the tree. If <base> tag already exists do nothing.
"""
if tree.xpath('//base'):
Expand Down
Loading