Skip to content

Commit

Permalink
utf-8 and discontinuous annotation support
Browse files Browse the repository at this point in the history
  • Loading branch information
spyysalo committed Apr 17, 2013
1 parent 74fae07 commit cb58fa3
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions tools/catann.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ def main(argv):
assert re.search(r'\.ann$', fn), 'Error: argument %s not a .ann file.' % fn
txtfn = re.sub(r'\.ann$', '.txt', fn)

with open(fn, 'r') as annf:
with codecs.open(fn, 'r', encoding='utf-8') as annf:
anns.append(annf.readlines())

with open(txtfn, 'r') as txtf:
with codecs.open(txtfn, 'r', encoding='utf-8') as txtf:
texts.append(txtf.read())

# process each .ann in turn, keeping track of the "base" offset
Expand All @@ -110,14 +110,21 @@ def main(argv):
for j in range(len(anns[i])):
l = anns[i][j]
# see http://brat.nlplab.org/standoff.html for format
m = re.match(r'^(T\d+\t\S+) (\d+) (\d+)(.*\n?)', l)
if not m:
if not l or l[0] != 'T':
continue
begin, startoff, endoff, end = m.groups()
m = re.match(r'^(T\d+\t\S+) (\d+ \d+(?:;\d+ \d+)*)(\t.*\n?)', l)
assert m, 'failed to parse "%s"' % l
begin, offsets, end = m.groups()

startoff = int(startoff) + baseoff
endoff = int(endoff) + baseoff
anns[i][j] = "%s %d %d%s" % (begin, startoff, endoff, end)
new_offsets = []
for offset in offsets.split(';'):
startoff, endoff = offset.split(' ')
startoff = int(startoff) + baseoff
endoff = int(endoff) + baseoff
new_offsets.append('%d %d' % (startoff, endoff))
offsets = ';'.join(new_offsets)

anns[i][j] = "%s %s%s" % (begin, offsets, end)

baseoff += len(texts[i])

Expand Down Expand Up @@ -207,7 +214,7 @@ def main(argv):
# output
for i in range(len(anns)):
for l in anns[i]:
sys.stdout.write(l)
sys.stdout.write(l.encode('utf-8'))

if __name__ == "__main__":
sys.exit(main(sys.argv))

0 comments on commit cb58fa3

Please sign in to comment.