Skip to content

Commit

Permalink
Merge pull request mcs07#15 from mcs07/enc
Browse files Browse the repository at this point in the history
Fix stdin/stdout encoding issues
  • Loading branch information
mcs07 authored Feb 21, 2017
2 parents 08b37bc + a5ff9f1 commit 1aed4c3
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 43 deletions.
11 changes: 5 additions & 6 deletions chemdataextractor/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

import json
import logging
import sys

import click
import six
Expand All @@ -41,8 +40,8 @@ def cli(ctx, verbose):


@cli.command()
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.argument('input', type=click.File('rb'), default=sys.stdin)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
@click.pass_obj
def extract(ctx, input, output):
"""Run ChemDataExtractor on a document."""
Expand All @@ -55,16 +54,16 @@ def extract(ctx, input, output):


@cli.command()
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.argument('input', type=click.File('rb'), default=sys.stdin)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
@click.pass_obj
def read(ctx, input, output):
"""Output processed document elements."""
log.info('chemdataextractor.read')
log.info('Reading %s' % input.name)
doc = Document.from_file(input)
for element in doc.elements:
output.write('%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element)))
output.write(u'%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element)))


from . import cluster, config, data, tokenize, pos, chemdner, cem, dict, evaluate
Expand Down
13 changes: 6 additions & 7 deletions chemdataextractor/cli/chemdner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import sys

import click
import six
Expand Down Expand Up @@ -56,12 +55,12 @@ def prepare_tokens(ctx, input, annotations, tout, lout):
# Process the corpus
for line in input:
pmid, title, abstract = line.strip().split(u'\t')
for t, section, anns in [(Title(title), 'T', anndict.get((pmid, 'T'), [])), (Paragraph(abstract), 'A', anndict.get((pmid, 'A'), []))]:
for t, section, anns in [(Title(title), 'T', anndict.get((pmid, u'T'), [])), (Paragraph(abstract), u'A', anndict.get((pmid, u'A'), []))]:
# Write our tokens with POS and IOB tags
tagged = _prep_tags(t, anns)
for i, sentence in enumerate(tagged):
tout.write(u' '.join(['/'.join([token, tag, label]) for token, tag, label in sentence]))
lout.write(u' '.join(['/'.join([token, label]) for token, tag, label in sentence]))
tout.write(u' '.join([u'/'.join([token, tag, label]) for token, tag, label in sentence]))
lout.write(u' '.join([u'/'.join([token, label]) for token, tag, label in sentence]))
tout.write(u'\n')
lout.write(u'\n')
tout.write(u'\n')
Expand All @@ -85,7 +84,7 @@ def _prep_tags(t, annotations):

@chemdner_cli.command()
@click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.pass_obj
def tag(ctx, corpus, output):
"""Tag chemical entities and write CHEMDNER annotations predictions file."""
Expand All @@ -95,9 +94,9 @@ def tag(ctx, corpus, output):
# print(pmid)
counter = 1
d = Document(Title(title), Paragraph(abstract))
for t, section in [(d.elements[0], 'T'), (d.elements[1], 'A')]:
for t, section in [(d.elements[0], u'T'), (d.elements[1], u'A')]:
for cem in t.cems:
code = u'%s:%s:%s' % (section, cem.start, cem.end)
output.write(u'\t'.join([pmid, code, six.text_type(counter), '1']))
output.write(u'\t'.join([pmid, code, six.text_type(counter), u'1']))
output.write(u'\n')
counter += 1
23 changes: 11 additions & 12 deletions chemdataextractor/cli/dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from __future__ import division
from __future__ import print_function
import re
import sys

import click
from ..nlp.lexicon import ChemLexicon
Expand Down Expand Up @@ -272,8 +271,8 @@ def _make_tokens(name):

@dict_cli.command()
@click.argument('jochem', type=click.File('r', encoding='utf8'))
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=sys.stdout)
@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=sys.stdout)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=click.get_text_stream('stdout'))
@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=click.get_text_stream('stdout'))
@click.pass_obj
def prepare_jochem(ctx, jochem, output, csoutput):
"""Process and filter jochem file to produce list of names for dictionary."""
Expand All @@ -293,16 +292,16 @@ def prepare_jochem(ctx, jochem, output, csoutput):

@dict_cli.command()
@click.argument('include', type=click.File('r', encoding='utf8'))
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.pass_obj
def prepare_include(ctx, include, output):
"""Process and filter include file to produce list of names for dictionary."""
click.echo('chemdataextractor.dict.prepare_include')
for i, line in enumerate(include):
print('IN%s' % i)
for tokens in _make_tokens(line.strip()):
output.write(' '.join(tokens))
output.write('\n')
output.write(u' '.join(tokens))
output.write(u'\n')


@dict_cli.command()
Expand All @@ -327,7 +326,7 @@ def build(ctx, inputs, output, cs):
@click.argument('model', required=True)
@click.option('--cs/--no-cs', default=False)
@click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.pass_obj
def tag(ctx, model, cs, corpus, output):
"""Tag chemical entities and write CHEMDNER annotations predictions file."""
Expand All @@ -337,18 +336,18 @@ def tag(ctx, model, cs, corpus, output):
sentence = []
goldsentence = []
for t in line.split():
token, tag = t.rsplit('/', 1)
token, tag = t.rsplit(u'/', 1)
goldsentence.append((token, tag))
sentence.append(token)
if sentence:
tokentags = tagger.tag(sentence)
for i, tokentag in enumerate(tokentags):
goldtokentag = goldsentence[i]
if goldtokentag[1] not in {'B-CM', 'I-CM'} and tokentag[1] in {'B-CM', 'I-CM'}:
if goldtokentag[1] not in {u'B-CM', u'I-CM'} and tokentag[1] in {u'B-CM', u'I-CM'}:
print(line)
print(tokentag[0])

output.write(' '.join('/'.join(tokentag) for tokentag in tagger.tag(sentence)))
output.write('\n')
output.write(u' '.join(u'/'.join(tokentag) for tokentag in tagger.tag(sentence)))
output.write(u'\n')
else:
output.write('\n')
output.write(u'\n')
19 changes: 9 additions & 10 deletions chemdataextractor/cli/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from __future__ import division
from __future__ import print_function
import logging
import sys

import click

Expand Down Expand Up @@ -232,17 +231,17 @@ def evaluate_perceptron(ctx, model, corpus):
evaluation = wsj_evaluation
sents = list(evaluation.tagged_sents())
for i, wsj_sent in enumerate(sents):
sents[i] = [t for t in wsj_sent if not t[1] == '-NONE-']
sents[i] = [t for t in wsj_sent if not t[1] == u'-NONE-']
elif corpus == 'genia':
evaluation = genia_evaluation
sents = list(evaluation.tagged_sents())
# Translate GENIA bracket tags
for i, genia_sent in enumerate(sents):
for j, (token, tag) in enumerate(genia_sent):
if tag == '(':
sents[i][j] = (token, '-LRB-')
elif tag == ')':
sents[i][j] = (token, '-RRB-')
if tag == u'(':
sents[i][j] = (token, u'-LRB-')
elif tag == u')':
sents[i][j] = (token, u'-RRB-')
else:
raise click.ClickException('Invalid corpus')
tagger = ChemApPosTagger(model=model)
Expand All @@ -251,8 +250,8 @@ def evaluate_perceptron(ctx, model, corpus):


@pos_cli.command()
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.argument('input', type=click.File('rb'), default=sys.stdin)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
@click.pass_obj
def tag(ctx, input, output):
"""Output POS-tagged tokens."""
Expand All @@ -262,5 +261,5 @@ def tag(ctx, input, output):
for element in doc.elements:
if isinstance(element, Text):
for sentence in element.sentences:
output.write(' '.join('/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens))
output.write('\n')
output.write(u' '.join(u'/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens))
output.write(u'\n')
15 changes: 7 additions & 8 deletions chemdataextractor/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from __future__ import division
from __future__ import print_function
import logging
import sys

import click

Expand Down Expand Up @@ -63,8 +62,8 @@ def train_punkt(ctx, input, output, abbr, colloc):


@tokenize_cli.command()
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.argument('input', type=click.File('rb'), default=sys.stdin)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
@click.pass_obj
def sentences(ctx, input, output):
"""Read input document, and output sentences."""
Expand All @@ -75,12 +74,12 @@ def sentences(ctx, input, output):
if isinstance(element, Text):
for raw_sentence in element.raw_sentences:
output.write(raw_sentence.strip())
output.write('\n')
output.write(u'\n')


@tokenize_cli.command()
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
@click.argument('input', type=click.File('rb'), default=sys.stdin)
@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
@click.pass_obj
def words(ctx, input, output):
"""Read input document, and output words."""
Expand All @@ -90,5 +89,5 @@ def words(ctx, input, output):
for element in doc.elements:
if isinstance(element, Text):
for sentence in element.sentences:
output.write(' '.join(sentence.raw_tokens))
output.write('\n')
output.write(u' '.join(sentence.raw_tokens))
output.write(u'\n')

0 comments on commit 1aed4c3

Please sign in to comment.