From a5ff9f1a9f8473c69acb57ec3e1ec63525706943 Mon Sep 17 00:00:00 2001 From: Matt Swain Date: Tue, 21 Feb 2017 22:05:26 +0000 Subject: [PATCH] Fix stdin/stdout encoding issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fix was motivated by a `UnicodeEncodeError` being raised when using the command line interface with `stdout` being redirected to a file use `>`. For all stdout, use click’s `get_text_stream` so we can write unicode text and it will handle passing byte stream to stdout. For all stdin, use click’s `get_binary_stream`, so we get a byte stream input, similar to opening a file in binary mode. --- chemdataextractor/cli/__init__.py | 11 +++++------ chemdataextractor/cli/chemdner.py | 13 ++++++------- chemdataextractor/cli/dict.py | 23 +++++++++++------------ chemdataextractor/cli/pos.py | 19 +++++++++---------- chemdataextractor/cli/tokenize.py | 15 +++++++-------- 5 files changed, 38 insertions(+), 43 deletions(-) diff --git a/chemdataextractor/cli/__init__.py b/chemdataextractor/cli/__init__.py index 771707e..a0e5807 100644 --- a/chemdataextractor/cli/__init__.py +++ b/chemdataextractor/cli/__init__.py @@ -15,7 +15,6 @@ import json import logging -import sys import click import six @@ -41,8 +40,8 @@ def cli(ctx, verbose): @cli.command() -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) -@click.argument('input', type=click.File('rb'), default=sys.stdin) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) +@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin')) @click.pass_obj def extract(ctx, input, output): """Run ChemDataExtractor on a document.""" @@ -55,8 +54,8 @@ def extract(ctx, input, output): @cli.command() -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) -@click.argument('input', type=click.File('rb'), default=sys.stdin) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) +@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin')) @click.pass_obj def read(ctx, input, output): """Output processed document elements.""" @@ -64,7 +63,7 @@ def read(ctx, input, output): log.info('Reading %s' % input.name) doc = Document.from_file(input) for element in doc.elements: - output.write('%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element))) + output.write(u'%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element))) from . import cluster, config, data, tokenize, pos, chemdner, cem, dict, evaluate diff --git a/chemdataextractor/cli/chemdner.py b/chemdataextractor/cli/chemdner.py index a9ccca6..080d264 100644 --- a/chemdataextractor/cli/chemdner.py +++ b/chemdataextractor/cli/chemdner.py @@ -11,7 +11,6 @@ from __future__ import division from __future__ import print_function from collections import defaultdict -import sys import click import six @@ -56,12 +55,12 @@ def prepare_tokens(ctx, input, annotations, tout, lout): # Process the corpus for line in input: pmid, title, abstract = line.strip().split(u'\t') - for t, section, anns in [(Title(title), 'T', anndict.get((pmid, 'T'), [])), (Paragraph(abstract), 'A', anndict.get((pmid, 'A'), []))]: + for t, section, anns in [(Title(title), 'T', anndict.get((pmid, u'T'), [])), (Paragraph(abstract), u'A', anndict.get((pmid, u'A'), []))]: # Write our tokens with POS and IOB tags tagged = _prep_tags(t, anns) for i, sentence in enumerate(tagged): - tout.write(u' '.join(['/'.join([token, tag, label]) for token, tag, label in sentence])) - lout.write(u' '.join(['/'.join([token, label]) for token, tag, label in sentence])) + tout.write(u' '.join([u'/'.join([token, tag, label]) for token, tag, label in sentence])) + lout.write(u' '.join([u'/'.join([token, label]) for token, tag, label in sentence])) tout.write(u'\n') lout.write(u'\n') tout.write(u'\n') @@ -85,7 +84,7 @@ def _prep_tags(t, annotations): @chemdner_cli.command() @click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True) -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) @click.pass_obj def tag(ctx, corpus, output): """Tag chemical entities and write CHEMDNER annotations predictions file.""" @@ -95,9 +94,9 @@ def tag(ctx, corpus, output): # print(pmid) counter = 1 d = Document(Title(title), Paragraph(abstract)) - for t, section in [(d.elements[0], 'T'), (d.elements[1], 'A')]: + for t, section in [(d.elements[0], u'T'), (d.elements[1], u'A')]: for cem in t.cems: code = u'%s:%s:%s' % (section, cem.start, cem.end) - output.write(u'\t'.join([pmid, code, six.text_type(counter), '1'])) + output.write(u'\t'.join([pmid, code, six.text_type(counter), u'1'])) output.write(u'\n') counter += 1 diff --git a/chemdataextractor/cli/dict.py b/chemdataextractor/cli/dict.py index fe07a2d..42edc02 100644 --- a/chemdataextractor/cli/dict.py +++ b/chemdataextractor/cli/dict.py @@ -11,7 +11,6 @@ from __future__ import division from __future__ import print_function import re -import sys import click from ..nlp.lexicon import ChemLexicon @@ -272,8 +271,8 @@ def _make_tokens(name): @dict_cli.command() @click.argument('jochem', type=click.File('r', encoding='utf8')) -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=sys.stdout) -@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=sys.stdout) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=click.get_text_stream('stdout')) +@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=click.get_text_stream('stdout')) @click.pass_obj def prepare_jochem(ctx, jochem, output, csoutput): """Process and filter jochem file to produce list of names for dictionary.""" @@ -293,7 +292,7 @@ def prepare_jochem(ctx, jochem, output, csoutput): @dict_cli.command() @click.argument('include', type=click.File('r', encoding='utf8')) -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) @click.pass_obj def prepare_include(ctx, include, output): """Process and filter include file to produce list of names for dictionary.""" @@ -301,8 +300,8 @@ def prepare_include(ctx, include, output): for i, line in enumerate(include): print('IN%s' % i) for tokens in _make_tokens(line.strip()): - output.write(' '.join(tokens)) - output.write('\n') + output.write(u' '.join(tokens)) + output.write(u'\n') @dict_cli.command() @@ -327,7 +326,7 @@ def build(ctx, inputs, output, cs): @click.argument('model', required=True) @click.option('--cs/--no-cs', default=False) @click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True) -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) @click.pass_obj def tag(ctx, model, cs, corpus, output): """Tag chemical entities and write CHEMDNER annotations predictions file.""" @@ -337,18 +336,18 @@ def tag(ctx, model, cs, corpus, output): sentence = [] goldsentence = [] for t in line.split(): - token, tag = t.rsplit('/', 1) + token, tag = t.rsplit(u'/', 1) goldsentence.append((token, tag)) sentence.append(token) if sentence: tokentags = tagger.tag(sentence) for i, tokentag in enumerate(tokentags): goldtokentag = goldsentence[i] - if goldtokentag[1] not in {'B-CM', 'I-CM'} and tokentag[1] in {'B-CM', 'I-CM'}: + if goldtokentag[1] not in {u'B-CM', u'I-CM'} and tokentag[1] in {u'B-CM', u'I-CM'}: print(line) print(tokentag[0]) - output.write(' '.join('/'.join(tokentag) for tokentag in tagger.tag(sentence))) - output.write('\n') + output.write(u' '.join(u'/'.join(tokentag) for tokentag in tagger.tag(sentence))) + output.write(u'\n') else: - output.write('\n') + output.write(u'\n') diff --git a/chemdataextractor/cli/pos.py b/chemdataextractor/cli/pos.py index 989c520..b4e1aab 100644 --- a/chemdataextractor/cli/pos.py +++ b/chemdataextractor/cli/pos.py @@ -11,7 +11,6 @@ from __future__ import division from __future__ import print_function import logging -import sys import click @@ -232,17 +231,17 @@ def evaluate_perceptron(ctx, model, corpus): evaluation = wsj_evaluation sents = list(evaluation.tagged_sents()) for i, wsj_sent in enumerate(sents): - sents[i] = [t for t in wsj_sent if not t[1] == '-NONE-'] + sents[i] = [t for t in wsj_sent if not t[1] == u'-NONE-'] elif corpus == 'genia': evaluation = genia_evaluation sents = list(evaluation.tagged_sents()) # Translate GENIA bracket tags for i, genia_sent in enumerate(sents): for j, (token, tag) in enumerate(genia_sent): - if tag == '(': - sents[i][j] = (token, '-LRB-') - elif tag == ')': - sents[i][j] = (token, '-RRB-') + if tag == u'(': + sents[i][j] = (token, u'-LRB-') + elif tag == u')': + sents[i][j] = (token, u'-RRB-') else: raise click.ClickException('Invalid corpus') tagger = ChemApPosTagger(model=model) @@ -251,8 +250,8 @@ def evaluate_perceptron(ctx, model, corpus): @pos_cli.command() -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) -@click.argument('input', type=click.File('rb'), default=sys.stdin) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) +@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin')) @click.pass_obj def tag(ctx, input, output): """Output POS-tagged tokens.""" @@ -262,5 +261,5 @@ def tag(ctx, input, output): for element in doc.elements: if isinstance(element, Text): for sentence in element.sentences: - output.write(' '.join('/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens)) - output.write('\n') + output.write(u' '.join(u'/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens)) + output.write(u'\n') diff --git a/chemdataextractor/cli/tokenize.py b/chemdataextractor/cli/tokenize.py index d20b834..f047b9b 100644 --- a/chemdataextractor/cli/tokenize.py +++ b/chemdataextractor/cli/tokenize.py @@ -11,7 +11,6 @@ from __future__ import division from __future__ import print_function import logging -import sys import click @@ -63,8 +62,8 @@ def train_punkt(ctx, input, output, abbr, colloc): @tokenize_cli.command() -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) -@click.argument('input', type=click.File('rb'), default=sys.stdin) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) +@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin')) @click.pass_obj def sentences(ctx, input, output): """Read input document, and output sentences.""" @@ -75,12 +74,12 @@ def sentences(ctx, input, output): if isinstance(element, Text): for raw_sentence in element.raw_sentences: output.write(raw_sentence.strip()) - output.write('\n') + output.write(u'\n') @tokenize_cli.command() -@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout) -@click.argument('input', type=click.File('rb'), default=sys.stdin) +@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout')) +@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin')) @click.pass_obj def words(ctx, input, output): """Read input document, and output words.""" @@ -90,5 +89,5 @@ def words(ctx, input, output): for element in doc.elements: if isinstance(element, Text): for sentence in element.sentences: - output.write(' '.join(sentence.raw_tokens)) - output.write('\n') + output.write(u' '.join(sentence.raw_tokens)) + output.write(u'\n')