Merge pull request mcs07#15 from mcs07/enc

Fix stdin/stdout encoding issues
yqq2022 · Feb 21, 2017 · 1aed4c3 · 1aed4c3
2 parents 08b37bc + a5ff9f1
commit 1aed4c3
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 43 deletions.
diff --git a/chemdataextractor/cli/__init__.py b/chemdataextractor/cli/__init__.py
@@ -15,7 +15,6 @@
 
 import json
 import logging
-import sys
 
 import click
 import six
@@ -41,8 +40,8 @@ def cli(ctx, verbose):
 
 
 @cli.command()
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
-@click.argument('input', type=click.File('rb'), default=sys.stdin)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
+@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
 @click.pass_obj
 def extract(ctx, input, output):
     """Run ChemDataExtractor on a document."""
@@ -55,16 +54,16 @@ def extract(ctx, input, output):
 
 
 @cli.command()
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
-@click.argument('input', type=click.File('rb'), default=sys.stdin)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
+@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
 @click.pass_obj
 def read(ctx, input, output):
     """Output processed document elements."""
     log.info('chemdataextractor.read')
     log.info('Reading %s' % input.name)
     doc = Document.from_file(input)
     for element in doc.elements:
-        output.write('%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element)))
+        output.write(u'%s : %s\n=====\n' % (element.__class__.__name__, six.text_type(element)))
 
 
 from . import cluster, config, data, tokenize, pos, chemdner, cem, dict, evaluate

diff --git a/chemdataextractor/cli/chemdner.py b/chemdataextractor/cli/chemdner.py
@@ -11,7 +11,6 @@
 from __future__ import division
 from __future__ import print_function
 from collections import defaultdict
-import sys
 
 import click
 import six
@@ -56,12 +55,12 @@ def prepare_tokens(ctx, input, annotations, tout, lout):
     # Process the corpus
     for line in input:
         pmid, title, abstract = line.strip().split(u'\t')
-        for t, section, anns in [(Title(title), 'T', anndict.get((pmid, 'T'), [])), (Paragraph(abstract), 'A', anndict.get((pmid, 'A'), []))]:
+        for t, section, anns in [(Title(title), 'T', anndict.get((pmid, u'T'), [])), (Paragraph(abstract), u'A', anndict.get((pmid, u'A'), []))]:
             # Write our tokens with POS and IOB tags
             tagged = _prep_tags(t, anns)
             for i, sentence in enumerate(tagged):
-                tout.write(u' '.join(['/'.join([token, tag, label]) for token, tag, label in sentence]))
-                lout.write(u' '.join(['/'.join([token, label]) for token, tag, label in sentence]))
+                tout.write(u' '.join([u'/'.join([token, tag, label]) for token, tag, label in sentence]))
+                lout.write(u' '.join([u'/'.join([token, label]) for token, tag, label in sentence]))
                 tout.write(u'\n')
                 lout.write(u'\n')
             tout.write(u'\n')
@@ -85,7 +84,7 @@ def _prep_tags(t, annotations):
 
 @chemdner_cli.command()
 @click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True)
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
 @click.pass_obj
 def tag(ctx, corpus, output):
     """Tag chemical entities and write CHEMDNER annotations predictions file."""
@@ -95,9 +94,9 @@ def tag(ctx, corpus, output):
         # print(pmid)
         counter = 1
         d = Document(Title(title), Paragraph(abstract))
-        for t, section in [(d.elements[0], 'T'), (d.elements[1], 'A')]:
+        for t, section in [(d.elements[0], u'T'), (d.elements[1], u'A')]:
             for cem in t.cems:
                 code = u'%s:%s:%s' % (section, cem.start, cem.end)
-                output.write(u'\t'.join([pmid, code, six.text_type(counter), '1']))
+                output.write(u'\t'.join([pmid, code, six.text_type(counter), u'1']))
                 output.write(u'\n')
                 counter += 1
diff --git a/chemdataextractor/cli/dict.py b/chemdataextractor/cli/dict.py
@@ -11,7 +11,6 @@
 from __future__ import division
 from __future__ import print_function
 import re
-import sys
 
 import click
 from ..nlp.lexicon import ChemLexicon
@@ -272,8 +271,8 @@ def _make_tokens(name):
 
 @dict_cli.command()
 @click.argument('jochem', type=click.File('r', encoding='utf8'))
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=sys.stdout)
-@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=sys.stdout)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Dictionary file.', default=click.get_text_stream('stdout'))
+@click.option('--csoutput', '-c', type=click.File('w', encoding='utf8'), help='Case-sensitive dictionary file.', default=click.get_text_stream('stdout'))
 @click.pass_obj
 def prepare_jochem(ctx, jochem, output, csoutput):
     """Process and filter jochem file to produce list of names for dictionary."""
@@ -293,16 +292,16 @@ def prepare_jochem(ctx, jochem, output, csoutput):
 
 @dict_cli.command()
 @click.argument('include', type=click.File('r', encoding='utf8'))
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
 @click.pass_obj
 def prepare_include(ctx, include, output):
     """Process and filter include file to produce list of names for dictionary."""
     click.echo('chemdataextractor.dict.prepare_include')
     for i, line in enumerate(include):
         print('IN%s' % i)
         for tokens in _make_tokens(line.strip()):
-            output.write(' '.join(tokens))
-            output.write('\n')
+            output.write(u' '.join(tokens))
+            output.write(u'\n')
 
 
 @dict_cli.command()
@@ -327,7 +326,7 @@ def build(ctx, inputs, output, cs):
 @click.argument('model', required=True)
 @click.option('--cs/--no-cs', default=False)
 @click.option('--corpus', '-c', type=click.File('r', encoding='utf8'), required=True)
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
 @click.pass_obj
 def tag(ctx, model, cs, corpus, output):
     """Tag chemical entities and write CHEMDNER annotations predictions file."""
@@ -337,18 +336,18 @@ def tag(ctx, model, cs, corpus, output):
         sentence = []
         goldsentence = []
         for t in line.split():
-            token, tag = t.rsplit('/', 1)
+            token, tag = t.rsplit(u'/', 1)
             goldsentence.append((token, tag))
             sentence.append(token)
         if sentence:
             tokentags = tagger.tag(sentence)
             for i, tokentag in enumerate(tokentags):
                 goldtokentag = goldsentence[i]
-                if goldtokentag[1] not in {'B-CM', 'I-CM'} and tokentag[1] in {'B-CM', 'I-CM'}:
+                if goldtokentag[1] not in {u'B-CM', u'I-CM'} and tokentag[1] in {u'B-CM', u'I-CM'}:
                     print(line)
                     print(tokentag[0])
 
-            output.write(' '.join('/'.join(tokentag) for tokentag in tagger.tag(sentence)))
-            output.write('\n')
+            output.write(u' '.join(u'/'.join(tokentag) for tokentag in tagger.tag(sentence)))
+            output.write(u'\n')
         else:
-            output.write('\n')
+            output.write(u'\n')
diff --git a/chemdataextractor/cli/pos.py b/chemdataextractor/cli/pos.py
@@ -11,7 +11,6 @@
 from __future__ import division
 from __future__ import print_function
 import logging
-import sys
 
 import click
 
@@ -232,17 +231,17 @@ def evaluate_perceptron(ctx, model, corpus):
         evaluation = wsj_evaluation
         sents = list(evaluation.tagged_sents())
         for i, wsj_sent in enumerate(sents):
-            sents[i] = [t for t in wsj_sent if not t[1] == '-NONE-']
+            sents[i] = [t for t in wsj_sent if not t[1] == u'-NONE-']
     elif corpus == 'genia':
         evaluation = genia_evaluation
         sents = list(evaluation.tagged_sents())
         # Translate GENIA bracket tags
         for i, genia_sent in enumerate(sents):
             for j, (token, tag) in enumerate(genia_sent):
-                if tag == '(':
-                    sents[i][j] = (token, '-LRB-')
-                elif tag == ')':
-                    sents[i][j] = (token, '-RRB-')
+                if tag == u'(':
+                    sents[i][j] = (token, u'-LRB-')
+                elif tag == u')':
+                    sents[i][j] = (token, u'-RRB-')
     else:
         raise click.ClickException('Invalid corpus')
     tagger = ChemApPosTagger(model=model)
@@ -251,8 +250,8 @@ def evaluate_perceptron(ctx, model, corpus):
 
 
 @pos_cli.command()
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
-@click.argument('input', type=click.File('rb'), default=sys.stdin)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
+@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
 @click.pass_obj
 def tag(ctx, input, output):
     """Output POS-tagged tokens."""
@@ -262,5 +261,5 @@ def tag(ctx, input, output):
     for element in doc.elements:
         if isinstance(element, Text):
             for sentence in element.sentences:
-                output.write(' '.join('/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens))
-                output.write('\n')
+                output.write(u' '.join(u'/'.join([token, tag]) for token, tag in sentence.pos_tagged_tokens))
+                output.write(u'\n')
diff --git a/chemdataextractor/cli/tokenize.py b/chemdataextractor/cli/tokenize.py
@@ -11,7 +11,6 @@
 from __future__ import division
 from __future__ import print_function
 import logging
-import sys
 
 import click
 
@@ -63,8 +62,8 @@ def train_punkt(ctx, input, output, abbr, colloc):
 
 
 @tokenize_cli.command()
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
-@click.argument('input', type=click.File('rb'), default=sys.stdin)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
+@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
 @click.pass_obj
 def sentences(ctx, input, output):
     """Read input document, and output sentences."""
@@ -75,12 +74,12 @@ def sentences(ctx, input, output):
         if isinstance(element, Text):
             for raw_sentence in element.raw_sentences:
                 output.write(raw_sentence.strip())
-                output.write('\n')
+                output.write(u'\n')
 
 
 @tokenize_cli.command()
-@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=sys.stdout)
-@click.argument('input', type=click.File('rb'), default=sys.stdin)
+@click.option('--output', '-o', type=click.File('w', encoding='utf8'), help='Output file.', default=click.get_text_stream('stdout'))
+@click.argument('input', type=click.File('rb'), default=click.get_binary_stream('stdin'))
 @click.pass_obj
 def words(ctx, input, output):
     """Read input document, and output words."""
@@ -90,5 +89,5 @@ def words(ctx, input, output):
     for element in doc.elements:
         if isinstance(element, Text):
             for sentence in element.sentences:
-                output.write(' '.join(sentence.raw_tokens))
-                output.write('\n')
+                output.write(u' '.join(sentence.raw_tokens))
+                output.write(u'\n')