Skip to content

replace printable for try/except utf-8 #2255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
dist: precise
language: python
sudo: false
env:
Expand Down
13 changes: 12 additions & 1 deletion qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,23 @@ def test_load_template_to_dataframe_lowercase(self):
exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True)
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_non_utf8(self):
def test_load_template_to_dataframe_non_utf8_error(self):
bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
with self.assertRaises(ValueError):
qdb.metadata_template.util.load_template_to_dataframe(
StringIO(bad))

def test_load_template_to_dataframe_non_utf8(self):
replace = EXP_SAMPLE_TEMPLATE.replace(
'Test Sample 2', u'Test Sample\x962')
qdb.metadata_template.util.load_template_to_dataframe(
StringIO(replace))
# setting back
replace = EXP_SAMPLE_TEMPLATE.replace(
u'Test Sample\x962', 'Test Sample 2')
qdb.metadata_template.util.load_template_to_dataframe(
StringIO(replace))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless I am missing something obvious, this test is missing a "test". Is "not erroring" the test? If so, are there any checks that can be done on the returned value from load_template_to_dataframe?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test is that it not raises an error, AKA that it can be done test_load_template_to_dataframe_non_utf8_error has the test for the raise.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, thanks! 👍

def test_load_template_to_dataframe_typechecking(self):
obs = qdb.metadata_template.util.load_template_to_dataframe(
StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT))
Expand Down
17 changes: 8 additions & 9 deletions qiita_db/metadata_template/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from __future__ import division
from future.utils import PY3, viewitems
from six import StringIO
from string import printable
from collections import defaultdict

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -103,17 +103,16 @@ def load_template_to_dataframe(fn, index='sample_name'):
# Load in file lines
holdfile = None
with open_file(fn, mode='U') as f:
errors = {}
errors = defaultdict(list)
holdfile = f.readlines()
# here we are checking for non printable chars AKA non UTF-8 chars
# here we are checking for non UTF-8 chars
for row, line in enumerate(holdfile):
for col, block in enumerate(line.split('\t')):
tblock = ''.join([c for c in block if c in printable])
if len(block) != len(tblock):
tblock = ''.join([c if c in printable else '🐾'
for c in block])
if tblock not in errors:
errors[tblock] = []
try:
tblock = block.encode('utf-8')
except UnicodeDecodeError:
tblock = unicode(block, errors='replace')
tblock = tblock.replace(u'\ufffd', '🐾')
errors[tblock].append('(%d, %d)' % (row, col))
if bool(errors):
raise ValueError(
Expand Down