Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
41 changes: 41 additions & 0 deletions qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# -----------------------------------------------------------------------------

from six import StringIO
from inspect import currentframe, getfile
from os.path import dirname, abspath, join
from unittest import TestCase, main
import warnings

Expand Down Expand Up @@ -69,6 +71,31 @@ def test_load_template_to_dataframe(self):
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_xlsx(self):
mfp = join(dirname(abspath(getfile(currentframe()))), 'support_files')

# test loading a qiimp file
fp = join(mfp, 'a_qiimp_wb.xlsx')
obs = qdb.metadata_template.util.load_template_to_dataframe(fp)
exp = pd.DataFrame.from_dict(EXP_QIIMP, dtype=str)
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

# test loading an empty qiimp file
fp = join(mfp, 'empty_qiimp_wb.xlsx')
with self.assertRaises(qdb.exceptions.QiitaDBColumnError) as error:
qdb.metadata_template.util.load_template_to_dataframe(fp)
self.assertEqual(
str(error.exception), "The 'sample_name' column is missing from "
"your template, this file cannot be parsed.")
Copy link
Contributor

@AmandaBirmingham AmandaBirmingham Jun 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't a show-stopper, but I am concerned because I think the auto-generated exception is inaccurate ... an empty QIIMP template will contain a sample_name column, as shown below, and the tsv produced from it will also include that column--actually, the tsv will contain ONLY a row of column names :) I fear that an error message indicating the sample_name column isn't there when it actually is will mislead users/future maintainers when they try to figure out why their upload failed.
screen shot 2018-06-01 at 4 45 27 pm

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AmandaBirmingham, good point. The reason this was happening was all columns are empty so all of them are dropped and the final dataframe was empty, and the first test is to check for validity is if sample_name column is there. Anyway, I added a new tests (empty) and now the error should make more sense.


# test loading non qiimp file
fp = join(mfp, 'not_a_qiimp_wb.xlsx')
obs = qdb.metadata_template.util.load_template_to_dataframe(fp)
exp = pd.DataFrame.from_dict(EXP_NOT_QIIMP, dtype=str)
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_qiime_map(self):
obs = qdb.metadata_template.util.load_template_to_dataframe(
StringIO(QIIME_TUTORIAL_MAP_SUBSET), index='#SampleID')
Expand Down Expand Up @@ -844,5 +871,19 @@ def test_get_pgsql_reserved_words(self):
'1.SKD8.640184\tCGTAGAGCTCTC\tANL\tTest Project\tNone\tEMP\tBBBB\tAAAA\t'
'GTGCCAGCMGCCGCGGTAA\tILLUMINA\ts_G1_L001_sequences\tValue for sample 2\n')

EXP_QIIMP = {
'asfaewf': {'sample': 'f', 'oijnmk': 'f'},
'pheno': {'sample': 'med', 'oijnmk': 'missing: not provided'},
'bawer': {'sample': 'a', 'oijnmk': 'b'},
'aelrjg': {'sample': 'asfe', 'oijnmk': 'asfs'}
}

EXP_NOT_QIIMP = {
'myownidea': {
'sample5': 'I skipped some',
'sample1': 'sampleoneinfo',
'sample2': 'sampletwoinfo'}
}

if __name__ == '__main__':
main()
31 changes: 31 additions & 0 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
from os.path import join, basename, isdir, exists
from os import walk, remove, listdir, makedirs, rename
from shutil import move, rmtree, copy as shutil_copy
from openpyxl import load_workbook
from tempfile import mkstemp
from csv import writer as csv_writer
from json import dumps
from datetime import datetime
from itertools import chain
Expand Down Expand Up @@ -1730,6 +1733,34 @@ def _get_filehandle(filepath_or, *args, **kwargs):
if _is_string_or_bytes(filepath_or):
if h5py.is_hdf5(filepath_or):
fh, own_fh = h5py.File(filepath_or, *args, **kwargs), True
elif filepath_or.endswith('.xlsx'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to fail when filepath_or is not a string because it won't have the endswith method, right?

I think the XLSX parsing should be moved elsewhere this function seems too generic as it should only open a file path or file handle. Is there any motivation to have this here instead of in the sample/prep information parsing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right! I initially was gonna add the code in load_template_to_dataframe but realized that it really just calls this function (see, https://github.com/biocore/qiita/blob/master/qiita_db/metadata_template/util.py#L100) as it just calls the open_file method. BTW note that the same method is available to open hdf5, text, or iostring. Thus, I believe is fine to fail if not a string as the other code should be able to handle and parse correctly those errors. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, the use of this function is rather limited (see these search results). It still seems too odd to me that a function that's intended to return a file handle will parse a file write it as a txt file and return that file handle.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the idea of parsing the xmlx is that it will be transformed to txt and then use the regular prep/sample code ... ¯_(ツ)_/¯ ... I don't have a better place to move it but if you do, let me know.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes more sense to have this outside of this function. I see as two very different things, (1) opening a file and (2) parsing a file. And this function opens the file and shouldn't be parsing it. Is there any technical limitation to putting this in load_template_to_dataframe? Perhaps as a helper function somewhere in there?

# due to extension, let's assume Excel file
wb = load_workbook(filename=filepath_or, data_only=True)
sheetnames = wb.sheetnames
# let's check if Qiimp, they must be in same order
first_cell_index = 0
is_qiimp_wb = False
if sheetnames == ["Metadata", "Validation", "Data Dictionary",
"metadata_schema", "metadata_form",
"Instructions"]:
first_cell_index = 1
is_qiimp_wb = True
first_sheet = wb[sheetnames[0]]
cell_range = range(first_cell_index, first_sheet.max_column)
_, fp = mkstemp(suffix='.txt')
with open(fp, 'w') as fh:
cfh = csv_writer(fh, delimiter='\t')
for r in first_sheet.rows:
if is_qiimp_wb:
# check contents of first column; if they are a zero
# (not a valid QIIMP sample_id) or a "No more than
# max samples" message, there are no more valid rows,
# so don't examine any more rows.
fcv = str(r[cell_range[0]].value)
if fcv == "0" or fcv.startswith("No more than"):
break
cfh.writerow([r[x].value for x in cell_range])
fh, own_fh = open(fp, *args, **kwargs), True
else:
fh, own_fh = open(filepath_or, *args, **kwargs), True
else:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
'tornado==3.1.1', 'toredis', 'redis', 'six',
'pyparsing', 'h5py >= 2.3.1', 'biom-format',
'natsort', 'networkx < 2.0', 'humanize',
'scikit-bio == 0.4.2', 'wtforms == 2.0.1',
'scikit-bio == 0.4.2', 'wtforms == 2.0.1', 'openpyxl',
'sphinx-bootstrap-theme', 'Sphinx >= 1.2.2',
'gitpython', 'qiita-files', 'redbiom==0.1.0-dev',
'sphinx_rtd_theme'],
Expand Down