Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
39 changes: 39 additions & 0 deletions qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# -----------------------------------------------------------------------------

from six import StringIO
from inspect import currentframe, getfile
from os.path import dirname, abspath, join
from unittest import TestCase, main
import warnings

Expand Down Expand Up @@ -69,6 +71,29 @@ def test_load_template_to_dataframe(self):
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_xlsx(self):
mfp = join(dirname(abspath(getfile(currentframe()))), 'support_files')

# test loading a qiimp file
fp = join(mfp, 'a_qiimp_wb.xlsx')
obs = qdb.metadata_template.util.load_template_to_dataframe(fp)
exp = pd.DataFrame.from_dict(EXP_QIIMP, dtype=str)
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

# test loading an empty qiimp file
fp = join(mfp, 'empty_qiimp_wb.xlsx')
with self.assertRaises(ValueError) as error:
qdb.metadata_template.util.load_template_to_dataframe(fp)
self.assertEqual(str(error.exception), "The template is empty")

# test loading non qiimp file
fp = join(mfp, 'not_a_qiimp_wb.xlsx')
obs = qdb.metadata_template.util.load_template_to_dataframe(fp)
exp = pd.DataFrame.from_dict(EXP_NOT_QIIMP, dtype=str)
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_qiime_map(self):
obs = qdb.metadata_template.util.load_template_to_dataframe(
StringIO(QIIME_TUTORIAL_MAP_SUBSET), index='#SampleID')
Expand Down Expand Up @@ -844,5 +869,19 @@ def test_get_pgsql_reserved_words(self):
'1.SKD8.640184\tCGTAGAGCTCTC\tANL\tTest Project\tNone\tEMP\tBBBB\tAAAA\t'
'GTGCCAGCMGCCGCGGTAA\tILLUMINA\ts_G1_L001_sequences\tValue for sample 2\n')

EXP_QIIMP = {
'asfaewf': {'sample': 'f', 'oijnmk': 'f'},
'pheno': {'sample': 'med', 'oijnmk': 'missing: not provided'},
'bawer': {'sample': 'a', 'oijnmk': 'b'},
'aelrjg': {'sample': 'asfe', 'oijnmk': 'asfs'}
}

EXP_NOT_QIIMP = {
'myownidea': {
'sample5': 'I skipped some',
'sample1': 'sampleoneinfo',
'sample2': 'sampletwoinfo'}
}

if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions qiita_db/metadata_template/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def load_template_to_dataframe(fn, index='sample_name'):
regex=True, inplace=True)
# removing columns with empty values
template.dropna(axis='columns', how='all', inplace=True)
if template.empty:
raise ValueError("The template is empty")

initial_columns = set(template.columns)

Expand Down
38 changes: 37 additions & 1 deletion qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
from os.path import join, basename, isdir, exists
from os import walk, remove, listdir, makedirs, rename
from shutil import move, rmtree, copy as shutil_copy
from openpyxl import load_workbook
from tempfile import mkstemp
from csv import writer as csv_writer
from json import dumps
from datetime import datetime
from itertools import chain
Expand Down Expand Up @@ -1724,12 +1727,45 @@ def _is_string_or_bytes(s):


def _get_filehandle(filepath_or, *args, **kwargs):
"""Open file if `filepath_or` looks like a string/unicode/bytes, else
"""Open file if `filepath_or` looks like a string/unicode/bytes/Excel, else
pass through.

Notes
-----
If Excel, the code will write a temporary txt file with the contents. Also,
it will check if the file is a Qiimp file or a regular Excel file.
"""
if _is_string_or_bytes(filepath_or):
if h5py.is_hdf5(filepath_or):
fh, own_fh = h5py.File(filepath_or, *args, **kwargs), True
elif filepath_or.endswith('.xlsx'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to fail when filepath_or is not a string because it won't have the endswith method, right?

I think the XLSX parsing should be moved elsewhere this function seems too generic as it should only open a file path or file handle. Is there any motivation to have this here instead of in the sample/prep information parsing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right! I initially was gonna add the code in load_template_to_dataframe but realized that it really just calls this function (see, https://github.com/biocore/qiita/blob/master/qiita_db/metadata_template/util.py#L100) as it just calls the open_file method. BTW note that the same method is available to open hdf5, text, or iostring. Thus, I believe is fine to fail if not a string as the other code should be able to handle and parse correctly those errors. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, the use of this function is rather limited (see these search results). It still seems too odd to me that a function that's intended to return a file handle will parse a file write it as a txt file and return that file handle.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the idea of parsing the xmlx is that it will be transformed to txt and then use the regular prep/sample code ... ¯_(ツ)_/¯ ... I don't have a better place to move it but if you do, let me know.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes more sense to have this outside of this function. I see as two very different things, (1) opening a file and (2) parsing a file. And this function opens the file and shouldn't be parsing it. Is there any technical limitation to putting this in load_template_to_dataframe? Perhaps as a helper function somewhere in there?

# due to extension, let's assume Excel file
wb = load_workbook(filename=filepath_or, data_only=True)
sheetnames = wb.sheetnames
# let's check if Qiimp, they must be in same order
first_cell_index = 0
is_qiimp_wb = False
if sheetnames == ["Metadata", "Validation", "Data Dictionary",
"metadata_schema", "metadata_form",
"Instructions"]:
first_cell_index = 1
is_qiimp_wb = True
first_sheet = wb[sheetnames[0]]
cell_range = range(first_cell_index, first_sheet.max_column)
_, fp = mkstemp(suffix='.txt')
with open(fp, 'w') as fh:
cfh = csv_writer(fh, delimiter='\t')
for r in first_sheet.rows:
if is_qiimp_wb:
# check contents of first column; if they are a zero
# (not a valid QIIMP sample_id) or a "No more than
# max samples" message, there are no more valid rows,
# so don't examine any more rows.
fcv = str(r[cell_range[0]].value)
if fcv == "0" or fcv.startswith("No more than"):
break
cfh.writerow([r[x].value for x in cell_range])
fh, own_fh = open(fp, *args, **kwargs), True
else:
fh, own_fh = open(filepath_or, *args, **kwargs), True
else:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
'tornado==3.1.1', 'toredis', 'redis', 'six',
'pyparsing', 'h5py >= 2.3.1', 'biom-format',
'natsort', 'networkx < 2.0', 'humanize',
'scikit-bio == 0.4.2', 'wtforms == 2.0.1',
'scikit-bio == 0.4.2', 'wtforms == 2.0.1', 'openpyxl',
'sphinx-bootstrap-theme', 'Sphinx >= 1.2.2',
'gitpython', 'qiita-files', 'redbiom==0.1.0-dev',
'sphinx_rtd_theme'],
Expand Down