Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,8 @@ def test_get_pgsql_reserved_words(self):
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
"physical_location\trequired_sample_info_status\tsample_type\t"
"str_column\n"
"2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t"
"2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\t"
'"True\t"\t"\nTrue"\t'
"NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
"Value for sample 1\n"
"2.Sample2 \t05/29/2014 12:24:51\t"
Expand Down
49 changes: 27 additions & 22 deletions qiita_db/metadata_template/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,13 @@ def prefix_sample_names_with_id(md_template, study_id):
md_template.index.name = None


def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
def load_template_to_dataframe(fn, index='sample_name'):
"""Load a sample/prep template or a QIIME mapping file into a data frame

Parameters
----------
fn : str or file-like object
filename of the template to load, or an already open template file
strip_whitespace : bool, optional
Defaults to True. Whether or not to strip whitespace from values in the
input file
index : str, optional
Defaults to 'sample_name'. The index to use in the loaded information

Expand Down Expand Up @@ -110,19 +107,6 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
if not holdfile:
raise ValueError('Empty file passed!')

# Strip all values in the cells in the input file, if requested
if strip_whitespace:
for pos, line in enumerate(holdfile):
holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
for d in line.split('\t'))

# get and clean the controlled columns
cols = holdfile[0].split('\t')
controlled_cols = {'sample_name'}
controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
for c in cols)

if index == "#SampleID":
# We're going to parse a QIIME mapping file. We are going to first
# parse it with the QIIME function so we can remove the comments
Expand All @@ -133,11 +117,29 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
# The QIIME parser fixes the index and removes the #
index = 'SampleID'

# Check that we don't have duplicate columns
col_names = [c.lower() for c in holdfile[0].strip().split('\t')]
if len(set(col_names)) != len(col_names):
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
find_duplicates(col_names))
# Strip all values in the cells in the input file
for pos, line in enumerate(holdfile):
cols = line.split('\t')
if pos == 0 and index != 'SampleID':
# get and clean the controlled columns
ccols = {'sample_name'}
ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
newcols = [
c.lower().strip() if c.lower().strip() in ccols
else c.strip()
for c in cols]

# while we are here, let's check for duplicate columns headers
if len(set(newcols)) != len(newcols):
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
find_duplicates(newcols))
else:
# .strip will remove odd chars, newlines, tabs and multiple
# spaces but we need to read a new line at the end of the
# line(+'\n')
newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]

holdfile[pos] = '\t'.join(newcols) + '\n'

# index_col:
# is set as False, otherwise it is cast as a float and we want a string
Expand All @@ -158,6 +160,9 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
index_col=False,
comment='\t',
converters={index: lambda x: str(x).strip()})
# remove newlines and tabs from fields
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
regex=True, inplace=True)
except UnicodeDecodeError:
# Find row number and col number for utf-8 encoding errors
headers = holdfile[0].strip().split('\t')
Expand Down
4 changes: 4 additions & 0 deletions qiita_db/support_files/patches/48.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Jan 20, 2017
-- see py file

SELECT 1;
56 changes: 56 additions & 0 deletions qiita_db/support_files/patches/python_patches/48.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# replacing all \t and \n for space as those chars brake QIIME

from qiita_db.study import Study
from qiita_db.sql_connection import TRN


def searcher(df):
search = r"\t|\n"

return [col for col in df
if df[col].str.contains(search, na=False, regex=True).any()]


studies = Study.get_by_status('private').union(
Study.get_by_status('public')).union(Study.get_by_status('sandbox'))

# we will start search using pandas as is much easier and faster
# than using pgsql. remember that to_dataframe actually transforms what's
# in the db
to_fix = []
for s in studies:
st = s.sample_template
if st is None:
continue
cols = searcher(st.to_dataframe())
if cols:
to_fix.append((st, cols))

for pt in s.prep_templates():
if pt is None:
continue
cols = searcher(pt.to_dataframe())
if cols:
to_fix.append((pt, cols))


# now let's fix the database and regenerate the files
for infofile, cols in to_fix:
with TRN:
for col in cols:
# removing tabs
sql = """UPDATE qiita.{0}{1}
SET {2} = replace({2}, chr(9), ' ')""".format(
infofile._table_prefix, infofile.id, col)
TRN.add(sql)

# removing enters
sql = """UPDATE qiita.{0}{1}
SET {2} = regexp_replace(
{2}, E'[\\n\\r\\u2028]+', ' ', 'g' )""".format(
infofile._table_prefix, infofile.id, col)
TRN.add(sql)

TRN.execute()

infofile.generate_files()