Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions labman/db/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,8 +1702,8 @@ def _parse_pico_csv(contents, sep='\t',

Parameters
----------
contents: fp or open filehandle
pico quant file
contents : str
The contents of the pico green plate reader output
sep: str
sep char used in quant file
conc_col_name: str
Expand All @@ -1714,18 +1714,60 @@ def _parse_pico_csv(contents, sep='\t',
pico_df: pandas DataFrame object
DataFrame relating well location and DNA concentration
"""
raw_df = pd.read_csv(contents, sep=sep, skiprows=2, skipfooter=5,
engine='python')

cleaned_contents = QuantificationProcess._rationalize_pico_csv_string(
contents)
contents_io = StringIO(cleaned_contents)

# when reading in concentrations, force them to come in as strings
# so can check for overflow entries using regex
raw_df = pd.read_csv(contents_io, sep=sep, skiprows=2, skipfooter=5,
engine='python',
converters={'[Concentration]': lambda x: str(x)})

pico_df = raw_df[['Well', '[Concentration]']]
pico_df = pico_df.rename(columns={'[Concentration]': conc_col_name})

# any concentrations containing strings of question marks
# (generated when you overflow the sensor; usually due to sample being
# too concentrated) should be replaced with the highest concentration
# found in this file, per wet lab practice. Start by
# getting mask of the concentration rows that hold only question marks;
# regex matches start of string followed by one or more literal
# question marks, followed by end of string
overflow_mask = pico_df[conc_col_name].str.contains(
r'^\?+$', regex=True)

# coerce oddball concentrations to np.nan
pico_df[conc_col_name] = pd.to_numeric(pico_df[conc_col_name],
errors='coerce')

# find the highest concentration in the file and replace all overflow
# concentrations with that value
max_concentration = pico_df[conc_col_name].max()
pico_df.loc[overflow_mask, conc_col_name] = max_concentration

# if there are any NaN concentrations left, there's a problem with the
# parsing, so throw an error
if pico_df[conc_col_name].isnull().any():
raise ValueError("Some concentrations in pico green quantitation "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this actually reported to user in the GUI? In other words, when you do this via the GUI and leave NaNs do you see this error?

----- As this is not part of this PR, if the answer is no; let's just open an issue.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@antgonza Finally hunted this down :) The error is reported to the user in the web browser, albeit in a totally unpolished way:

screen shot 2018-08-22 at 4 19 45 pm

Would you like me to open an issue to have it reported within the Labman page chrome, like the example below is?

screen shot 2018-08-13 at 11 33 34 am

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Let's open an issue but make it the least priority possible.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do ;)

"file are NaN: {0}".format(pico_df))

return pico_df

@staticmethod
def _rationalize_pico_csv_string(contents):
# Plate reader files end with CR; convert to LF
contents = contents.replace('\r', '\n')

# anything valued as "<X" is converted to just "X"
# e.g., <0.000 becomes 0.000
contents = contents.replace('<', '')

# anything valued as ">X" is converted to just "X"
contents = contents.replace('>', '')
return contents

@staticmethod
def parse(contents, file_format="minipico", rows=8, cols=12):
"""Parses the quantification output
Expand All @@ -1746,13 +1788,12 @@ def parse(contents, file_format="minipico", rows=8, cols=12):
DataFrame
"""
parsers = {'minipico': QuantificationProcess._parse_pico_csv}
contents_io = StringIO(contents)

if file_format not in parsers:
raise ValueError(
'File format %s not recognized. Supported file formats: %s'
% (file_format, ', '.join(parsers)))
df = parsers[file_format](contents_io)
df = parsers[file_format](contents)
array = QuantificationProcess._make_2D_array(df, rows=rows, cols=cols)
return array.astype(float)

Expand Down
94 changes: 71 additions & 23 deletions labman/db/tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def test_create(self):
notes = 'test note'
obs = GDNAExtractionProcess.create(
user, plate, kf_robot, ep_robot, tool, kit, 10,
'gdna - Test plate 1',
'gdna - Test plate 1',
extraction_date=test_date, notes=notes)
self.assertEqual(obs.date, test_date)
self.assertEqual(obs.personnel, user)
Expand Down Expand Up @@ -708,35 +708,64 @@ def test_make_2D_array(self):
cols=4).astype(float)
np.testing.assert_allclose(obs, exp2_cp_array)

def test_rationalize_pico_csv_string(self):
pico_csv = ('Results \r'
' \r'
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \r'
'SPL1\tA1\t<0.000\t3.432 \r'
'SPL2\tA2\t4949.000\t3.239 \r'
'SPL3\tB1\t>15302.000\t10.016 \r'
'SPL4\tB2\t4039.000\t2.644 \r'
' \r'
'Curve2 Fitting Results \r'
' \r'
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')

expected_output = (
'Results \n'
' \n'
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \n'
'SPL1\tA1\t0.000\t3.432 \n'
'SPL2\tA2\t4949.000\t3.239 \n'
'SPL3\tB1\t15302.000\t10.016 \n'
'SPL4\tB2\t4039.000\t2.644 \n'
' \n'
'Curve2 Fitting Results \n'
' \n'
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\n'
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
output = QuantificationProcess._rationalize_pico_csv_string(pico_csv)
self.assertEqual(output, expected_output)

def test_parse_pico_csv(self):
# Test a normal sheet
pico_csv = '''Results
pico_csv1 = '''Results

Well ID\tWell\t[Blanked-RFU]\t[Concentration]
SPL1\tA1\t5243.000\t3.432
SPL2\tA2\t4949.000\t3.239
SPL3\tB1\t15302.000\t10.016
SPL4\tB2\t4039.000\t2.644
SPL4\tB2\t4039.000\t2.644

Curve2 Fitting Results

Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
'''
exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
exp_pico_df1 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
'Sample DNA Concentration':
[3.432, 3.239, 10.016, 2.644]})
pico_csv_f = StringIO(pico_csv)
obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
[3.432, 3.239, 10.016, 2.644]})
obs_pico_df1 = QuantificationProcess._parse_pico_csv(pico_csv1)
pd.testing.assert_frame_equal(obs_pico_df1, exp_pico_df1,
check_like=True)

# Test a sheet that has some ???? zero values
pico_csv = '''Results
# Test a sheet that has some ????, <, and > values
pico_csv2 = '''Results

Well ID\tWell\t[Blanked-RFU]\t[Concentration]
SPL1\tA1\t5243.000\t3.432
SPL2\tA2\t4949.000\t3.239
SPL1\tA1\t5243.000\t>3.432
SPL2\tA2\t4949.000\t<0.000
SPL3\tB1\t15302.000\t10.016
SPL4\tB2\t\t?????

Expand All @@ -745,28 +774,47 @@ def test_parse_pico_csv(self):
Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
'''
exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
exp_pico_df2 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
'Sample DNA Concentration':
[3.432, 3.239, 10.016, np.nan]})
pico_csv_f = StringIO(pico_csv)
obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
[3.432, 0.000, 10.016, 10.016]})
obs_pico_df2 = QuantificationProcess._parse_pico_csv(pico_csv2)
pd.testing.assert_frame_equal(obs_pico_df2, exp_pico_df2,
check_like=True)

def test_parse(self):
pico_csv = '''Results
# Test a sheet that has unexpected value that can't be converted to #
pico_csv3 = '''Results

Well ID\tWell\t[Blanked-RFU]\t[Concentration]
SPL1\tA1\t5243.000\t3.432
SPL2\tA2\t4949.000\t3.239
SPL3\tB1\t15302.000\t10.016
SPL4\tB2\t4039.000\t2.644
SPL4\tB2\t\tfail

Curve2 Fitting Results

Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
'''
with self.assertRaises(ValueError):
QuantificationProcess._parse_pico_csv(pico_csv3)

def test_parse(self):
# Test a normal sheet
# Note that the pico output file appears to have \r (NOT \r\n)
# line endings
pico_csv = ('Results \r'
' \r'
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \r'
'SPL1\tA1\t5243.000\t3.432 \r'
'SPL2\tA2\t4949.000\t3.239 \r'
'SPL3\tB1\t15302.000\t10.016 \r'
'SPL4\tB2\t4039.000\t2.644 \r'
' \r'
'Curve2 Fitting Results \r'
' \r'
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')

obs = QuantificationProcess.parse(pico_csv)
exp = np.asarray(
[[3.432, 3.239, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
Expand Down Expand Up @@ -796,11 +844,11 @@ def test_attributes(self):
self.assertEqual(tester.notes,None)
obs = tester.concentrations
self.assertEqual(len(obs), 95)
self.assertEqual(obs[0],
self.assertEqual(obs[0],
(LibraryPrep16SComposition(1), 20.0, 60.606))
self.assertEqual(obs[36],
self.assertEqual(obs[36],
(LibraryPrep16SComposition(37), 20.0, 60.606))
self.assertEqual(obs[94],
self.assertEqual(obs[94],
(LibraryPrep16SComposition(95), 1.0, 3.0303))

tester = QuantificationProcess(4)
Expand Down