Skip to content

Commit fbff62e

Browse files
AmandaBirminghamantgonza
authored andcommitted
fixes #291 : Fix parsing of quantitation files (#317)
* Fixes #315 and makes cmd in README to create server cert/key a little easier to read * Requested by Antonio: "can you also add a call to labman in .travis.yml so we can catch this error? Something like this to the end of the tests: labman & LABMAN_PID=$! sleep 5 # or 10? kill $LABMAN_PID " * fixes part of #291: failure of quantitation file parsing due to CR newlines * fixes #291 : Prevent parsing exception caused by CR newlines Interpret any values listed as "<0.00" as 0.00 Interpret any values listed as ">(some numeric value)" as "(some numeric value)" Interpret any values listed as a bunch of question marks (when you overflow the sensor; usually sample being too concentrated) as the highest quantification value in that particular file Raise an error if, after these fixes, there are any concentration values that are still NaN
1 parent aa48290 commit fbff62e

File tree

2 files changed

+118
-29
lines changed

2 files changed

+118
-29
lines changed

labman/db/process.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,8 +1702,8 @@ def _parse_pico_csv(contents, sep='\t',
17021702
17031703
Parameters
17041704
----------
1705-
contents: fp or open filehandle
1706-
pico quant file
1705+
contents : str
1706+
The contents of the pico green plate reader output
17071707
sep: str
17081708
sep char used in quant file
17091709
conc_col_name: str
@@ -1714,18 +1714,60 @@ def _parse_pico_csv(contents, sep='\t',
17141714
pico_df: pandas DataFrame object
17151715
DataFrame relating well location and DNA concentration
17161716
"""
1717-
raw_df = pd.read_csv(contents, sep=sep, skiprows=2, skipfooter=5,
1718-
engine='python')
1717+
1718+
cleaned_contents = QuantificationProcess._rationalize_pico_csv_string(
1719+
contents)
1720+
contents_io = StringIO(cleaned_contents)
1721+
1722+
# when reading in concentrations, force them to come in as strings
1723+
# so can check for overflow entries using regex
1724+
raw_df = pd.read_csv(contents_io, sep=sep, skiprows=2, skipfooter=5,
1725+
engine='python',
1726+
converters={'[Concentration]': lambda x: str(x)})
17191727

17201728
pico_df = raw_df[['Well', '[Concentration]']]
17211729
pico_df = pico_df.rename(columns={'[Concentration]': conc_col_name})
17221730

1731+
# any concentrations containing strings of question marks
1732+
# (generated when you overflow the sensor; usually due to sample being
1733+
# too concentrated) should be replaced with the highest concentration
1734+
# found in this file, per wet lab practice. Start by
1735+
# getting mask of the concentration rows that hold only question marks;
1736+
# regex matches start of string followed by one or more literal
1737+
# question marks, followed by end of string
1738+
overflow_mask = pico_df[conc_col_name].str.contains(
1739+
r'^\?+$', regex=True)
1740+
17231741
# coerce oddball concentrations to np.nan
17241742
pico_df[conc_col_name] = pd.to_numeric(pico_df[conc_col_name],
17251743
errors='coerce')
17261744

1745+
# find the highest concentration in the file and replace all overflow
1746+
# concentrations with that value
1747+
max_concentration = pico_df[conc_col_name].max()
1748+
pico_df.loc[overflow_mask, conc_col_name] = max_concentration
1749+
1750+
# if there are any NaN concentrations left, there's a problem with the
1751+
# parsing, so throw an error
1752+
if pico_df[conc_col_name].isnull().any():
1753+
raise ValueError("Some concentrations in pico green quantitation "
1754+
"file are NaN: {0}".format(pico_df))
1755+
17271756
return pico_df
17281757

1758+
@staticmethod
1759+
def _rationalize_pico_csv_string(contents):
1760+
# Plate reader files end with CR; convert to LF
1761+
contents = contents.replace('\r', '\n')
1762+
1763+
# anything valued as "<X" is converted to just "X"
1764+
# e.g., <0.000 becomes 0.000
1765+
contents = contents.replace('<', '')
1766+
1767+
# anything valued as ">X" is converted to just "X"
1768+
contents = contents.replace('>', '')
1769+
return contents
1770+
17291771
@staticmethod
17301772
def parse(contents, file_format="minipico", rows=8, cols=12):
17311773
"""Parses the quantification output
@@ -1746,13 +1788,12 @@ def parse(contents, file_format="minipico", rows=8, cols=12):
17461788
DataFrame
17471789
"""
17481790
parsers = {'minipico': QuantificationProcess._parse_pico_csv}
1749-
contents_io = StringIO(contents)
17501791

17511792
if file_format not in parsers:
17521793
raise ValueError(
17531794
'File format %s not recognized. Supported file formats: %s'
17541795
% (file_format, ', '.join(parsers)))
1755-
df = parsers[file_format](contents_io)
1796+
df = parsers[file_format](contents)
17561797
array = QuantificationProcess._make_2D_array(df, rows=rows, cols=cols)
17571798
return array.astype(float)
17581799

labman/db/tests/test_process.py

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def test_create(self):
294294
notes = 'test note'
295295
obs = GDNAExtractionProcess.create(
296296
user, plate, kf_robot, ep_robot, tool, kit, 10,
297-
'gdna - Test plate 1',
297+
'gdna - Test plate 1',
298298
extraction_date=test_date, notes=notes)
299299
self.assertEqual(obs.date, test_date)
300300
self.assertEqual(obs.personnel, user)
@@ -708,35 +708,64 @@ def test_make_2D_array(self):
708708
cols=4).astype(float)
709709
np.testing.assert_allclose(obs, exp2_cp_array)
710710

711+
def test_rationalize_pico_csv_string(self):
712+
pico_csv = ('Results \r'
713+
' \r'
714+
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \r'
715+
'SPL1\tA1\t<0.000\t3.432 \r'
716+
'SPL2\tA2\t4949.000\t3.239 \r'
717+
'SPL3\tB1\t>15302.000\t10.016 \r'
718+
'SPL4\tB2\t4039.000\t2.644 \r'
719+
' \r'
720+
'Curve2 Fitting Results \r'
721+
' \r'
722+
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
723+
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
724+
725+
expected_output = (
726+
'Results \n'
727+
' \n'
728+
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \n'
729+
'SPL1\tA1\t0.000\t3.432 \n'
730+
'SPL2\tA2\t4949.000\t3.239 \n'
731+
'SPL3\tB1\t15302.000\t10.016 \n'
732+
'SPL4\tB2\t4039.000\t2.644 \n'
733+
' \n'
734+
'Curve2 Fitting Results \n'
735+
' \n'
736+
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\n'
737+
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
738+
output = QuantificationProcess._rationalize_pico_csv_string(pico_csv)
739+
self.assertEqual(output, expected_output)
740+
711741
def test_parse_pico_csv(self):
712742
# Test a normal sheet
713-
pico_csv = '''Results
743+
pico_csv1 = '''Results
714744
715745
Well ID\tWell\t[Blanked-RFU]\t[Concentration]
716746
SPL1\tA1\t5243.000\t3.432
717747
SPL2\tA2\t4949.000\t3.239
718748
SPL3\tB1\t15302.000\t10.016
719-
SPL4\tB2\t4039.000\t2.644
749+
SPL4\tB2\t4039.000\t2.644
720750
721751
Curve2 Fitting Results
722752
723753
Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
724754
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
725755
'''
726-
exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
756+
exp_pico_df1 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
727757
'Sample DNA Concentration':
728-
[3.432, 3.239, 10.016, 2.644]})
729-
pico_csv_f = StringIO(pico_csv)
730-
obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
731-
pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
758+
[3.432, 3.239, 10.016, 2.644]})
759+
obs_pico_df1 = QuantificationProcess._parse_pico_csv(pico_csv1)
760+
pd.testing.assert_frame_equal(obs_pico_df1, exp_pico_df1,
732761
check_like=True)
733762

734-
# Test a sheet that has some ???? zero values
735-
pico_csv = '''Results
763+
# Test a sheet that has some ????, <, and > values
764+
pico_csv2 = '''Results
736765
737766
Well ID\tWell\t[Blanked-RFU]\t[Concentration]
738-
SPL1\tA1\t5243.000\t3.432
739-
SPL2\tA2\t4949.000\t3.239
767+
SPL1\tA1\t5243.000\t>3.432
768+
SPL2\tA2\t4949.000\t<0.000
740769
SPL3\tB1\t15302.000\t10.016
741770
SPL4\tB2\t\t?????
742771
@@ -745,28 +774,47 @@ def test_parse_pico_csv(self):
745774
Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
746775
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
747776
'''
748-
exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
777+
exp_pico_df2 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
749778
'Sample DNA Concentration':
750-
[3.432, 3.239, 10.016, np.nan]})
751-
pico_csv_f = StringIO(pico_csv)
752-
obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
753-
pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
779+
[3.432, 0.000, 10.016, 10.016]})
780+
obs_pico_df2 = QuantificationProcess._parse_pico_csv(pico_csv2)
781+
pd.testing.assert_frame_equal(obs_pico_df2, exp_pico_df2,
754782
check_like=True)
755783

756-
def test_parse(self):
757-
pico_csv = '''Results
784+
# Test a sheet that has unexpected value that can't be converted to #
785+
pico_csv3 = '''Results
758786
759787
Well ID\tWell\t[Blanked-RFU]\t[Concentration]
760788
SPL1\tA1\t5243.000\t3.432
761789
SPL2\tA2\t4949.000\t3.239
762790
SPL3\tB1\t15302.000\t10.016
763-
SPL4\tB2\t4039.000\t2.644
791+
SPL4\tB2\t\tfail
764792
765793
Curve2 Fitting Results
766794
767795
Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
768796
Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
769797
'''
798+
with self.assertRaises(ValueError):
799+
QuantificationProcess._parse_pico_csv(pico_csv3)
800+
801+
def test_parse(self):
802+
# Test a normal sheet
803+
# Note that the pico output file appears to have \r (NOT \r\n)
804+
# line endings
805+
pico_csv = ('Results \r'
806+
' \r'
807+
'Well ID\tWell\t[Blanked-RFU]\t[Concentration] \r'
808+
'SPL1\tA1\t5243.000\t3.432 \r'
809+
'SPL2\tA2\t4949.000\t3.239 \r'
810+
'SPL3\tB1\t15302.000\t10.016 \r'
811+
'SPL4\tB2\t4039.000\t2.644 \r'
812+
' \r'
813+
'Curve2 Fitting Results \r'
814+
' \r'
815+
'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
816+
'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
817+
770818
obs = QuantificationProcess.parse(pico_csv)
771819
exp = np.asarray(
772820
[[3.432, 3.239, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
@@ -796,11 +844,11 @@ def test_attributes(self):
796844
self.assertEqual(tester.notes,None)
797845
obs = tester.concentrations
798846
self.assertEqual(len(obs), 95)
799-
self.assertEqual(obs[0],
847+
self.assertEqual(obs[0],
800848
(LibraryPrep16SComposition(1), 20.0, 60.606))
801-
self.assertEqual(obs[36],
849+
self.assertEqual(obs[36],
802850
(LibraryPrep16SComposition(37), 20.0, 60.606))
803-
self.assertEqual(obs[94],
851+
self.assertEqual(obs[94],
804852
(LibraryPrep16SComposition(95), 1.0, 3.0303))
805853

806854
tester = QuantificationProcess(4)

0 commit comments

Comments
 (0)