fixes #291 : Fix parsing of quantitation files (#317)

AmandaBirmingham · antgonza · commit fbff62ed6c9b · 2018-08-09T21:35:27.000-04:00
* Fixes #315 and makes cmd in README to create server cert/key a little easier to read * Requested by Antonio: "can you also add a call to labman in .travis.yml so we can catch this error? Something like this to the end of the tests: labman & LABMAN_PID=$! sleep 5 # or 10? kill $LABMAN_PID " * fixes part of #291: failure of quantitation file parsing due to CR newlines * fixes #291 : Prevent parsing exception caused by CR newlines Interpret any values listed as "<0.00" as 0.00 Interpret any values listed as ">(some numeric value)" as "(some numeric value)" Interpret any values listed as a bunch of question marks (when you overflow the sensor; usually sample being too concentrated) as the highest quantification value in that particular file Raise an error if, after these fixes, there are any concentration values that are still NaN
diff --git a/labman/db/process.py b/labman/db/process.py
@@ -1702,8 +1702,8 @@ def _parse_pico_csv(contents, sep='\t',
 
         Parameters
         ----------
-        contents: fp or open filehandle
-            pico quant file
+        contents : str
+            The contents of the pico green plate reader output
         sep: str
             sep char used in quant file
         conc_col_name: str
@@ -1714,18 +1714,60 @@ def _parse_pico_csv(contents, sep='\t',
         pico_df: pandas DataFrame object
             DataFrame relating well location and DNA concentration
         """
-        raw_df = pd.read_csv(contents, sep=sep, skiprows=2, skipfooter=5,
-                             engine='python')
+
+        cleaned_contents = QuantificationProcess._rationalize_pico_csv_string(
+            contents)
+        contents_io = StringIO(cleaned_contents)
+
+        # when reading in concentrations, force them to come in as strings
+        # so can check for overflow entries using regex
+        raw_df = pd.read_csv(contents_io, sep=sep, skiprows=2, skipfooter=5,
+                             engine='python',
+                             converters={'[Concentration]': lambda x: str(x)})
 
         pico_df = raw_df[['Well', '[Concentration]']]
         pico_df = pico_df.rename(columns={'[Concentration]': conc_col_name})
 
+        # any concentrations containing strings of question marks
+        # (generated when you overflow the sensor; usually due to sample being
+        # too concentrated) should be replaced with the highest concentration
+        # found in this file, per wet lab practice. Start by
+        # getting mask of the concentration rows that hold only question marks;
+        # regex matches start of string followed by one or more literal
+        # question marks, followed by end of string
+        overflow_mask = pico_df[conc_col_name].str.contains(
+            r'^\?+$', regex=True)
+
         # coerce oddball concentrations to np.nan
         pico_df[conc_col_name] = pd.to_numeric(pico_df[conc_col_name],
                                                errors='coerce')
 
+        # find the highest concentration in the file and replace all overflow
+        # concentrations with that value
+        max_concentration = pico_df[conc_col_name].max()
+        pico_df.loc[overflow_mask, conc_col_name] = max_concentration
+
+        # if there are any NaN concentrations left, there's a problem with the
+        # parsing, so throw an error
+        if pico_df[conc_col_name].isnull().any():
+            raise ValueError("Some concentrations in pico green quantitation "
+                             "file are NaN: {0}".format(pico_df))
+
         return pico_df
 
+    @staticmethod
+    def _rationalize_pico_csv_string(contents):
+        # Plate reader files end with CR; convert to LF
+        contents = contents.replace('\r', '\n')
+
+        # anything valued as "<X" is converted to just "X"
+        # e.g., <0.000 becomes 0.000
+        contents = contents.replace('<', '')
+
+        # anything valued as ">X" is converted to just "X"
+        contents = contents.replace('>', '')
+        return contents
+
     @staticmethod
     def parse(contents, file_format="minipico", rows=8, cols=12):
         """Parses the quantification output
@@ -1746,13 +1788,12 @@ def parse(contents, file_format="minipico", rows=8, cols=12):
         DataFrame
         """
         parsers = {'minipico': QuantificationProcess._parse_pico_csv}
-        contents_io = StringIO(contents)
 
         if file_format not in parsers:
             raise ValueError(
                 'File format %s not recognized. Supported file formats: %s'
                 % (file_format, ', '.join(parsers)))
-        df = parsers[file_format](contents_io)
+        df = parsers[file_format](contents)
         array = QuantificationProcess._make_2D_array(df, rows=rows, cols=cols)
         return array.astype(float)
 
diff --git a/labman/db/tests/test_process.py b/labman/db/tests/test_process.py
@@ -294,7 +294,7 @@ def test_create(self):
         notes = 'test note'
         obs = GDNAExtractionProcess.create(
             user, plate, kf_robot, ep_robot, tool, kit, 10,
-            'gdna - Test plate 1', 
+            'gdna - Test plate 1',
             extraction_date=test_date, notes=notes)
         self.assertEqual(obs.date, test_date)
         self.assertEqual(obs.personnel, user)
@@ -708,35 +708,64 @@ def test_make_2D_array(self):
             cols=4).astype(float)
         np.testing.assert_allclose(obs, exp2_cp_array)
 
+    def test_rationalize_pico_csv_string(self):
+        pico_csv = ('Results					\r'
+                    '					\r'
+                    'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\r'
+                    'SPL1\tA1\t<0.000\t3.432		\r'
+                    'SPL2\tA2\t4949.000\t3.239		\r'
+                    'SPL3\tB1\t>15302.000\t10.016		\r'
+                    'SPL4\tB2\t4039.000\t2.644		\r'
+                    '					\r'
+                    'Curve2 Fitting Results					\r'
+                    '					\r'
+                    'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
+                    'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+
+        expected_output = (
+            'Results					\n'
+            '					\n'
+            'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\n'
+            'SPL1\tA1\t0.000\t3.432		\n'
+            'SPL2\tA2\t4949.000\t3.239		\n'
+            'SPL3\tB1\t15302.000\t10.016		\n'
+            'SPL4\tB2\t4039.000\t2.644		\n'
+            '					\n'
+            'Curve2 Fitting Results					\n'
+            '					\n'
+            'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\n'
+            'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+        output = QuantificationProcess._rationalize_pico_csv_string(pico_csv)
+        self.assertEqual(output, expected_output)
+
     def test_parse_pico_csv(self):
         # Test a normal sheet
-        pico_csv = '''Results
+        pico_csv1 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
         SPL1\tA1\t5243.000\t3.432
         SPL2\tA2\t4949.000\t3.239
         SPL3\tB1\t15302.000\t10.016
-        SPL4\tB2\t4039.000\t2.644
+        SPL4\tB2\t4039.000\t2.644 
 
         Curve2 Fitting Results
 
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
-        exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
+        exp_pico_df1 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
                                     'Sample DNA Concentration':
-                                    [3.432, 3.239, 10.016, 2.644]})
-        pico_csv_f = StringIO(pico_csv)
-        obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
-        pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
+                                        [3.432, 3.239, 10.016, 2.644]})
+        obs_pico_df1 = QuantificationProcess._parse_pico_csv(pico_csv1)
+        pd.testing.assert_frame_equal(obs_pico_df1, exp_pico_df1,
                                       check_like=True)
 
-        # Test a sheet that has some ???? zero values
-        pico_csv = '''Results
+        # Test a sheet that has some ????, <, and > values
+        pico_csv2 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
-        SPL1\tA1\t5243.000\t3.432
-        SPL2\tA2\t4949.000\t3.239
+        SPL1\tA1\t5243.000\t>3.432
+        SPL2\tA2\t4949.000\t<0.000
         SPL3\tB1\t15302.000\t10.016
         SPL4\tB2\t\t?????
 
@@ -745,28 +774,47 @@ def test_parse_pico_csv(self):
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
-        exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
+        exp_pico_df2 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
                                     'Sample DNA Concentration':
-                                    [3.432, 3.239, 10.016, np.nan]})
-        pico_csv_f = StringIO(pico_csv)
-        obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
-        pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
+                                        [3.432, 0.000, 10.016, 10.016]})
+        obs_pico_df2 = QuantificationProcess._parse_pico_csv(pico_csv2)
+        pd.testing.assert_frame_equal(obs_pico_df2, exp_pico_df2,
                                       check_like=True)
 
-    def test_parse(self):
-        pico_csv = '''Results
+        # Test a sheet that has unexpected value that can't be converted to #
+        pico_csv3 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
         SPL1\tA1\t5243.000\t3.432
         SPL2\tA2\t4949.000\t3.239
         SPL3\tB1\t15302.000\t10.016
-        SPL4\tB2\t4039.000\t2.644
+        SPL4\tB2\t\tfail
 
         Curve2 Fitting Results
 
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
+        with self.assertRaises(ValueError):
+            QuantificationProcess._parse_pico_csv(pico_csv3)
+
+    def test_parse(self):
+        # Test a normal sheet
+        # Note that the pico output file appears to have \r (NOT \r\n)
+        # line endings
+        pico_csv = ('Results					\r'
+                    '					\r'
+                    'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\r'
+                    'SPL1\tA1\t5243.000\t3.432		\r'
+                    'SPL2\tA2\t4949.000\t3.239		\r'
+                    'SPL3\tB1\t15302.000\t10.016		\r'
+                    'SPL4\tB2\t4039.000\t2.644		\r'
+                    '					\r'
+                    'Curve2 Fitting Results					\r'
+                    '					\r'
+                    'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
+                    'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+
         obs = QuantificationProcess.parse(pico_csv)
         exp = np.asarray(
             [[3.432, 3.239, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
@@ -796,11 +844,11 @@ def test_attributes(self):
         self.assertEqual(tester.notes,None)
         obs = tester.concentrations
         self.assertEqual(len(obs), 95)
-        self.assertEqual(obs[0], 
+        self.assertEqual(obs[0],
                          (LibraryPrep16SComposition(1), 20.0, 60.606))
-        self.assertEqual(obs[36], 
+        self.assertEqual(obs[36],
                          (LibraryPrep16SComposition(37), 20.0, 60.606))
-        self.assertEqual(obs[94], 
+        self.assertEqual(obs[94],
                          (LibraryPrep16SComposition(95), 1.0, 3.0303))
 
         tester = QuantificationProcess(4)