biocore · antgonza · Aug 10, 2018 · Aug 7, 2018 · Aug 7, 2018 · Aug 7, 2018
diff --git a/labman/db/process.py b/labman/db/process.py
@@ -1702,8 +1702,8 @@ def _parse_pico_csv(contents, sep='\t',
 
         Parameters
         ----------
-        contents: fp or open filehandle
-            pico quant file
+        contents : str
+            The contents of the pico green plate reader output
         sep: str
             sep char used in quant file
         conc_col_name: str
@@ -1714,18 +1714,60 @@ def _parse_pico_csv(contents, sep='\t',
         pico_df: pandas DataFrame object
             DataFrame relating well location and DNA concentration
         """
-        raw_df = pd.read_csv(contents, sep=sep, skiprows=2, skipfooter=5,
-                             engine='python')
+
+        cleaned_contents = QuantificationProcess._rationalize_pico_csv_string(
+            contents)
+        contents_io = StringIO(cleaned_contents)
+
+        # when reading in concentrations, force them to come in as strings
+        # so can check for overflow entries using regex
+        raw_df = pd.read_csv(contents_io, sep=sep, skiprows=2, skipfooter=5,
+                             engine='python',
+                             converters={'[Concentration]': lambda x: str(x)})
 
         pico_df = raw_df[['Well', '[Concentration]']]
         pico_df = pico_df.rename(columns={'[Concentration]': conc_col_name})
 
+        # any concentrations containing strings of question marks
+        # (generated when you overflow the sensor; usually due to sample being
+        # too concentrated) should be replaced with the highest concentration
+        # found in this file, per wet lab practice. Start by
+        # getting mask of the concentration rows that hold only question marks;
+        # regex matches start of string followed by one or more literal
+        # question marks, followed by end of string
+        overflow_mask = pico_df[conc_col_name].str.contains(
+            r'^\?+$', regex=True)
+
         # coerce oddball concentrations to np.nan
         pico_df[conc_col_name] = pd.to_numeric(pico_df[conc_col_name],
                                                errors='coerce')
 
+        # find the highest concentration in the file and replace all overflow
+        # concentrations with that value
+        max_concentration = pico_df[conc_col_name].max()
+        pico_df.loc[overflow_mask, conc_col_name] = max_concentration
+
+        # if there are any NaN concentrations left, there's a problem with the
+        # parsing, so throw an error
+        if pico_df[conc_col_name].isnull().any():
+            raise ValueError("Some concentrations in pico green quantitation "
+                             "file are NaN: {0}".format(pico_df))
+
         return pico_df
 
+    @staticmethod
+    def _rationalize_pico_csv_string(contents):
+        # Plate reader files end with CR; convert to LF
+        contents = contents.replace('\r', '\n')
+
+        # anything valued as "<X" is converted to just "X"
+        # e.g., <0.000 becomes 0.000
+        contents = contents.replace('<', '')
+
+        # anything valued as ">X" is converted to just "X"
+        contents = contents.replace('>', '')
+        return contents
+
     @staticmethod
     def parse(contents, file_format="minipico", rows=8, cols=12):
         """Parses the quantification output
@@ -1746,13 +1788,12 @@ def parse(contents, file_format="minipico", rows=8, cols=12):
         DataFrame
         """
         parsers = {'minipico': QuantificationProcess._parse_pico_csv}
-        contents_io = StringIO(contents)
 
         if file_format not in parsers:
             raise ValueError(
                 'File format %s not recognized. Supported file formats: %s'
                 % (file_format, ', '.join(parsers)))
-        df = parsers[file_format](contents_io)
+        df = parsers[file_format](contents)
         array = QuantificationProcess._make_2D_array(df, rows=rows, cols=cols)
         return array.astype(float)
 

diff --git a/labman/db/tests/test_process.py b/labman/db/tests/test_process.py
@@ -294,7 +294,7 @@ def test_create(self):
         notes = 'test note'
         obs = GDNAExtractionProcess.create(
             user, plate, kf_robot, ep_robot, tool, kit, 10,
-            'gdna - Test plate 1', 
+            'gdna - Test plate 1',
             extraction_date=test_date, notes=notes)
         self.assertEqual(obs.date, test_date)
         self.assertEqual(obs.personnel, user)
@@ -708,35 +708,64 @@ def test_make_2D_array(self):
             cols=4).astype(float)
         np.testing.assert_allclose(obs, exp2_cp_array)
 
+    def test_rationalize_pico_csv_string(self):
+        pico_csv = ('Results					\r'
+                    '					\r'
+                    'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\r'
+                    'SPL1\tA1\t<0.000\t3.432		\r'
+                    'SPL2\tA2\t4949.000\t3.239		\r'
+                    'SPL3\tB1\t>15302.000\t10.016		\r'
+                    'SPL4\tB2\t4039.000\t2.644		\r'
+                    '					\r'
+                    'Curve2 Fitting Results					\r'
+                    '					\r'
+                    'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
+                    'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+
+        expected_output = (
+            'Results					\n'
+            '					\n'
+            'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\n'
+            'SPL1\tA1\t0.000\t3.432		\n'
+            'SPL2\tA2\t4949.000\t3.239		\n'
+            'SPL3\tB1\t15302.000\t10.016		\n'
+            'SPL4\tB2\t4039.000\t2.644		\n'
+            '					\n'
+            'Curve2 Fitting Results					\n'
+            '					\n'
+            'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\n'
+            'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+        output = QuantificationProcess._rationalize_pico_csv_string(pico_csv)
+        self.assertEqual(output, expected_output)
+
     def test_parse_pico_csv(self):
         # Test a normal sheet
-        pico_csv = '''Results
+        pico_csv1 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
         SPL1\tA1\t5243.000\t3.432
         SPL2\tA2\t4949.000\t3.239
         SPL3\tB1\t15302.000\t10.016
-        SPL4\tB2\t4039.000\t2.644
+        SPL4\tB2\t4039.000\t2.644 
 
         Curve2 Fitting Results
 
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
-        exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
+        exp_pico_df1 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
                                     'Sample DNA Concentration':
-                                    [3.432, 3.239, 10.016, 2.644]})
-        pico_csv_f = StringIO(pico_csv)
-        obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
-        pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
+                                        [3.432, 3.239, 10.016, 2.644]})
+        obs_pico_df1 = QuantificationProcess._parse_pico_csv(pico_csv1)
+        pd.testing.assert_frame_equal(obs_pico_df1, exp_pico_df1,
                                       check_like=True)
 
-        # Test a sheet that has some ???? zero values
-        pico_csv = '''Results
+        # Test a sheet that has some ????, <, and > values
+        pico_csv2 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
-        SPL1\tA1\t5243.000\t3.432
-        SPL2\tA2\t4949.000\t3.239
+        SPL1\tA1\t5243.000\t>3.432
+        SPL2\tA2\t4949.000\t<0.000
         SPL3\tB1\t15302.000\t10.016
         SPL4\tB2\t\t?????
 
@@ -745,28 +774,47 @@ def test_parse_pico_csv(self):
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
-        exp_pico_df = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
+        exp_pico_df2 = pd.DataFrame({'Well': ['A1', 'A2', 'B1', 'B2'],
                                     'Sample DNA Concentration':
-                                    [3.432, 3.239, 10.016, np.nan]})
-        pico_csv_f = StringIO(pico_csv)
-        obs_pico_df = QuantificationProcess._parse_pico_csv(pico_csv_f)
-        pd.testing.assert_frame_equal(obs_pico_df, exp_pico_df,
+                                        [3.432, 0.000, 10.016, 10.016]})
+        obs_pico_df2 = QuantificationProcess._parse_pico_csv(pico_csv2)
+        pd.testing.assert_frame_equal(obs_pico_df2, exp_pico_df2,
                                       check_like=True)
 
-    def test_parse(self):
-        pico_csv = '''Results
+        # Test a sheet that has unexpected value that can't be converted to #
+        pico_csv3 = '''Results
 
         Well ID\tWell\t[Blanked-RFU]\t[Concentration]
         SPL1\tA1\t5243.000\t3.432
         SPL2\tA2\t4949.000\t3.239
         SPL3\tB1\t15302.000\t10.016
-        SPL4\tB2\t4039.000\t2.644
+        SPL4\tB2\t\tfail
 
         Curve2 Fitting Results
 
         Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob
         Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????
         '''
+        with self.assertRaises(ValueError):
+            QuantificationProcess._parse_pico_csv(pico_csv3)
+
+    def test_parse(self):
+        # Test a normal sheet
+        # Note that the pico output file appears to have \r (NOT \r\n)
+        # line endings
+        pico_csv = ('Results					\r'
+                    '					\r'
+                    'Well ID\tWell\t[Blanked-RFU]\t[Concentration]		\r'
+                    'SPL1\tA1\t5243.000\t3.432		\r'
+                    'SPL2\tA2\t4949.000\t3.239		\r'
+                    'SPL3\tB1\t15302.000\t10.016		\r'
+                    'SPL4\tB2\t4039.000\t2.644		\r'
+                    '					\r'
+                    'Curve2 Fitting Results					\r'
+                    '					\r'
+                    'Curve Name\tCurve Formula\tA\tB\tR2\tFit F Prob\r'
+                    'Curve2\tY=A*X+B\t1.53E+003\t0\t0.995\t?????')
+
         obs = QuantificationProcess.parse(pico_csv)
         exp = np.asarray(
             [[3.432, 3.239, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
@@ -796,11 +844,11 @@ def test_attributes(self):
         self.assertEqual(tester.notes,None)
         obs = tester.concentrations
         self.assertEqual(len(obs), 95)
-        self.assertEqual(obs[0], 
+        self.assertEqual(obs[0],
                          (LibraryPrep16SComposition(1), 20.0, 60.606))
-        self.assertEqual(obs[36], 
+        self.assertEqual(obs[36],
                          (LibraryPrep16SComposition(37), 20.0, 60.606))
-        self.assertEqual(obs[94], 
+        self.assertEqual(obs[94],
                          (LibraryPrep16SComposition(95), 1.0, 3.0303))
 
         tester = QuantificationProcess(4)