Merge pull request #461 from kinverarity1/reshape-in-data-reader

Allow different data types per curve in data section reader
kinverarity1 · Apr 26, 2021 · 5eb1854 · 5eb1854
2 parents d9d90f5 + fbd631a
commit 5eb1854
Show file tree

Hide file tree

Showing 4 changed files with 183 additions and 69 deletions.
diff --git a/lasio/las.py b/lasio/las.py
@@ -10,6 +10,7 @@
 import logging
 import re
 import sys
+import traceback
 
 # get basestring in py3
 
@@ -89,6 +90,7 @@ def read(
         ignore_data_comments="#",
         mnemonic_case="upper",
         index_unit=None,
+        dtypes="auto",
         **kwargs
     ):
         """Read a LAS file.
@@ -112,6 +114,13 @@ def read(
                                  'upper': convert all HeaderItem mnemonics to uppercase
                                  'lower': convert all HeaderItem mnemonics to lowercase
             index_unit (str): Optionally force-set the index curve's unit to "m" or "ft"
+            dtypes ("auto", dict or list): specify the data types for each curve in the
+                ~ASCII data section. If "auto", each curve will be converted to floats if
+                possible and remain as str if not. If a dict you can specify only the
+                curve mnemonics you want to convert as a key. If a list, please specify
+                data types for each curve in order. Note that the conversion currently
+                only occurs via numpy.ndarray.astype() and therefore only a few simple
+                casts will work e.g. `int`, `float`, `str`.
 
         See :func:`lasio.reader.open_with_codecs` for additional keyword
         arguments which help to manage issues relate to character encodings.
@@ -261,16 +270,28 @@ def read(
                         ignore_comments=ignore_data_comments,
                     )
 
+                    # How many curves should the reader attempt to find?
+                    reader_n_columns = n_columns
+                    if reader_n_columns == -1:
+                        reader_n_columns = len(self.curves)
+
                     file_obj.seek(k)
+
+                    # Convert dtypes passed as dict into list for all columns
+                    # defaulting to float for any not specified.
+                    if isinstance(dtypes, dict):
+                        dtypes = [dtypes.get(c.mnemonic, float) for c in self.curves]
+
                     # Notes see 2d9e43c3 and e960998f for 'try' background
                     try:
-                        arr = reader.read_data_section_iterative(
+                        curves_data_gen = reader.read_data_section_iterative(
                             file_obj,
                             (first_line, last_line),
                             regexp_subs,
                             value_null_subs,
                             ignore_comments=ignore_data_comments,
-                            n_columns=n_columns,
+                            n_columns=reader_n_columns,
+                            dtypes=dtypes,
                         )
                     except KeyboardInterrupt:
                         raise
@@ -279,68 +300,33 @@ def read(
                             traceback.format_exc()[:-1]
                             + " in data section beginning line {}".format(i + 1)
                         )
-                    logger.debug(
-                        "Read ndarray {arrshape} from data section".format(
-                            arrshape=arr.shape
-                        )
-                    )
-
-                    # This is so we can check data size and use self.set_data(data, truncate=False)
-                    # in cases of data.size is zero.
-                    data = arr
-
-                    if data.size > 0:
-                        # TODO: check whether this treatment of NULLs is correct
-                        logger.debug("~A data {}".format(arr))
-                        if version_NULL:
-                            arr[arr == provisional_null] = np.nan
-                        logger.debug("~A after NULL replacement data {}".format(arr))
-
-                        # Provisionally, assume that the number of columns represented
-                        # by the data section's array is equal to the number of columns
-                        # defined in the Curves/Definition section.
 
-                        n_columns_in_arr = len(self.curves)
+                    # Assign data to curves.
+                    curve_idx = 0
+                    for curve_arr in curves_data_gen:
 
-                        # If we are told the file is unwrapped, then we assume that each
-                        # column detected is a column, and we ignore the Curves/Definition
-                        # section's number of columns instead.
-
-                        if provisional_wrapped == "NO":
-                            n_columns_in_arr = n_columns
-
-                        # ---------------------------------------------------------------------
-                        # TODO:
-                        # This enables tests/test_read.py::test_barebones_missing_all_sections
-                        # to pass, but may not be the complete or final solution.
-                        # ---------------------------------------------------------------------
-                        if len(self.curves) == 0 and n_columns > 0:
-                            n_columns_in_arr = n_columns
+                        # Do not replace nulls in the index curve.
+                        if version_NULL and curve_arr.dtype == float and curve_idx != 0:
+                            logger.debug(
+                                "Replacing {} with nan in {}-th curve".format(
+                                    provisional_null, curve_idx
+                                )
+                            )
+                            curve_arr[curve_arr == provisional_null] = np.nan
 
                         logger.debug(
-                            "Data array (size {}) assumed to have {} columns "
-                            "({} curves defined)".format(
-                                arr.shape, n_columns_in_arr, len(self.curves)
+                            "Assigning data {} to curve #{}".format(
+                                curve_arr, curve_idx
                             )
                         )
+                        if curve_idx < len(self.curves):
+                            self.curves[curve_idx].data = curve_arr
+                        else:
+                            logger.debug("Creating new curve")
+                            curve = CurveItem(mnemonic="", data=curve_arr)
+                            self.curves.append(curve)
+                        curve_idx += 1
 
-                        # We attempt to reshape the 1D array read in from
-                        # the data section so that it can be assigned to curves.
-                        try:
-                            data = np.reshape(arr, (-1, n_columns_in_arr))
-                        except ValueError as exception:
-                            error_message = "Cannot reshape ~A data size {0} into {1} columns".format(
-                                arr.shape, n_columns_in_arr
-                            )
-                            if sys.version_info.major < 3:
-                                exception.message = error_message
-                                raise exception
-                            else:
-                                raise ValueError(error_message).with_traceback(
-                                    exception.__traceback__
-                                )
-
-                    self.set_data(data, truncate=False)
         finally:
             if hasattr(file_obj, "close"):
                 file_obj.close()

diff --git a/lasio/reader.py b/lasio/reader.py
@@ -362,13 +362,15 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, ignore_comments="#"):
     try:
         assert len(set(item_counts)) == 1
     except AssertionError:
+        logger.debug("Inconsistent number of columns {}".format(item_counts))
         return -1
     else:
+        logger.debug("Consistently found {} columns".format(item_counts[0]))
         return item_counts[0]
 
 
 def read_data_section_iterative(
-    file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns
+    file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns, dtypes
 ):
     """Read data section into memory.
 
@@ -381,14 +383,20 @@ def read_data_section_iterative(
         value_null_subs (list): list of numerical values to be replaced by
             numpy.nan values.
         ignore_comments (str): lines beginning with this character will be ignored
-        n_columns (int, None): expected number of columns, or None/-1 if unknown
+        n_columns (int): expected number of columns
+        dtypes (list, "auto", False): list of expected data types for each column,
+            (each data type can be specified as e.g. `int`,
+            `float`, `str`, `datetime`). If you specify 'auto', then this function
+            will attempt to convert each column to a float and if that fails,
+            the column will be returned as a string. If you specify False, no
+            conversion of data types will be attempt at all.
 
-    Returns:
-        A 1-D numpy ndarray.
+    Returns: generator which yields the data as a 1D ndarray for each column at a time.
 
     """
-    if n_columns == -1:
-        n_columns = None
+    logger.debug(
+        "Attempting to read {} columns between lines {}".format(n_columns, line_nos)
+    )
 
     title = file_obj.readline()
 
@@ -421,12 +429,17 @@ def items(f, start_line_no, end_line_no):
     )
     for value in value_null_subs:
         array[array == value] = np.nan
-    logger.debug("Successfully read {} items in data section".format(len(array)))
 
-    if not n_columns is None:
-        logger.debug(
-            "Attempting to re-shape into 2D array with {} columns".format(n_columns)
-        )
+    logger.debug("Read {} items in data section".format(len(array)))
+
+    # Cater for situations where the data section is empty.
+    if len(array) == 0:
+        logger.warning("Data section is empty therefore setting n_columns to zero")
+        n_columns = 0
+
+    # Re-shape the 1D array to a 2D array.
+    if n_columns > 0:
+        logger.debug("Attempt re-shape to {} columns".format(n_columns))
         try:
             array = np.reshape(array, (-1, n_columns))
         except ValueError as exception:
@@ -439,7 +452,62 @@ def items(f, start_line_no, end_line_no):
             else:
                 raise ValueError(error_message).with_traceback(exception.__traceback__)
 
-    return array
+    # Identify how many columns have actually been found.
+    if len(array.shape) < 2:
+        arr_n_cols = 0
+    else:
+        arr_n_cols = array.shape[1]
+
+    # Identify what the appropriate data types should be for each column based on the first
+    # row of the data.
+    if dtypes == "auto":
+        if len(array) > 0:
+            dtypes = identify_dtypes_from_data(array[0, :])
+        else:
+            dtypes = []
+    elif dtypes is False:
+        dtypes = [str for n in range(arr_n_cols)]
+
+    # Iterate over each column, convert to the appropriate dtype (if possible)
+    # and then yield the data column.
+    for col_idx in range(arr_n_cols):
+        curve_arr = array[:, col_idx]
+        curve_dtype = dtypes[col_idx]
+        try:
+            curve_arr = curve_arr.astype(curve_dtype, copy=False)
+        except ValueError:
+            logger.warning(
+                "Could not convert curve #{} to {}".format(col_idx, curve_dtype)
+            )
+        else:
+            logger.debug(
+                "Converted curve {} to {} ({})".format(col_idx, curve_dtype, curve_arr)
+            )
+        yield curve_arr
+
+
+def identify_dtypes_from_data(row):
+    """Identify which columns should be 'str' and which 'float'.
+
+    Args:
+        row (1D ndarray): first row of data section
+
+    Returns: list of [float, float, str, ...] etc
+
+    """
+    logger.debug("Creating auto dtype spec from first line of data array")
+    dtypes_list = []
+    for i, value in enumerate(row):
+        try:
+            value_converted = float(value)
+        except:
+            dtypes_list.append(str)
+        else:
+            dtypes_list.append(float)
+        logger.debug(
+            "Column {}: value {} -> dtype {}".format(i, value, dtypes_list[-1])
+        )
+    return dtypes_list
 
 
 def get_substitutions(read_policy, null_policy):

diff --git a/tests/examples/sample_str_in_data.las b/tests/examples/sample_str_in_data.las
@@ -0,0 +1,27 @@
+~VERSION INFORMATION
+ VERS.                  1.2:   CWLS LOG ASCII STANDARD -VERSION 1.2
+ WRAP.                  NO:   ONE LINE PER DEPTH STEP
+~WELL INFORMATION BLOCK
+ STRT.M        1670.000000:
+ STOP.M        1669.750000:
+ STEP.M            -0.1250:
+ NULL.           -999.2500:
+ COMP.             COMPANY:   # ANY OIL COMPANY LTD.
+ WELL.                WELL:   ANY ET AL OIL WELL #12
+ FLD .               FIELD:   EDAM
+ LOC .            LOCATION:   A9-16-49-20W3M
+ PROV.            PROVINCE:   SASKATCHEWAN
+ SRVC.     SERVICE COMPANY:   ANY LOGGING COMPANY LTD.
+ DATE.            LOG DATE:   25-DEC-1988
+ UWI .      UNIQUE WELL ID:   100091604920W300
+~CURVE INFORMATION
+ DEPT.M                      :  1  DEPTH
+ DT_STR  .US/M     		     :  2  SONIC TRANSIT TIME
+ RHOB_INT.K/M3                   :  3  BULK DENSITY
+ NPHI_FLOAT.V/V                    :  4   NEUTRON POROSITY
+~PARAMETER INFORMATION
+~Other
+~A  DEPTH     DT       RHOB     NPHI
+1670.000   123.450 2550.000    0.450
+1669.875   123.450 2550.000    0.450
+1669.750   123.450 2550.000    0.450
diff --git a/tests/test_read.py b/tests/test_read.py
@@ -11,6 +11,7 @@
 from numbers import Number
 
 import lasio
+import lasio.examples
 
 test_dir = os.path.dirname(__file__)
 
@@ -446,3 +447,35 @@ def test_read_v2_sample_empty_other_section():
     las = lasio.read(stegfn("2.0", "sample_2.0_empty_other_section.las"))
     assert las.other == ""
     assert las.data[0][0] == 1670.0
+
+
+def test_sample_dtypes_specified():
+    las = lasio.examples.open(
+        "sample_str_in_data.las", read_policy=[], dtypes=[float, str, int, float]
+    )
+    # DT_STR
+    assert isinstance(las.curves[1].data[0], str)
+    # RHOB_INT
+    # assert isinstance(las.curves[2].data[0], int)
+    # The above fails because dtypes are fun - instead we check the opposite:
+    assert not isinstance(las.curves[2].data[0], float)
+    # NPHI_FLOAT
+    assert isinstance(las.curves[3].data[0], float)
+
+
+def test_sample_dtypes_specified_as_dict():
+    las = lasio.examples.open(
+        "sample_str_in_data.las", read_policy=[], dtypes={"NPHI_FLOAT": str}
+    )
+    # RHOB_INT -> float by default
+    assert isinstance(las.curves[2].data[0], float)
+    # NPHI_FLOAT -> str by specification
+    assert isinstance(las.curves[3].data[0], str)
+
+
+def test_sample_dtypes_specified_as_false():
+    las = lasio.examples.open("sample_str_in_data.las", read_policy=[], dtypes=False)
+    assert isinstance(las.curves[0].data[0], str)
+    assert isinstance(las.curves[1].data[0], str)
+    assert isinstance(las.curves[2].data[0], str)
+    assert isinstance(las.curves[3].data[0], str)