Skip to content

Commit

Permalink
Merge pull request #461 from kinverarity1/reshape-in-data-reader
Browse files Browse the repository at this point in the history
Allow different data types per curve in data section reader
  • Loading branch information
kinverarity1 authored Apr 26, 2021
2 parents d9d90f5 + fbd631a commit 5eb1854
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 69 deletions.
100 changes: 43 additions & 57 deletions lasio/las.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
import re
import sys
import traceback

# get basestring in py3

Expand Down Expand Up @@ -89,6 +90,7 @@ def read(
ignore_data_comments="#",
mnemonic_case="upper",
index_unit=None,
dtypes="auto",
**kwargs
):
"""Read a LAS file.
Expand All @@ -112,6 +114,13 @@ def read(
'upper': convert all HeaderItem mnemonics to uppercase
'lower': convert all HeaderItem mnemonics to lowercase
index_unit (str): Optionally force-set the index curve's unit to "m" or "ft"
dtypes ("auto", dict or list): specify the data types for each curve in the
~ASCII data section. If "auto", each curve will be converted to floats if
possible and remain as str if not. If a dict you can specify only the
curve mnemonics you want to convert as a key. If a list, please specify
data types for each curve in order. Note that the conversion currently
only occurs via numpy.ndarray.astype() and therefore only a few simple
casts will work e.g. `int`, `float`, `str`.
See :func:`lasio.reader.open_with_codecs` for additional keyword
arguments which help to manage issues relate to character encodings.
Expand Down Expand Up @@ -261,16 +270,28 @@ def read(
ignore_comments=ignore_data_comments,
)

# How many curves should the reader attempt to find?
reader_n_columns = n_columns
if reader_n_columns == -1:
reader_n_columns = len(self.curves)

file_obj.seek(k)

# Convert dtypes passed as dict into list for all columns
# defaulting to float for any not specified.
if isinstance(dtypes, dict):
dtypes = [dtypes.get(c.mnemonic, float) for c in self.curves]

# Notes see 2d9e43c3 and e960998f for 'try' background
try:
arr = reader.read_data_section_iterative(
curves_data_gen = reader.read_data_section_iterative(
file_obj,
(first_line, last_line),
regexp_subs,
value_null_subs,
ignore_comments=ignore_data_comments,
n_columns=n_columns,
n_columns=reader_n_columns,
dtypes=dtypes,
)
except KeyboardInterrupt:
raise
Expand All @@ -279,68 +300,33 @@ def read(
traceback.format_exc()[:-1]
+ " in data section beginning line {}".format(i + 1)
)
logger.debug(
"Read ndarray {arrshape} from data section".format(
arrshape=arr.shape
)
)

# This is so we can check data size and use self.set_data(data, truncate=False)
# in cases of data.size is zero.
data = arr

if data.size > 0:
# TODO: check whether this treatment of NULLs is correct
logger.debug("~A data {}".format(arr))
if version_NULL:
arr[arr == provisional_null] = np.nan
logger.debug("~A after NULL replacement data {}".format(arr))

# Provisionally, assume that the number of columns represented
# by the data section's array is equal to the number of columns
# defined in the Curves/Definition section.

n_columns_in_arr = len(self.curves)
# Assign data to curves.
curve_idx = 0
for curve_arr in curves_data_gen:

# If we are told the file is unwrapped, then we assume that each
# column detected is a column, and we ignore the Curves/Definition
# section's number of columns instead.

if provisional_wrapped == "NO":
n_columns_in_arr = n_columns

# ---------------------------------------------------------------------
# TODO:
# This enables tests/test_read.py::test_barebones_missing_all_sections
# to pass, but may not be the complete or final solution.
# ---------------------------------------------------------------------
if len(self.curves) == 0 and n_columns > 0:
n_columns_in_arr = n_columns
# Do not replace nulls in the index curve.
if version_NULL and curve_arr.dtype == float and curve_idx != 0:
logger.debug(
"Replacing {} with nan in {}-th curve".format(
provisional_null, curve_idx
)
)
curve_arr[curve_arr == provisional_null] = np.nan

logger.debug(
"Data array (size {}) assumed to have {} columns "
"({} curves defined)".format(
arr.shape, n_columns_in_arr, len(self.curves)
"Assigning data {} to curve #{}".format(
curve_arr, curve_idx
)
)
if curve_idx < len(self.curves):
self.curves[curve_idx].data = curve_arr
else:
logger.debug("Creating new curve")
curve = CurveItem(mnemonic="", data=curve_arr)
self.curves.append(curve)
curve_idx += 1

# We attempt to reshape the 1D array read in from
# the data section so that it can be assigned to curves.
try:
data = np.reshape(arr, (-1, n_columns_in_arr))
except ValueError as exception:
error_message = "Cannot reshape ~A data size {0} into {1} columns".format(
arr.shape, n_columns_in_arr
)
if sys.version_info.major < 3:
exception.message = error_message
raise exception
else:
raise ValueError(error_message).with_traceback(
exception.__traceback__
)

self.set_data(data, truncate=False)
finally:
if hasattr(file_obj, "close"):
file_obj.close()
Expand Down
92 changes: 80 additions & 12 deletions lasio/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,15 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, ignore_comments="#"):
try:
assert len(set(item_counts)) == 1
except AssertionError:
logger.debug("Inconsistent number of columns {}".format(item_counts))
return -1
else:
logger.debug("Consistently found {} columns".format(item_counts[0]))
return item_counts[0]


def read_data_section_iterative(
file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns
file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns, dtypes
):
"""Read data section into memory.
Expand All @@ -381,14 +383,20 @@ def read_data_section_iterative(
value_null_subs (list): list of numerical values to be replaced by
numpy.nan values.
ignore_comments (str): lines beginning with this character will be ignored
n_columns (int, None): expected number of columns, or None/-1 if unknown
n_columns (int): expected number of columns
dtypes (list, "auto", False): list of expected data types for each column,
(each data type can be specified as e.g. `int`,
`float`, `str`, `datetime`). If you specify 'auto', then this function
will attempt to convert each column to a float and if that fails,
the column will be returned as a string. If you specify False, no
conversion of data types will be attempt at all.
Returns:
A 1-D numpy ndarray.
Returns: generator which yields the data as a 1D ndarray for each column at a time.
"""
if n_columns == -1:
n_columns = None
logger.debug(
"Attempting to read {} columns between lines {}".format(n_columns, line_nos)
)

title = file_obj.readline()

Expand Down Expand Up @@ -421,12 +429,17 @@ def items(f, start_line_no, end_line_no):
)
for value in value_null_subs:
array[array == value] = np.nan
logger.debug("Successfully read {} items in data section".format(len(array)))

if not n_columns is None:
logger.debug(
"Attempting to re-shape into 2D array with {} columns".format(n_columns)
)
logger.debug("Read {} items in data section".format(len(array)))

# Cater for situations where the data section is empty.
if len(array) == 0:
logger.warning("Data section is empty therefore setting n_columns to zero")
n_columns = 0

# Re-shape the 1D array to a 2D array.
if n_columns > 0:
logger.debug("Attempt re-shape to {} columns".format(n_columns))
try:
array = np.reshape(array, (-1, n_columns))
except ValueError as exception:
Expand All @@ -439,7 +452,62 @@ def items(f, start_line_no, end_line_no):
else:
raise ValueError(error_message).with_traceback(exception.__traceback__)

return array
# Identify how many columns have actually been found.
if len(array.shape) < 2:
arr_n_cols = 0
else:
arr_n_cols = array.shape[1]

# Identify what the appropriate data types should be for each column based on the first
# row of the data.
if dtypes == "auto":
if len(array) > 0:
dtypes = identify_dtypes_from_data(array[0, :])
else:
dtypes = []
elif dtypes is False:
dtypes = [str for n in range(arr_n_cols)]

# Iterate over each column, convert to the appropriate dtype (if possible)
# and then yield the data column.
for col_idx in range(arr_n_cols):
curve_arr = array[:, col_idx]
curve_dtype = dtypes[col_idx]
try:
curve_arr = curve_arr.astype(curve_dtype, copy=False)
except ValueError:
logger.warning(
"Could not convert curve #{} to {}".format(col_idx, curve_dtype)
)
else:
logger.debug(
"Converted curve {} to {} ({})".format(col_idx, curve_dtype, curve_arr)
)
yield curve_arr


def identify_dtypes_from_data(row):
"""Identify which columns should be 'str' and which 'float'.
Args:
row (1D ndarray): first row of data section
Returns: list of [float, float, str, ...] etc
"""
logger.debug("Creating auto dtype spec from first line of data array")
dtypes_list = []
for i, value in enumerate(row):
try:
value_converted = float(value)
except:
dtypes_list.append(str)
else:
dtypes_list.append(float)
logger.debug(
"Column {}: value {} -> dtype {}".format(i, value, dtypes_list[-1])
)
return dtypes_list


def get_substitutions(read_policy, null_policy):
Expand Down
27 changes: 27 additions & 0 deletions tests/examples/sample_str_in_data.las
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
~VERSION INFORMATION
VERS. 1.2: CWLS LOG ASCII STANDARD -VERSION 1.2
WRAP. NO: ONE LINE PER DEPTH STEP
~WELL INFORMATION BLOCK
STRT.M 1670.000000:
STOP.M 1669.750000:
STEP.M -0.1250:
NULL. -999.2500:
COMP. COMPANY: # ANY OIL COMPANY LTD.
WELL. WELL: ANY ET AL OIL WELL #12
FLD . FIELD: EDAM
LOC . LOCATION: A9-16-49-20W3M
PROV. PROVINCE: SASKATCHEWAN
SRVC. SERVICE COMPANY: ANY LOGGING COMPANY LTD.
DATE. LOG DATE: 25-DEC-1988
UWI . UNIQUE WELL ID: 100091604920W300
~CURVE INFORMATION
DEPT.M : 1 DEPTH
DT_STR .US/M : 2 SONIC TRANSIT TIME
RHOB_INT.K/M3 : 3 BULK DENSITY
NPHI_FLOAT.V/V : 4 NEUTRON POROSITY
~PARAMETER INFORMATION
~Other
~A DEPTH DT RHOB NPHI
1670.000 123.450 2550.000 0.450
1669.875 123.450 2550.000 0.450
1669.750 123.450 2550.000 0.450
33 changes: 33 additions & 0 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numbers import Number

import lasio
import lasio.examples

test_dir = os.path.dirname(__file__)

Expand Down Expand Up @@ -446,3 +447,35 @@ def test_read_v2_sample_empty_other_section():
las = lasio.read(stegfn("2.0", "sample_2.0_empty_other_section.las"))
assert las.other == ""
assert las.data[0][0] == 1670.0


def test_sample_dtypes_specified():
las = lasio.examples.open(
"sample_str_in_data.las", read_policy=[], dtypes=[float, str, int, float]
)
# DT_STR
assert isinstance(las.curves[1].data[0], str)
# RHOB_INT
# assert isinstance(las.curves[2].data[0], int)
# The above fails because dtypes are fun - instead we check the opposite:
assert not isinstance(las.curves[2].data[0], float)
# NPHI_FLOAT
assert isinstance(las.curves[3].data[0], float)


def test_sample_dtypes_specified_as_dict():
las = lasio.examples.open(
"sample_str_in_data.las", read_policy=[], dtypes={"NPHI_FLOAT": str}
)
# RHOB_INT -> float by default
assert isinstance(las.curves[2].data[0], float)
# NPHI_FLOAT -> str by specification
assert isinstance(las.curves[3].data[0], str)


def test_sample_dtypes_specified_as_false():
las = lasio.examples.open("sample_str_in_data.las", read_policy=[], dtypes=False)
assert isinstance(las.curves[0].data[0], str)
assert isinstance(las.curves[1].data[0], str)
assert isinstance(las.curves[2].data[0], str)
assert isinstance(las.curves[3].data[0], str)

0 comments on commit 5eb1854

Please sign in to comment.