Skip to content

properly handle forecast with empty catalogs #111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 65 additions & 33 deletions csep/core/catalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,10 +905,10 @@ def __init__(self, **kwargs):

@classmethod
def load_ascii_catalogs(cls, filename, **kwargs):
""" Loads multiple CSEP catalogs in ASCII format.
""" Loads multiple catalogs in csep-ascii format.

This function can load multiple catalogs stored in a single file or directories. This typically called to
load a catalog-based forecast.
This function can load multiple catalogs stored in a single file. This typically called to
load a catalog-based forecast, but could also load a collection of catalogs stored in the same file

Args:
filename (str): filepath or directory of catalog files
Expand All @@ -926,15 +926,46 @@ def parse_filename(filename):
start_time = strptime_to_utc_datetime(split_fname[1], format="%Y-%m-%dT%H-%M-%S-%f")
return (name, start_time)

def read_float(val):
"""Returns val as float or None if unable"""
try:
val = float(val)
except:
val = None
return val

def is_header_line(line):
if line[0] == 'lon':
if line[0].lower() == 'lon':
return True
else:
return False

name_from_file, start_time = parse_filename(filename)
def read_catalog_line(line):
# convert to correct types
lon = read_float(line[0])
lat = read_float(line[1])
magnitude = read_float(line[2])
# maybe fractional seconds are not included
origin_time = line[3]
if origin_time:
try:
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
except ValueError:
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
depth = read_float(line[4])
catalog_id = int(line[5])
event_id = line[6]
# temporary event
temp_event = (event_id, origin_time, lat, lon, depth, magnitude)
return temp_event, catalog_id

# overwrite filename, if user specifies
kwargs.setdefault('name', name_from_file)
try:
name_from_file, start_time = parse_filename(filename)
kwargs.setdefault('name', name_from_file)
except:
pass

# handle all catalogs in single file
if os.path.isfile(filename):
with open(filename, 'r', newline='') as input_file:
Expand All @@ -948,52 +979,53 @@ def is_header_line(line):
if prev_id is None:
if is_header_line(line):
continue
# convert to correct types
lon = float(line[0])
lat = float(line[1])
magnitude = float(line[2])
# maybe fractional seconds are not included
try:
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
except ValueError:
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
depth = float(line[4])
catalog_id = int(line[5])
event_id = line[6]
# read line and return catalog id
temp_event, catalog_id = read_catalog_line(line)
empty = False
# OK if event_id is empty
if all([val in (None, '') for val in temp_event[1:]]):
empty = True
# first event is when prev_id is none, catalog_id should always start at zero
if prev_id is None:
prev_id = 0
# if the first catalog doesn't start at zero
if catalog_id != prev_id:
prev_id = catalog_id
# store this event for next time
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
if not empty:
events = [temp_event]
else:
events = []
for id in range(catalog_id):
yield cls(data=[], catalog_id=id, **kwargs)
# deal with cases of events
prev_id = catalog_id
continue
# accumulate event if catalog_id is the same as previous event
if catalog_id == prev_id:
if not all([val in (None, '') for val in temp_event]):
events.append(temp_event)
prev_id = catalog_id
events.append((event_id, origin_time, lat, lon, depth, magnitude))
# create and yield class if the events are from different catalogs
elif catalog_id == prev_id + 1:
catalog = cls(data=events, catalog_id=prev_id, **kwargs)
yield cls(data=events, catalog_id=prev_id, **kwargs)
# add event to new event list
if not empty:
events = [temp_event]
else:
events = []
prev_id = catalog_id
# add first event to new event list
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
yield catalog
# this implies there are empty catalogs, because they are not listed in the ascii file
elif catalog_id > prev_id + 1:
catalog = cls(data=events, catalog_id=prev_id, **kwargs)
# add event to new event list
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
yield cls(data=events, catalog_id=prev_id, **kwargs)
# if prev_id = 0 and catalog_id = 2, then we skipped one catalog. thus, we skip catalog_id - prev_id - 1 catalogs
num_empty_catalogs = catalog_id - prev_id - 1
# create empty catalog classes
# first yield empty catalog classes
for id in range(num_empty_catalogs):
yield cls(data=[], catalog_id=catalog_id - num_empty_catalogs + id, **kwargs)
# finally we want to yield the buffered catalog to preserve order
prev_id = catalog_id
yield catalog
# add event to new event list
if not empty:
events = [temp_event]
else:
events = []
else:
raise ValueError(
"catalog_id should be monotonically increasing and events should be ordered by catalog_id")
Expand Down
2 changes: 0 additions & 2 deletions docs/concepts/catalogs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ PyCSEP data model.
Going between a DataFrame and CSEPCatalog is a lossy transformation. It essentially only retains the essential event
attributes that are defined by the ``dtype`` of the class.



****************
Loading catalogs
****************
Expand Down
10 changes: 5 additions & 5 deletions docs/concepts/evaluations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,11 @@ Consistency tests
Publication reference
=====================

1. Number test (:ref:`Savran et al., 2020`<savran-2020>`)
2. Spatial test (:ref:`Savran et al., 2020`<savran-2020>`)
3. Magnitude test (:ref:`Savran et al., 2020`<savran-2020>`)
4. Pseudolikelihood test (:ref:`Savran et al., 2020`<savran-2020>`)
5. Calibration test (:ref:`Savran et al., 2020`<savran-2020>`)
1. Number test (:ref:`Savran et al., 2020<savran-2020>`)
2. Spatial test (:ref:`Savran et al., 2020<savran-2020>`)
3. Magnitude test (:ref:`Savran et al., 2020<savran-2020>`)
4. Pseudolikelihood test (:ref:`Savran et al., 2020<savran-2020>`)
5. Calibration test (:ref:`Savran et al., 2020<savran-2020>`)

****************************
Preparing evaluation catalog
Expand Down
62 changes: 59 additions & 3 deletions docs/concepts/forecasts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,70 @@ they are trying to model. A grid-based forecast can be easily computed from a ca
space-magnitude region and counting events within each bin from each catalog in the forecast. There can be issues with
under sampling, especially for larger magnitude events.


Working with catalog-based forecasts
####################################

.. autosummary:: csep.core.forecasts.CatalogForecast

Please see visit :ref:`this<catalog-forecast-evaluation>` example for an end-to-end tutorial on how to evaluate a catalog-based
earthquake forecast. An example of a catalog-based forecast stored in the default PyCSEP format can be found
`here<https://github.com/SCECcode/csep2/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>_`.
`here<https://github.com/SCECcode/pycsep/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>`_.


The standard format for catalog-based forecasts a comma separated value ASCII format. This format was chosen to be
human-readable and easy to implement in all programming languages. Information about the format is shown below.

.. note::
Custom formats can be supported by writing a custom function or sub-classing the
:ref:`AbstractBaseCatalog<csep.core.forecasts.AbstractBaseCatalog>`.

The event format matches the follow specfication: ::

LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 0, 0

Each row in the catalog corresponds to an event. The catalogs are expected to be placed into the same file and are
differentiated through their `catalog_id`. Catalogs with no events can be handled in a couple different ways intended to
save storage.

The events within a catalog should be sorted in time, and the *catalog_id* should be increasing sequentially. Breaks in
the *catalog_id* are interpreted as missing catalogs.

The following two examples show how you represent a forecast with 5 catalogs each containing zero events.

**1. Including all events (verbose)** ::

LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
,,,,,0,
,,,,,1,
,,,,,2,
,,,,,3,
,,,,,4,

**2. Short-hand** ::

LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
,,,,,4,

The following three example show how you could represent a forecast with 5 catalogs. Four of the catalogs contain zero events
and one catalog contains one event.

**3. Including all events (verbose)** ::

LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
,,,,,0,
,,,,,1,
,,,,,2,
,,,,,3,
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0

**4. Short-hand** ::

LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0

The simplest way to orient the file follow (3) in the case where some catalogs contain zero events. The zero oriented
catalog_id should be assigned to correspond with the total number of catalogs in the forecast. In the case where every catalog
contains zero forecasted events, you would specify the forecasting using (2). The *catalog_id* should be assigned to
correspond with the total number of catalogs in the forecast.

We will be adding more to these documentation pages, so stay tuned for updated.
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
html_show_sphinx = False

# The short X.Y version
version = 'v0.2'
version = 'v0.4'
# The full version, including alpha/beta/rc tags
release = 'v0.2.0'
release = 'v0.4.0'


# -- General configuration ---------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions tests/artifacts/test_ascii_catalogs/all_empty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,,,,,9,
10 changes: 10 additions & 0 deletions tests/artifacts/test_ascii_catalogs/all_empty_verbose.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
,,,,,0,
,,,,,1,
,,,,,2,
,,,,,3,
,,,,,4,
,,,,,5,
,,,,,6,
,,,,,7,
,,,,,8,
,,,,,9,
10 changes: 10 additions & 0 deletions tests/artifacts/test_ascii_catalogs/all_present.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1,1,1,1992-01-01T0:0:0.0,1,0,1
1,1,1,1992-01-01T0:0:0.0,1,1,1
1,1,1,1992-01-01T0:0:0.0,1,2,1
1,1,1,1992-01-01T0:0:0.0,1,3,1
1,1,1,1992-01-01T0:0:0.0,1,4,1
1,1,1,1992-01-01T0:0:0.0,1,5,1
1,1,1,1992-01-01T0:0:0.0,1,6,1
1,1,1,1992-01-01T0:0:0.0,1,7,1
1,1,1,1992-01-01T0:0:0.0,1,8,1
1,1,1,1992-01-01T0:0:0.0,1,9,1
2 changes: 2 additions & 0 deletions tests/artifacts/test_ascii_catalogs/last_empty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1,1,1,1992-01-01T0:0:0.0,1,0,1
,,,,,9,
4 changes: 4 additions & 0 deletions tests/artifacts/test_ascii_catalogs/some_empty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1,1,1,1992-01-01T0:0:0.0,1,3,1
,,,,,5,
1,1,1,1992-01-01T0:0:0.0,1,7,1
,,,,,9,
10 changes: 10 additions & 0 deletions tests/artifacts/test_ascii_catalogs/some_missing_verbose.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
,,,,,0,
,,,,,1,
,,,,,2,
1,1,1,1992-01-01T0:0:0.0,1,3,1
,,,,,4,
,,,,,5,
,,,,,6,
1,1,1,1992-01-01T0:0:0.0,1,7,1
,,,,,8,
,,,,,9,
1 change: 0 additions & 1 deletion tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def test_filter_spatial(self):
filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid)
numpy.testing.assert_equal(numpy.array([b'2', b'3'], dtype='S256').T, filtered_test_cat.get_event_ids())


def test_filter_spatial_in_place_return(self):
test_cat = copy.deepcopy(self.test_cat1)
filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid, in_place=False)
Expand Down
58 changes: 58 additions & 0 deletions tests/test_forecast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os, unittest
import numpy
from csep import load_catalog_forecast

def get_test_catalog_root():
root_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(root_dir, 'artifacts', 'test_ascii_catalogs')
return data_dir

class TestCatalogForecastCreation(unittest.TestCase):

def test_ascii_load_all_empty(self):
fname = os.path.join(get_test_catalog_root(), 'all_empty.csv')
test_fore = load_catalog_forecast(fname)
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
self.assertEqual(0, total_event_count)
self.assertEqual(10, test_fore.n_cat)
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))

def test_ascii_load_all_empty_verbose(self):
fname = os.path.join(get_test_catalog_root(), 'all_empty_verbose.csv')
test_fore = load_catalog_forecast(fname)
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
self.assertEqual(0, total_event_count)
self.assertEqual(10, test_fore.n_cat)
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))

def test_ascii_load_last_empty(self):
fname = os.path.join(get_test_catalog_root(), 'last_empty.csv')
test_fore = load_catalog_forecast(fname)
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
self.assertEqual(1, total_event_count)
self.assertEqual(10, test_fore.n_cat)
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))

def test_ascii_some_missing(self):
fname = os.path.join(get_test_catalog_root(), 'some_empty.csv')
test_fore = load_catalog_forecast(fname)
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
print([(cat.event_count, cat.catalog_id) for cat in test_fore])
self.assertEqual(2, total_event_count)
self.assertEqual(10, test_fore.n_cat)
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))

def test_ascii_some_missing_verbose(self):
fname = os.path.join(get_test_catalog_root(), 'all_present.csv')
test_fore = load_catalog_forecast(fname)
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
print([(cat.event_count, cat.catalog_id) for cat in test_fore])
self.assertEqual(10, total_event_count)
self.assertEqual(10, test_fore.n_cat)
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))

def test_ascii_all_present(self):
pass

if __name__ == '__main__':
unittest.main()