automatic Hs threshold

cmichelenstrofer · cmichelenstrofer · commit 44775803b303 · 2023-10-03T10:32:58.000-06:00
diff --git a/examples/data/loads/data_loads_hs.csv b/examples/data/loads/data_loads_hs.csv
diff --git a/mhkit/loads/extreme.py b/mhkit/loads/extreme.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from scipy import stats
 from scipy import optimize
+from scipy import signal
 from mhkit.wave.resource import frequency_moment
 
 
@@ -157,6 +158,124 @@ def peaks_distribution_weibull_tail_fit(x):
     return peaks
 
 
+def automatic_hs_threshold(
+    peaks,
+    sampling_rate,
+    initial_threshold_range = (0.990, 0.999, 0.001),
+    max_refinement=5
+):
+    """
+    Find the best significant wave height threshold for the
+    peaks-over-threshold method.
+
+    This method was developed by:
+
+    > Neary, V. S., S. Ahn, B. E. Seng, M. N. Allahdadi, T. Wang, Z. Yang and R. He (2020).
+    > "Characterization of Extreme Wave Conditions for Wave Energy Converter Design and Project Risk Assessment.”
+    > J. Mar. Sci. Eng. 2020, 8(4), 289; https://doi.org/10.3390/jmse8040289.
+
+    please cite this paper if using this method.
+
+    After all thresholds in the initial range are evaluated, the search
+    range is refined around the optimal point until either (i) there
+    is minimal change from the previous refinement results, (ii) the
+    number of data points become smaller than about 1 per year, or (iii)
+    the maximum number of iterations is reached.
+
+    Parameters
+    ----------
+    peaks: np.array
+        Peak values of the response time-series
+    sampling_rate: float
+        Sampling rate in hours.
+    initial_threshold_range: tuple
+        Initial range of thresholds to search. Described as
+        (min, max, step).
+    max_refinement: int
+        Maximum number of times to refine the search range.
+
+    Returns
+    -------
+    best_threshold: float
+        Threshold that results in the best correlation.
+    """
+    assert isinstance(sampling_rate, (float, int)), (
+        'sampling_rate must be of type float')
+    assert isinstance(peaks, np.ndarray), 'peaks must be of type np.ndarray'
+    assert len(initial_threshold_range) == 3, (
+        'initial_threshold_range must be length 3')
+    assert isinstance(max_refinement, int)
+
+    range_min, range_max, range_step = initial_threshold_range
+    best_threshold = -1
+    years = len(peaks)/(365.25*24/sampling_rate)
+
+    def _peaks_over_threshold(peaks, threshold, sampling_rate):
+        threshold_unit = stats.scoreatpercentile(peaks, 100*threshold)
+        idx_peaks = np.arange(len(peaks))
+        idx_storm_peaks, storm_peaks = global_peaks(
+            idx_peaks, peaks-threshold_unit)
+        idx_storm_peaks = idx_storm_peaks.astype(int)
+
+        # Two storms that are close enough (within specified window) are
+        # considered the same storm, to ensure independence.
+        independent_storm_peaks = [storm_peaks[0],]
+        idx_independent_storm_peaks = [idx_storm_peaks[0],]
+        # check first 14 days to determine window size
+        nlags = int(14 * 24 / sampling_rate)
+        x = peaks - np.mean(peaks)
+        acf = signal.correlate(x, x, mode="full")
+        lag = signal.correlation_lags(len(x), len(x), mode="full")
+        idx_zero = np.argmax(lag==0)
+        positive_lag = lag[(idx_zero+1):(idx_zero+nlags)]
+        acf_positive = acf[(idx_zero+1):(idx_zero+nlags)] / acf[idx_zero]
+        window_size = sampling_rate * positive_lag[np.argmax(acf_positive<0.5)]
+        # window size in "observations" instead of "hours" between peaks.
+        window = window_size / sampling_rate
+        # keep only independent storm peaks
+        for idx in idx_storm_peaks[1:]:
+            if (idx - idx_independent_storm_peaks[-1]) > window:
+                idx_independent_storm_peaks.append(idx)
+                independent_storm_peaks.append(peaks[idx])
+
+        return independent_storm_peaks
+
+    for i in range(max_refinement):
+        thresholds = np.arange(range_min, range_max, range_step)
+        correlations = []
+
+        for threshold in thresholds:
+            distribution = stats.genpareto
+            over_threshold = _peaks_over_threshold(
+                peaks, threshold, sampling_rate)
+            rate_per_year = len(over_threshold) / years
+            if rate_per_year < 2:
+                break
+            distributions_parameters = distribution.fit(
+                over_threshold, floc=0.)
+            _, (_, _, correlation) = stats.probplot(
+                peaks, distributions_parameters, distribution, fit=True)
+            correlations.append(correlation)
+
+        max_i = np.argmax(correlations)
+        minimal_change = np.abs(best_threshold - thresholds[max_i]) < 0.0005
+        best_threshold = thresholds[max_i]
+        if minimal_change and i<max_refinement-1:
+            break
+        range_step /= 10
+        if max_i == len(thresholds)-1:
+            range_min = thresholds[max_i - 1]
+            range_max = thresholds[max_i] + 5*range_step
+        elif max_i == 0:
+            range_min = thresholds[max_i] - 9*range_step
+            range_max = thresholds[max_i + 1]
+        else:
+            range_min = thresholds[max_i-1]
+            range_max = thresholds[max_i+1]
+
+    return best_threshold
+
+
 def peaks_distribution_peaks_over_threshold(x, threshold=None):
     """
     Estimate the peaks distribution using the peaks over threshold
diff --git a/mhkit/tests/loads/test_loads.py b/mhkit/tests/loads/test_loads.py
@@ -218,6 +218,12 @@ def test_shortterm_extreme(self):
             ste = loads.extreme.ste(t, data, t_st, method)
             assert_allclose(ste.cdf(x), cdf_1)
 
+    def test_automatic_threshold(self):
+        filename = "data_loads_hs.csv"
+        data = np.loadtxt(os.path.join(datadir, filename), delimiter=",")
+        years = 2.97
+        threshold = loads.extreme.automatic_peaks_threshold(data, years)
+        assert np.isclose(threshold, 0.99724)
 
 if __name__ == '__main__':
     unittest.main()