stumpy-dev · NimaSarajpoor · Aug 2, 2022 · Aug 2, 2022 · Aug 2, 2022 · Sep 2, 2022
@@ -8,9 +8,11 @@
 STUMPY_MEAN_STD_NUM_CHUNKS = 1
 STUMPY_MEAN_STD_MAX_ITER = 10
 STUMPY_DENOM_THRESHOLD = 1e-14
-STUMPY_STDDEV_THRESHOLD = 1e-7
+STUMPY_STDDEV_THRESHOLD = 1e-100  # 1e-7
+STUMPY_VAR_THRESHOLD = 1e-5
 STUMPY_P_NORM_THRESHOLD = 1e-14
 STUMPY_TEST_PRECISION = 5
 STUMPY_MAX_P_NORM_DISTANCE = np.finfo(np.float64).max
 STUMPY_MAX_DISTANCE = np.sqrt(STUMPY_MAX_P_NORM_DISTANCE)
 STUMPY_EXCL_ZONE_DENOM = 4
+STUMPY_CORRELATION_THRESHOLD = 0.99999999  # 1 - 1e-08
@@ -577,6 +577,8 @@ def _welford_nanvar(a, w, a_subseq_isfinite):
                 * (a[last_idx] - curr_mean + a[prev_start_idx] - prev_mean)
                 / w
             )
+            if curr_var < config.STUMPY_VAR_THRESHOLD:
+                curr_var = np.nanvar(a[start_idx:stop_idx])
 
         all_variances[start_idx] = curr_var
 
@@ -1753,8 +1755,10 @@ def preprocess_diagonal(T, m):
     """
     T, T_subseq_isfinite = preprocess_non_normalized(T, m)
     M_T, Σ_T = compute_mean_std(T, m)
-    T_subseq_isconstant = Σ_T < config.STUMPY_STDDEV_THRESHOLD
-    Σ_T[T_subseq_isconstant] = 1.0  # Avoid divide by zero in next inversion step
+    T_sliding_min = _rolling_nanmin_1d(T, m)
+    T_subseq_isconstant = T_sliding_min == M_T
+    Σ_T[T_subseq_isfinite & T_subseq_isconstant] = 0.0
+    Σ_T[Σ_T <= config.STUMPY_STDDEV_THRESHOLD] = 1.0
     Σ_T_inverse = 1.0 / Σ_T
     M_T_m_1, _ = compute_mean_std(T, m - 1)
 

@@ -182,12 +182,27 @@ def _compute_diagonal(
 
             if T_B_subseq_isfinite[i + k] and T_A_subseq_isfinite[i]:
                 # Neither subsequence contains NaNs
-                if T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]:
+                if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]:
+                    pearson = 1.0
+                elif T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]:
                     pearson = 0.5
                 else:
                     pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i]
-
-                if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]:
+                    if config.STUMPY_CORRELATION_THRESHOLD <= pearson < 1.0:
+                        if pearson > ρ[thread_idx, i, 0] or (
+                            ignore_trivial and pearson > ρ[thread_idx, i + k, 0]
+                        ):
+                            cov = (
+                                np.dot(
+                                    (T_B[i + k : i + k + m] - M_T[i + k]),
+                                    (T_A[i : i + m] - μ_Q[i]),
+                                )
+                                * m_inverse
+                            )
+
+                            pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i]
+
+                if pearson > 1.0:
                     pearson = 1.0
 
                 if pearson > ρ[thread_idx, i, 0]:

@@ -6,7 +6,7 @@
 from stumpy import core, config
 
 
-def z_norm(a, axis=0, threshold=1e-7):
+def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD):
     std = np.std(a, axis, keepdims=True)
     std[np.less(std, threshold, where=~np.isnan(std))] = 1.0
 

@@ -240,3 +240,111 @@ def test_stump_nan_zero_mean_self_join():
     naive.replace_inf(ref_mp)
     naive.replace_inf(comp_mp)
     npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+def test_stump_identical_subsequence_self_join_rare_cases():
+    # This test function is designed to capture the errors that migtht be raised
+    # due the imprecision in the calculation of pearson values in the edge case
+    # where two subsequences are identical.
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    seed_values = [27343, 84451]
+    for seed in seed_values:
+        np.random.seed(seed)
+
+        identical = np.random.rand(8)
+        T_A = np.random.rand(20)
+        T_A[1 : 1 + identical.shape[0]] = identical
+        T_A[11 : 11 + identical.shape[0]] = identical
+
+        ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
+        comp_mp = stump(T_A, m, ignore_trivial=True)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+        comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+
+def test_stump_identical_subsequence_self_join_rare_cases_2():
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    seed_values = [27343, 84451]
+    for seed in seed_values:
+        np.random.seed(seed)
+
+        identical = np.random.rand(8)
+        T_A = np.random.rand(20)
+        T_A[1 : 1 + identical.shape[0]] = identical * 0.001
+        T_A[11 : 11 + identical.shape[0]] = identical * 1000
+
+        ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
+        comp_mp = stump(T_A, m, ignore_trivial=True)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+        comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+
+def test_stump_identical_subsequence_self_join_rare_cases_3():
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    seed_values = [27343, 84451]
+    for seed in seed_values:
+        np.random.seed(seed)
+
+        identical = np.random.rand(8)
+        T_A = np.random.rand(20)
+        T_A[1 : 1 + identical.shape[0]] = identical * 0.00001
+        T_A[11 : 11 + identical.shape[0]] = identical * 100000
+
+        ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
+        comp_mp = stump(T_A, m, ignore_trivial=True)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+        comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices
+
+
+def test_stump_volatile():
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    seed_values = [1]
+    for seed in seed_values:
+        np.random.seed(seed)
+        T = np.random.rand(64)
+        scale = np.random.choice(np.array([0.001, 0, 1000]), len(T), replace=True)
+        T[:] = T * scale
+
+        ref_mp = naive.stump(T, m, exclusion_zone=zone, row_wise=True)
+        comp_mp = stump(T, m, ignore_trivial=True)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+
+        npt.assert_almost_equal(
+            ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
+        )  # ignore indices