Skip to content

Fixed #610 Imprecision in identical subsequence case #657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 46 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
67ec77f
Add test functions for rare cases
NimaSarajpoor Aug 2, 2022
e1c9785
Add config variable to resolve imprecision
NimaSarajpoor Aug 2, 2022
1d0a22c
Refine value to handle imprecision
NimaSarajpoor Aug 2, 2022
9eb0f0e
refine std if it is very small
NimaSarajpoor Sep 2, 2022
aeb87a1
Recalculate nearest distance if it is small
NimaSarajpoor Sep 2, 2022
d3bbd31
Add new test function for identical case
NimaSarajpoor Sep 2, 2022
cc128ed
refine pearson right after its calculation
NimaSarajpoor Sep 2, 2022
d972f74
Comment config variable
NimaSarajpoor Sep 2, 2022
73061c9
replace stumpy_perfect_correation with threshold
NimaSarajpoor Sep 2, 2022
eee51c2
refine pearson only when we have to
NimaSarajpoor Sep 2, 2022
f433014
correct format
NimaSarajpoor Sep 2, 2022
94ceb32
increase pearson correlation threshold
NimaSarajpoor Sep 4, 2022
a8b0481
increase pearson threshold
NimaSarajpoor Sep 4, 2022
e6fd6bd
add new config variable for variance
NimaSarajpoor Sep 4, 2022
f60d624
replace hard-coded value with config variable
NimaSarajpoor Sep 4, 2022
c913183
add new test function
NimaSarajpoor Sep 4, 2022
ea98582
increase config variable threshold
NimaSarajpoor Sep 4, 2022
7ff2981
change test function
NimaSarajpoor Sep 4, 2022
109c7d3
Add new test function
NimaSarajpoor Sep 4, 2022
c4b1954
fix test function
NimaSarajpoor Sep 4, 2022
94330f6
New test function fails
NimaSarajpoor Sep 4, 2022
621f555
set default to 0.0 to avoid miscalculation when std is small
NimaSarajpoor Sep 4, 2022
ab4faeb
find constant subseqs with help of rolling min
NimaSarajpoor Sep 4, 2022
805f294
change threshold of variance
NimaSarajpoor Sep 4, 2022
9ed54b1
set threshold to a very small non-zero value
NimaSarajpoor Sep 4, 2022
c78c90e
change the scale of values in time series
NimaSarajpoor Sep 4, 2022
e2702b4
change config variable pearson threshold
NimaSarajpoor Sep 4, 2022
766f98d
modify thresholds in config
NimaSarajpoor Sep 4, 2022
ee95e70
use rolling min and max to find constant sequences
NimaSarajpoor Sep 4, 2022
83b5364
polish test functions
NimaSarajpoor Sep 4, 2022
07141ef
replace hard-coded value with config variable
NimaSarajpoor Sep 4, 2022
9cc81f5
clean the if statement
NimaSarajpoor Sep 4, 2022
2d93b2b
avoid numpy linalg.norm to speed up computation
NimaSarajpoor Sep 4, 2022
855ab63
find constant subseqs with rolling min and rolling mean
NimaSarajpoor Sep 4, 2022
d2eb056
refine precision of pearson value just for main matrix profile
NimaSarajpoor Sep 4, 2022
95bd5a6
change structure of if-block
NimaSarajpoor Sep 4, 2022
05a0245
re-design the refinement of pearson
NimaSarajpoor Sep 4, 2022
183f861
use numpy dot to calculated square of norm of an array
NimaSarajpoor Sep 4, 2022
7fcb69a
correct format
NimaSarajpoor Sep 4, 2022
c57f662
refine pearson by recalculating it using cov
NimaSarajpoor Sep 4, 2022
086cd2a
Add new seed value
NimaSarajpoor Sep 5, 2022
4ee651c
update cov and pearson
NimaSarajpoor Sep 5, 2022
d035d06
remove comment
NimaSarajpoor Sep 5, 2022
a11ca36
reduce pearson threshold
NimaSarajpoor Sep 5, 2022
566638a
Temporarily remvoe seed 0 in test function
NimaSarajpoor Sep 5, 2022
f080306
increase pearson correlation threshold
NimaSarajpoor Sep 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion stumpy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
STUMPY_MEAN_STD_NUM_CHUNKS = 1
STUMPY_MEAN_STD_MAX_ITER = 10
STUMPY_DENOM_THRESHOLD = 1e-14
STUMPY_STDDEV_THRESHOLD = 1e-7
STUMPY_STDDEV_THRESHOLD = 1e-100 # 1e-7
STUMPY_VAR_THRESHOLD = 1e-5
STUMPY_P_NORM_THRESHOLD = 1e-14
STUMPY_TEST_PRECISION = 5
STUMPY_MAX_P_NORM_DISTANCE = np.finfo(np.float64).max
STUMPY_MAX_DISTANCE = np.sqrt(STUMPY_MAX_P_NORM_DISTANCE)
STUMPY_EXCL_ZONE_DENOM = 4
STUMPY_CORRELATION_THRESHOLD = 0.99999999 # 1 - 1e-08
8 changes: 6 additions & 2 deletions stumpy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,8 @@ def _welford_nanvar(a, w, a_subseq_isfinite):
* (a[last_idx] - curr_mean + a[prev_start_idx] - prev_mean)
/ w
)
if curr_var < config.STUMPY_VAR_THRESHOLD:
curr_var = np.nanvar(a[start_idx:stop_idx])

all_variances[start_idx] = curr_var

Expand Down Expand Up @@ -1753,8 +1755,10 @@ def preprocess_diagonal(T, m):
"""
T, T_subseq_isfinite = preprocess_non_normalized(T, m)
M_T, Σ_T = compute_mean_std(T, m)
T_subseq_isconstant = Σ_T < config.STUMPY_STDDEV_THRESHOLD
Σ_T[T_subseq_isconstant] = 1.0 # Avoid divide by zero in next inversion step
T_sliding_min = _rolling_nanmin_1d(T, m)
T_subseq_isconstant = T_sliding_min == M_T
Σ_T[T_subseq_isfinite & T_subseq_isconstant] = 0.0
Σ_T[Σ_T <= config.STUMPY_STDDEV_THRESHOLD] = 1.0
Σ_T_inverse = 1.0 / Σ_T
M_T_m_1, _ = compute_mean_std(T, m - 1)

Expand Down
21 changes: 18 additions & 3 deletions stumpy/stump.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,27 @@ def _compute_diagonal(

if T_B_subseq_isfinite[i + k] and T_A_subseq_isfinite[i]:
# Neither subsequence contains NaNs
if T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]:
if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]:
pearson = 1.0
elif T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]:
pearson = 0.5
else:
pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i]

if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]:
if config.STUMPY_CORRELATION_THRESHOLD <= pearson < 1.0:
if pearson > ρ[thread_idx, i, 0] or (
ignore_trivial and pearson > ρ[thread_idx, i + k, 0]
):
cov = (
np.dot(
(T_B[i + k : i + k + m] - M_T[i + k]),
(T_A[i : i + m] - μ_Q[i]),
)
* m_inverse
)

pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i]

if pearson > 1.0:
pearson = 1.0

if pearson > ρ[thread_idx, i, 0]:
Expand Down
2 changes: 1 addition & 1 deletion tests/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from stumpy import core, config


def z_norm(a, axis=0, threshold=1e-7):
def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD):
std = np.std(a, axis, keepdims=True)
std[np.less(std, threshold, where=~np.isnan(std))] = 1.0

Expand Down
108 changes: 108 additions & 0 deletions tests/test_stump.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,111 @@ def test_stump_nan_zero_mean_self_join():
naive.replace_inf(ref_mp)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)


def test_stump_identical_subsequence_self_join_rare_cases():
# This test function is designed to capture the errors that migtht be raised
# due the imprecision in the calculation of pearson values in the edge case
# where two subsequences are identical.
m = 3
zone = int(np.ceil(m / 4))

seed_values = [27343, 84451]
for seed in seed_values:
np.random.seed(seed)

identical = np.random.rand(8)
T_A = np.random.rand(20)
T_A[1 : 1 + identical.shape[0]] = identical
T_A[11 : 11 + identical.shape[0]] = identical

ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
comp_mp = stump(T_A, m, ignore_trivial=True)
naive.replace_inf(ref_mp)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices

comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices


def test_stump_identical_subsequence_self_join_rare_cases_2():
m = 3
zone = int(np.ceil(m / 4))

seed_values = [27343, 84451]
for seed in seed_values:
np.random.seed(seed)

identical = np.random.rand(8)
T_A = np.random.rand(20)
T_A[1 : 1 + identical.shape[0]] = identical * 0.001
T_A[11 : 11 + identical.shape[0]] = identical * 1000

ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
comp_mp = stump(T_A, m, ignore_trivial=True)
naive.replace_inf(ref_mp)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices

comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices


def test_stump_identical_subsequence_self_join_rare_cases_3():
m = 3
zone = int(np.ceil(m / 4))

seed_values = [27343, 84451]
for seed in seed_values:
np.random.seed(seed)

identical = np.random.rand(8)
T_A = np.random.rand(20)
T_A[1 : 1 + identical.shape[0]] = identical * 0.00001
T_A[11 : 11 + identical.shape[0]] = identical * 100000

ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True)
comp_mp = stump(T_A, m, ignore_trivial=True)
naive.replace_inf(ref_mp)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices

comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices


def test_stump_volatile():
m = 3
zone = int(np.ceil(m / 4))

seed_values = [1]
for seed in seed_values:
np.random.seed(seed)
T = np.random.rand(64)
scale = np.random.choice(np.array([0.001, 0, 1000]), len(T), replace=True)
T[:] = T * scale

ref_mp = naive.stump(T, m, exclusion_zone=zone, row_wise=True)
comp_mp = stump(T, m, ignore_trivial=True)
naive.replace_inf(ref_mp)
naive.replace_inf(comp_mp)

npt.assert_almost_equal(
ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION
) # ignore indices