Skip to content

🔵 Statistical Validation & Robustness Testing #11

@iAmGiG

Description

@iAmGiG

Overview

Ensure discovered patterns are statistically significant and robust, not spurious correlations or data mining artifacts.

Tasks

  • Implement permutation testing for pattern significance
  • Calculate false discovery rate (FDR) corrections
  • Test pattern stability across different time periods
  • Check for data mining bias and multiple testing issues
  • Validate pattern robustness across market regimes
  • Implement Monte Carlo simulations for confidence intervals
  • Test sensitivity to parameter choices
  • Cross-validate results across different data splits

Permutation Testing Framework

class PermutationTester:
    def __init__(self, n_permutations=10000):
        self.n_permutations = n_permutations
        
    def test_pattern_significance(self, pattern, sequences):
        """
        Test if pattern performance beats random chance
        """
        observed_performance = self.calculate_performance(pattern, sequences)
        
        # Generate null distribution
        null_distribution = []
        for i in range(self.n_permutations):
            shuffled_sequences = self.shuffle_outcomes(sequences)
            null_performance = self.calculate_performance(pattern, shuffled_sequences)
            null_distribution.append(null_performance)
        
        # Calculate p-value
        p_value = np.mean(np.array(null_distribution) >= observed_performance)
        
        return {
            'observed_performance': observed_performance,
            'null_mean': np.mean(null_distribution),
            'p_value': p_value,
            'is_significant': p_value < 0.05,
            'effect_size': (observed_performance - np.mean(null_distribution)) / np.std(null_distribution)
        }

Multiple Testing Correction

def apply_fdr_correction(pattern_results, alpha=0.05):
    """
    Apply Benjamini-Hochberg FDR correction for multiple pattern testing
    """
    p_values = [result['p_value'] for result in pattern_results]
    n_tests = len(p_values)
    
    # Sort p-values and get critical values
    sorted_indices = np.argsort(p_values)
    sorted_p_values = np.array(p_values)[sorted_indices]
    
    # Calculate critical values: (i/n) * alpha
    critical_values = np.arange(1, n_tests + 1) / n_tests * alpha
    
    # Find largest i such that P(i) <= (i/n) * alpha
    significant_indices = np.where(sorted_p_values <= critical_values)[0]
    
    if len(significant_indices) > 0:
        threshold_index = significant_indices[-1]
        threshold_p_value = sorted_p_values[threshold_index]
        
        # Mark patterns as significant if p_value <= threshold
        for i, result in enumerate(pattern_results):
            result['fdr_significant'] = result['p_value'] <= threshold_p_value
            result['fdr_threshold'] = threshold_p_value
    
    return pattern_results

Temporal Stability Testing

def test_pattern_stability(pattern, data, window_size=252):
    """
    Test if pattern performance is stable across different time periods
    """
    stability_results = []
    
    # Rolling window analysis
    for start in range(0, len(data) - window_size, 63):  # Quarterly windows
        window_data = data[start:start + window_size]
        window_performance = calculate_performance(pattern, window_data)
        
        stability_results.append({
            'start_date': window_data.index[0],
            'end_date': window_data.index[-1],
            'performance': window_performance,
            'sample_size': len(window_data)
        })
    
    # Calculate stability metrics
    performances = [r['performance'] for r in stability_results]
    stability_score = 1 - (np.std(performances) / np.mean(performances))  # Coefficient of variation
    
    return {
        'temporal_results': stability_results,
        'stability_score': stability_score,
        'performance_variance': np.var(performances),
        'consistent_performance': np.mean(performances) > 0 and min(performances) > -0.05
    }

Market Regime Analysis

MARKET_REGIMES = {
    'covid_crash': ('2020-02-01', '2020-05-01'),
    'recovery_bull': ('2020-06-01', '2021-12-31'),
    'rate_hike_bear': ('2022-01-01', '2022-12-31'), 
    'normalization': ('2023-01-01', '2024-12-31')
}

def test_regime_robustness(pattern, data):
    """
    Test pattern performance across different market regimes
    """
    regime_results = {}
    
    for regime_name, (start_date, end_date) in MARKET_REGIMES.items():
        regime_data = data[start_date:end_date]
        
        if len(regime_data) > 50:  # Minimum sample size
            performance = calculate_performance(pattern, regime_data)
            significance = permutation_test(pattern, regime_data)
            
            regime_results[regime_name] = {
                'performance': performance,
                'sample_size': len(regime_data),
                'p_value': significance['p_value'],
                'is_significant': significance['is_significant']
            }
    
    # Pattern is robust if significant in at least 3/4 regimes
    significant_regimes = sum(1 for r in regime_results.values() if r['is_significant'])
    is_robust = significant_regimes >= 3
    
    return {
        'regime_results': regime_results,
        'significant_regimes': significant_regimes,
        'is_regime_robust': is_robust,
        'worst_regime_performance': min(r['performance'] for r in regime_results.values())
    }

Data Mining Bias Detection

def detect_data_mining_bias(all_patterns, data):
    """
    Detect if results are due to excessive pattern searching
    """
    # Calculate the probability of finding N significant patterns by chance
    n_patterns_tested = len(all_patterns)
    n_significant = sum(1 for p in all_patterns if p['p_value'] < 0.05)
    expected_false_positives = n_patterns_tested * 0.05
    
    # Binomial test for excess significance
    from scipy.stats import binom
    p_value_bias = 1 - binom.cdf(n_significant - 1, n_patterns_tested, 0.05)
    
    return {
        'n_patterns_tested': n_patterns_tested,
        'n_significant': n_significant, 
        'expected_false_positives': expected_false_positives,
        'excess_significance_p': p_value_bias,
        'likely_data_mining_bias': p_value_bias < 0.05,
        'recommendation': 'Apply stricter significance threshold' if p_value_bias < 0.05 else 'Acceptable significance rate'
    }

Monte Carlo Confidence Intervals

def monte_carlo_confidence_intervals(pattern, data, n_simulations=1000):
    """
    Generate confidence intervals for pattern performance using bootstrap
    """
    bootstrap_performances = []
    
    for _ in range(n_simulations):
        # Bootstrap resample the data
        bootstrap_indices = np.random.choice(len(data), len(data), replace=True)
        bootstrap_data = data.iloc[bootstrap_indices]
        
        # Calculate performance on bootstrap sample
        performance = calculate_performance(pattern, bootstrap_data)
        bootstrap_performances.append(performance)
    
    # Calculate confidence intervals
    ci_lower = np.percentile(bootstrap_performances, 2.5)
    ci_upper = np.percentile(bootstrap_performances, 97.5)
    
    return {
        'mean_performance': np.mean(bootstrap_performances),
        'std_performance': np.std(bootstrap_performances),
        'ci_95_lower': ci_lower,
        'ci_95_upper': ci_upper,
        'ci_contains_zero': ci_lower <= 0 <= ci_upper
    }

Acceptance Criteria

  • Permutation testing implemented with configurable iterations (10,000+)
  • FDR correction applied to control false discovery rate
  • Temporal stability testing across rolling windows
  • Market regime robustness analysis (4+ different regimes)
  • Data mining bias detection and warnings
  • Monte Carlo confidence intervals for all performance metrics
  • Parameter sensitivity analysis for key algorithm settings
  • Cross-validation results across different data splits
  • Statistical significance thresholds appropriate for financial data
  • Comprehensive reporting of all validation results

Output Format

{
    "pattern_id": "P001",
    "validation_summary": {
        "permutation_p_value": 0.003,
        "fdr_significant": true,
        "fdr_threshold": 0.0125,
        "temporal_stability_score": 0.82,
        "regime_robustness": "3/4 regimes significant",
        "data_mining_bias_risk": "low",
        "confidence_interval_95": [0.015, 0.089],
        "overall_validation_score": "PASS"
    },
    "detailed_results": {
        "permutation_test": {...},
        "stability_analysis": {...},
        "regime_analysis": {...},
        "monte_carlo_results": {...}
    },
    "recommendations": [
        "Pattern shows robust performance across market regimes",
        "Consider higher position sizing due to stable returns",
        "Monitor performance in new market conditions"
    ]
}

Implementation Notes

Research Context

Critical final step ensuring that discovered patterns represent genuine market inefficiencies rather than statistical artifacts, providing confidence in research conclusions.

Metadata

Metadata

Assignees

Labels

analysisData analysis and pattern discoveryresearchGeneral research tasks

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions