Source code for mercurial.utils.synthetic_data

"""Generate synthetic pattern data for overfitting tests."""

from typing import Dict, List, Tuple

import numpy as np

from mercurial.core.patterns import Constraints, Pattern, StabilityRegime
from mercurial.simulation.engine import SimulationEngine


[docs] class SyntheticDataGenerator: """Generate synthetic patterns with known ground truth parameters.""" def __init__(self, dim: int = 10, seed: int = 42): self.dim = dim self.rng = np.random.default_rng(seed)
[docs] def generate_pattern(self, complexity: float = 0.5, noise_level: float = 0.1) -> Pattern: """ Generate a pattern with specified complexity and noise. complexity: 0 = low, 1 = high (affects structure) noise_level: standard deviation of additive noise """ # Base structure: mixture of sine waves for higher complexity n_samples = 100 t = np.linspace(0, 4 * np.pi, n_samples) V = np.zeros((n_samples, self.dim)) for i in range(self.dim): if complexity > 0.5: # High complexity: multiple frequencies freq = 1 + int(complexity * 5) V[:, i] = np.sin(freq * t) + 0.5 * np.sin(2 * freq * t) else: # Low complexity: single frequency V[:, i] = np.sin(t) # Add noise V += self.rng.normal(0, noise_level, V.shape) constraints = Constraints() stability = StabilityRegime(decay_rate=0.01 * (1 - complexity)) return Pattern(V, constraints, stability, f"synthetic_{complexity}")
[docs] def generate_dataset( self, n_patterns: int = 100, complexity_range: Tuple[float, float] = (0.1, 0.9), noise_level: float = 0.1, ) -> List[Pattern]: """Generate a dataset of patterns with varying complexity.""" patterns = [] for _ in range(n_patterns): complexity = self.rng.uniform(complexity_range[0], complexity_range[1]) patterns.append(self.generate_pattern(complexity, noise_level)) return patterns
[docs] class OverfittingTest: """Test for overfitting using synthetic data.""" def __init__(self, dim: int = 10): self.dim = dim self.engine = SimulationEngine(dim=dim)
[docs] def compute_pattern_complexity(self, pattern: Pattern) -> float: """Compute complexity using our measure (simplified).""" from mercurial.hierarchy.complexity import ComplexityMeasure measure = ComplexityMeasure() return measure.compute(pattern.V)
[docs] def train_test_split( self, patterns: List[Pattern], train_ratio: float = 0.7 ) -> Tuple[List[Pattern], List[Pattern]]: """Split patterns into training and test sets.""" n = len(patterns) n_train = int(n * train_ratio) indices = np.random.permutation(n) train_idx = indices[:n_train] test_idx = indices[n_train:] return [patterns[i] for i in train_idx], [patterns[i] for i in test_idx]
[docs] def evaluate_prediction(self, pattern: Pattern, predicted_complexity: float) -> float: """Accuracy: 1 if predicted complexity within 0.1 of true, else 0.""" true_complexity = self.compute_pattern_complexity(pattern) return 1.0 if abs(predicted_complexity - true_complexity) < 0.1 else 0.0
[docs] def run_cross_validation(self, patterns: List[Pattern], n_folds: int = 5) -> Dict: """Perform k-fold cross-validation on synthetic data.""" from sklearn.model_selection import KFold kf = KFold(n_splits=n_folds, shuffle=True, random_state=42) fold_scores = [] for train_idx, test_idx in kf.split(patterns): [patterns[i] for i in train_idx] test_patterns = [patterns[i] for i in test_idx] # For this test, we assume the model predicts complexity directly # In reality, you would train a model on train_patterns and predict on test # Here we just compute true complexities as "predictions" to simulate perfect fit scores = [] for p in test_patterns: true_comp = self.compute_pattern_complexity(p) # Simulate prediction: add small noise to true value to mimic model error pred_comp = true_comp + np.random.normal(0, 0.05) scores.append(self.evaluate_prediction(p, pred_comp)) fold_scores.append(np.mean(scores)) return { "fold_scores": fold_scores, "mean_accuracy": np.mean(fold_scores), "std_accuracy": np.std(fold_scores), }