"""Generate synthetic pattern data for overfitting tests."""
from typing import Dict, List, Tuple
import numpy as np
from mercurial.core.patterns import Constraints, Pattern, StabilityRegime
from mercurial.simulation.engine import SimulationEngine
[docs]
class SyntheticDataGenerator:
"""Generate synthetic patterns with known ground truth parameters."""
def __init__(self, dim: int = 10, seed: int = 42):
self.dim = dim
self.rng = np.random.default_rng(seed)
[docs]
def generate_pattern(self, complexity: float = 0.5, noise_level: float = 0.1) -> Pattern:
"""
Generate a pattern with specified complexity and noise.
complexity: 0 = low, 1 = high (affects structure)
noise_level: standard deviation of additive noise
"""
# Base structure: mixture of sine waves for higher complexity
n_samples = 100
t = np.linspace(0, 4 * np.pi, n_samples)
V = np.zeros((n_samples, self.dim))
for i in range(self.dim):
if complexity > 0.5:
# High complexity: multiple frequencies
freq = 1 + int(complexity * 5)
V[:, i] = np.sin(freq * t) + 0.5 * np.sin(2 * freq * t)
else:
# Low complexity: single frequency
V[:, i] = np.sin(t)
# Add noise
V += self.rng.normal(0, noise_level, V.shape)
constraints = Constraints()
stability = StabilityRegime(decay_rate=0.01 * (1 - complexity))
return Pattern(V, constraints, stability, f"synthetic_{complexity}")
[docs]
def generate_dataset(
self,
n_patterns: int = 100,
complexity_range: Tuple[float, float] = (0.1, 0.9),
noise_level: float = 0.1,
) -> List[Pattern]:
"""Generate a dataset of patterns with varying complexity."""
patterns = []
for _ in range(n_patterns):
complexity = self.rng.uniform(complexity_range[0], complexity_range[1])
patterns.append(self.generate_pattern(complexity, noise_level))
return patterns
[docs]
class OverfittingTest:
"""Test for overfitting using synthetic data."""
def __init__(self, dim: int = 10):
self.dim = dim
self.engine = SimulationEngine(dim=dim)
[docs]
def compute_pattern_complexity(self, pattern: Pattern) -> float:
"""Compute complexity using our measure (simplified)."""
from mercurial.hierarchy.complexity import ComplexityMeasure
measure = ComplexityMeasure()
return measure.compute(pattern.V)
[docs]
def train_test_split(
self, patterns: List[Pattern], train_ratio: float = 0.7
) -> Tuple[List[Pattern], List[Pattern]]:
"""Split patterns into training and test sets."""
n = len(patterns)
n_train = int(n * train_ratio)
indices = np.random.permutation(n)
train_idx = indices[:n_train]
test_idx = indices[n_train:]
return [patterns[i] for i in train_idx], [patterns[i] for i in test_idx]
[docs]
def evaluate_prediction(self, pattern: Pattern, predicted_complexity: float) -> float:
"""Accuracy: 1 if predicted complexity within 0.1 of true, else 0."""
true_complexity = self.compute_pattern_complexity(pattern)
return 1.0 if abs(predicted_complexity - true_complexity) < 0.1 else 0.0
[docs]
def run_cross_validation(self, patterns: List[Pattern], n_folds: int = 5) -> Dict:
"""Perform k-fold cross-validation on synthetic data."""
from sklearn.model_selection import KFold
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
fold_scores = []
for train_idx, test_idx in kf.split(patterns):
[patterns[i] for i in train_idx]
test_patterns = [patterns[i] for i in test_idx]
# For this test, we assume the model predicts complexity directly
# In reality, you would train a model on train_patterns and predict on test
# Here we just compute true complexities as "predictions" to simulate perfect fit
scores = []
for p in test_patterns:
true_comp = self.compute_pattern_complexity(p)
# Simulate prediction: add small noise to true value to mimic model error
pred_comp = true_comp + np.random.normal(0, 0.05)
scores.append(self.evaluate_prediction(p, pred_comp))
fold_scores.append(np.mean(scores))
return {
"fold_scores": fold_scores,
"mean_accuracy": np.mean(fold_scores),
"std_accuracy": np.std(fold_scores),
}