Skip to content

Drift Detection

Automatically detect when model quality degrades and take action before users notice.

Types of Drift

Score Drift

Eval scores trending downward over time.

Week 1: 94% → Week 2: 92% → Week 3: 88% → Week 4: 85%
        ↑ Alert triggered at 90% threshold

Distribution Drift

Response characteristics changing (length, format, tone).

Sudden Degradation

Abrupt quality drop (often from provider changes).

Hour 1: 94% → Hour 2: 94% → Hour 3: 72%
                            ↑ Immediate alert

Configuration

Basic Setup

python
from gateflow import EvalClient

client = EvalClient(api_key="gf-...")

client.configure_drift_detection(
    enabled=True,
    models=["gpt-4o", "claude-opus-4-5"],
    suites=["quality-general", "safety-core"],
    alert_threshold=90,  # Alert if score drops below 90%
    alert_channels=["slack", "email"]
)

Advanced Configuration

python
client.configure_drift_detection(
    enabled=True,

    # Models to monitor
    models=["gpt-4o", "claude-opus-4-5", "gemini-2.5-pro"],

    # Eval suites for drift calculation
    suites=["quality-general", "safety-core"],
    suite_weights={"quality-general": 0.6, "safety-core": 0.4},

    # Detection parameters
    detection={
        "window": "7d",              # Baseline window
        "min_samples": 500,          # Minimum samples for detection
        "sensitivity": "medium",     # low, medium, high
        "methods": ["trend", "sudden", "distribution"]
    },

    # Alert thresholds
    alerts={
        "score_threshold": 90,       # Absolute threshold
        "trend_threshold": -5,       # Alert if 5% decline
        "sudden_threshold": -10,     # Alert if 10% drop in 1h
        "channels": ["slack", "pagerduty", "email"]
    },

    # Automatic actions
    actions={
        "on_drift": "reduce_traffic",  # or "alert_only", "block"
        "traffic_reduction": 0.5,       # Cut traffic by 50%
        "recovery_threshold": 92        # Restore when score > 92%
    }
)

Detection Methods

Trend Detection

Identifies gradual degradation using regression:

python
detection = {
    "methods": ["trend"],
    "trend_config": {
        "window": "14d",           # Look back 14 days
        "min_slope": -0.5,         # Alert if losing 0.5%/day
        "confidence": 0.95         # Statistical confidence
    }
}

Sudden Change Detection

Catches abrupt drops using statistical tests:

python
detection = {
    "methods": ["sudden"],
    "sudden_config": {
        "baseline_window": "24h",
        "comparison_window": "1h",
        "threshold_stddev": 3,     # Alert if 3σ deviation
        "min_samples": 50
    }
}

Distribution Drift

Monitors response characteristics:

python
detection = {
    "methods": ["distribution"],
    "distribution_config": {
        "metrics": [
            "response_length",
            "token_count",
            "sentiment_score",
            "readability_score"
        ],
        "method": "ks_test",       # Kolmogorov-Smirnov test
        "p_value_threshold": 0.01
    }
}

Alert Configuration

Slack Integration

python
client.configure_alerts(
    channel="slack",
    config={
        "webhook_url": "https://hooks.slack.com/...",
        "channel": "#ai-ops",
        "mention": ["@oncall"],
        "severity_emoji": {
            "critical": "🚨",
            "warning": "⚠️",
            "info": "ℹ️"
        }
    }
)

PagerDuty Integration

python
client.configure_alerts(
    channel="pagerduty",
    config={
        "routing_key": "your-routing-key",
        "severity_mapping": {
            "critical": "critical",
            "warning": "warning"
        }
    }
)

Email Alerts

python
client.configure_alerts(
    channel="email",
    config={
        "recipients": ["ai-team@company.com"],
        "severity_filter": ["critical", "warning"]
    }
)

Webhook (Custom)

python
client.configure_alerts(
    channel="webhook",
    config={
        "url": "https://your-app.com/webhooks/drift",
        "headers": {"Authorization": "Bearer ..."},
        "payload_template": {
            "model": "{{model}}",
            "score": "{{current_score}}",
            "baseline": "{{baseline_score}}",
            "drift_percent": "{{drift_percent}}"
        }
    }
)

Viewing Drift Status

Dashboard

Navigate to Eval → Drift to see:

  • Current drift status per model
  • Score trends over time
  • Alert history
  • Automatic actions taken

API

python
# Get current drift status
status = client.get_drift_status()

for model in status.models:
    print(f"{model.name}:")
    print(f"  Current score: {model.current_score}")
    print(f"  Baseline score: {model.baseline_score}")
    print(f"  Drift: {model.drift_percent:+.1f}%")
    print(f"  Status: {model.status}")  # stable, drifting, critical
    print(f"  Trend: {model.trend}")    # improving, declining, stable

Historical Analysis

python
# Get drift history
history = client.get_drift_history(
    model="gpt-4o",
    time_range="30d"
)

for event in history.events:
    print(f"{event.timestamp}: {event.type}")
    print(f"  Score: {event.score}")
    print(f"  Action: {event.action_taken}")

Automatic Recovery

Recovery Detection

python
actions = {
    "on_drift": "reduce_traffic",
    "traffic_reduction": 0.5,

    # Recovery settings
    "recovery": {
        "threshold": 92,           # Recover when score > 92%
        "sustained_window": "4h",  # Must sustain for 4 hours
        "gradual_restore": True,   # Gradually increase traffic
        "restore_steps": 4         # In 4 steps over 4 hours
    }
}

Manual Recovery

python
# Acknowledge drift and suppress alerts
client.acknowledge_drift(
    model="gpt-4o",
    reason="Known issue, fix in progress",
    suppress_alerts="2h"
)

# Force recovery (override automatic detection)
client.force_recovery(
    model="gpt-4o",
    reason="Fix deployed, verified manually"
)

Drift Analysis

Root Cause Analysis

python
# Analyze what's causing drift
analysis = client.analyze_drift(model="gpt-4o")

print(analysis.summary)
# Drift detected: -5.2% over 7 days
#
# Contributing factors:
# - Reasoning tasks: -8.3% (high impact)
# - Instruction following: -4.1% (medium impact)
# - General quality: -2.0% (low impact)
#
# Failure patterns:
# - 67% of failures involve multi-step reasoning
# - Increased refusals on edge cases
# - Response length decreased 15%

print(analysis.recommendations)
# 1. Review recent prompt changes
# 2. Check for provider model updates
# 3. Consider adding reasoning-specific evals

Comparison with Baseline

python
# Compare current behavior with baseline period
comparison = client.compare_periods(
    model="gpt-4o",
    baseline="2024-01-01:2024-01-07",
    current="2024-01-08:2024-01-14"
)

print(comparison.dimension_changes)
# {
#   "accuracy": -3.2%,
#   "helpfulness": -1.8%,
#   "safety": +0.5%,  # Actually improved
#   "coherence": -4.1%
# }

Best Practices

  1. Set realistic baselines - Use stable periods for baseline calculation
  2. Tune sensitivity - Start with medium, adjust based on alert volume
  3. Layer detection methods - Use both trend and sudden detection
  4. Include distribution monitoring - Catches issues eval scores miss
  5. Configure recovery carefully - Avoid oscillation with sustained windows

Next Steps

Built with reliability in mind.