ignitionstack.pro implements enterprise-grade resilience for AI operations. When a provider fails, the system automatically detects issues, prevents cascading failures, and routes requests to healthy alternatives.
The circuit breaker prevents repeated calls to failing services:
┌─────────────────────────────────────────┐
│ Circuit Breaker │
│ │
Request ──────►│ ┌────────┐ ┌────────┐ ┌─────┐ │
│ │ CLOSED │────►│ OPEN │────►│HALF │ │
│ │(normal)│ │(reject)│ │OPEN │ │
│ └────────┘ └────────┘ └─────┘ │
│ ▲ │ │
│ └───────────────────────────┘ │
│ (success) │
└─────────────────────────────────────────┘| State | Behavior | Transition |
|---|---|---|
| CLOSED | Normal operation, requests pass through | Opens after N failures |
| OPEN | Requests immediately fail, no provider calls | Transitions to HALF_OPEN after timeout |
| HALF_OPEN | Limited requests allowed to test recovery | Closes on success, opens on failure |
// src/app/lib/ai/circuit-breaker/breaker.ts
import { CircuitBreaker } from '@/lib/ai/circuit-breaker/breaker'
const breaker = new CircuitBreaker({
failureThreshold: 5, // Failures before opening
successThreshold: 2, // Successes to close from half-open
timeout: 30000, // Request timeout (ms)
halfOpenDuration: 300000, // Time before testing (5 min)
errorRateThreshold: 0.5, // 50% error rate triggers open
})
// Execute with circuit breaker protection
const result = await breaker.execute(
'openai',
async () => provider.chat(messages, options)
)interface CircuitBreakerConfig {
// Failure detection
failureThreshold: number // Consecutive failures to trip (default: 5)
errorRateThreshold: number // Error rate to trip (default: 0.5)
windowSize: number // Sliding window for rate calculation (default: 10)
// Recovery
successThreshold: number // Successes to reset (default: 2)
halfOpenDuration: number // Time before retry (default: 5 min)
halfOpenMaxRequests: number // Requests allowed in half-open (default: 3)
// Timeouts
timeout: number // Per-request timeout (default: 30s)
slowCallThreshold: number // Slow call detection (default: 10s)
slowCallRateThreshold: number // Slow call rate to trip (default: 0.8)
}// Database-backed health tracking
interface ProviderHealthStats {
provider: string
model: string
state: 'closed' | 'open' | 'half_open'
failureCount: number
successCount: number
lastFailure: Date | null
lastSuccess: Date | null
errorRate: number
latencyP50: number
latencyP95: number
latencyP99: number
}-- Provider status tracking
CREATE TABLE ai_provider_status (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
provider TEXT NOT NULL,
model TEXT,
state TEXT DEFAULT 'closed',
failure_count INTEGER DEFAULT 0,
success_count INTEGER DEFAULT 0,
last_failure TIMESTAMPTZ,
last_success TIMESTAMPTZ,
error_rate FLOAT DEFAULT 0,
latency_p50 INTEGER,
latency_p95 INTEGER,
latency_p99 INTEGER,
manually_disabled BOOLEAN DEFAULT false,
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(provider, model)
);
-- Health check history
CREATE TABLE ai_health_checks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
provider TEXT NOT NULL,
model TEXT,
success BOOLEAN NOT NULL,
latency_ms INTEGER,
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);// Record health metrics after each request
await breaker.recordMetrics({
provider: 'openai',
model: 'gpt-4o',
success: true,
latencyMs: 1250,
tokenCount: 500,
})
// Get current health status
const health = await breaker.getHealth('openai', 'gpt-4o')
// Returns: { state: 'closed', errorRate: 0.02, latencyP95: 2100 }// src/app/lib/ai/router/strategy-router.ts
const router = new StrategyRouter({
circuitBreaker: breaker,
fallbackOrder: ['openai', 'gemini', 'ollama'],
})
// Router checks circuit state before routing
const decision = await router.route({
preferredProvider: 'openai',
task: 'chat',
})
// If OpenAI is OPEN, returns next healthy provider
// decision: { provider: 'gemini', model: 'gemini-1.5-pro', reason: 'failover' }Request for OpenAI
│
▼
┌───────────────────┐
│ Check Circuit │
│ State for OpenAI │
└────────┬──────────┘
│
┌────┴────┐
│ State? │
└────┬────┘
│
CLOSED OPEN/HALF_OPEN
│ │
▼ ▼
┌─────────┐ ┌──────────────┐
│ Execute │ │ Try Fallback │
│ Request │ │ Provider │
└────┬────┘ └──────┬───────┘
│ │
▼ ▼
Success? Gemini Available?
│ │
Yes No Yes No
│ │ │ │
▼ ▼ ▼ ▼
Return Record Execute Try Ollama
Result Failure Request (local)const router = new StrategyRouter({
fallbackRules: {
// By task type
code: ['openai', 'anthropic', 'ollama'],
creative: ['openai', 'gemini', 'ollama'],
analysis: ['gemini', 'openai', 'anthropic'],
// By plan tier
enterprise: ['anthropic', 'openai', 'gemini'],
pro: ['openai', 'gemini', 'ollama'],
free: ['ollama', 'gemini'],
},
})// Manually disable a provider
await breaker.disable('openai', {
reason: 'Scheduled maintenance',
duration: 3600000, // 1 hour
notifyUsers: true,
})
// Re-enable provider
await breaker.enable('openai')
// Force circuit state
await breaker.forceState('openai', 'closed')-- View all provider states
SELECT
provider,
model,
state,
error_rate,
latency_p95,
manually_disabled,
last_failure
FROM ai_provider_status
ORDER BY error_rate DESC;
-- Recent failures
SELECT
provider,
model,
error_message,
created_at
FROM ai_health_checks
WHERE success = false
ORDER BY created_at DESC
LIMIT 50;Not all errors should trip the circuit:
// Transient errors - count towards failure
const transientErrors = [
'rate_limit_exceeded',
'server_error',
'timeout',
'connection_refused',
]
// Permanent errors - don't count, fail immediately
const permanentErrors = [
'invalid_api_key',
'insufficient_quota',
'model_not_found',
]
// Content errors - don't count, return gracefully
const contentErrors = [
'content_filter',
'context_length_exceeded',
]async chat(messages, options) {
try {
return await this.client.chat(messages, options)
} catch (error) {
// Classify error
if (this.isPermanentError(error)) {
throw new PermanentError(error) // Skip circuit breaker
}
if (this.isContentError(error)) {
throw new ContentError(error) // User-actionable
}
// Transient error - let circuit breaker handle
throw new TransientError(error)
}
}const breaker = new CircuitBreaker({
slowCallThreshold: 10000, // 10 seconds
slowCallRateThreshold: 0.8, // 80% slow calls
})
// If 80% of requests exceed 10s, circuit opens// Latency percentiles are tracked automatically
const stats = await breaker.getLatencyStats('openai', 'gpt-4o')
console.log({
p50: stats.latencyP50, // 1200ms - median
p95: stats.latencyP95, // 3500ms - 95th percentile
p99: stats.latencyP99, // 8000ms - 99th percentile
})describe('CircuitBreaker', () => {
it('should open after failure threshold', async () => {
const breaker = new CircuitBreaker({ failureThreshold: 3 })
// Simulate failures
for (let i = 0; i < 3; i++) {
await breaker.execute('test', async () => {
throw new Error('fail')
}).catch(() => {})
}
expect(breaker.getState('test')).toBe('open')
})
it('should transition to half-open after timeout', async () => {
const breaker = new CircuitBreaker({
failureThreshold: 1,
halfOpenDuration: 100, // Fast for testing
})
await breaker.execute('test', async () => {
throw new Error('fail')
}).catch(() => {})
await sleep(150)
expect(breaker.getState('test')).toBe('half_open')
})
})// Inject random failures for testing
const chaosBreaker = new CircuitBreaker({
chaos: {
enabled: process.env.CHAOS_TESTING === 'true',
failureRate: 0.1, // 10% random failures
latencyMs: 5000, // Add 5s latency
},
})// GET /api/health/ai
export async function GET() {
const providers = ['openai', 'gemini', 'ollama']
const health = await Promise.all(
providers.map(async (p) => ({
provider: p,
status: await breaker.getHealth(p),
}))
)
const allHealthy = health.every(h => h.status.state === 'closed')
return Response.json({
status: allHealthy ? 'healthy' : 'degraded',
providers: health,
}, {
status: allHealthy ? 200 : 503,
})
}// Set up alerts for circuit state changes
breaker.on('stateChange', async (event) => {
if (event.newState === 'open') {
await sendAlert({
severity: 'high',
message: `Circuit opened for ${event.provider}`,
errorRate: event.errorRate,
lastError: event.lastError,
})
}
})Tune Thresholds: Start conservative, adjust based on traffic patterns
Monitor Latency: Slow calls often precede failures
Test Failover: Regularly verify fallback providers work
Document Incidents: Log circuit opens for post-mortems
Gradual Recovery: Use half-open state to prevent thundering herd
Per-Model Circuits: Track each model separately for granular control