slo-tracker.js
1 /** 2 * SLO (Service-Level Objective) Tracker 3 * 4 * Monitors pipeline stage performance against defined SLOs. 5 * Creates alerts when actual performance falls below targets. 6 */ 7 8 import { getAll, getOne } from '../../utils/db.js'; 9 import Logger from '../../utils/logger.js'; 10 11 const logger = new Logger('SLOTracker'); 12 13 /** 14 * Service-Level Objectives for pipeline stages 15 * 16 * Each SLO defines: 17 * - stage_name: Name of the pipeline stage 18 * - target_percentile: What % of sites should meet the target (e.g., 95%) 19 * - target_duration_minutes: Maximum duration in minutes 20 * - lookback_hours: How far back to analyze (default: 24 hours) 21 */ 22 export const SLO_DEFINITIONS = [ 23 { 24 stage_name: 'serps_to_assets', 25 description: 'SERP scraping to asset capture', 26 target_percentile: 95, // 95% of sites 27 target_duration_minutes: 60, // within 1 hour 28 lookback_hours: 24, 29 }, 30 { 31 stage_name: 'assets_to_scored', 32 description: 'Asset capture to initial scoring', 33 target_percentile: 95, // 95% of sites 34 target_duration_minutes: 30, // within 30 minutes 35 lookback_hours: 24, 36 }, 37 { 38 stage_name: 'scored_to_rescored', 39 description: 'Initial scoring to rescoring (for B- and below)', 40 target_percentile: 90, // 90% of sites 41 target_duration_minutes: 45, // within 45 minutes 42 lookback_hours: 24, 43 }, 44 ]; 45 46 /** 47 * Calculate actual performance for a stage transition 48 * 49 * @param {string} fromStage - Starting stage (e.g., 'found', 'assets_captured') 50 * @param {string} toStage - Ending stage (e.g., 'assets_captured', 'prog_scored') 51 * @param {number} lookbackHours - How far back to analyze 52 * @returns {Promise<Object>} - { totalSites, durations: [minutes], p50, p95, p99 } 53 */ 54 export async function calculateStagePerformance(fromStage, toStage, lookbackHours = 24) { 55 // Query site transitions using site_status history table 56 const rows = await getAll( 57 `SELECT 58 s1.site_id, 59 EXTRACT(EPOCH FROM (s2.created_at - s1.created_at)) / 60 AS duration_minutes 60 FROM site_status s1 61 JOIN site_status s2 ON s1.site_id = s2.site_id 62 WHERE s1.status = $1 63 AND s2.status = $2 64 AND s2.created_at > s1.created_at 65 AND s1.created_at > NOW() - ($3 || ' hours')::interval 66 ORDER BY s1.created_at DESC`, 67 [fromStage, toStage, lookbackHours] 68 ); 69 70 if (rows.length === 0) { 71 return { 72 totalSites: 0, 73 durations: [], 74 p50: null, 75 p95: null, 76 p99: null, 77 }; 78 } 79 80 // Sort durations 81 const durations = rows.map(r => parseFloat(r.duration_minutes)).sort((a, b) => a - b); 82 83 return { 84 totalSites: durations.length, 85 durations, 86 p50: percentile(durations, 50), 87 p95: percentile(durations, 95), 88 p99: percentile(durations, 99), 89 }; 90 } 91 92 /** 93 * Calculate percentile from sorted array 94 * 95 * @param {number[]} sortedArray - Sorted array of numbers 96 * @param {number} p - Percentile (0-100) 97 * @returns {number|null} - Percentile value or null if empty 98 */ 99 function percentile(sortedArray, p) { 100 if (sortedArray.length === 0) return null; 101 102 const index = (p / 100) * (sortedArray.length - 1); 103 const lower = Math.floor(index); 104 const upper = Math.ceil(index); 105 const weight = index - lower; 106 107 if (upper >= sortedArray.length) return sortedArray[sortedArray.length - 1]; 108 109 return sortedArray[lower] * (1 - weight) + sortedArray[upper] * weight; 110 } 111 112 /** 113 * Check all SLOs for compliance violations 114 * 115 * @returns {Promise<Object[]>} - Array of violations { slo, actual, violation_severity } 116 */ 117 export async function checkSLOCompliance() { 118 const violations = []; 119 120 for (const slo of SLO_DEFINITIONS) { 121 // Parse stage transition from stage_name 122 const [fromStage, toStage] = parseStageTransition(slo.stage_name); 123 124 if (!fromStage || !toStage) { 125 logger.warn(`Invalid SLO stage_name: ${slo.stage_name}`); 126 continue; 127 } 128 129 // Calculate actual performance 130 const actual = await calculateStagePerformance(fromStage, toStage, slo.lookback_hours); 131 132 // Skip if no data 133 if (actual.totalSites === 0) { 134 continue; 135 } 136 137 // Check if p95 exceeds target (violation) 138 const targetPercentile = slo.target_percentile === 95 ? actual.p95 : actual.p50; 139 140 if (targetPercentile > slo.target_duration_minutes) { 141 const severity = calculateViolationSeverity(targetPercentile, slo.target_duration_minutes); 142 143 violations.push({ 144 slo: { 145 stage_name: slo.stage_name, 146 description: slo.description, 147 target_percentile: slo.target_percentile, 148 target_duration_minutes: slo.target_duration_minutes, 149 }, 150 actual: { 151 totalSites: actual.totalSites, 152 p50: Math.round(actual.p50), 153 p95: Math.round(actual.p95), 154 p99: Math.round(actual.p99), 155 }, 156 violation_severity: severity, 157 violation_description: `${slo.description}: P${slo.target_percentile} is ${Math.round(targetPercentile)} minutes (target: ${slo.target_duration_minutes} minutes)`, 158 }); 159 } 160 } 161 162 return violations; 163 } 164 165 /** 166 * Parse stage transition from SLO stage name 167 * 168 * @param {string} stageName - e.g., 'serps_to_assets', 'assets_to_scored' 169 * @returns {[string, string]} - [fromStage, toStage] or [null, null] if invalid 170 */ 171 function parseStageTransition(stageName) { 172 const mapping = { 173 serps_to_assets: ['found', 'assets_captured'], 174 assets_to_scored: ['assets_captured', 'prog_scored'], 175 scored_to_rescored: ['prog_scored', 'semantic_scored'], 176 }; 177 178 return mapping[stageName] || [null, null]; 179 } 180 181 /** 182 * Calculate violation severity based on how far actual exceeds target 183 * 184 * @param {number} actual - Actual duration in minutes 185 * @param {number} target - Target duration in minutes 186 * @returns {string} - 'low', 'medium', 'high', or 'critical' 187 */ 188 function calculateViolationSeverity(actual, target) { 189 const ratio = actual / target; 190 191 if (ratio >= 3) return 'critical'; // 3x or more over target 192 if (ratio >= 2) return 'high'; // 2-3x over target 193 if (ratio >= 1.5) return 'medium'; // 1.5-2x over target 194 return 'low'; // 1-1.5x over target 195 } 196 197 /** 198 * Get SLO compliance summary (for dashboard/logging) 199 * 200 * @returns {Promise<Object>} - { total_slos, violations, compliance_rate } 201 */ 202 export async function getSLOSummary() { 203 const violations = await checkSLOCompliance(); 204 205 return { 206 total_slos: SLO_DEFINITIONS.length, 207 violations: violations.length, 208 compliance_rate: 209 SLO_DEFINITIONS.length > 0 210 ? ((SLO_DEFINITIONS.length - violations.length) / SLO_DEFINITIONS.length) * 100 211 : 100, 212 violations_detail: violations, 213 }; 214 }