/ src / agents / utils / slo-tracker.js
slo-tracker.js
  1  /**
  2   * SLO (Service-Level Objective) Tracker
  3   *
  4   * Monitors pipeline stage performance against defined SLOs.
  5   * Creates alerts when actual performance falls below targets.
  6   */
  7  
  8  import { getAll, getOne } from '../../utils/db.js';
  9  import Logger from '../../utils/logger.js';
 10  
 11  const logger = new Logger('SLOTracker');
 12  
 13  /**
 14   * Service-Level Objectives for pipeline stages
 15   *
 16   * Each SLO defines:
 17   * - stage_name: Name of the pipeline stage
 18   * - target_percentile: What % of sites should meet the target (e.g., 95%)
 19   * - target_duration_minutes: Maximum duration in minutes
 20   * - lookback_hours: How far back to analyze (default: 24 hours)
 21   */
 22  export const SLO_DEFINITIONS = [
 23    {
 24      stage_name: 'serps_to_assets',
 25      description: 'SERP scraping to asset capture',
 26      target_percentile: 95, // 95% of sites
 27      target_duration_minutes: 60, // within 1 hour
 28      lookback_hours: 24,
 29    },
 30    {
 31      stage_name: 'assets_to_scored',
 32      description: 'Asset capture to initial scoring',
 33      target_percentile: 95, // 95% of sites
 34      target_duration_minutes: 30, // within 30 minutes
 35      lookback_hours: 24,
 36    },
 37    {
 38      stage_name: 'scored_to_rescored',
 39      description: 'Initial scoring to rescoring (for B- and below)',
 40      target_percentile: 90, // 90% of sites
 41      target_duration_minutes: 45, // within 45 minutes
 42      lookback_hours: 24,
 43    },
 44  ];
 45  
 46  /**
 47   * Calculate actual performance for a stage transition
 48   *
 49   * @param {string} fromStage - Starting stage (e.g., 'found', 'assets_captured')
 50   * @param {string} toStage - Ending stage (e.g., 'assets_captured', 'prog_scored')
 51   * @param {number} lookbackHours - How far back to analyze
 52   * @returns {Promise<Object>} - { totalSites, durations: [minutes], p50, p95, p99 }
 53   */
 54  export async function calculateStagePerformance(fromStage, toStage, lookbackHours = 24) {
 55    // Query site transitions using site_status history table
 56    const rows = await getAll(
 57      `SELECT
 58         s1.site_id,
 59         EXTRACT(EPOCH FROM (s2.created_at - s1.created_at)) / 60 AS duration_minutes
 60       FROM site_status s1
 61       JOIN site_status s2 ON s1.site_id = s2.site_id
 62       WHERE s1.status = $1
 63         AND s2.status = $2
 64         AND s2.created_at > s1.created_at
 65         AND s1.created_at > NOW() - ($3 || ' hours')::interval
 66       ORDER BY s1.created_at DESC`,
 67      [fromStage, toStage, lookbackHours]
 68    );
 69  
 70    if (rows.length === 0) {
 71      return {
 72        totalSites: 0,
 73        durations: [],
 74        p50: null,
 75        p95: null,
 76        p99: null,
 77      };
 78    }
 79  
 80    // Sort durations
 81    const durations = rows.map(r => parseFloat(r.duration_minutes)).sort((a, b) => a - b);
 82  
 83    return {
 84      totalSites: durations.length,
 85      durations,
 86      p50: percentile(durations, 50),
 87      p95: percentile(durations, 95),
 88      p99: percentile(durations, 99),
 89    };
 90  }
 91  
 92  /**
 93   * Calculate percentile from sorted array
 94   *
 95   * @param {number[]} sortedArray - Sorted array of numbers
 96   * @param {number} p - Percentile (0-100)
 97   * @returns {number|null} - Percentile value or null if empty
 98   */
 99  function percentile(sortedArray, p) {
100    if (sortedArray.length === 0) return null;
101  
102    const index = (p / 100) * (sortedArray.length - 1);
103    const lower = Math.floor(index);
104    const upper = Math.ceil(index);
105    const weight = index - lower;
106  
107    if (upper >= sortedArray.length) return sortedArray[sortedArray.length - 1];
108  
109    return sortedArray[lower] * (1 - weight) + sortedArray[upper] * weight;
110  }
111  
112  /**
113   * Check all SLOs for compliance violations
114   *
115   * @returns {Promise<Object[]>} - Array of violations { slo, actual, violation_severity }
116   */
117  export async function checkSLOCompliance() {
118    const violations = [];
119  
120    for (const slo of SLO_DEFINITIONS) {
121      // Parse stage transition from stage_name
122      const [fromStage, toStage] = parseStageTransition(slo.stage_name);
123  
124      if (!fromStage || !toStage) {
125        logger.warn(`Invalid SLO stage_name: ${slo.stage_name}`);
126        continue;
127      }
128  
129      // Calculate actual performance
130      const actual = await calculateStagePerformance(fromStage, toStage, slo.lookback_hours);
131  
132      // Skip if no data
133      if (actual.totalSites === 0) {
134        continue;
135      }
136  
137      // Check if p95 exceeds target (violation)
138      const targetPercentile = slo.target_percentile === 95 ? actual.p95 : actual.p50;
139  
140      if (targetPercentile > slo.target_duration_minutes) {
141        const severity = calculateViolationSeverity(targetPercentile, slo.target_duration_minutes);
142  
143        violations.push({
144          slo: {
145            stage_name: slo.stage_name,
146            description: slo.description,
147            target_percentile: slo.target_percentile,
148            target_duration_minutes: slo.target_duration_minutes,
149          },
150          actual: {
151            totalSites: actual.totalSites,
152            p50: Math.round(actual.p50),
153            p95: Math.round(actual.p95),
154            p99: Math.round(actual.p99),
155          },
156          violation_severity: severity,
157          violation_description: `${slo.description}: P${slo.target_percentile} is ${Math.round(targetPercentile)} minutes (target: ${slo.target_duration_minutes} minutes)`,
158        });
159      }
160    }
161  
162    return violations;
163  }
164  
165  /**
166   * Parse stage transition from SLO stage name
167   *
168   * @param {string} stageName - e.g., 'serps_to_assets', 'assets_to_scored'
169   * @returns {[string, string]} - [fromStage, toStage] or [null, null] if invalid
170   */
171  function parseStageTransition(stageName) {
172    const mapping = {
173      serps_to_assets: ['found', 'assets_captured'],
174      assets_to_scored: ['assets_captured', 'prog_scored'],
175      scored_to_rescored: ['prog_scored', 'semantic_scored'],
176    };
177  
178    return mapping[stageName] || [null, null];
179  }
180  
181  /**
182   * Calculate violation severity based on how far actual exceeds target
183   *
184   * @param {number} actual - Actual duration in minutes
185   * @param {number} target - Target duration in minutes
186   * @returns {string} - 'low', 'medium', 'high', or 'critical'
187   */
188  function calculateViolationSeverity(actual, target) {
189    const ratio = actual / target;
190  
191    if (ratio >= 3) return 'critical'; // 3x or more over target
192    if (ratio >= 2) return 'high'; // 2-3x over target
193    if (ratio >= 1.5) return 'medium'; // 1.5-2x over target
194    return 'low'; // 1-1.5x over target
195  }
196  
197  /**
198   * Get SLO compliance summary (for dashboard/logging)
199   *
200   * @returns {Promise<Object>} - { total_slos, violations, compliance_rate }
201   */
202  export async function getSLOSummary() {
203    const violations = await checkSLOCompliance();
204  
205    return {
206      total_slos: SLO_DEFINITIONS.length,
207      violations: violations.length,
208      compliance_rate:
209        SLO_DEFINITIONS.length > 0
210          ? ((SLO_DEFINITIONS.length - violations.length) / SLO_DEFINITIONS.length) * 100
211          : 100,
212      violations_detail: violations,
213    };
214  }