/ app / incident_samples.py
incident_samples.py
  1  from __future__ import annotations
  2  
  3  from app.schemas import IncidentRequest
  4  
  5  
  6  SCENARIOS: dict[str, IncidentRequest] = {
  7      "industrial": IncidentRequest(
  8          incident_id="INC-EDGE-2048",
  9          site="Rotterdam Edge Fabric - Plant 4",
 10          business_unit="Industrial Automation",
 11          severity="critical",
 12          alerts=[
 13              "TLS certificate expiry < 36h on edge-gw-17 and edge-gw-18",
 14              "Southbound OPC-UA packet loss sustained at 18-24% for 11 minutes",
 15              "Historian ingestion lag breached 4 minute SLO",
 16              "PLC heartbeat jitter exceeded baseline by 320%",
 17          ],
 18          logs=[
 19              "2026-03-21T19:02:11Z edge-gw-17 envoy[883]: upstream connect error or disconnect/reset before headers. reset reason: tls cert expired soon",
 20              "2026-03-21T19:03:08Z edge-gw-18 cert-agent: renewal attempt failed: ACME directory timeout via proxy egress-fw-2",
 21              "2026-03-21T19:04:41Z plant-switch-3 netmon: packet_loss=22.8% vlan=442 path=edge-gw-17->plc-segment-a",
 22              "2026-03-21T19:05:12Z historian-sync: backlog=78412 writes delayed due to intermittent gateway acknowledgements",
 23          ],
 24          topology_summary=(
 25              "Two redundant edge gateways front 84 PLCs and forward telemetry to the regional "
 26              "historian over a constrained MPLS path through egress-fw-2."
 27          ),
 28          known_assets=[
 29              "edge-gw-17",
 30              "edge-gw-18",
 31              "egress-fw-2",
 32              "plant-switch-3",
 33              "regional-historian-eu-west",
 34              "PLC segment A (84 controllers)",
 35          ],
 36          allowed_actions=[
 37              "Fail over traffic between redundant gateways",
 38              "Restart local certificate renewal sidecar",
 39              "Open network incident with WAN provider",
 40              "Pause non-essential telemetry replication",
 41              "Escalate to OT platform and network engineering",
 42          ],
 43          output_schema_instructions=(
 44              "Return concise executive-safe JSON for OT leadership. Do not invent actions "
 45              "outside the allowed_actions list."
 46          ),
 47      ),
 48      "soc": IncidentRequest(
 49          incident_id="INC-SOC-9917",
 50          site="US-East Corporate Cloud",
 51          business_unit="Security Operations",
 52          severity="critical",
 53          alerts=[
 54              "Privileged IAM token used from impossible-travel source within 6 minutes",
 55              "EDR lateral movement analytic triggered across 3 Windows jump hosts",
 56              "Kerberos service ticket request volume spiked 14x above baseline",
 57              "MFA fatigue report opened by finance administrator",
 58          ],
 59          logs=[
 60              "2026-03-21T18:47:06Z iam-audit: principal=svc-fin-admin action=GenerateAccessToken src_ip=185.91.214.33 geo=Warsaw",
 61              "2026-03-21T18:50:44Z ad-dc-02 security: EventID=4769 unusually high TGS requests account=svc-fin-admin client=jump-us-east-3",
 62              "2026-03-21T18:51:19Z edr jump-us-east-2: remote service creation detected parent=psexec.exe target=jump-us-east-3",
 63              "2026-03-21T18:53:02Z mfa-portal: user=jane.holt approved=0 denied=9 report='repeated push prompts'",
 64          ],
 65          topology_summary=(
 66              "Finance admin tooling is segmented behind three Windows jump hosts, federated IAM, "
 67              "and a hybrid Active Directory trust."
 68          ),
 69          known_assets=[
 70              "svc-fin-admin",
 71              "jump-us-east-2",
 72              "jump-us-east-3",
 73              "ad-dc-02",
 74              "finance-admin-portal",
 75              "iam-federation-prod",
 76          ],
 77          allowed_actions=[
 78              "Disable compromised service principal",
 79              "Isolate affected jump hosts from the network",
 80              "Force credential rotation",
 81              "Invalidate active sessions and tokens",
 82              "Escalate to SOC incident response and identity engineering",
 83          ],
 84          output_schema_instructions=(
 85              "Assume this may be an active intrusion. Keep language board-safe and avoid "
 86              "speculative attribution."
 87          ),
 88      ),
 89      "support": IncidentRequest(
 90          incident_id="INC-SUP-4473",
 91          site="Global SaaS Control Plane",
 92          business_unit="Premium Customer Support",
 93          severity="high",
 94          alerts=[
 95              "Premium tenant SLA burn rate exceeded 3.4x threshold",
 96              "Telemetry anomaly detected in workflow orchestration success rate",
 97              "Customer escalation opened for delayed case synchronization",
 98              "Support queue backlog for platinum tier crossed 210 open items",
 99          ],
100          logs=[
101              "2026-03-21T17:15:10Z workflow-api: tenant=blueharbor status=202 sync_job lag=00:18:14",
102              "2026-03-21T17:16:28Z telemetry-analyzer: anomaly score=0.93 service=orchestration-router metric=completion_rate",
103              "2026-03-21T17:18:09Z support-escalation: account=BlueHarbor contact='VP Operations' issue='case sync delay causing missed callbacks'",
104              "2026-03-21T17:20:55Z queue-manager: platinum_backlog=217 regional_skew=us-east",
105          ],
106          topology_summary=(
107              "Premium support case synchronization depends on the orchestration router, event bus, "
108              "and CRM sync workers in us-east with follow-on replication to regional queues."
109          ),
110          known_assets=[
111              "workflow-api",
112              "orchestration-router",
113              "crm-sync-worker-us-east",
114              "event-bus-premium",
115              "support-queue-us-east",
116              "BlueHarbor premium tenant",
117          ],
118          allowed_actions=[
119              "Throttle low-priority sync jobs",
120              "Scale CRM sync workers",
121              "Reroute premium queue processing",
122              "Issue customer communications update",
123              "Escalate to support engineering and SRE",
124          ],
125          output_schema_instructions=(
126              "Focus on SLA recovery and customer impact containment. Keep the summary concise."
127          ),
128      ),
129  }
130  
131  
132  def get_incident(name: str) -> IncidentRequest:
133      try:
134          return SCENARIOS[name]
135      except KeyError as exc:
136          valid = ", ".join(sorted(SCENARIOS))
137          raise ValueError(f"Unknown scenario '{name}'. Expected one of: {valid}") from exc
138  
139  
140  def scenario_names() -> list[str]:
141      return sorted(SCENARIOS)