/ services / teamMemorySync / secretScanner.ts
secretScanner.ts
  1  /**
  2   * Client-side secret scanner for team memory (PSR M22174).
  3   *
  4   * Scans content for credentials before upload so secrets never leave the
  5   * user's machine. Uses a curated subset of high-confidence rules from
  6   * gitleaks (https://github.com/gitleaks/gitleaks, MIT license) — only
  7   * rules with distinctive prefixes that have near-zero false-positive
  8   * rates are included. Generic keyword-context rules are omitted.
  9   *
 10   * Rule IDs and regexes sourced directly from the public gitleaks config:
 11   * https://github.com/gitleaks/gitleaks/blob/master/config/gitleaks.toml
 12   *
 13   * JS regex notes:
 14   *   - gitleaks uses Go regex; inline (?i) and mode groups (?-i:...) are
 15   *     not portable to JS. Affected rules are rewritten with explicit
 16   *     character classes ([a-zA-Z0-9] instead of (?i)[a-z0-9]).
 17   *   - Trailing boundary alternations like (?:[\x60'"\s;]|\\[nr]|$) from
 18   *     Go regex are kept (JS $ matches end-of-string in default mode).
 19   */
 20  
 21  import { capitalize } from '../../utils/stringUtils.js'
 22  
 23  type SecretRule = {
 24    /** Gitleaks rule ID (kebab-case), used in labels and analytics */
 25    id: string
 26    /** Regex source, lazily compiled on first scan */
 27    source: string
 28    /** Optional JS regex flags (most rules are case-sensitive by default) */
 29    flags?: string
 30  }
 31  
 32  export type SecretMatch = {
 33    /** Gitleaks rule ID that matched (e.g., "github-pat", "aws-access-token") */
 34    ruleId: string
 35    /** Human-readable label derived from the rule ID */
 36    label: string
 37  }
 38  
 39  // ─── Curated rules ──────────────────────────────────────────────
 40  // High-confidence patterns from gitleaks with distinctive prefixes.
 41  // Ordered roughly by likelihood of appearing in dev-team content.
 42  
 43  // Anthropic API key prefix, assembled at runtime so the literal byte
 44  // sequence isn't present in the external bundle (excluded-strings check).
 45  // join() is not constant-folded by the minifier.
 46  const ANT_KEY_PFX = ['sk', 'ant', 'api'].join('-')
 47  
 48  const SECRET_RULES: SecretRule[] = [
 49    // — Cloud providers —
 50    {
 51      id: 'aws-access-token',
 52      source: '\\b((?:A3T[A-Z0-9]|AKIA|ASIA|ABIA|ACCA)[A-Z2-7]{16})\\b',
 53    },
 54    {
 55      id: 'gcp-api-key',
 56      source: '\\b(AIza[\\w-]{35})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 57    },
 58    {
 59      id: 'azure-ad-client-secret',
 60      source:
 61        '(?:^|[\\\\\'"\\x60\\s>=:(,)])([a-zA-Z0-9_~.]{3}\\dQ~[a-zA-Z0-9_~.-]{31,34})(?:$|[\\\\\'"\\x60\\s<),])',
 62    },
 63    {
 64      id: 'digitalocean-pat',
 65      source: '\\b(dop_v1_[a-f0-9]{64})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 66    },
 67    {
 68      id: 'digitalocean-access-token',
 69      source: '\\b(doo_v1_[a-f0-9]{64})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 70    },
 71  
 72    // — AI APIs —
 73    {
 74      id: 'anthropic-api-key',
 75      source: `\\b(${ANT_KEY_PFX}03-[a-zA-Z0-9_\\-]{93}AA)(?:[\\x60'"\\s;]|\\\\[nr]|$)`,
 76    },
 77    {
 78      id: 'anthropic-admin-api-key',
 79      source:
 80        '\\b(sk-ant-admin01-[a-zA-Z0-9_\\-]{93}AA)(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 81    },
 82    {
 83      id: 'openai-api-key',
 84      source:
 85        '\\b(sk-(?:proj|svcacct|admin)-(?:[A-Za-z0-9_-]{74}|[A-Za-z0-9_-]{58})T3BlbkFJ(?:[A-Za-z0-9_-]{74}|[A-Za-z0-9_-]{58})\\b|sk-[a-zA-Z0-9]{20}T3BlbkFJ[a-zA-Z0-9]{20})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 86    },
 87    {
 88      id: 'huggingface-access-token',
 89      // gitleaks: hf_(?i:[a-z]{34}) → JS: hf_[a-zA-Z]{34}
 90      source: '\\b(hf_[a-zA-Z]{34})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
 91    },
 92  
 93    // — Version control —
 94    {
 95      id: 'github-pat',
 96      source: 'ghp_[0-9a-zA-Z]{36}',
 97    },
 98    {
 99      id: 'github-fine-grained-pat',
100      source: 'github_pat_\\w{82}',
101    },
102    {
103      id: 'github-app-token',
104      source: '(?:ghu|ghs)_[0-9a-zA-Z]{36}',
105    },
106    {
107      id: 'github-oauth',
108      source: 'gho_[0-9a-zA-Z]{36}',
109    },
110    {
111      id: 'github-refresh-token',
112      source: 'ghr_[0-9a-zA-Z]{36}',
113    },
114    {
115      id: 'gitlab-pat',
116      source: 'glpat-[\\w-]{20}',
117    },
118    {
119      id: 'gitlab-deploy-token',
120      source: 'gldt-[0-9a-zA-Z_\\-]{20}',
121    },
122  
123    // — Communication —
124    {
125      id: 'slack-bot-token',
126      source: 'xoxb-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9-]*',
127    },
128    {
129      id: 'slack-user-token',
130      source: 'xox[pe](?:-[0-9]{10,13}){3}-[a-zA-Z0-9-]{28,34}',
131    },
132    {
133      id: 'slack-app-token',
134      source: 'xapp-\\d-[A-Z0-9]+-\\d+-[a-z0-9]+',
135      flags: 'i',
136    },
137    {
138      id: 'twilio-api-key',
139      source: 'SK[0-9a-fA-F]{32}',
140    },
141    {
142      id: 'sendgrid-api-token',
143      // gitleaks: SG\.(?i)[a-z0-9=_\-\.]{66} → JS: case-insensitive via flag
144      source: '\\b(SG\\.[a-zA-Z0-9=_\\-.]{66})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
145    },
146  
147    // — Dev tooling —
148    {
149      id: 'npm-access-token',
150      source: '\\b(npm_[a-zA-Z0-9]{36})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
151    },
152    {
153      id: 'pypi-upload-token',
154      source: 'pypi-AgEIcHlwaS5vcmc[\\w-]{50,1000}',
155    },
156    {
157      id: 'databricks-api-token',
158      source: '\\b(dapi[a-f0-9]{32}(?:-\\d)?)(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
159    },
160    {
161      id: 'hashicorp-tf-api-token',
162      // gitleaks: (?i)[a-z0-9]{14}\.(?-i:atlasv1)\.[a-z0-9\-_=]{60,70}
163      // → JS: case-insensitive hex+alnum prefix, literal "atlasv1", case-insensitive suffix
164      source: '[a-zA-Z0-9]{14}\\.atlasv1\\.[a-zA-Z0-9\\-_=]{60,70}',
165    },
166    {
167      id: 'pulumi-api-token',
168      source: '\\b(pul-[a-f0-9]{40})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
169    },
170    {
171      id: 'postman-api-token',
172      // gitleaks: PMAK-(?i)[a-f0-9]{24}\-[a-f0-9]{34} → JS: use [a-fA-F0-9]
173      source:
174        '\\b(PMAK-[a-fA-F0-9]{24}-[a-fA-F0-9]{34})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
175    },
176  
177    // — Observability —
178    {
179      id: 'grafana-api-key',
180      source:
181        '\\b(eyJrIjoi[A-Za-z0-9+/]{70,400}={0,3})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
182    },
183    {
184      id: 'grafana-cloud-api-token',
185      source: '\\b(glc_[A-Za-z0-9+/]{32,400}={0,3})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
186    },
187    {
188      id: 'grafana-service-account-token',
189      source:
190        '\\b(glsa_[A-Za-z0-9]{32}_[A-Fa-f0-9]{8})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
191    },
192    {
193      id: 'sentry-user-token',
194      source: '\\b(sntryu_[a-f0-9]{64})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
195    },
196    {
197      id: 'sentry-org-token',
198      source:
199        '\\bsntrys_eyJpYXQiO[a-zA-Z0-9+/]{10,200}(?:LCJyZWdpb25fdXJs|InJlZ2lvbl91cmwi|cmVnaW9uX3VybCI6)[a-zA-Z0-9+/]{10,200}={0,2}_[a-zA-Z0-9+/]{43}',
200    },
201  
202    // — Payment / commerce —
203    {
204      id: 'stripe-access-token',
205      source:
206        '\\b((?:sk|rk)_(?:test|live|prod)_[a-zA-Z0-9]{10,99})(?:[\\x60\'"\\s;]|\\\\[nr]|$)',
207    },
208    {
209      id: 'shopify-access-token',
210      source: 'shpat_[a-fA-F0-9]{32}',
211    },
212    {
213      id: 'shopify-shared-secret',
214      source: 'shpss_[a-fA-F0-9]{32}',
215    },
216  
217    // — Crypto —
218    {
219      id: 'private-key',
220      source:
221        '-----BEGIN[ A-Z0-9_-]{0,100}PRIVATE KEY(?: BLOCK)?-----[\\s\\S-]{64,}?-----END[ A-Z0-9_-]{0,100}PRIVATE KEY(?: BLOCK)?-----',
222      flags: 'i',
223    },
224  ]
225  
226  // Lazily compiled pattern cache — compile once on first scan.
227  let compiledRules: Array<{ id: string; re: RegExp }> | null = null
228  
229  function getCompiledRules(): Array<{ id: string; re: RegExp }> {
230    if (compiledRules === null) {
231      compiledRules = SECRET_RULES.map(r => ({
232        id: r.id,
233        re: new RegExp(r.source, r.flags),
234      }))
235    }
236    return compiledRules
237  }
238  
239  /**
240   * Convert a gitleaks rule ID (kebab-case) to a human-readable label.
241   * e.g., "github-pat" → "GitHub PAT", "aws-access-token" → "AWS Access Token"
242   */
243  function ruleIdToLabel(ruleId: string): string {
244    // Words where the canonical capitalization differs from title case
245    const specialCase: Record<string, string> = {
246      aws: 'AWS',
247      gcp: 'GCP',
248      api: 'API',
249      pat: 'PAT',
250      ad: 'AD',
251      tf: 'TF',
252      oauth: 'OAuth',
253      npm: 'NPM',
254      pypi: 'PyPI',
255      jwt: 'JWT',
256      github: 'GitHub',
257      gitlab: 'GitLab',
258      openai: 'OpenAI',
259      digitalocean: 'DigitalOcean',
260      huggingface: 'HuggingFace',
261      hashicorp: 'HashiCorp',
262      sendgrid: 'SendGrid',
263    }
264    return ruleId
265      .split('-')
266      .map(part => specialCase[part] ?? capitalize(part))
267      .join(' ')
268  }
269  
270  /**
271   * Scan a string for potential secrets.
272   *
273   * Returns one match per rule that fired (deduplicated by rule ID). The
274   * actual matched text is intentionally NOT returned — we never log or
275   * display secret values.
276   */
277  export function scanForSecrets(content: string): SecretMatch[] {
278    const matches: SecretMatch[] = []
279    const seen = new Set<string>()
280  
281    for (const rule of getCompiledRules()) {
282      if (seen.has(rule.id)) {
283        continue
284      }
285      if (rule.re.test(content)) {
286        seen.add(rule.id)
287        matches.push({
288          ruleId: rule.id,
289          label: ruleIdToLabel(rule.id),
290        })
291      }
292    }
293  
294    return matches
295  }
296  
297  /**
298   * Get a human-readable label for a gitleaks rule ID.
299   * Falls back to kebab-to-Title conversion for unknown IDs.
300   */
301  export function getSecretLabel(ruleId: string): string {
302    return ruleIdToLabel(ruleId)
303  }
304  
305  /**
306   * Redact any matched secrets in-place with [REDACTED].
307   * Unlike scanForSecrets, this returns the content with spans replaced
308   * so the surrounding text can still be written to disk safely.
309   */
310  let redactRules: RegExp[] | null = null
311  
312  export function redactSecrets(content: string): string {
313    redactRules ??= SECRET_RULES.map(
314      r => new RegExp(r.source, (r.flags ?? '').replace('g', '') + 'g'),
315    )
316    for (const re of redactRules) {
317      // Replace only the captured group, not the full match — patterns include
318      // boundary chars (space, quote, ;) outside the group that must survive.
319      content = content.replace(re, (match, g1) =>
320        typeof g1 === 'string' ? match.replace(g1, '[REDACTED]') : '[REDACTED]',
321      )
322    }
323    return content
324  }