name-extractor.js
1 /** 2 * Name Extractor 3 * Determines if a contact label contains a real human first name. 4 * Uses a regex fast-path for obvious cases; falls back to Haiku only when ambiguous. 5 * 6 * Examples: 7 * "Jim Walsh" → "Jim" (regex: two capitalised words) 8 * "Office" → null (regex: known role word) 9 * "Büro" → null (Haiku fallback) 10 * "Reception" → null (regex: known role word) 11 * "Marie Sales" → "Marie" (regex: two capitalised words) 12 * "nick" → "Nick" (regex: single lowercase word → capitalise) 13 */ 14 15 import { readFileSync } from 'fs'; 16 import { join, dirname } from 'path'; 17 import { fileURLToPath } from 'url'; 18 import { callLLM } from './llm-provider.js'; 19 import { stripInjectionMarkers } from './llm-sanitizer.js'; 20 import Logger from './logger.js'; 21 22 const __dirname = dirname(fileURLToPath(import.meta.url)); 23 const projectRoot = join(__dirname, '../..'); 24 25 const logger = new Logger('NameExtractor'); 26 27 const HAIKU_MODEL = process.env.CLAUDE_HAIKU_MODEL || 'anthropic/claude-haiku-4-5'; 28 const NAME_EXTRACTOR_PROMPT = readFileSync(join(projectRoot, 'prompts/NAME-EXTRACTOR.md'), 'utf-8'); 29 30 // ─── Fast-path regex rules ────────────────────────────────────────────────── 31 32 // Common non-person role/department/generic labels — exact match (case-insensitive). 33 // Extend this list rather than relying on Haiku for obvious cases. 34 const ROLE_WORDS = new Set([ 35 'office', 36 'reception', 37 'receptionist', 38 'admin', 39 'administration', 40 'info', 41 'information', 42 'enquiries', 43 'enquiry', 44 'inquiry', 45 'inquiries', 46 'contact', 47 'support', 48 'help', 49 'sales', 50 'service', 51 'services', 52 'team', 53 'staff', 54 'crew', 55 'management', 56 'manager', 57 'hello', 58 'hi', 59 'hey', 60 'general', 61 'main', 62 'fax', 63 'phone', 64 'mobile', 65 'web', 66 'website', 67 'accounts', 68 'billing', 69 'accounts receivable', 70 'accounts payable', 71 'hr', 72 'marketing', 73 'operations', 74 'owner', 75 'owners', 76 'director', 77 'directors', 78 'principal', 79 'ceo', 80 'cfo', 81 'cto', 82 'noreply', 83 'no-reply', 84 'donotreply', 85 'do-not-reply', 86 // Additional generic labels found during proofreading (2026-03-06) 87 'customer', 88 'customerservice', 89 'customers', 90 'user', 91 'users', 92 'booking', 93 'bookings', 94 'estimate', 95 'estimates', 96 'quote', 97 'quotes', 98 'residential', 99 'commercial', 100 'connect', 101 'contactus', 102 'mymail', 103 'business', 104 'hire', 105 'jobs', 106 'account', 107 'twitter', 108 ]); 109 110 // A word that looks like a Western first name: starts with uppercase (or all-lowercase), 111 // 2-20 alpha chars, no digits or punctuation. 112 const WESTERN_NAME_RE = /^[A-ZÀ-Ö][a-zà-ö]{1,19}$|^[a-zà-ö]{2,19}$/; 113 114 /** 115 * Try to extract a first name without calling an LLM. 116 * Returns the first name string, 'null' (skip) sentinel, or undefined (ambiguous → call LLM). 117 */ 118 function fastExtract(label) { 119 const trimmed = label.trim(); 120 if (!trimmed) return 'null'; 121 122 // All-digits or contains @ — not a name 123 if (/^\d+$/.test(trimmed) || trimmed.includes('@')) return 'null'; 124 125 const lower = trimmed.toLowerCase(); 126 127 // Single word 128 const words = trimmed.split(/\s+/); 129 if (words.length === 1) { 130 if (ROLE_WORDS.has(lower)) return 'null'; 131 if (WESTERN_NAME_RE.test(trimmed)) { 132 // Capitalise first letter 133 return trimmed.charAt(0).toUpperCase() + trimmed.slice(1).toLowerCase(); 134 } 135 // Single non-latin word — let Haiku decide 136 return undefined; 137 } 138 139 // Two words: "Firstname Lastname" pattern 140 if (words.length === 2) { 141 const [first, second] = words; 142 // Both words look like Western names → return first word as first name 143 if (WESTERN_NAME_RE.test(first) && WESTERN_NAME_RE.test(second)) { 144 // But reject if the first word is a role (e.g. "Sales Manager") 145 if (ROLE_WORDS.has(first.toLowerCase())) return 'null'; 146 return first.charAt(0).toUpperCase() + first.slice(1).toLowerCase(); 147 } 148 } 149 150 // Three+ words or non-Western script — let Haiku decide 151 return undefined; 152 } 153 154 // ─── In-process cache ──────────────────────────────────────────────────────── 155 // Avoids redundant Haiku calls for the same label within the same process lifetime. 156 // Labels are usually short strings like "Jim" or "Reception" — low memory overhead. 157 const _cache = new Map(); 158 159 /** 160 * Extract a real human first name from a contact label, or return null. 161 * Works for any language/locale. 162 * 163 * @param {string|null} label 164 * @returns {Promise<string|null>} First name string, or null 165 */ 166 export async function extractFirstname(label) { 167 if (!label || typeof label !== 'string') return null; 168 169 const trimmed = label.trim(); 170 if (!trimmed) return null; 171 172 // Check in-process cache first 173 if (_cache.has(trimmed)) { 174 return _cache.get(trimmed); 175 } 176 177 // Try fast-path regex 178 const fast = fastExtract(trimmed); 179 if (fast !== undefined) { 180 const result = fast === 'null' ? null : fast; 181 _cache.set(trimmed, result); 182 return result; 183 } 184 185 // Ambiguous — call Haiku 186 try { 187 const { content } = await callLLM({ 188 model: HAIKU_MODEL, 189 temperature: 0, 190 max_tokens: 20, 191 messages: [ 192 { 193 role: 'user', 194 content: NAME_EXTRACTOR_PROMPT.replace('{label}', stripInjectionMarkers(trimmed)), 195 }, 196 ], 197 stage: 'name-extraction', 198 }); 199 200 const raw = content?.trim(); 201 if (!raw || raw.toLowerCase() === 'null') { 202 _cache.set(trimmed, null); 203 return null; 204 } 205 206 if (raw.includes('\n') || raw.length > 30) { 207 logger.warn(`Unexpected name-extractor response for "${trimmed}": ${raw}`); 208 _cache.set(trimmed, null); 209 return null; 210 } 211 212 _cache.set(trimmed, raw); 213 return raw; 214 } catch (err) { 215 logger.warn(`name-extractor failed for "${trimmed}": ${err.message}`); 216 return null; 217 } 218 } 219 220 export default { extractFirstname };