/ src / utils / name-extractor.js
name-extractor.js
  1  /**
  2   * Name Extractor
  3   * Determines if a contact label contains a real human first name.
  4   * Uses a regex fast-path for obvious cases; falls back to Haiku only when ambiguous.
  5   *
  6   * Examples:
  7   *   "Jim Walsh"   → "Jim"   (regex: two capitalised words)
  8   *   "Office"      → null    (regex: known role word)
  9   *   "Büro"        → null    (Haiku fallback)
 10   *   "Reception"   → null    (regex: known role word)
 11   *   "Marie Sales" → "Marie" (regex: two capitalised words)
 12   *   "nick"        → "Nick"  (regex: single lowercase word → capitalise)
 13   */
 14  
 15  import { readFileSync } from 'fs';
 16  import { join, dirname } from 'path';
 17  import { fileURLToPath } from 'url';
 18  import { callLLM } from './llm-provider.js';
 19  import { stripInjectionMarkers } from './llm-sanitizer.js';
 20  import Logger from './logger.js';
 21  
 22  const __dirname = dirname(fileURLToPath(import.meta.url));
 23  const projectRoot = join(__dirname, '../..');
 24  
 25  const logger = new Logger('NameExtractor');
 26  
 27  const HAIKU_MODEL = process.env.CLAUDE_HAIKU_MODEL || 'anthropic/claude-haiku-4-5';
 28  const NAME_EXTRACTOR_PROMPT = readFileSync(join(projectRoot, 'prompts/NAME-EXTRACTOR.md'), 'utf-8');
 29  
 30  // ─── Fast-path regex rules ──────────────────────────────────────────────────
 31  
 32  // Common non-person role/department/generic labels — exact match (case-insensitive).
 33  // Extend this list rather than relying on Haiku for obvious cases.
 34  const ROLE_WORDS = new Set([
 35    'office',
 36    'reception',
 37    'receptionist',
 38    'admin',
 39    'administration',
 40    'info',
 41    'information',
 42    'enquiries',
 43    'enquiry',
 44    'inquiry',
 45    'inquiries',
 46    'contact',
 47    'support',
 48    'help',
 49    'sales',
 50    'service',
 51    'services',
 52    'team',
 53    'staff',
 54    'crew',
 55    'management',
 56    'manager',
 57    'hello',
 58    'hi',
 59    'hey',
 60    'general',
 61    'main',
 62    'fax',
 63    'phone',
 64    'mobile',
 65    'web',
 66    'website',
 67    'accounts',
 68    'billing',
 69    'accounts receivable',
 70    'accounts payable',
 71    'hr',
 72    'marketing',
 73    'operations',
 74    'owner',
 75    'owners',
 76    'director',
 77    'directors',
 78    'principal',
 79    'ceo',
 80    'cfo',
 81    'cto',
 82    'noreply',
 83    'no-reply',
 84    'donotreply',
 85    'do-not-reply',
 86    // Additional generic labels found during proofreading (2026-03-06)
 87    'customer',
 88    'customerservice',
 89    'customers',
 90    'user',
 91    'users',
 92    'booking',
 93    'bookings',
 94    'estimate',
 95    'estimates',
 96    'quote',
 97    'quotes',
 98    'residential',
 99    'commercial',
100    'connect',
101    'contactus',
102    'mymail',
103    'business',
104    'hire',
105    'jobs',
106    'account',
107    'twitter',
108  ]);
109  
110  // A word that looks like a Western first name: starts with uppercase (or all-lowercase),
111  // 2-20 alpha chars, no digits or punctuation.
112  const WESTERN_NAME_RE = /^[A-ZÀ-Ö][a-zà-ö]{1,19}$|^[a-zà-ö]{2,19}$/;
113  
114  /**
115   * Try to extract a first name without calling an LLM.
116   * Returns the first name string, 'null' (skip) sentinel, or undefined (ambiguous → call LLM).
117   */
118  function fastExtract(label) {
119    const trimmed = label.trim();
120    if (!trimmed) return 'null';
121  
122    // All-digits or contains @ — not a name
123    if (/^\d+$/.test(trimmed) || trimmed.includes('@')) return 'null';
124  
125    const lower = trimmed.toLowerCase();
126  
127    // Single word
128    const words = trimmed.split(/\s+/);
129    if (words.length === 1) {
130      if (ROLE_WORDS.has(lower)) return 'null';
131      if (WESTERN_NAME_RE.test(trimmed)) {
132        // Capitalise first letter
133        return trimmed.charAt(0).toUpperCase() + trimmed.slice(1).toLowerCase();
134      }
135      // Single non-latin word — let Haiku decide
136      return undefined;
137    }
138  
139    // Two words: "Firstname Lastname" pattern
140    if (words.length === 2) {
141      const [first, second] = words;
142      // Both words look like Western names → return first word as first name
143      if (WESTERN_NAME_RE.test(first) && WESTERN_NAME_RE.test(second)) {
144        // But reject if the first word is a role (e.g. "Sales Manager")
145        if (ROLE_WORDS.has(first.toLowerCase())) return 'null';
146        return first.charAt(0).toUpperCase() + first.slice(1).toLowerCase();
147      }
148    }
149  
150    // Three+ words or non-Western script — let Haiku decide
151    return undefined;
152  }
153  
154  // ─── In-process cache ────────────────────────────────────────────────────────
155  // Avoids redundant Haiku calls for the same label within the same process lifetime.
156  // Labels are usually short strings like "Jim" or "Reception" — low memory overhead.
157  const _cache = new Map();
158  
159  /**
160   * Extract a real human first name from a contact label, or return null.
161   * Works for any language/locale.
162   *
163   * @param {string|null} label
164   * @returns {Promise<string|null>} First name string, or null
165   */
166  export async function extractFirstname(label) {
167    if (!label || typeof label !== 'string') return null;
168  
169    const trimmed = label.trim();
170    if (!trimmed) return null;
171  
172    // Check in-process cache first
173    if (_cache.has(trimmed)) {
174      return _cache.get(trimmed);
175    }
176  
177    // Try fast-path regex
178    const fast = fastExtract(trimmed);
179    if (fast !== undefined) {
180      const result = fast === 'null' ? null : fast;
181      _cache.set(trimmed, result);
182      return result;
183    }
184  
185    // Ambiguous — call Haiku
186    try {
187      const { content } = await callLLM({
188        model: HAIKU_MODEL,
189        temperature: 0,
190        max_tokens: 20,
191        messages: [
192          {
193            role: 'user',
194            content: NAME_EXTRACTOR_PROMPT.replace('{label}', stripInjectionMarkers(trimmed)),
195          },
196        ],
197        stage: 'name-extraction',
198      });
199  
200      const raw = content?.trim();
201      if (!raw || raw.toLowerCase() === 'null') {
202        _cache.set(trimmed, null);
203        return null;
204      }
205  
206      if (raw.includes('\n') || raw.length > 30) {
207        logger.warn(`Unexpected name-extractor response for "${trimmed}": ${raw}`);
208        _cache.set(trimmed, null);
209        return null;
210      }
211  
212      _cache.set(trimmed, raw);
213      return raw;
214    } catch (err) {
215      logger.warn(`name-extractor failed for "${trimmed}": ${err.message}`);
216      return null;
217    }
218  }
219  
220  export default { extractFirstname };