contact-repair.js
1 /** 2 * Contact Repair 3 * Validates contacts_json returned by LLMs and retries with targeted error context 4 * when invalid contacts are found. 5 * 6 * Source attribution strategy: 7 * - If contact has source = image filename → retry with just that image + error 8 * - If contact has source = XPath → retry with html_dom + xpath + error 9 * - If no source → retry with full context (html + screenshots) 10 */ 11 12 import { callLLM } from './llm-provider.js'; 13 import { validatePhone, validateEmail } from './contact-validator.js'; 14 import { safeJsonParse } from './error-handler.js'; 15 import { wrapUntrusted } from './llm-sanitizer.js'; 16 import Logger from './logger.js'; 17 18 const logger = new Logger('ContactRepair'); 19 20 // Use same model as enrichment for repair calls 21 const REPAIR_MODEL = 22 process.env.ENRICHMENT_MODEL || process.env.CLAUDE_SONNET_MODEL || 'anthropic/claude-sonnet-4-6'; 23 24 /** 25 * Detect whether a source string is an image filename or an XPath 26 * @param {string} source 27 * @returns {'image'|'xpath'|null} 28 */ 29 function classifySource(source) { 30 if (!source) return null; 31 if (/\.(jpg|jpeg|png|gif|webp)$/i.test(source)) return 'image'; 32 if (source.startsWith('//') || source.startsWith('/html') || source.startsWith('(//')) 33 return 'xpath'; 34 return null; 35 } 36 37 /** 38 * Find a screenshot buffer by filename from the assets object 39 * @param {string} filename - e.g. "above_fold.jpg" 40 * @param {Object} screenshots - map of filename → base64 string 41 * @returns {string|null} base64 string or null 42 */ 43 function findScreenshot(filename, screenshots = {}) { 44 // eslint-disable-next-line security/detect-object-injection 45 return screenshots[filename] || null; 46 } 47 48 /** 49 * Build a repair LLM call for a group of errors sharing the same source type 50 * @returns {Promise<Object|null>} Repaired contacts partial object, or null 51 */ 52 async function callRepairLLM({ errors, html, screenshots, sourceType }) { 53 const errorList = errors 54 .map(e => `- ${e.field}: "${e.value}" (source: ${e.source || 'unknown'}) — ${e.reason}`) 55 .join('\n'); 56 57 const systemPrompt = `You are correcting invalid contact information extracted from a business website. 58 You will be given a list of invalid contacts and their errors. Fix ONLY the listed contacts. 59 Return ONLY a JSON object with the corrected contacts in the same schema. Do not include contacts that were not listed as errors. 60 If you cannot find a valid replacement, omit that contact entirely.`; 61 62 const userContent = []; 63 64 // Add error list as text 65 userContent.push({ 66 type: 'text', 67 text: `The following extracted contacts failed validation:\n\n${wrapUntrusted(errorList, 'validation_errors')}\n\nPlease provide corrected values.`, 68 }); 69 70 // Attach relevant assets based on source type 71 if (sourceType === 'image') { 72 for (const err of errors) { 73 const b64 = findScreenshot(err.source, screenshots); 74 if (b64) { 75 userContent.push({ 76 type: 'text', 77 text: `Source image (${err.source}):`, 78 }); 79 userContent.push({ 80 type: 'image_url', 81 image_url: { url: `data:image/png;base64,${b64}` }, 82 }); 83 } 84 } 85 } else if (sourceType === 'xpath' && html) { 86 const xpaths = [...new Set(errors.map(e => e.source).filter(Boolean))]; 87 userContent.push({ 88 type: 'text', 89 text: `${wrapUntrusted(html.substring(0, 30000), 'website_html')}\n\nXPaths to check:\n${xpaths.join('\n')}`, 90 }); 91 } else { 92 // No source — full context 93 if (html) { 94 userContent.push({ 95 type: 'text', 96 text: `HTML DOM (first 30000 chars):\n${html.substring(0, 30000)}`, 97 }); 98 } 99 // Attach all screenshots 100 for (const [filename, b64] of Object.entries(screenshots || {})) { 101 if (b64) { 102 userContent.push({ type: 'text', text: `Screenshot: ${filename}` }); 103 userContent.push({ 104 type: 'image_url', 105 image_url: { url: `data:image/png;base64,${b64}` }, 106 }); 107 } 108 } 109 } 110 111 try { 112 const response = await callLLM({ 113 model: REPAIR_MODEL, 114 temperature: 0.1, 115 max_tokens: 1000, 116 json_mode: true, 117 messages: [ 118 { role: 'system', content: systemPrompt }, 119 { role: 'user', content: userContent }, 120 ], 121 stage: 'enrichment', 122 }); 123 124 return safeJsonParse(response.content); 125 } catch (err) { 126 logger.warn(`Repair LLM call failed: ${err.message}`); 127 return null; 128 } 129 } 130 131 /** 132 * Validate contacts_json and repair invalid entries via targeted LLM retry. 133 * 134 * @param {Object} contacts - contacts_json object 135 * @param {Object} assets - { countryCode, html, screenshots: { filename: base64 } } 136 * @returns {Promise<Object>} Repaired contacts_json 137 */ 138 export async function validateAndRepairContacts( 139 contacts, 140 { countryCode, html, screenshots = {} } = {} 141 ) { 142 if (!contacts || typeof contacts !== 'object') return contacts; 143 144 const errors = []; 145 146 // Validate phone numbers 147 for (const phone of contacts.phone_numbers ?? []) { 148 const { valid, reason } = validatePhone(phone.number, countryCode); 149 if (!valid) { 150 errors.push({ 151 field: 'phone_numbers', 152 value: phone.number, 153 source: phone.source || null, 154 reason, 155 }); 156 } 157 } 158 159 // Validate email addresses (format only — MX check done separately in bulk) 160 for (const emailEntry of contacts.email_addresses ?? []) { 161 const addr = emailEntry.email; 162 if (!addr || !/^[^\s@]+@[^\s@]+\.[^\s@]{2,}$/.test(addr.trim())) { 163 errors.push({ 164 field: 'email_addresses', 165 value: addr ?? '(empty)', 166 source: emailEntry.source || null, 167 reason: `"${addr}" is not a valid email format`, 168 }); 169 } 170 } 171 172 if (errors.length === 0) { 173 return contacts; // All good, no repair needed 174 } 175 176 logger.info(`Found ${errors.length} invalid contact(s) — attempting repair`); 177 178 // Group errors by source type for targeted repair calls 179 const imageErrors = errors.filter(e => classifySource(e.source) === 'image'); 180 const xpathErrors = errors.filter(e => classifySource(e.source) === 'xpath'); 181 const unknownErrors = errors.filter(e => !classifySource(e.source)); 182 183 const repairGroups = [ 184 { errors: imageErrors, sourceType: 'image' }, 185 { errors: xpathErrors, sourceType: 'xpath' }, 186 { errors: unknownErrors, sourceType: 'unknown' }, 187 ].filter(g => g.errors.length > 0); 188 189 const repairedContacts = { ...contacts }; 190 191 for (const group of repairGroups) { 192 const patch = await callRepairLLM({ ...group, html, screenshots }); 193 194 if (patch) { 195 // Merge repaired contacts — replace invalid entries with corrected ones 196 if (patch.phone_numbers) { 197 const invalidNumbers = new Set( 198 group.errors.filter(e => e.field === 'phone_numbers').map(e => e.value) 199 ); 200 repairedContacts.phone_numbers = [ 201 ...(repairedContacts.phone_numbers ?? []).filter(p => !invalidNumbers.has(p.number)), 202 ...patch.phone_numbers, 203 ]; 204 } 205 if (patch.email_addresses) { 206 const invalidEmails = new Set( 207 group.errors.filter(e => e.field === 'email_addresses').map(e => e.value) 208 ); 209 repairedContacts.email_addresses = [ 210 ...(repairedContacts.email_addresses ?? []).filter(e => !invalidEmails.has(e.email)), 211 ...patch.email_addresses, 212 ]; 213 } 214 logger.info(`Repair applied ${Object.keys(patch).length} field(s)`); 215 } else { 216 // Repair failed — drop the invalid contacts rather than send bad data 217 const invalidPhones = new Set( 218 group.errors.filter(e => e.field === 'phone_numbers').map(e => e.value) 219 ); 220 const invalidEmails = new Set( 221 group.errors.filter(e => e.field === 'email_addresses').map(e => e.value) 222 ); 223 224 if (invalidPhones.size > 0) { 225 repairedContacts.phone_numbers = (repairedContacts.phone_numbers ?? []).filter( 226 p => !invalidPhones.has(p.number) 227 ); 228 logger.warn(`Dropped ${invalidPhones.size} invalid phone(s) that could not be repaired`); 229 } 230 if (invalidEmails.size > 0) { 231 repairedContacts.email_addresses = (repairedContacts.email_addresses ?? []).filter( 232 e => !invalidEmails.has(e.email) 233 ); 234 logger.warn(`Dropped ${invalidEmails.size} invalid email(s) that could not be repaired`); 235 } 236 } 237 } 238 239 return repairedContacts; 240 } 241 242 export default { validateAndRepairContacts };