Cradicle Explorer

/ src / utils / contact-repair.js
contact-repair.js
  1  /**
  2   * Contact Repair
  3   * Validates contacts_json returned by LLMs and retries with targeted error context
  4   * when invalid contacts are found.
  5   *
  6   * Source attribution strategy:
  7   * - If contact has source = image filename → retry with just that image + error
  8   * - If contact has source = XPath           → retry with html_dom + xpath + error
  9   * - If no source                             → retry with full context (html + screenshots)
 10   */
 11  
 12  import { callLLM } from './llm-provider.js';
 13  import { validatePhone, validateEmail } from './contact-validator.js';
 14  import { safeJsonParse } from './error-handler.js';
 15  import { wrapUntrusted } from './llm-sanitizer.js';
 16  import Logger from './logger.js';
 17  
 18  const logger = new Logger('ContactRepair');
 19  
 20  // Use same model as enrichment for repair calls
 21  const REPAIR_MODEL =
 22    process.env.ENRICHMENT_MODEL || process.env.CLAUDE_SONNET_MODEL || 'anthropic/claude-sonnet-4-6';
 23  
 24  /**
 25   * Detect whether a source string is an image filename or an XPath
 26   * @param {string} source
 27   * @returns {'image'|'xpath'|null}
 28   */
 29  function classifySource(source) {
 30    if (!source) return null;
 31    if (/\.(jpg|jpeg|png|gif|webp)$/i.test(source)) return 'image';
 32    if (source.startsWith('//') || source.startsWith('/html') || source.startsWith('(//'))
 33      return 'xpath';
 34    return null;
 35  }
 36  
 37  /**
 38   * Find a screenshot buffer by filename from the assets object
 39   * @param {string} filename - e.g. "above_fold.jpg"
 40   * @param {Object} screenshots - map of filename → base64 string
 41   * @returns {string|null} base64 string or null
 42   */
 43  function findScreenshot(filename, screenshots = {}) {
 44    // eslint-disable-next-line security/detect-object-injection
 45    return screenshots[filename] || null;
 46  }
 47  
 48  /**
 49   * Build a repair LLM call for a group of errors sharing the same source type
 50   * @returns {Promise<Object|null>} Repaired contacts partial object, or null
 51   */
 52  async function callRepairLLM({ errors, html, screenshots, sourceType }) {
 53    const errorList = errors
 54      .map(e => `- ${e.field}: "${e.value}" (source: ${e.source || 'unknown'}) — ${e.reason}`)
 55      .join('\n');
 56  
 57    const systemPrompt = `You are correcting invalid contact information extracted from a business website.
 58  You will be given a list of invalid contacts and their errors. Fix ONLY the listed contacts.
 59  Return ONLY a JSON object with the corrected contacts in the same schema. Do not include contacts that were not listed as errors.
 60  If you cannot find a valid replacement, omit that contact entirely.`;
 61  
 62    const userContent = [];
 63  
 64    // Add error list as text
 65    userContent.push({
 66      type: 'text',
 67      text: `The following extracted contacts failed validation:\n\n${wrapUntrusted(errorList, 'validation_errors')}\n\nPlease provide corrected values.`,
 68    });
 69  
 70    // Attach relevant assets based on source type
 71    if (sourceType === 'image') {
 72      for (const err of errors) {
 73        const b64 = findScreenshot(err.source, screenshots);
 74        if (b64) {
 75          userContent.push({
 76            type: 'text',
 77            text: `Source image (${err.source}):`,
 78          });
 79          userContent.push({
 80            type: 'image_url',
 81            image_url: { url: `data:image/png;base64,${b64}` },
 82          });
 83        }
 84      }
 85    } else if (sourceType === 'xpath' && html) {
 86      const xpaths = [...new Set(errors.map(e => e.source).filter(Boolean))];
 87      userContent.push({
 88        type: 'text',
 89        text: `${wrapUntrusted(html.substring(0, 30000), 'website_html')}\n\nXPaths to check:\n${xpaths.join('\n')}`,
 90      });
 91    } else {
 92      // No source — full context
 93      if (html) {
 94        userContent.push({
 95          type: 'text',
 96          text: `HTML DOM (first 30000 chars):\n${html.substring(0, 30000)}`,
 97        });
 98      }
 99      // Attach all screenshots
100      for (const [filename, b64] of Object.entries(screenshots || {})) {
101        if (b64) {
102          userContent.push({ type: 'text', text: `Screenshot: ${filename}` });
103          userContent.push({
104            type: 'image_url',
105            image_url: { url: `data:image/png;base64,${b64}` },
106          });
107        }
108      }
109    }
110  
111    try {
112      const response = await callLLM({
113        model: REPAIR_MODEL,
114        temperature: 0.1,
115        max_tokens: 1000,
116        json_mode: true,
117        messages: [
118          { role: 'system', content: systemPrompt },
119          { role: 'user', content: userContent },
120        ],
121        stage: 'enrichment',
122      });
123  
124      return safeJsonParse(response.content);
125    } catch (err) {
126      logger.warn(`Repair LLM call failed: ${err.message}`);
127      return null;
128    }
129  }
130  
131  /**
132   * Validate contacts_json and repair invalid entries via targeted LLM retry.
133   *
134   * @param {Object} contacts - contacts_json object
135   * @param {Object} assets - { countryCode, html, screenshots: { filename: base64 } }
136   * @returns {Promise<Object>} Repaired contacts_json
137   */
138  export async function validateAndRepairContacts(
139    contacts,
140    { countryCode, html, screenshots = {} } = {}
141  ) {
142    if (!contacts || typeof contacts !== 'object') return contacts;
143  
144    const errors = [];
145  
146    // Validate phone numbers
147    for (const phone of contacts.phone_numbers ?? []) {
148      const { valid, reason } = validatePhone(phone.number, countryCode);
149      if (!valid) {
150        errors.push({
151          field: 'phone_numbers',
152          value: phone.number,
153          source: phone.source || null,
154          reason,
155        });
156      }
157    }
158  
159    // Validate email addresses (format only — MX check done separately in bulk)
160    for (const emailEntry of contacts.email_addresses ?? []) {
161      const addr = emailEntry.email;
162      if (!addr || !/^[^\s@]+@[^\s@]+\.[^\s@]{2,}$/.test(addr.trim())) {
163        errors.push({
164          field: 'email_addresses',
165          value: addr ?? '(empty)',
166          source: emailEntry.source || null,
167          reason: `"${addr}" is not a valid email format`,
168        });
169      }
170    }
171  
172    if (errors.length === 0) {
173      return contacts; // All good, no repair needed
174    }
175  
176    logger.info(`Found ${errors.length} invalid contact(s) — attempting repair`);
177  
178    // Group errors by source type for targeted repair calls
179    const imageErrors = errors.filter(e => classifySource(e.source) === 'image');
180    const xpathErrors = errors.filter(e => classifySource(e.source) === 'xpath');
181    const unknownErrors = errors.filter(e => !classifySource(e.source));
182  
183    const repairGroups = [
184      { errors: imageErrors, sourceType: 'image' },
185      { errors: xpathErrors, sourceType: 'xpath' },
186      { errors: unknownErrors, sourceType: 'unknown' },
187    ].filter(g => g.errors.length > 0);
188  
189    const repairedContacts = { ...contacts };
190  
191    for (const group of repairGroups) {
192      const patch = await callRepairLLM({ ...group, html, screenshots });
193  
194      if (patch) {
195        // Merge repaired contacts — replace invalid entries with corrected ones
196        if (patch.phone_numbers) {
197          const invalidNumbers = new Set(
198            group.errors.filter(e => e.field === 'phone_numbers').map(e => e.value)
199          );
200          repairedContacts.phone_numbers = [
201            ...(repairedContacts.phone_numbers ?? []).filter(p => !invalidNumbers.has(p.number)),
202            ...patch.phone_numbers,
203          ];
204        }
205        if (patch.email_addresses) {
206          const invalidEmails = new Set(
207            group.errors.filter(e => e.field === 'email_addresses').map(e => e.value)
208          );
209          repairedContacts.email_addresses = [
210            ...(repairedContacts.email_addresses ?? []).filter(e => !invalidEmails.has(e.email)),
211            ...patch.email_addresses,
212          ];
213        }
214        logger.info(`Repair applied ${Object.keys(patch).length} field(s)`);
215      } else {
216        // Repair failed — drop the invalid contacts rather than send bad data
217        const invalidPhones = new Set(
218          group.errors.filter(e => e.field === 'phone_numbers').map(e => e.value)
219        );
220        const invalidEmails = new Set(
221          group.errors.filter(e => e.field === 'email_addresses').map(e => e.value)
222        );
223  
224        if (invalidPhones.size > 0) {
225          repairedContacts.phone_numbers = (repairedContacts.phone_numbers ?? []).filter(
226            p => !invalidPhones.has(p.number)
227          );
228          logger.warn(`Dropped ${invalidPhones.size} invalid phone(s) that could not be repaired`);
229        }
230        if (invalidEmails.size > 0) {
231          repairedContacts.email_addresses = (repairedContacts.email_addresses ?? []).filter(
232            e => !invalidEmails.has(e.email)
233          );
234          logger.warn(`Dropped ${invalidEmails.size} invalid email(s) that could not be repaired`);
235        }
236      }
237    }
238  
239    return repairedContacts;
240  }
241  
242  export default { validateAndRepairContacts };