/ src / utils / fix-checker.js
fix-checker.js
  1  /**
  2   * Fix Checker — Detects whether a prospect applied the free fix we suggested.
  3   *
  4   * Used by the follow-up generator at touches 2-7 to produce adaptive messaging:
  5   * "Nice work, you applied the fix!" vs "Still available if you want it."
  6   *
  7   * Also exports selectFreeFix() for use during touch 1 generation to populate
  8   * free_fix_before_state on the site record.
  9   */
 10  
 11  import Logger from './logger.js';
 12  
 13  const logger = new Logger('FixChecker');
 14  
 15  const FETCH_TIMEOUT_MS = 10_000;
 16  const USER_AGENT =
 17    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
 18  
 19  // Meta description values that indicate a site is using the CMS default — not a real description.
 20  const GENERIC_META_DESCRIPTIONS = [
 21    'just another wordpress site',
 22    'just another site',
 23    'sample page',
 24    'my blog',
 25    'my website',
 26    'welcome to wordpress',
 27    'proudly powered by wordpress',
 28  ];
 29  
 30  // ─── Internal helpers ─────────────────────────────────────────────────────────
 31  
 32  /**
 33   * Validate a URL is safe to fetch (reject private/localhost to prevent SSRF).
 34   * @param {string} url
 35   * @returns {boolean}
 36   */
 37  function isSafeUrl(url) {
 38    try {
 39      const parsed = new URL(url);
 40      if (!['http:', 'https:'].includes(parsed.protocol)) return false;
 41      const host = parsed.hostname.toLowerCase();
 42      if (host === 'localhost' || host === '127.0.0.1' || host === '::1') return false;
 43      if (host === '0.0.0.0') return false;
 44      // Reject private IP ranges (10.x, 172.16-31.x, 192.168.x, 169.254.x)
 45      const parts = host.split('.');
 46      if (parts.length === 4 && parts.every(p => /^\d+$/.test(p))) {
 47        const [a, b] = parts.map(Number);
 48        if (a === 10) return false;
 49        if (a === 172 && b >= 16 && b <= 31) return false;
 50        if (a === 192 && b === 168) return false;
 51        if (a === 169 && b === 254) return false;
 52      }
 53      return true;
 54    } catch {
 55      return false;
 56    }
 57  }
 58  
 59  /**
 60   * Fetch a URL with a timeout and realistic browser headers.
 61   * Follows redirects (default behaviour for fetch).
 62   *
 63   * @param {string} url
 64   * @returns {Promise<{ok: boolean, status: number, text: string|null}>}
 65   */
 66  async function fetchPage(url) {
 67    if (!isSafeUrl(url)) {
 68      logger.warn(`Rejected unsafe URL: ${url}`);
 69      return { ok: false, status: 0, text: null };
 70    }
 71  
 72    const controller = new AbortController();
 73    const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
 74  
 75    try {
 76      const response = await fetch(url, {
 77        signal: controller.signal,
 78        redirect: 'follow',
 79        headers: {
 80          'User-Agent': USER_AGENT,
 81          Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 82          'Accept-Language': 'en-US,en;q=0.5',
 83          'Cache-Control': 'no-cache',
 84        },
 85      });
 86  
 87      const text = await response.text();
 88      return { ok: response.ok, status: response.status, text };
 89    } catch (err) {
 90      if (err.name === 'AbortError') {
 91        logger.warn(`Fetch timed out after ${FETCH_TIMEOUT_MS}ms: ${url}`);
 92      } else {
 93        logger.warn(`Fetch failed for ${url}: ${err.message}`);
 94      }
 95      return { ok: false, status: 0, text: null };
 96    } finally {
 97      clearTimeout(timer);
 98    }
 99  }
100  
101  /**
102   * Extract the content attribute of <meta name="description">.
103   * Returns null if the tag is absent.
104   *
105   * Handles both attribute orderings:
106   *   <meta name="description" content="...">
107   *   <meta content="..." name="description">
108   *
109   * @param {string} html
110   * @returns {string|null}
111   */
112  function extractMetaDescription(html) {
113    // name before content
114    const fwd = html.match(
115      /<meta[^>]+name\s*=\s*["']description["'][^>]+content\s*=\s*["']([^"']*)["']/i
116    );
117    if (fwd) return fwd[1].trim();
118  
119    // content before name
120    const rev = html.match(
121      /<meta[^>]+content\s*=\s*["']([^"']*)["'][^>]+name\s*=\s*["']description["']/i
122    );
123    if (rev) return rev[1].trim();
124  
125    return null;
126  }
127  
128  /**
129   * Return true if the value looks like an unmodified CMS placeholder.
130   *
131   * @param {string} value
132   * @returns {boolean}
133   */
134  function isGenericMetaDescription(value) {
135    if (!value) return false;
136    const lower = value.toLowerCase().trim();
137    return GENERIC_META_DESCRIPTIONS.some(g => lower.includes(g));
138  }
139  
140  /**
141   * Check whether the canonical tag is now present in the HTML.
142   *
143   * @param {string} html
144   * @returns {boolean}
145   */
146  function hasCanonicalTag(html) {
147    return /<link[^>]+rel\s*=\s*["']canonical["'][^>]*>/i.test(html);
148  }
149  
150  /**
151   * Determine whether the first large <img> in the body area now has a non-empty
152   * alt attribute. We look at the first 2000 characters of the body content so we
153   * focus on above-fold / hero images.
154   *
155   * @param {string} html
156   * @returns {{ hasAlt: boolean, altValue: string|null }}
157   */
158  function getHeroImageAlt(html) {
159    // Isolate body — take everything after <body> if present, else use full HTML.
160    const bodyStart = html.search(/<body[^>]*>/i);
161    const bodyHtml = bodyStart >= 0 ? html.slice(bodyStart) : html;
162    const searchArea = bodyHtml.slice(0, 2000);
163  
164    // Find the first <img> tag in the search area.
165    const imgMatch = searchArea.match(/<img\s[^>]*>/i);
166    if (!imgMatch) return { hasAlt: false, altValue: null };
167  
168    const imgTag = imgMatch[0];
169  
170    // Check for alt attribute (may be empty string).
171    const altMatch = imgTag.match(/\balt\s*=\s*["']([^"']*)["']/i);
172    if (!altMatch) return { hasAlt: false, altValue: null };
173  
174    const altValue = altMatch[1].trim();
175    return { hasAlt: altValue.length > 0, altValue };
176  }
177  
178  // ─── Exported functions ───────────────────────────────────────────────────────
179  
180  /**
181   * Check if a prospect applied the free fix we suggested.
182   *
183   * @param {Object} site - Site record from DB with free_fix_before_state
184   * @returns {Promise<{applied: boolean, element: string, beforeValue: string, currentValue: string|null}>}
185   */
186  export async function checkFixApplied(site) {
187    const fallback = { applied: false, element: 'unknown', beforeValue: '', currentValue: null };
188  
189    // Parse before-state
190    if (!site?.free_fix_before_state) {
191      return fallback;
192    }
193  
194    let state;
195    try {
196      state =
197        typeof site.free_fix_before_state === 'string'
198          ? JSON.parse(site.free_fix_before_state)
199          : site.free_fix_before_state;
200    } catch (err) {
201      logger.warn(`Failed to parse free_fix_before_state for site ${site.id}: ${err.message}`);
202      return fallback;
203    }
204  
205    const { type, element = 'unknown', beforeValue = '', url } = state;
206  
207    if (!type || !url) {
208      logger.warn(`free_fix_before_state missing type or url for site ${site.id}`);
209      return { applied: false, element, beforeValue, currentValue: null };
210    }
211  
212    // ── broken_link: fetch the specific URL that was 404, check if it's now 200 ──
213    if (type === 'broken_link') {
214      const brokenUrl = state.brokenUrl || url;
215      logger.debug(`Checking broken_link fix: ${brokenUrl}`);
216      const { ok, status } = await fetchPage(brokenUrl);
217      return {
218        applied: ok && status === 200,
219        element,
220        beforeValue,
221        currentValue: String(status),
222      };
223    }
224  
225    // All other types need the homepage HTML.
226    logger.debug(`Fetching homepage for fix check (${type}): ${url}`);
227    const { ok, text: html } = await fetchPage(url);
228  
229    if (!ok || !html) {
230      logger.warn(`Could not fetch homepage for fix check on site ${site.id} (${url})`);
231      return { applied: false, element, beforeValue, currentValue: null };
232    }
233  
234    // ── meta_description ──────────────────────────────────────────────────────
235    if (type === 'meta_description') {
236      const currentValue = extractMetaDescription(html);
237  
238      if (currentValue === null) {
239        // Still missing — not applied.
240        return { applied: false, element, beforeValue, currentValue: null };
241      }
242  
243      // Applied if the value is now different from what it was before AND is not
244      // another generic placeholder.
245      const changed = currentValue.toLowerCase() !== beforeValue.toLowerCase();
246      const notGeneric = !isGenericMetaDescription(currentValue);
247      return { applied: changed && notGeneric, element, beforeValue, currentValue };
248    }
249  
250    // ── canonical ─────────────────────────────────────────────────────────────
251    if (type === 'canonical') {
252      // Before-state: canonical was absent. Applied = now present.
253      const present = hasCanonicalTag(html);
254      const canonicalMatch = html.match(/<link[^>]+rel\s*=\s*["']canonical["'][^>]+href\s*=\s*["']([^"']*)["']/i);
255      const currentValue = canonicalMatch ? canonicalMatch[1] : null;
256      return { applied: present, element, beforeValue, currentValue };
257    }
258  
259    // ── alt_text ──────────────────────────────────────────────────────────────
260    if (type === 'alt_text') {
261      const { hasAlt, altValue } = getHeroImageAlt(html);
262      return { applied: hasAlt, element, beforeValue, currentValue: altValue };
263    }
264  
265    // Unknown type — log and return not-applied.
266    logger.warn(`Unknown fix type "${type}" for site ${site.id} — skipping check`);
267    return { applied: false, element, beforeValue, currentValue: null };
268  }
269  
270  // ─── Fix Selection ────────────────────────────────────────────────────────────
271  
272  /**
273   * Determine the best free fix to suggest for a site based on its score data
274   * and the raw homepage HTML already fetched during the assets stage.
275   *
276   * Called during touch 1 generation to populate free_fix_before_state.
277   *
278   * Priority order:
279   *   1. Missing or generic meta description (< 50 chars or CMS placeholder)
280   *   2. Missing canonical tag
281   *   3. Missing alt text on hero image
282   *
283   * @param {Object} scoreData - Parsed score_json
284   * @param {string} html - Raw homepage HTML (already fetched during assets stage)
285   * @returns {{type: string, element: string, beforeValue: string, suggestedFix: string}|null}
286   */
287  export function selectFreeFix(scoreData, html) {
288    if (!html || html.length < 100) return null;
289  
290    // Extract contextual signals for a better suggested meta description.
291    const industry = scoreData?.factor_scores?.contextual_appropriateness?.industry_context
292      || scoreData?.industry_classification
293      || null;
294  
295    const city =
296      scoreData?.overall_calculation?.city ||
297      scoreData?.contact_details?.city ||
298      null;
299  
300    const countryCode =
301      scoreData?.overall_calculation?.country_code ||
302      scoreData?.contact_details?.country_code ||
303      null;
304  
305    // ── Priority 1: meta description ─────────────────────────────────────────
306    const currentMeta = extractMetaDescription(html);
307  
308    const metaMissing = currentMeta === null;
309    const metaShort = currentMeta !== null && currentMeta.length < 50;
310    const metaGeneric = currentMeta !== null && isGenericMetaDescription(currentMeta);
311    const metaTruncated = currentMeta !== null && currentMeta.length > 160;
312  
313    if (metaMissing || metaShort || metaGeneric || metaTruncated) {
314      const beforeValue = currentMeta ?? '';
315      const suggestedFix = buildSuggestedMetaDescription({ industry, city, countryCode });
316      return {
317        type: 'meta_description',
318        element: 'meta description',
319        beforeValue,
320        suggestedFix,
321      };
322    }
323  
324    // ── Priority 2: canonical tag ─────────────────────────────────────────────
325    if (!hasCanonicalTag(html)) {
326      return {
327        type: 'canonical',
328        element: 'canonical tag',
329        beforeValue: '',
330        suggestedFix: '<link rel="canonical" href="[your-homepage-url]" />',
331      };
332    }
333  
334    // ── Priority 3: alt text on hero image ────────────────────────────────────
335    const { hasAlt } = getHeroImageAlt(html);
336    if (!hasAlt) {
337      // Attempt to extract the src of the first image so we can be specific.
338      const bodyStart = html.search(/<body[^>]*>/i);
339      const bodyHtml = bodyStart >= 0 ? html.slice(bodyStart) : html;
340      const imgMatch = bodyHtml.slice(0, 2000).match(/<img\s[^>]*>/i);
341      const srcMatch = imgMatch ? imgMatch[0].match(/\bsrc\s*=\s*["']([^"']*)["']/i) : null;
342      const srcHint = srcMatch ? ` (${srcMatch[1].split('/').pop()})` : '';
343  
344      return {
345        type: 'alt_text',
346        element: `hero image alt text${srcHint}`,
347        beforeValue: '',
348        suggestedFix: buildSuggestedAltText({ industry }),
349      };
350    }
351  
352    // No fixable issue found.
353    return null;
354  }
355  
356  // ─── Private helpers for fix generation ─────────────────────────────────────
357  
358  /**
359   * Build a suggested meta description using available context signals.
360   * Kept under 160 characters; does not claim specifics we cannot verify.
361   *
362   * @param {Object} opts
363   * @param {string|null} opts.industry
364   * @param {string|null} opts.city
365   * @param {string|null} opts.countryCode
366   * @returns {string}
367   */
368  function buildSuggestedMetaDescription({ industry, city, countryCode }) {
369    const locationPart = city ? ` in ${city}` : '';
370  
371    // Use a generic but high-quality template based on whatever context we have.
372    if (industry) {
373      const industryLower = industry.toLowerCase();
374      const base = `Professional ${industryLower} services${locationPart}. Get a free quote today and see why local customers trust us for quality results.`;
375      return base.slice(0, 160);
376    }
377  
378    // Bare minimum when we have no industry.
379    const base = `Local business${locationPart}. Contact us today for a free consultation and quality service you can rely on.`;
380    return base.slice(0, 160);
381  }
382  
383  /**
384   * Build a suggested alt text string for the hero image.
385   *
386   * @param {Object} opts
387   * @param {string|null} opts.industry
388   * @returns {string}
389   */
390  function buildSuggestedAltText({ industry }) {
391    if (industry) {
392      return `${industry} team at work — professional service you can trust`;
393    }
394    return 'Our team providing professional services — quality you can count on';
395  }