/ src / utils / programmatic-scorer.js
programmatic-scorer.js
   1  /**
   2   * Programmatic Scorer — Rule-based website scoring (no LLM required).
   3   *
   4   * Replaces GPT-4o-mini scoring with HTML/DOM analysis.
   5   * Uses the same factor weights and grade thresholds as score.js.
   6   *
   7   * Each factor is scored 0-10 using regex/DOM pattern matching:
   8   *   0 = absent, 3-4 = weak, 5-6 = adequate, 7-8 = strong, 9-10 = exceptional
   9   *
  10   * Non-English support: French (FR/BE/CH), Polish (PL), Spanish (ES/MX/AR),
  11   * German (DE/AT/CH), Italian (IT). Language detected from HTML lang attribute.
  12   */
  13  
  14  import { computeGrade, FACTOR_WEIGHTS } from '../score.js';
  15  import { extractContactsFromHtml } from './html-contact-extractor.js';
  16  import { detectCountryFromTLD } from './tld-detector.js';
  17  
  18  // ─── Vision-Aware Weight Redistribution ──────────────────────────────────────
  19  
  20  /**
  21   * When ENABLE_VISION is false, imagery_design and hook_engagement can only be
  22   * partially assessed from HTML (no screenshots for layout/visual hierarchy).
  23   * Redistribute their weight to factors the programmatic scorer CAN assess.
  24   *
  25   * Default weights (vision ON):
  26   *   headline_quality: 0.15, value_proposition: 0.14, unique_selling_proposition: 0.13,
  27   *   call_to_action: 0.13, urgency_messaging: 0.10, hook_engagement: 0.09,
  28   *   trust_signals: 0.11, imagery_design: 0.08, offer_clarity: 0.04, contextual: 0.03
  29   *
  30   * No-vision weights (vision OFF):
  31   *   imagery_design: 0.03 (down from 0.08 — can still detect img count, alt text, responsive)
  32   *   hook_engagement: 0.04 (down from 0.09 — can detect video embeds, hero images in markup)
  33   *   Freed weight (0.10) redistributed to semantic + structural factors the scorer handles well.
  34   */
  35  const NO_VISION_WEIGHTS = {
  36    headline_quality: 0.17, // +0.02 (detectable from h1)
  37    value_proposition: 0.16, // +0.02 (detectable from text)
  38    unique_selling_proposition: 0.14, // +0.01 (detectable from text)
  39    call_to_action: 0.15, // +0.02 (well-detected from HTML)
  40    urgency_messaging: 0.1, // unchanged
  41    hook_engagement: 0.04, // -0.05 (can't assess visual impact)
  42    trust_signals: 0.13, // +0.02 (well-detected from HTML)
  43    imagery_design: 0.03, // -0.05 (can't assess design quality)
  44    offer_clarity: 0.05, // +0.01 (well-detected from HTML)
  45    contextual_appropriateness: 0.03, // unchanged
  46  };
  47  
  48  /**
  49   * Compute weighted total score using vision-aware weights.
  50   * @param {Object} factorScores - Factor scores (each has .score 0-10)
  51   * @param {boolean} visionEnabled - Whether vision/screenshots are available
  52   * @returns {number} Score 0-100
  53   */
  54  export function computeWeightedScore(factorScores, visionEnabled = true) {
  55    if (!factorScores || typeof factorScores !== 'object') return null;
  56    const weights = visionEnabled ? FACTOR_WEIGHTS : NO_VISION_WEIGHTS;
  57    let total = 0;
  58    for (const [factor, weight] of Object.entries(weights)) {
  59      const score = factorScores[factor]?.score ?? 0;
  60      total += score * weight;
  61    }
  62    return Math.round(total * 10 * 10) / 10;
  63  }
  64  
  65  // ─── Text Extraction for Hybrid Scoring ──────────────────────────────────────
  66  
  67  /**
  68   * Extract key text sections from HTML for LLM semantic scoring.
  69   * Returns a compact text representation (~500-1500 tokens) that Haiku
  70   * can use to evaluate headline quality, value proposition, and USP.
  71   *
  72   * @param {string} html - Full rendered DOM HTML
  73   * @returns {Object} Extracted text sections
  74   */
  75  export function extractScoringText(html) {
  76    if (!html || html.length < 100) return null;
  77  
  78    // H1 headline
  79    const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
  80    const h1 = h1Match ? h1Match[1].replace(/<[^>]+>/g, '').trim() : null;
  81  
  82    // H2 subheadings (first 5)
  83    const h2Matches = [...(html.matchAll(/<h2[^>]*>([\s\S]*?)<\/h2>/gi) || [])];
  84    const h2s = h2Matches
  85      .slice(0, 5)
  86      .map(m => m[1].replace(/<[^>]+>/g, '').trim())
  87      .filter(Boolean);
  88  
  89    // Above-fold text (first ~3000 chars of HTML, stripped)
  90    const aboveFold = stripHtmlForExtraction(html.slice(0, 4000)).slice(0, 800);
  91  
  92    // Full page text (stripped, capped)
  93    const fullText = stripHtmlForExtraction(html).slice(0, 2000);
  94  
  95    // Testimonial/review sections
  96    const testimonials = extractTestimonials(html);
  97  
  98    // CTA text (button/link text)
  99    const ctaTexts = extractCTATexts(html);
 100  
 101    // Meta description
 102    const metaMatch = html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']+)["']/i);
 103    const metaDescription = metaMatch ? metaMatch[1].trim() : null;
 104  
 105    // Title tag
 106    const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
 107    const title = titleMatch ? titleMatch[1].replace(/<[^>]+>/g, '').trim() : null;
 108  
 109    return {
 110      title,
 111      meta_description: metaDescription,
 112      h1,
 113      h2s,
 114      above_fold_text: aboveFold,
 115      body_text: fullText,
 116      testimonial_snippets: testimonials,
 117      cta_texts: ctaTexts,
 118    };
 119  }
 120  
 121  function stripHtmlForExtraction(html) {
 122    return (html || '')
 123      .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
 124      .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
 125      .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
 126      .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
 127      .replace(/<[^>]+>/g, ' ')
 128      .replace(/&[a-z]+;/gi, ' ')
 129      .replace(/\s+/g, ' ')
 130      .trim();
 131  }
 132  
 133  function extractTestimonials(html) {
 134    const snippets = [];
 135    // Look for blockquote, testimonial divs, review sections
 136    const blockquotes = html.match(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi) || [];
 137    for (const bq of blockquotes.slice(0, 3)) {
 138      const text = bq
 139        .replace(/<[^>]+>/g, '')
 140        .trim()
 141        .slice(0, 200);
 142      if (text.length > 20) snippets.push(text);
 143    }
 144    // testimonial/review class divs
 145    const testimonialDivs =
 146      html.match(/<div[^>]*class[^>]*(testimonial|review|feedback)[^>]*>([\s\S]*?)<\/div>/gi) || [];
 147    for (const div of testimonialDivs.slice(0, 3)) {
 148      const text = div
 149        .replace(/<[^>]+>/g, '')
 150        .trim()
 151        .slice(0, 200);
 152      if (text.length > 20 && !snippets.includes(text)) snippets.push(text);
 153    }
 154    return snippets.slice(0, 5);
 155  }
 156  
 157  function extractCTATexts(html) {
 158    const ctas = [];
 159    const buttons =
 160      html.match(/<(button|a)[^>]*class[^>]*(btn|button|cta)[^>]*>([\s\S]*?)<\/\1>/gi) || [];
 161    for (const btn of buttons.slice(0, 5)) {
 162      const text = btn.replace(/<[^>]+>/g, '').trim();
 163      if (text.length >= 2 && text.length <= 50) ctas.push(text);
 164    }
 165    return [...new Set(ctas)].slice(0, 5);
 166  }
 167  
 168  // ─── Language-Specific Keyword Sets ─────────────────────────────────────────
 169  
 170  /**
 171   * CTA action keywords per language.
 172   * Used in scoreCTA() to detect call-to-action text in links/buttons.
 173   */
 174  const CTA_KEYWORDS = {
 175    en: /\b(get started|book now|call now|contact us|get a quote|free quote|schedule|request|order now|buy now|sign up|learn more|find out|get in touch|start now|claim|download|try free|shop now)\b/i,
 176    fr: /\b(appeler|appelez|contactez|contact|devis|devis gratuit|réserver|réservez|commander|demander|commencer|s'inscrire|en savoir plus|prendre rendez-vous|obtenir un devis|demandez un devis)\b/i,
 177    pl: /\b(zadzwoń|zadzwon|kontakt|wycena|zamów|zamow|zarezerwuj|zapisz się|zapisz sie|dowiedz się więcej|dowiedz sie wiecej|bezpłatna wycena|darmowa wycena|skontaktuj się|skontaktuj sie)\b/i,
 178    es: /\b(llamar|llame|contactar|contáctenos|contactenos|cotizar|cotización|cotizacion|reservar|solicitar|comenzar|suscribirse|más información|mas informacion|obtenga presupuesto|presupuesto gratis)\b/i,
 179    de: /\b(anrufen|rufen sie|kontakt|angebot|angebot anfordern|kostenloses angebot|buchen|reservieren|bestellen|anfragen|jetzt starten|mehr erfahren|termin vereinbaren|jetzt anrufen)\b/i,
 180    it: /\b(chiama|chiami|contatta|contattateci|preventivo|preventivo gratuito|prenota|prenotate|ordina|richiedi|inizia|iscriviti|scopri di più|scopri di piu|richiedi un preventivo)\b/i,
 181  };
 182  
 183  /**
 184   * Urgency / time-pressure keywords per language.
 185   */
 186  const URGENCY_KEYWORDS = {
 187    en: /\b(limited time|hurry|act now|don't miss|expires?|deadline|last chance|ending soon|today only|this week|offer ends|while supplies last|before it's too late)\b/i,
 188    fr: /\b(offre limitée|offre limitee|dépêchez|depechez|agissez maintenant|ne ratez pas|expire|date limite|dernière chance|derniere chance|se termine bientôt|se termine bientot|aujourd'hui seulement|cette semaine)\b/i,
 189    pl: /\b(ograniczona oferta|pośpiesz się|pospieszcie sie|działaj teraz|dzialaj teraz|nie przegap|wygasa|termin|ostatnia szansa|kończy się|konczy sie|tylko dziś|tylko dzis|w tym tygodniu)\b/i,
 190    es: /\b(tiempo limitado|apúrese|apurese|actúe ahora|actue ahora|no se pierda|expira|fecha límite|fecha limite|última oportunidad|ultima oportunidad|termina pronto|solo hoy|esta semana)\b/i,
 191    de: /\b(begrenzte zeit|beeilen sie sich|jetzt handeln|verpassen sie nicht|läuft ab|lauft ab|frist|letzte chance|endet bald|nur heute|diese woche|angebot endet)\b/i,
 192    it: /\b(tempo limitato|affrettatevi|agite ora|non perdete|scade|scadenza|ultima possibilità|ultima possibilita|termina presto|solo oggi|questa settimana|offerta termina)\b/i,
 193  };
 194  
 195  /**
 196   * Scarcity keywords per language.
 197   */
 198  const SCARCITY_KEYWORDS = {
 199    en: /\b(only \d+|limited (spots?|seats?|availability)|few remaining|almost gone|selling fast|limited stock)\b/i,
 200    fr: /\b(seulement \d+|places limitées|places limitees|disponibilité limitée|disponibilite limitee|presque épuisé|presque epuise|stock limité|stock limite)\b/i,
 201    pl: /\b(tylko \d+|ograniczona liczba miejsc|ograniczona dostępność|ograniczona dostepnosc|prawie wyprzedane|ograniczony stan)\b/i,
 202    es: /\b(solo \d+|plazas limitadas|disponibilidad limitada|casi agotado|stock limitado|pocas unidades)\b/i,
 203    de: /\b(nur \d+|begrenzte plätze|begrenzte platze|begrenzte verfügbarkeit|begrenzte verfugbarkeit|fast ausverkauft|begrenzter bestand)\b/i,
 204    it: /\b(solo \d+|posti limitati|disponibilità limitata|disponibilita limitata|quasi esaurito|scorte limitate)\b/i,
 205  };
 206  
 207  /**
 208   * Trust signal keywords per language.
 209   */
 210  const TRUST_SECTION_KEYWORDS = {
 211    en: /\b(testimonial|review|what (our |people |customers? |clients? )?say|feedback|rating)\b/i,
 212    fr: /\b(témoignage|temoignage|avis|avis clients?|ce que (nos |les )?clients? disent|commentaire|note|évaluation|evaluation)\b/i,
 213    pl: /\b(opinia|opinie|recenzja|recenzje|co mówią|co mowia|komentarz|komentarze|ocena|oceny|nasi klienci)\b/i,
 214    es: /\b(testimonio|testimonios|reseña|reseñas|resena|resenas|lo que (nuestros |los )?clientes? (dicen|opinan)|comentario|calificación|calificacion|opinión|opinion)\b/i,
 215    de: /\b(erfahrungsbericht|erfahrungsberichte|bewertung|bewertungen|kundenmeinung|kundenmeinungen|was (unsere |die )?kunden sagen|rezension|bewertung)\b/i,
 216    it: /\b(testimonianza|testimonianze|recensione|recensioni|cosa (i nostri |i )?clienti (dicono|pensano)|commento|valutazione|opinione)\b/i,
 217  };
 218  
 219  /**
 220   * Certification / trust badge keywords per language.
 221   */
 222  const CERT_KEYWORDS = {
 223    en: /\b(certified|accredited|licensed|insured|bonded|registered|approved|verified|member of)\b/i,
 224    fr: /\b(certifié|certifie|accrédité|accredite|agréé|agree|assuré|assure|enregistré|enregistre|approuvé|approuve|vérifié|verifie|membre de)\b/i,
 225    pl: /\b(certyfikowany|certyfikat|akredytowany|licencjonowany|ubezpieczony|zarejestrowany|zatwierdzony|zweryfikowany|członek)\b/i,
 226    es: /\b(certificado|acreditado|licenciado|asegurado|registrado|aprobado|verificado|miembro de)\b/i,
 227    de: /\b(zertifiziert|akkreditiert|lizenziert|versichert|eingetragen|zugelassen|verifiziert|mitglied (von|der|des))\b/i,
 228    it: /\b(certificato|accreditato|autorizzato|assicurato|registrato|approvato|verificato|membro di)\b/i,
 229  };
 230  
 231  /**
 232   * Guarantee / risk-reversal keywords per language.
 233   */
 234  const GUARANTEE_KEYWORDS = {
 235    en: /\b(guarantee|money.back|satisfaction|warranty|no.risk)\b/i,
 236    fr: /\b(garantie|remboursement|satisfait ou remboursé|satisfait ou rembourse|sans risque)\b/i,
 237    pl: /\b(gwarancja|zwrot pieniędzy|zwrot pieniedzy|satysfakcja|bez ryzyka)\b/i,
 238    es: /\b(garantía|garantia|devolución|devolucion|reembolso|satisfacción|satisfaccion|sin riesgo)\b/i,
 239    de: /\b(garantie|geld-zurück|geld zurück|geld zuruck|zufriedenheit|risikofrei)\b/i,
 240    it: /\b(garanzia|rimborso|soddisfazione|senza rischio)\b/i,
 241  };
 242  
 243  /**
 244   * Benefit / value proposition keywords per language.
 245   */
 246  const BENEFIT_KEYWORDS = {
 247    en: /\b(save|reduce|increase|improve|grow|protect|maximize|minimize|eliminate|prevent|achieve|guarantee|ensure|deliver)\b/gi,
 248    fr: /\b(économisez|economisez|réduisez|reduisez|augmentez|améliorez|ameliorez|développez|developpez|protégez|protegez|maximisez|éliminez|eliminez|garantissez|assurez|livrez)\b/gi,
 249    pl: /\b(oszczędź|oszczedz|zmniejsz|zwiększ|zwieksz|popraw|rozwijaj|chroń|chron|maksymalizuj|eliminuj|gwarantuj|zapewnij|dostarcz)\b/gi,
 250    es: /\b(ahorre|reduzca|aumente|mejore|crezca|proteja|maximice|minimice|elimine|prevenga|logre|garantice|asegure|entregue)\b/gi,
 251    de: /\b(sparen|reduzieren|steigern|verbessern|wachsen|schützen|schutzen|maximieren|minimieren|beseitigen|verhindern|erreichen|garantieren|sicherstellen|liefern)\b/gi,
 252    it: /\b(risparmia|riduci|aumenta|migliora|cresci|proteggi|massimizza|minimizza|elimina|previeni|raggiungi|garantisci|assicura|consegna)\b/gi,
 253  };
 254  
 255  /**
 256   * Headline benefit words per language (single word check).
 257   */
 258  const HEADLINE_BENEFIT_WORDS = {
 259    en: /\b(save|grow|boost|increase|improve|transform|get|start|discover|free|best|fast|easy|simple|trusted|guaranteed|proven|results?|solution|affordable|professional|expert|quality)\b/i,
 260    fr: /\b(économisez|economisez|développez|developpez|améliorez|ameliorez|gratuit|meilleur|rapide|simple|fiable|garanti|résultats|resultats|solution|abordable|professionnel|expert|qualité|qualite)\b/i,
 261    pl: /\b(oszczędź|oszczedz|rozwijaj|popraw|bezpłatny|bezplatny|darmowy|najlepszy|szybki|prosty|zaufany|gwarantowany|wyniki|rozwiązanie|rozwiazanie|przystępny|przystepny|profesjonalny|ekspert|jakość|jakosc)\b/i,
 262    es: /\b(ahorre|desarrolle|mejore|gratis|mejor|rápido|rapido|simple|confiable|garantizado|resultados|solución|solucion|asequible|profesional|experto|calidad)\b/i,
 263    de: /\b(sparen|entwickeln|verbessern|kostenlos|beste|schnell|einfach|zuverlässig|zuverlassig|garantiert|ergebnisse|lösung|losung|erschwinglich|professionell|experte|qualität|qualitat)\b/i,
 264    it: /\b(risparmia|sviluppa|migliora|gratis|gratuito|migliore|veloce|semplice|affidabile|garantito|risultati|soluzione|conveniente|professionale|esperto|qualità|qualita)\b/i,
 265  };
 266  
 267  /**
 268   * USP / differentiation keywords per language.
 269   */
 270  const USP_KEYWORDS = {
 271    en: /\b(only|unique|exclusive|unlike|first|pioneering|original|proprietary|patented|award[- ]winning|leading|#1|number one|best in)\b/gi,
 272    fr: /\b(unique|exclusif|exclusifs|contrairement|premier|pionnier|original|propriétaire|breveté|brevete|primé|prime|leader|n°1|numéro un|numero un|meilleur de)\b/gi,
 273    pl: /\b(jedyny|unikalny|unikalny|wyjątkowy|wyjatkowy|ekskluzywny|w przeciwieństwie|w przeciwienstwie|pierwszy|pionierski|oryginalny|opatentowany|nagrodzony|wiodący|wiodacy|nr 1|numer jeden|najlepszy)\b/gi,
 274    es: /\b(único|unico|exclusivo|a diferencia|primero|pionero|original|patentado|premiado|líder|lider|número uno|numero uno|el mejor)\b/gi,
 275    de: /\b(einzigartig|exklusiv|im gegensatz|erste|erstmals|pionier|original|patentiert|preisgekrönt|preisgekront|führend|fuhrend|nr\.? ?1|nummer eins|bestes)\b/gi,
 276    it: /\b(unico|esclusivo|a differenza|primo|pioniere|originale|brevettato|premiato|leader|n\. ?1|numero uno|il migliore)\b/gi,
 277  };
 278  
 279  /**
 280   * Value proposition "you-focus" pronouns per language.
 281   */
 282  const YOU_PRONOUNS = {
 283    en: /\b(you|your|you're|you'll)\b/gi,
 284    fr: /\b(vous|votre|vos|tu|ton|ta|tes)\b/gi,
 285    pl: /\b(ty|twój|twoja|twoje|twoi|wasz|wasza|wasze|wasi|pana|pani)\b/gi,
 286    es: /\b(usted|su|sus|tú|tu|tus|vosotros|vuestro|vuestra|vuestros|vuestras)\b/gi,
 287    de: /\b(sie|ihr|ihre|ihrem|ihren|ihres|du|dein|deine|deinem|deinen|deines)\b/gi,
 288    it: /\b(lei|suo|sua|suoi|sue|tu|tuo|tua|tuoi|tue|voi|vostro|vostra)\b/gi,
 289  };
 290  
 291  /**
 292   * "We-focus" pronouns per language.
 293   */
 294  const WE_PRONOUNS = {
 295    en: /\b(we|our|we're|we'll)\b/gi,
 296    fr: /\b(nous|notre|nos)\b/gi,
 297    pl: /\b(my|nasz|nasza|nasze|nasi)\b/gi,
 298    es: /\b(nosotros|nuestro|nuestra|nuestros|nuestras)\b/gi,
 299    de: /\b(wir|unser|unsere|unserem|unseren|unseres)\b/gi,
 300    it: /\b(noi|nostro|nostra|nostri|nostre)\b/gi,
 301  };
 302  
 303  /**
 304   * "Specific outcomes" phrases per language.
 305   */
 306  const OUTCOME_PHRASES = {
 307    en: /\b(up to|within|in just|only takes|as fast as|guaranteed)\b/i,
 308    fr: /\b(jusqu'à|jusqu'a|en seulement|en moins de|aussi vite que|garanti|en \d+ (jours?|heures?|minutes?))\b/i,
 309    pl: /\b(do|w ciągu|w ciagu|w zaledwie|tak szybko jak|gwarantowany|w \d+ (dniach?|godzinach?|minutach?))\b/i,
 310    es: /\b(hasta|en solo|en menos de|tan rápido como|tan rapido como|garantizado|en \d+ (días?|dias?|horas?|minutos?))\b/i,
 311    de: /\b(bis zu|innerhalb von|in nur|so schnell wie|garantiert|in \d+ (tagen?|stunden?|minuten?))\b/i,
 312    it: /\b(fino a|entro|in soli|così velocemente come|cosi velocemente come|garantito|in \d+ (giorni?|ore|minuti?))\b/i,
 313  };
 314  
 315  /**
 316   * Service list keywords per language.
 317   */
 318  const SERVICE_KEYWORDS = {
 319    en: /\b(services?|what we (do|offer)|our (services?|work))\b/i,
 320    fr: /\b(services?|prestations?|ce que nous (faisons|proposons|offrons)|nos (services?|prestations?))\b/i,
 321    pl: /\b(usługi|uslugi|oferta|co (robimy|oferujemy)|nasze (usługi|uslugi))\b/i,
 322    es: /\b(servicios?|lo que (hacemos|ofrecemos)|nuestros (servicios?|trabajos?))\b/i,
 323    de: /\b(leistungen?|dienstleistungen?|was wir (tun|anbieten)|unsere (leistungen?|dienstleistungen?))\b/i,
 324    it: /\b(servizi?|cosa (facciamo|offriamo)|i nostri (servizi?|lavori?))\b/i,
 325  };
 326  
 327  /**
 328   * Business hours keywords per language.
 329   */
 330  const HOURS_KEYWORDS = {
 331    en: /\b(hours?|open|serving|areas? served|locations?|coverage)\b/i,
 332    fr: /\b(heures?|ouvert|horaires?|zones? desservies?|emplacements?|couverture)\b/i,
 333    pl: /\b(godziny|otwarty|otwarte|obsługiwany|obsługiwane|obszary|lokalizacje|zasięg)\b/i,
 334    es: /\b(horas?|abierto|horarios?|áreas? de servicio|areas? de servicio|ubicaciones?|cobertura)\b/i,
 335    de: /\b(stunden?|öffnungszeiten|geöffnet|geöffnet|servicegebiete|standorte?|versorgungsgebiet)\b/i,
 336    it: /\b(ore|aperto|orari?|aree servite|posizioni?|copertura)\b/i,
 337  };
 338  
 339  /**
 340   * Process / how-it-works keywords per language.
 341   */
 342  const PROCESS_KEYWORDS = {
 343    en: /\b(how it works|our process|step \d|getting started)\b/i,
 344    fr: /\b(comment ça marche|comment cela fonctionne|notre processus|étape \d|etape \d|pour commencer)\b/i,
 345    pl: /\b(jak to działa|jak to dziala|nasz proces|krok \d|jak zacząć|jak zaczac)\b/i,
 346    es: /\b(cómo funciona|como funciona|nuestro proceso|paso \d|como empezar)\b/i,
 347    de: /\b(wie es funktioniert|unser prozess|schritt \d|so geht's|so geht es)\b/i,
 348    it: /\b(come funziona|il nostro processo|passo \d|fase \d|come iniziare)\b/i,
 349  };
 350  
 351  /**
 352   * Discount keywords per language (urgency-adjacent).
 353   */
 354  const DISCOUNT_KEYWORDS = {
 355    en: /\b(\d+%\s*off|save\s*\$?\d+|discount|special offer|deal)\b/i,
 356    fr: /\b(\d+%\s*(de réduction|de reduction)|économisez|economisez|réduction|reduction|offre spéciale|offre speciale|promotion)\b/i,
 357    pl: /\b(\d+%\s*(zniżki|znizki|taniej)|oszczędź|oszczedz|zniżka|znizka|oferta specjalna|promocja)\b/i,
 358    es: /\b(\d+%\s*(de descuento|menos)|ahorre|descuento|oferta especial|promoción|promocion)\b/i,
 359    de: /\b(\d+%\s*rabatt|\d+%\s*günstiger|gunstiger|sparen|rabatt|sonderangebot|aktion)\b/i,
 360    it: /\b(\d+%\s*(di sconto|meno)|risparmia|sconto|offerta speciale|promozione)\b/i,
 361  };
 362  
 363  /**
 364   * Local business keywords per language.
 365   */
 366  const LOCAL_KEYWORDS = {
 367    en: /\b(local|nearby|serving|area|community|neighborhood|suburb)\b/i,
 368    fr: /\b(local|locale|à proximité|a proximite|servant|zone|quartier|ville|région|region)\b/i,
 369    pl: /\b(lokalny|lokalna|w pobliżu|w poblizu|obsługujemy|obszar|dzielnica|miasto|region)\b/i,
 370    es: /\b(local|cercano|cercana|sirviendo|zona|barrio|vecindario|ciudad|región|region)\b/i,
 371    de: /\b(lokal|in der nähe|in der nahe|vor ort|region|bezirk|stadtteil|gemeinde)\b/i,
 372    it: /\b(locale|nelle vicinanze|che serve|zona|quartiere|città|citta|regione)\b/i,
 373  };
 374  
 375  // ─── Helper: Detect Language ─────────────────────────────────────────────────
 376  
 377  /**
 378   * Detect the primary language from HTML lang attribute.
 379   * Returns a short language code: 'en', 'fr', 'pl', 'es', 'de', 'it', or null.
 380   */
 381  function detectLang(html) {
 382    const rawLang = ((html.match(/<html[^>]*lang=["']([^"']+)["']/i) || [])[1] || '').toLowerCase();
 383    if (!rawLang) return null;
 384    // Normalize: 'fr-CA' → 'fr', 'de-AT' → 'de', etc.
 385    const base = rawLang.split(/[-_]/)[0];
 386    if (base === 'en') return 'en';
 387    if (base === 'fr') return 'fr';
 388    if (base === 'pl') return 'pl';
 389    if (base === 'es') return 'es';
 390    if (base === 'de') return 'de';
 391    if (base === 'it') return 'it';
 392    // Unknown non-English language
 393    return base;
 394  }
 395  
 396  /**
 397   * Return the keyword set for a given language, falling back to English.
 398   */
 399  function langKey(map, lang) {
 400    if (lang && map[lang]) return map[lang];
 401    return map['en'];
 402  }
 403  
 404  // ─── Factor Scoring Functions ───────────────────────────────────────────────
 405  
 406  /**
 407   * Factor 1: Headline Quality (weight: 15%)
 408   * Checks h1 presence, word count, benefit/action language
 409   */
 410  export function scoreHeadlineQuality(html, lang = 'en') {
 411    const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
 412    if (!h1Match)
 413      return { score: 0, reasoning: 'No h1 headline found', evidence: 'Missing <h1> tag' };
 414  
 415    const h1Text = h1Match[1].replace(/<[^>]+>/g, '').trim();
 416    if (!h1Text) return { score: 1, reasoning: 'Empty h1 tag', evidence: '<h1> exists but empty' };
 417  
 418    const words = h1Text.split(/\s+/).length;
 419    let score = 5; // Base: h1 with meaningful content exists (LLM avg for basic headlines is ~5-6)
 420  
 421    // Word count quality (3-12 words is ideal)
 422    if (words >= 3 && words <= 12) score += 1;
 423    else if (words >= 2) score += 1;
 424    // Very short (1 word) or very long (>12) headlines = no bonus
 425  
 426    // Benefit/action keywords (language-aware)
 427    const benefitWords = langKey(HEADLINE_BENEFIT_WORDS, lang);
 428    if (benefitWords.test(h1Text)) score += 2;
 429  
 430    // Specific numbers or quantified claims
 431    if (/\d+/.test(h1Text)) score += 1;
 432  
 433    // Questions or direct address (language-aware)
 434    const youPronouns = langKey(YOU_PRONOUNS, lang);
 435    if (youPronouns.test(h1Text) || h1Text.includes('?')) score += 1;
 436  
 437    return {
 438      score: Math.min(score, 10),
 439      reasoning: `H1 found with ${words} words${benefitWords.test(h1Text) ? ', includes benefit language' : ''}`,
 440      evidence: h1Text.slice(0, 100),
 441    };
 442  }
 443  
 444  /**
 445   * Factor 2: Value Proposition (weight: 14%)
 446   * Looks for quantified claims, benefit statements, outcome language
 447   */
 448  export function scoreValueProposition(html, lang = 'en') {
 449    const text = stripHtml(html);
 450    let score = 4; // Base: page has content (HTML alone can't assess visual quality)
 451  
 452    // Quantified claims (percentages, dollar amounts, time frames — mostly language-neutral)
 453    const quantified = text.match(/\d+\s*(%|percent|dollar|year|month|day|hour|minute|save|off)/gi);
 454    if (quantified && quantified.length >= 2) score += 3;
 455    else if (quantified) score += 2;
 456  
 457    // Benefit language (language-aware)
 458    const benefits = langKey(BENEFIT_KEYWORDS, lang);
 459    const benefitCount = (text.match(benefits) || []).length;
 460    if (benefitCount >= 3) score += 2;
 461    else if (benefitCount >= 1) score += 1;
 462  
 463    // Specific outcomes (language-aware)
 464    const outcomePhrases = langKey(OUTCOME_PHRASES, lang);
 465    if (outcomePhrases.test(text)) score += 1;
 466  
 467    // "We" vs "You" focus — customer-centric language scores higher (language-aware)
 468    const youCount = (text.match(langKey(YOU_PRONOUNS, lang)) || []).length;
 469    const weCount = (text.match(langKey(WE_PRONOUNS, lang)) || []).length;
 470    if (youCount > weCount) score += 1;
 471  
 472    return {
 473      score: Math.min(score, 10),
 474      reasoning: `${quantified?.length || 0} quantified claims, ${benefitCount} benefit keywords`,
 475      evidence: `You/Your: ${youCount}, We/Our: ${weCount}`,
 476    };
 477  }
 478  
 479  /**
 480   * Factor 3: USP / Differentiation (weight: 13%)
 481   * Looks for "only", "unique", "exclusive", comparative language
 482   */
 483  export function scoreUSP(html, lang = 'en') {
 484    const text = stripHtml(html);
 485    let score = 4; // Base: real business exists (can't assess differentiation from HTML alone)
 486  
 487    const uspKeywords = langKey(USP_KEYWORDS, lang);
 488    const uspCount = (text.match(uspKeywords) || []).length;
 489    if (uspCount >= 3) score += 3;
 490    else if (uspCount >= 1) score += 2;
 491  
 492    // Comparative language (mostly language-neutral patterns + language-specific)
 493    const comparativeEn =
 494      /\b(better than|compared to|versus|vs\.?|more than|faster than|cheaper than)\b/i;
 495    const comparativeFr =
 496      /\b(mieux que|par rapport à|par rapport a|versus|vs\.?|plus que|plus rapide que|moins cher que)\b/i;
 497    const comparativePl =
 498      /\b(lepszy niż|lepszy niz|w porównaniu do|versus|vs\.?|więcej niż|wiecej niz|szybszy niż|szybszy niz|tańszy niż|tanszy niz)\b/i;
 499    const comparativeEs =
 500      /\b(mejor que|comparado con|versus|vs\.?|más que|mas que|más rápido que|mas rapido que|más barato que|mas barato que)\b/i;
 501    const comparativeDe =
 502      /\b(besser als|verglichen mit|im vergleich zu|versus|vs\.?|mehr als|schneller als|günstiger als|gunstiger als)\b/i;
 503    const comparativeIt =
 504      /\b(meglio di|rispetto a|versus|vs\.?|più di|piu di|più veloce di|piu veloce di|più economico di|piu economico di)\b/i;
 505  
 506    const comparativeMap = {
 507      en: comparativeEn,
 508      fr: comparativeFr,
 509      pl: comparativePl,
 510      es: comparativeEs,
 511      de: comparativeDe,
 512      it: comparativeIt,
 513    };
 514    if (langKey(comparativeMap, lang).test(text)) score += 1;
 515  
 516    // Specific differentiators (years of experience, number of customers — numbers are language-neutral)
 517    if (
 518      /\b(\d+\+?\s*(years?|customers?|clients?|projects?|locations?|lat|ans?|kunden|clienti|clientes?|jahre?))\b/i.test(
 519        text
 520      )
 521    )
 522      score += 2;
 523  
 524    // "Why choose us" or similar sections (language-aware)
 525    const whyChooseEn =
 526      /\b(why choose|what makes us|what sets us apart|our difference|our advantage)\b/i;
 527    const whyChooseFr =
 528      /\b(pourquoi nous choisir|pourquoi choisir|ce qui nous différencie|ce qui nous differencie|notre avantage)\b/i;
 529    const whyChoosePl =
 530      /\b(dlaczego my|dlaczego warto|co nas wyróżnia|co nas wyroznia|nasza przewaga)\b/i;
 531    const whyChooseEs =
 532      /\b(por qué elegirnos|por que elegirnos|qué nos diferencia|que nos diferencia|nuestra ventaja)\b/i;
 533    const whyChooseDe =
 534      /\b(warum uns wählen|warum uns wahlen|was uns auszeichnet|unser vorteil|warum wir)\b/i;
 535    const whyChooseIt =
 536      /\b(perché sceglierci|perche sceglierci|cosa ci distingue|il nostro vantaggio)\b/i;
 537  
 538    const whyChooseMap = {
 539      en: whyChooseEn,
 540      fr: whyChooseFr,
 541      pl: whyChoosePl,
 542      es: whyChooseEs,
 543      de: whyChooseDe,
 544      it: whyChooseIt,
 545    };
 546    if (langKey(whyChooseMap, lang).test(text)) score += 1;
 547  
 548    return {
 549      score: Math.min(score, 10),
 550      reasoning: `${uspCount} differentiation keywords found`,
 551      evidence: (text.match(uspKeywords) || []).slice(0, 3).join(', ') || 'None',
 552    };
 553  }
 554  
 555  /**
 556   * Factor 4: CTA Design (weight: 13%)
 557   * Checks for button elements, CTA text quality, action verbs
 558   */
 559  export function scoreCTA(html, lang = 'en') {
 560    let score = 2; // Base: most sites have some navigation/contact links
 561  
 562    // Button or CTA-like elements
 563    const buttons = html.match(/<(button|a)[^>]*class[^>]*(btn|button|cta)[^>]*>([\s\S]*?)<\/\1>/gi);
 564    const linkButtons = html.match(/<a[^>]*>([\s\S]*?)<\/a>/gi) || [];
 565  
 566    // Strong CTA text patterns (language-aware)
 567    const ctaPattern = langKey(CTA_KEYWORDS, lang);
 568  
 569    let ctaElements = 0;
 570    for (const link of linkButtons) {
 571      const linkText = link.replace(/<[^>]+>/g, '').trim();
 572      if (ctaPattern.test(linkText)) ctaElements++;
 573    }
 574  
 575    if (buttons && buttons.length > 0) score += 2;
 576    else if (ctaElements > 0) score += 1;
 577  
 578    // Multiple CTAs (good for conversion)
 579    if (ctaElements >= 3) score += 2;
 580    else if (ctaElements >= 1) score += 1;
 581  
 582    // Phone number as CTA (tel: links) — universal
 583    if (/<a[^>]*href\s*=\s*["']tel:/i.test(html)) score += 1;
 584  
 585    // Form presence (another conversion path) — universal
 586    if (/<form/i.test(html)) score += 1;
 587  
 588    // Email link (mailto:) — universal
 589    if (/<a[^>]*href\s*=\s*["']mailto:/i.test(html)) score += 1;
 590  
 591    return {
 592      score: Math.min(score, 10),
 593      reasoning: `${buttons?.length || 0} button elements, ${ctaElements} CTA patterns, ${/<a[^>]*href\s*=\s*["']tel:/i.test(html) ? 'tel link found' : 'no tel link'}`,
 594      evidence: `Buttons: ${buttons?.length || 0}, CTA links: ${ctaElements}`,
 595    };
 596  }
 597  
 598  /**
 599   * Factor 5: Urgency/Scarcity (weight: 10%)
 600   * Date/deadline patterns, quantity limits, urgency language
 601   */
 602  export function scoreUrgency(html, lang = 'en') {
 603    const text = stripHtml(html);
 604    let score = 1; // Base: LLM typically gives 2-3 even without explicit urgency
 605  
 606    // Time-bound urgency (language-aware)
 607    const timeUrgency = langKey(URGENCY_KEYWORDS, lang);
 608    if (timeUrgency.test(text)) score += 4;
 609  
 610    // Quantity scarcity (language-aware)
 611    const quantityScarcity = langKey(SCARCITY_KEYWORDS, lang);
 612    if (quantityScarcity.test(text)) score += 3;
 613  
 614    // Seasonal/dated offers — month names are mostly recognizable across languages
 615    if (
 616      /\b(spring|summer|fall|autumn|winter|holiday|christmas|new year|black friday|printemps|été|automne|hiver|noël|nouvel an|wiosna|lato|jesień|zima|święta|primavera|verano|otoño|invierno|navidad|frühling|herbst|weihnachten|neujahr|estate|autunno|natale|capodanno)\b/i.test(
 617        text
 618      ) &&
 619      /\b(sale|offer|special|deal|discount|soldes|offre|promo|vente|oferta|especial|angebot|aktion|offerta|saldi)\b/i.test(
 620        text
 621      )
 622    )
 623      score += 2;
 624  
 625    // Discount language (language-aware)
 626    const discountKw = langKey(DISCOUNT_KEYWORDS, lang);
 627    if (discountKw.test(text)) score += 2;
 628  
 629    // Most local business sites lack urgency — that's normal. Score 0-2 is typical.
 630    return {
 631      score: Math.min(score, 10),
 632      reasoning:
 633        score === 0 ? 'No urgency or scarcity messaging present' : 'Urgency/scarcity elements found',
 634      evidence: (text.match(timeUrgency) || ['None'])[0],
 635    };
 636  }
 637  
 638  /**
 639   * Factor 6: Hook/Engagement (weight: 9%)
 640   * Hero imagery, video, compelling above-fold content
 641   */
 642  export function scoreHook(html) {
 643    let score = 3; // Base: page exists and loads
 644  
 645    // Hero image or background image
 646    if (/<img[^>]*(hero|banner|header|main|feature)/i.test(html)) score += 2;
 647    else if (/<img/i.test(html)) score += 1;
 648  
 649    // Video embed
 650    if (/<video|youtube\.com|vimeo\.com|wistia\.com/i.test(html)) score += 3;
 651  
 652    // Background image in CSS
 653    if (/background(-image)?\s*:\s*url/i.test(html)) score += 1;
 654  
 655    // Compelling above-fold text (checking first 2000 chars)
 656    const aboveFold = stripHtml(html.slice(0, 2000));
 657    if (aboveFold.length > 50) score += 1;
 658  
 659    // Interactive elements
 660    if (
 661      /<(slider|carousel|swiper|slideshow)/i.test(html) ||
 662      /class\s*=\s*["'][^"']*\b(slider|carousel|swiper)\b/i.test(html)
 663    )
 664      score += 1;
 665  
 666    return {
 667      score: Math.min(score, 10),
 668      reasoning: `${/<video|youtube|vimeo|wistia/i.test(html) ? 'Video present' : 'No video'}, ${/<img/i.test(html) ? 'images present' : 'no images'}`,
 669      evidence: `Images: ${(html.match(/<img/gi) || []).length}`,
 670    };
 671  }
 672  
 673  /**
 674   * Factor 7: Trust Signals (weight: 11%)
 675   * Testimonials, ratings, badges, certifications, "since YYYY"
 676   */
 677  export function scoreTrustSignals(html, lang = 'en') {
 678    const text = stripHtml(html);
 679    let score = 0;
 680  
 681    // Testimonial/review sections (language-aware)
 682    if (langKey(TRUST_SECTION_KEYWORDS, lang).test(text)) score += 2;
 683  
 684    // Star ratings (★ characters or rating patterns) — universal
 685    if (/[★☆⭐]|(\d(\.\d)?)\s*\/\s*5\s*(stars?)?|\bstar[s]?\b.*\brating\b/i.test(html)) score += 2;
 686  
 687    // Trust badges and certifications (language-aware)
 688    if (langKey(CERT_KEYWORDS, lang).test(text)) score += 2;
 689  
 690    // Industry associations or awards — partially language-neutral
 691    if (
 692      /\b(award|winner|finalist|recognized|featured in|as seen on|partner|prix|lauréat|nagroda|premio|preis|gewinner|premio|vincitore)\b/i.test(
 693        text
 694      )
 695    )
 696      score += 1;
 697  
 698    // "Since YYYY" or years in business (mostly language-neutral with some additions)
 699    if (
 700      /\b(since|established|est\.?|depuis|od roku|od|desde|seit|dal|fondat\w*)\s*(19|20)\d{2}\b/i.test(
 701        text
 702      )
 703    )
 704      score += 1;
 705  
 706    // Review platform mentions — universal brand names
 707    if (
 708      /\b(bbb|better business|google review|yelp|trustpilot|angi|homeadvisor|houzz|bark|checkatrade|trusted trader|avis vérifiés|avis verifies|opinie|opineo|ekomi|provenexpert)\b/i.test(
 709        text
 710      )
 711    )
 712      score += 1;
 713  
 714    // Guarantee language (language-aware)
 715    if (langKey(GUARANTEE_KEYWORDS, lang).test(text)) score += 1;
 716  
 717    // Phone number visible (proxy for legitimacy — virtually all real businesses display one) — universal
 718    if (/(\+?[0-9][\d\s\-().]{7,}[0-9]|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b)/.test(text)) score += 2;
 719  
 720    return {
 721      score: Math.min(score, 10),
 722      reasoning: `Trust elements: ${score === 0 ? 'none found' : 'present'}`,
 723      evidence:
 724        [
 725          langKey(TRUST_SECTION_KEYWORDS, lang).test(text) ? 'reviews' : null,
 726          langKey(CERT_KEYWORDS, lang).test(text) ? 'certifications' : null,
 727          /since.*\d{4}|depuis.*\d{4}|od roku.*\d{4}|desde.*\d{4}|seit.*\d{4}|dal.*\d{4}/i.test(text)
 728            ? 'establishment date'
 729            : null,
 730          langKey(GUARANTEE_KEYWORDS, lang).test(text) ? 'guarantee' : null,
 731        ]
 732          .filter(Boolean)
 733          .join(', ') || 'None',
 734    };
 735  }
 736  
 737  /**
 738   * Factor 8: Imagery/Design (weight: 8%)
 739   * Image count, alt text quality, responsive design indicators
 740   */
 741  export function scoreImageryDesign(html) {
 742    let score = 3; // Base: page renders (we can't see it, assume basic design)
 743  
 744    const images = html.match(/<img[^>]+>/gi) || [];
 745    if (images.length >= 5) score += 2;
 746    else if (images.length >= 2) score += 1;
 747  
 748    // Alt text quality
 749    const withAlt = images.filter(img => /alt\s*=\s*["'][^"']+["']/i.test(img));
 750    if (images.length > 0 && withAlt.length / images.length > 0.7) score += 1;
 751  
 752    // Responsive design indicators
 753    if (/<meta[^>]*viewport/i.test(html)) score += 1;
 754  
 755    // CSS framework indicators (Bootstrap, Tailwind, etc.)
 756    if (/bootstrap|tailwind|foundation|bulma|material/i.test(html)) score += 1;
 757  
 758    // Lazy loading (performance indicator)
 759    if (/loading\s*=\s*["']lazy["']/i.test(html)) score += 1;
 760  
 761    // WebP or modern image formats
 762    if (/\.webp|\.avif/i.test(html)) score += 1;
 763  
 764    return {
 765      score: Math.min(score, 10),
 766      reasoning: `${images.length} images, ${withAlt.length} with alt text, ${/<meta[^>]*viewport/i.test(html) ? 'responsive' : 'not responsive'}`,
 767      evidence: `Images: ${images.length}, Alt coverage: ${images.length > 0 ? Math.round((withAlt.length / images.length) * 100) : 0}%`,
 768    };
 769  }
 770  
 771  /**
 772   * Factor 9: Offer Clarity (weight: 4%)
 773   * Pricing patterns, specific terms, clear service descriptions
 774   */
 775  export function scoreOfferClarity(html, lang = 'en') {
 776    const text = stripHtml(html);
 777    let score = 4; // Base: real business page (HTML alone can't verify offer clarity)
 778  
 779    // Pricing visible — currency symbols are language-neutral
 780    if (
 781      /\$\d+|€\d+|£\d+|\d+\s*(USD|AUD|GBP|EUR|CAD|NZD|PLN|CHF|MXN|COP|ARS)|\bpric(e|ing)\b|\bprix\b|\bcena\b|\bpreis\b|\bprezzo\b|\bprecio\b/i.test(
 782        text
 783      )
 784    )
 785      score += 3;
 786  
 787    // Service list or menu (language-aware)
 788    if (langKey(SERVICE_KEYWORDS, lang).test(text)) score += 2;
 789  
 790    // Specific terms (areas served, hours, etc.) (language-aware)
 791    if (langKey(HOURS_KEYWORDS, lang).test(text)) score += 1;
 792  
 793    // FAQ section — language-neutral acronym + language-specific
 794    if (
 795      /\b(faq|frequently asked|common questions|questions fréquentes|questions frequentes|często zadawane|preguntas frecuentes|häufige fragen|domande frequenti)\b/i.test(
 796        text
 797      )
 798    )
 799      score += 1;
 800  
 801    // Process or "how it works" (language-aware)
 802    if (langKey(PROCESS_KEYWORDS, lang).test(text)) score += 1;
 803  
 804    return {
 805      score: Math.min(score, 10),
 806      reasoning: `${/pric|prix|cena|preis|prezzo|precio|€|\$|£/i.test(text) ? 'Pricing present' : 'No pricing'}, ${langKey(SERVICE_KEYWORDS, lang).test(text) ? 'services listed' : 'no service list'}`,
 807      evidence: `Pricing: ${/\$|€|£/i.test(text) ? 'yes' : 'no'}, Services: ${langKey(SERVICE_KEYWORDS, lang).test(text) ? 'yes' : 'no'}`,
 808    };
 809  }
 810  
 811  /**
 812   * Factor 10: Contextual Appropriateness (weight: 3%)
 813   * Industry-relevant content, local business indicators
 814   */
 815  export function scoreContext(html, keyword, lang = 'en') {
 816    const text = stripHtml(html);
 817    let score = 3; // Base: page has content
 818  
 819    // Keyword relevance
 820    if (keyword) {
 821      const keywordParts = keyword.toLowerCase().split(/\s+/);
 822      const textLower = text.toLowerCase();
 823      const matches = keywordParts.filter(part => part.length > 3 && textLower.includes(part));
 824      if (matches.length >= 2) score += 2;
 825      else if (matches.length >= 1) score += 1;
 826    }
 827  
 828    // Local business indicators (language-aware)
 829    if (langKey(LOCAL_KEYWORDS, lang).test(text)) score += 1;
 830  
 831    // Address/location present — numbers in addresses are language-neutral
 832    if (
 833      /\d+\s+\w+\s+(st|street|rd|road|ave|avenue|blvd|drive|lane|way|rue|avenue|boulevard|via|calle|straße|strasse|str\.|gasse|piazza|platz)\b/i.test(
 834        text
 835      )
 836    )
 837      score += 1;
 838  
 839    // Business hours — language-aware day names
 840    if (
 841      /\b(mon|tue|wed|thu|fri|sat|sun|lun|mar|mer|jeu|ven|sam|dim|pon|wt|śr|czw|pt|sob|nie|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|domingo|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag|lunedì|martedì|mercoledì|giovedì|venerdì|sabato|domenica)(day)?.*\d{1,2}(:\d{2})?\s*(am|pm|uhr|h)?/i.test(
 842        text
 843      )
 844    )
 845      score += 1;
 846  
 847    // Phone number on page — universal
 848    if (/\b(\+?\d[\d\s\-()]{8,})\b/.test(text)) score += 1;
 849  
 850    // Map embed — universal
 851    if (/google\.com\/maps|maps\.google|goo\.gl\/maps/i.test(html)) score += 1;
 852  
 853    return {
 854      score: Math.min(score, 10),
 855      reasoning: `${keyword ? 'Keyword alignment checked' : 'No keyword'}, local business indicators ${langKey(LOCAL_KEYWORDS, lang).test(text) ? 'present' : 'absent'}`,
 856      evidence: `Address: ${/\d+\s+\w+\s+(st|street|rd|road|ave|avenue|rue|via|calle|str\.)/i.test(text) ? 'yes' : 'no'}, Phone: ${/\b(\+?\d[\d\s\-()]{8,})\b/.test(text) ? 'yes' : 'no'}`,
 857    };
 858  }
 859  
 860  // ─── Metadata Extraction ────────────────────────────────────────────────────
 861  
 862  /**
 863   * Detect if site is an error page (soft 404, parking page, etc.)
 864   */
 865  export function detectErrorPage(html) {
 866    const text = stripHtml(html).toLowerCase();
 867  
 868    // Common error patterns
 869    if (/\b(page not found|404 error|404 not found|this page doesn.?t exist)\b/i.test(text))
 870      return { is_error_page: true, reason: '404 page' };
 871  
 872    // Parking/placeholder pages
 873    if (
 874      /\b(this domain|domain for sale|buy this domain|parked|coming soon|under construction|website is being)\b/i.test(
 875        text
 876      )
 877    )
 878      return { is_error_page: true, reason: 'Parked/placeholder page' };
 879  
 880    // GoDaddy, Wix, Squarespace default pages
 881    if (
 882      /\b(start your website|build your website|create your website)\b/i.test(text) &&
 883      text.length < 1000
 884    )
 885      return { is_error_page: true, reason: 'Platform default page' };
 886  
 887    return { is_error_page: false, reason: null };
 888  }
 889  
 890  /**
 891   * Detect if site is a business directory (not a local business)
 892   */
 893  export function detectBusinessDirectory(html) {
 894    const text = stripHtml(html).toLowerCase();
 895  
 896    if (
 897      /\b(business directory|yellow pages|find a|search for business|local listings|company directory|add your business)\b/i.test(
 898        text
 899      )
 900    )
 901      return true;
 902  
 903    // Multiple business listings pattern
 904    const listingPattern = /<div[^>]*class[^>]*(listing|result|business-card|company-item)/gi;
 905    const listings = html.match(listingPattern) || [];
 906    if (listings.length > 10) return true;
 907  
 908    return false;
 909  }
 910  
 911  /**
 912   * Classify industry from page content
 913   */
 914  export function classifyIndustry(html, keyword) {
 915    const text = stripHtml(html).toLowerCase();
 916    const kw = (keyword || '').toLowerCase();
 917  
 918    const industries = {
 919      plumber: /\b(plumb\w*|pipe|drain|faucet|water heater|leak|toilet|sewer)\b/i,
 920      electrician: /\b(electri\w*|wiring|circuit|power|outlet|panel|switch)\b/i,
 921      hvac: /\b(hvac|heat\w*|cool\w*|air condition\w*|furnace|heat pump|duct)\b/i,
 922      roofing: /\b(roof\w*|shingle|gutter|flashing|leak repair)\b/i,
 923      landscaping: /\b(landscap\w*|lawn\w*|garden\w*|mow\w*|tree|hedge|irrigation|turf)\b/i,
 924      painter: /\b(paint\w*|stain\w*|coating|wallpaper)\b/i,
 925      cleaner: /\b(clean\w*|janitorial|maid|housekeep\w*|carpet clean\w*|pressure wash\w*)\b/i,
 926      pest_control: /\b(pest\w*|termite|exterminator|rodent|insect|bug|cockroach)\b/i,
 927      locksmith: /\b(locksmith|lock|key|safe|security system|access control)\b/i,
 928      mechanic: /\b(mechanic\w*|auto repair|car repair|brake|transmission|oil change)\b/i,
 929      dentist: /\b(dent\w*|orthodont\w*|teeth|oral|filling|crown|implant)\b/i,
 930      lawyer: /\b(lawyer|attorney|law firm|legal|litigation|practice area)\b/i,
 931      accountant: /\b(account\w*|tax\w*|bookkeep\w*|cpa|audit\w*|payroll|financial)\b/i,
 932      real_estate: /\b(real estate|realtor|property|home for sale|listing|broker)\b/i,
 933      restaurant: /\b(restaurant|menu|reserv\w*|dine|cuisine|chef|takeout|delivery)\b/i,
 934      fitness: /\b(gym|fitness|workout|personal train\w*|yoga|pilates|crossfit)\b/i,
 935      salon: /\b(salon|hair|barber|spa|nails?|beauty|stylist|cosmetic)\b/i,
 936      veterinarian: /\b(vet\w*|veterinar\w*|animal|pet|clinic|surgery|spay|neuter)\b/i,
 937      photographer: /\b(photo\w*|portrait|wedding photo\w*|shoot|studio)\b/i,
 938      contractor: /\b(contractor|renovati\w*|remodel\w*|home improvement|build\w*|construct\w*)\b/i,
 939    };
 940  
 941    // Check keyword first (most reliable)
 942    for (const [industry, pattern] of Object.entries(industries)) {
 943      if (pattern.test(kw)) return industry;
 944    }
 945  
 946    // Then check page content — add global flag for counting all matches
 947    for (const [industry, pattern] of Object.entries(industries)) {
 948      const globalPattern = new RegExp(pattern.source, 'gi');
 949      const matches = (text.match(globalPattern) || []).length;
 950      if (matches >= 3) return industry;
 951    }
 952  
 953    return 'general_business';
 954  }
 955  
 956  /**
 957   * Extract location info from HTML
 958   */
 959  export function extractLocation(html) {
 960    const text = stripHtml(html);
 961  
 962    // Try to find city/state from common patterns
 963    const cityStatePattern =
 964      /\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)?),?\s+(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|ACT|NSW|NT|QLD|SA|TAS|VIC|WA)\b/;
 965    const match = text.match(cityStatePattern);
 966  
 967    return {
 968      city: match ? match[1] : null,
 969      state: match ? match[2] : null,
 970    };
 971  }
 972  
 973  // ─── Technical SEO Checks ───────────────────────────────────────────────────
 974  
 975  /**
 976   * Comprehensive Technical SEO audit.
 977   * Returns a detailed sub-report with individual check results.
 978   * These signals are woven into existing factor scores as bonuses/penalties
 979   * and also returned as a standalone technical_seo section in the result.
 980   *
 981   * @param {string} html - Raw HTML of the page
 982   * @returns {Object} Technical SEO check results
 983   */
 984  export function scoreTechnicalSEO(html) {
 985    const checks = {};
 986  
 987    // 1. Meta description — presence and optimal length (120-160 chars)
 988    const metaDescMatch = html.match(
 989      /<meta[^>]*name\s*=\s*["']description["'][^>]*content\s*=\s*["']([^"']*)["']/i
 990    ) || html.match(
 991      /<meta[^>]*content\s*=\s*["']([^"']*)["'][^>]*name\s*=\s*["']description["']/i
 992    );
 993    const metaDesc = metaDescMatch ? metaDescMatch[1].trim() : null;
 994    checks.meta_description = {
 995      present: !!metaDesc,
 996      length: metaDesc ? metaDesc.length : 0,
 997      optimal: metaDesc ? (metaDesc.length >= 120 && metaDesc.length <= 160) : false,
 998      value: metaDesc ? metaDesc.slice(0, 200) : null,
 999    };
1000  
1001    // 2. Title tag quality — presence, optimal length (30-60 chars), brand name detection
1002    const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
1003    const titleText = titleMatch ? titleMatch[1].replace(/<[^>]+>/g, '').trim() : null;
1004    const titleHasSeparator = titleText ? /\s[-|–—]\s/.test(titleText) : false;
1005    checks.title_tag = {
1006      present: !!titleText,
1007      length: titleText ? titleText.length : 0,
1008      optimal: titleText ? (titleText.length >= 30 && titleText.length <= 60) : false,
1009      has_separator: titleHasSeparator, // Suggests brand name included (e.g., "Services | Brand")
1010      value: titleText ? titleText.slice(0, 100) : null,
1011    };
1012  
1013    // 3. Multiple H1 tags — count <h1> elements, flag if >1
1014    const h1Matches = html.match(/<h1[^>]*>/gi) || [];
1015    checks.h1_tags = {
1016      count: h1Matches.length,
1017      optimal: h1Matches.length === 1,
1018      issue: h1Matches.length === 0 ? 'missing' : h1Matches.length > 1 ? 'multiple' : null,
1019    };
1020  
1021    // 4. Schema.org structured data — <script type="application/ld+json">
1022    const ldJsonMatches = html.match(/<script[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi) || [];
1023    const schemaTypes = [];
1024    for (const block of ldJsonMatches) {
1025      const content = block.replace(/<\/?script[^>]*>/gi, '').trim();
1026      try {
1027        const parsed = JSON.parse(content);
1028        const type = parsed['@type'] || (Array.isArray(parsed['@graph']) ? parsed['@graph'].map(g => g['@type']).filter(Boolean).join(', ') : null);
1029        if (type) schemaTypes.push(type);
1030      } catch {
1031        // Malformed JSON-LD — still counts as an attempt
1032        schemaTypes.push('malformed');
1033      }
1034    }
1035    // Also check for microdata (itemtype attribute)
1036    const microdataTypes = (html.match(/itemtype\s*=\s*["']https?:\/\/schema\.org\/(\w+)["']/gi) || [])
1037      .map(m => {
1038        const match = m.match(/schema\.org\/(\w+)/i);
1039        return match ? match[1] : null;
1040      })
1041      .filter(Boolean);
1042  
1043    const allSchemaTypes = [...new Set([...schemaTypes, ...microdataTypes])];
1044    const valuableSchemaTypes = ['LocalBusiness', 'Organization', 'FAQ', 'AggregateRating',
1045      'Product', 'Service', 'WebSite', 'BreadcrumbList', 'Review'];
1046    const hasValuableSchema = allSchemaTypes.some(t =>
1047      valuableSchemaTypes.some(v => t.includes(v))
1048    );
1049  
1050    checks.structured_data = {
1051      present: ldJsonMatches.length > 0 || microdataTypes.length > 0,
1052      json_ld_count: ldJsonMatches.length,
1053      types: allSchemaTypes,
1054      has_valuable_types: hasValuableSchema,
1055    };
1056  
1057    // 5. Open Graph meta — og:title, og:description, og:image
1058    const ogTitle = /<meta[^>]*property\s*=\s*["']og:title["']/i.test(html);
1059    const ogDesc = /<meta[^>]*property\s*=\s*["']og:description["']/i.test(html);
1060    const ogImage = /<meta[^>]*property\s*=\s*["']og:image["']/i.test(html);
1061    checks.open_graph = {
1062      has_title: ogTitle,
1063      has_description: ogDesc,
1064      has_image: ogImage,
1065      complete: ogTitle && ogDesc && ogImage,
1066      count: [ogTitle, ogDesc, ogImage].filter(Boolean).length,
1067    };
1068  
1069    // 6. Missing favicon
1070    const hasFavicon = /<link[^>]*rel\s*=\s*["'](icon|shortcut icon|apple-touch-icon)["']/i.test(html);
1071    checks.favicon = {
1072      present: hasFavicon,
1073    };
1074  
1075    // 7. HTML lang attribute
1076    const htmlLangMatch = html.match(/<html[^>]*lang\s*=\s*["']([^"']+)["']/i);
1077    checks.html_lang = {
1078      present: !!htmlLangMatch,
1079      value: htmlLangMatch ? htmlLangMatch[1] : null,
1080    };
1081  
1082    // 8. Render-blocking resources — stylesheets and scripts in <head> without async/defer
1083    const headMatch = html.match(/<head[^>]*>([\s\S]*?)<\/head>/i);
1084    const headHtml = headMatch ? headMatch[1] : '';
1085  
1086    // Count stylesheets in head (all are render-blocking by default unless media=print or preload)
1087    const stylesheets = (headHtml.match(/<link[^>]*rel\s*=\s*["']stylesheet["'][^>]*>/gi) || []);
1088    const blockingStylesheets = stylesheets.filter(s =>
1089      !(/media\s*=\s*["']print["']/i.test(s)) && !(/rel\s*=\s*["']preload["']/i.test(s))
1090    );
1091  
1092    // Count scripts in head without async or defer
1093    const headScripts = (headHtml.match(/<script[^>]*src\s*=\s*["'][^"']+["'][^>]*>/gi) || []);
1094    const blockingScripts = headScripts.filter(s =>
1095      !(/\basync\b/i.test(s)) && !(/\bdefer\b/i.test(s))
1096    );
1097  
1098    checks.render_blocking = {
1099      blocking_stylesheets: blockingStylesheets.length,
1100      blocking_scripts: blockingScripts.length,
1101      total_blocking: blockingStylesheets.length + blockingScripts.length,
1102      issue: (blockingStylesheets.length + blockingScripts.length) > 5 ? 'excessive' : null,
1103    };
1104  
1105    // Compute an overall technical SEO score (0-10) for inclusion in results
1106    let techScore = 3; // Base: site loads and has HTML
1107    if (checks.meta_description.present) techScore += 1;
1108    if (checks.meta_description.optimal) techScore += 0.5;
1109    if (checks.title_tag.present && checks.title_tag.optimal) techScore += 0.5;
1110    if (checks.h1_tags.optimal) techScore += 0.5;
1111    if (checks.h1_tags.count > 1) techScore -= 0.5; // Penalty for multiple H1s
1112    if (checks.structured_data.has_valuable_types) techScore += 1.5;
1113    else if (checks.structured_data.present) techScore += 0.5;
1114    if (checks.open_graph.complete) techScore += 1;
1115    else if (checks.open_graph.count >= 1) techScore += 0.5;
1116    if (checks.favicon.present) techScore += 0.5;
1117    if (checks.html_lang.present) techScore += 0.5;
1118    if (checks.render_blocking.total_blocking <= 3) techScore += 0.5;
1119    else if (checks.render_blocking.total_blocking > 8) techScore -= 0.5;
1120  
1121    checks.overall_score = Math.max(0, Math.min(10, Math.round(techScore * 10) / 10));
1122  
1123    return checks;
1124  }
1125  
1126  /**
1127   * Score page speed from performance timing data.
1128   * Returns a speed rating (0-10) and detailed metrics.
1129   *
1130   * Thresholds based on Google's Core Web Vitals guidelines:
1131   *   - Good: FCP < 1.8s, LCP < 2.5s, DCL < 2s
1132   *   - Needs improvement: FCP 1.8-3s, LCP 2.5-4s
1133   *   - Poor: FCP > 3s, LCP > 4s
1134   *
1135   * @param {Object|null} perfJson - Performance data from assets capture
1136   * @returns {Object} Page speed score and details
1137   */
1138  export function scorePageSpeed(perfJson) {
1139    if (!perfJson) {
1140      return {
1141        score: null,
1142        rating: 'unknown',
1143        details: 'No performance data available',
1144        metrics: null,
1145      };
1146    }
1147  
1148    let score = 5; // Base: page loads
1149    const issues = [];
1150    const strengths = [];
1151  
1152    // First Contentful Paint (FCP) — Core Web Vital
1153    if (perfJson.firstContentfulPaint !== null && perfJson.firstContentfulPaint !== undefined) {
1154      const fcp = perfJson.firstContentfulPaint;
1155      if (fcp < 1000) { score += 2; strengths.push(`Fast FCP (${fcp}ms)`); }
1156      else if (fcp < 1800) { score += 1; strengths.push(`Good FCP (${fcp}ms)`); }
1157      else if (fcp < 3000) { issues.push(`Slow FCP (${fcp}ms, target <1.8s)`); }
1158      else { score -= 1; issues.push(`Very slow FCP (${fcp}ms, target <1.8s)`); }
1159    }
1160  
1161    // DOM Content Loaded
1162    if (perfJson.domContentLoaded !== null && perfJson.domContentLoaded !== undefined) {
1163      const dcl = perfJson.domContentLoaded;
1164      if (dcl < 1500) { score += 1; strengths.push(`Fast DCL (${dcl}ms)`); }
1165      else if (dcl < 3000) { /* neutral */ }
1166      else { score -= 1; issues.push(`Slow DCL (${dcl}ms)`); }
1167    }
1168  
1169    // Load time
1170    if (perfJson.loadTime !== null && perfJson.loadTime !== undefined && perfJson.loadTime > 0) {
1171      const load = perfJson.loadTime;
1172      if (load < 2000) { score += 1; strengths.push(`Fast load (${load}ms)`); }
1173      else if (load < 4000) { /* neutral */ }
1174      else { score -= 1; issues.push(`Slow load (${load}ms)`); }
1175    }
1176  
1177    // DOM Interactive
1178    if (perfJson.domInteractive !== null && perfJson.domInteractive !== undefined) {
1179      const di = perfJson.domInteractive;
1180      if (di < 1000) { score += 1; strengths.push(`Fast DOM interactive (${di}ms)`); }
1181      else if (di > 3000) { score -= 1; issues.push(`Slow DOM interactive (${di}ms)`); }
1182    }
1183  
1184    // Transfer size (page weight)
1185    if (perfJson.totalTransferSize !== null && perfJson.totalTransferSize !== undefined) {
1186      const sizeKB = Math.round(perfJson.totalTransferSize / 1024);
1187      if (sizeKB < 500) { score += 1; strengths.push(`Light page (${sizeKB}KB)`); }
1188      else if (sizeKB < 2000) { /* neutral */ }
1189      else if (sizeKB < 5000) { issues.push(`Heavy page (${sizeKB}KB)`); }
1190      else { score -= 1; issues.push(`Very heavy page (${sizeKB}KB)`); }
1191    }
1192  
1193    // Resource count
1194    if (perfJson.resourceCount !== null && perfJson.resourceCount !== undefined) {
1195      if (perfJson.resourceCount > 100) {
1196        score -= 1;
1197        issues.push(`Too many requests (${perfJson.resourceCount})`);
1198      } else if (perfJson.resourceCount < 30) {
1199        score += 1;
1200        strengths.push(`Low request count (${perfJson.resourceCount})`);
1201      }
1202    }
1203  
1204    score = Math.max(0, Math.min(10, score));
1205  
1206    let rating = 'average';
1207    if (score >= 8) rating = 'fast';
1208    else if (score >= 6) rating = 'good';
1209    else if (score >= 4) rating = 'average';
1210    else if (score >= 2) rating = 'slow';
1211    else rating = 'very_slow';
1212  
1213    return {
1214      score,
1215      rating,
1216      issues,
1217      strengths,
1218      metrics: {
1219        fcp_ms: perfJson.firstContentfulPaint,
1220        dcl_ms: perfJson.domContentLoaded,
1221        load_ms: perfJson.loadTime,
1222        dom_interactive_ms: perfJson.domInteractive,
1223        transfer_size_bytes: perfJson.totalTransferSize,
1224        resource_count: perfJson.resourceCount,
1225      },
1226    };
1227  }
1228  
1229  // ─── Main Scoring Function ──────────────────────────────────────────────────
1230  
1231  /**
1232   * Score a website programmatically from its HTML.
1233   *
1234   * @param {string} html - Raw HTML of the page
1235   * @param {string} pageUrl - URL of the page
1236   * @param {string} [keyword] - Search keyword that found this site
1237   * @param {Object|null} [perfJson] - Performance timing data from assets capture
1238   * @returns {Object} Score result matching the LLM output schema
1239   */
1240  export function scoreWebsiteProgrammatically(html, pageUrl, keyword = null, perfJson = null) {
1241    // Error/broken detection
1242    const errorCheck = detectErrorPage(html);
1243    const isBrokenSite = !html || html.length < 200;
1244  
1245    if (isBrokenSite || errorCheck.is_error_page) {
1246      return {
1247        website_url: pageUrl,
1248        evaluation_date: new Date().toISOString(),
1249        conversion_score: 0,
1250        letter_grade: 'F',
1251        is_error_page: errorCheck.is_error_page,
1252        is_broken_site: isBrokenSite,
1253        error_reason: errorCheck.reason || 'Insufficient HTML content',
1254        factor_scores: null,
1255        contacts: extractContactsFromHtml(html || '', pageUrl),
1256      };
1257    }
1258  
1259    // Detect JS-heavy sites and language
1260    const { isJsHeavy, lang } = detectSiteCharacteristics(html);
1261  
1262    // Score all 10 factors
1263    // For JS-heavy sites: apply calibrated neutral scores (~62 overall) since content
1264    // is rendered client-side and HTML analysis would give misleadingly low results.
1265    const neutralReasoning = 'JS-rendered site — HTML analysis unreliable, neutral score applied';
1266    const factor_scores = isJsHeavy
1267      ? {
1268          headline_quality: {
1269            score: 6,
1270            reasoning: neutralReasoning,
1271            evidence: 'Next.js/React SSR detected',
1272          },
1273          value_proposition: {
1274            score: 7,
1275            reasoning: neutralReasoning,
1276            evidence: 'Content rendered client-side',
1277          },
1278          unique_selling_proposition: {
1279            score: 5,
1280            reasoning: neutralReasoning,
1281            evidence: 'Content rendered client-side',
1282          },
1283          call_to_action: {
1284            score: 7,
1285            reasoning: neutralReasoning,
1286            evidence: 'Content rendered client-side',
1287          },
1288          urgency_messaging: {
1289            score: 2,
1290            reasoning: neutralReasoning,
1291            evidence: 'Typically low for local businesses',
1292          },
1293          hook_engagement: {
1294            score: 7,
1295            reasoning: neutralReasoning,
1296            evidence: 'JS sites typically have good imagery',
1297          },
1298          trust_signals: {
1299            score: 6,
1300            reasoning: neutralReasoning,
1301            evidence: 'Content rendered client-side',
1302          },
1303          imagery_design: {
1304            score: 8,
1305            reasoning: neutralReasoning,
1306            evidence: 'Modern JS framework implies good design',
1307          },
1308          offer_clarity: {
1309            score: 7,
1310            reasoning: neutralReasoning,
1311            evidence: 'Content rendered client-side',
1312          },
1313          contextual_appropriateness: {
1314            score: 7,
1315            reasoning: neutralReasoning,
1316            evidence: 'Content rendered client-side',
1317          },
1318        }
1319      : {
1320          headline_quality: scoreHeadlineQuality(html, lang),
1321          value_proposition: scoreValueProposition(html, lang),
1322          unique_selling_proposition: scoreUSP(html, lang),
1323          call_to_action: scoreCTA(html, lang),
1324          urgency_messaging: scoreUrgency(html, lang),
1325          hook_engagement: scoreHook(html),
1326          trust_signals: scoreTrustSignals(html, lang),
1327          imagery_design: scoreImageryDesign(html),
1328          offer_clarity: scoreOfferClarity(html, lang),
1329          contextual_appropriateness: scoreContext(html, keyword, lang),
1330        };
1331  
1332    // Run technical SEO checks and page speed scoring
1333    const technicalSEO = scoreTechnicalSEO(html);
1334    const pageSpeed = scorePageSpeed(perfJson);
1335  
1336    // Apply technical SEO signals as adjustments to existing factor scores.
1337    // These are small bonuses/penalties (max +/-1 per factor) to avoid
1338    // changing the overall factor weighting system. The detailed reports
1339    // are included separately in the result for proposal generation.
1340    if (!isJsHeavy) {
1341      // Headline quality: penalize multiple H1s, bonus for good title+H1 combo
1342      if (technicalSEO.h1_tags.count > 1) {
1343        factor_scores.headline_quality.score = Math.max(0, factor_scores.headline_quality.score - 1);
1344        factor_scores.headline_quality.reasoning += ' | Multiple H1 tags detected (SEO issue)';
1345      }
1346      if (technicalSEO.title_tag.present && technicalSEO.title_tag.optimal) {
1347        factor_scores.headline_quality.score = Math.min(10, factor_scores.headline_quality.score + 1);
1348        factor_scores.headline_quality.reasoning += ' | Good title tag length';
1349      }
1350  
1351      // Trust signals: bonus for structured data (schema.org)
1352      if (technicalSEO.structured_data.has_valuable_types) {
1353        factor_scores.trust_signals.score = Math.min(10, factor_scores.trust_signals.score + 1);
1354        factor_scores.trust_signals.reasoning += ` | Schema.org: ${technicalSEO.structured_data.types.join(', ')}`;
1355      }
1356  
1357      // Imagery/Design: bonus for OG tags and favicon, penalty for excessive render-blocking
1358      if (technicalSEO.open_graph.complete) {
1359        factor_scores.imagery_design.score = Math.min(10, factor_scores.imagery_design.score + 1);
1360        factor_scores.imagery_design.reasoning += ' | Complete Open Graph meta tags';
1361      }
1362      if (!technicalSEO.favicon.present) {
1363        factor_scores.imagery_design.score = Math.max(0, factor_scores.imagery_design.score - 1);
1364        factor_scores.imagery_design.reasoning += ' | Missing favicon';
1365      }
1366      if (technicalSEO.render_blocking.total_blocking > 8) {
1367        factor_scores.imagery_design.score = Math.max(0, factor_scores.imagery_design.score - 1);
1368        factor_scores.imagery_design.reasoning += ` | ${technicalSEO.render_blocking.total_blocking} render-blocking resources`;
1369      }
1370  
1371      // Contextual: bonus for html lang attribute
1372      if (technicalSEO.html_lang.present) {
1373        factor_scores.contextual_appropriateness.score = Math.min(10, factor_scores.contextual_appropriateness.score + 1);
1374        factor_scores.contextual_appropriateness.reasoning += ` | HTML lang="${technicalSEO.html_lang.value}"`;
1375      }
1376  
1377      // Offer clarity: bonus for meta description (helps search appearance)
1378      if (technicalSEO.meta_description.present && technicalSEO.meta_description.optimal) {
1379        factor_scores.offer_clarity.score = Math.min(10, factor_scores.offer_clarity.score + 1);
1380        factor_scores.offer_clarity.reasoning += ' | Optimal meta description length';
1381      }
1382  
1383      // Hook/Engagement: bonus/penalty from page speed
1384      if (pageSpeed.score !== null && pageSpeed.score !== undefined) {
1385        if (pageSpeed.score >= 8) {
1386          factor_scores.hook_engagement.score = Math.min(10, factor_scores.hook_engagement.score + 1);
1387          factor_scores.hook_engagement.reasoning += ` | Fast page speed (${pageSpeed.rating})`;
1388        } else if (pageSpeed.score <= 3) {
1389          factor_scores.hook_engagement.score = Math.max(0, factor_scores.hook_engagement.score - 1);
1390          factor_scores.hook_engagement.reasoning += ` | Slow page speed (${pageSpeed.rating})`;
1391        }
1392      }
1393    }
1394  
1395    // Compute weighted score (vision-aware weights)
1396    const visionEnabled = process.env.ENABLE_VISION !== 'false';
1397    const conversion_score = computeWeightedScore(factor_scores, visionEnabled);
1398    const letter_grade = computeGrade(conversion_score);
1399  
1400    // Extract contacts
1401    const contacts = extractContactsFromHtml(html, pageUrl);
1402  
1403    // Metadata
1404    const isDirectory = detectBusinessDirectory(html);
1405    const industry = classifyIndustry(html, keyword);
1406    const location = extractLocation(html);
1407    const tldResult = pageUrl ? detectCountryFromTLD(pageUrl) : null;
1408    const countryFromTLD = tldResult?.countryCode || null;
1409    const isLawFirm = industry === 'lawyer';
1410  
1411    return {
1412      website_url: pageUrl,
1413      evaluation_date: new Date().toISOString(),
1414      conversion_score,
1415      letter_grade,
1416      factor_scores,
1417      technical_seo: technicalSEO,
1418      page_speed: pageSpeed,
1419      is_error_page: false,
1420      is_broken_site: false,
1421      is_js_heavy: isJsHeavy,
1422      is_business_directory: isDirectory,
1423      is_local_business: !isDirectory,
1424      is_law_firm: isLawFirm,
1425      industry_classification: industry,
1426      country_code: countryFromTLD,
1427      city: location.city,
1428      state: location.state,
1429      contacts,
1430    };
1431  }
1432  
1433  // ─── Helpers ────────────────────────────────────────────────────────────────
1434  
1435  /**
1436   * Detect site characteristics: whether it's JS-heavy (unreliable for HTML scoring)
1437   * and its language. Returns { isJsHeavy: boolean, lang: string|null }.
1438   *
1439   * JS-heavy detection covers: React/Next.js/Vue SSR payloads, LiteSpeed lazy-load
1440   * skeletons, and any page where visible text < 300 chars after stripping.
1441   *
1442   * Language detection uses the HTML lang attribute. Non-English sites with
1443   * sufficient content are now scored using language-specific keyword patterns
1444   * rather than forced into neutral scores.
1445   */
1446  function detectSiteCharacteristics(html) {
1447    // Next.js App Router flight protocol markers
1448    if (/self\.__next_f|__next_f\.push|\$RC\(/.test(html)) return { isJsHeavy: true, lang: null };
1449    // Vue/Nuxt SSR hydration markers
1450    if (/window\.__NUXT__|__vue_ssr_context__|nuxtState/.test(html))
1451      return { isJsHeavy: true, lang: null };
1452    // Remix/React Router flight
1453    if (/__remixContext|window\.__remixManifest/.test(html)) return { isJsHeavy: true, lang: null };
1454    // Angular universal
1455    if (/ng-server-context|ng-version/.test(html)) return { isJsHeavy: true, lang: null };
1456  
1457    // Generic: check readable word ratio — if huge HTML but nearly no readable words
1458    const wordCount = (html.replace(/<[^>]+>/g, ' ').match(/\b[a-zA-Z]{4,}\b/g) || []).length;
1459    if (html.length > 30000 && wordCount < 150) return { isJsHeavy: true, lang: null };
1460  
1461    // Sparse visible content after stripping scripts/styles — LiteSpeed lazy-load, etc.
1462    const visibleText = html
1463      .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
1464      .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
1465      .replace(/<[^>]+>/g, ' ')
1466      .replace(/\s+/g, ' ')
1467      .trim();
1468    if (html.length > 2000 && visibleText.length < 300) return { isJsHeavy: true, lang: null };
1469  
1470    // Detect language from HTML lang attribute
1471    const lang = detectLang(html);
1472  
1473    // Previously: non-English lang attr → return true (neutral score).
1474    // Now: only return isJsHeavy=true for unknown non-English languages we
1475    // don't have keyword sets for (so they still get neutral scores).
1476    // Known languages (fr, pl, es, de, it, en) get proper scoring.
1477    const supportedLangs = new Set(['en', 'fr', 'pl', 'es', 'de', 'it']);
1478    if (lang && !supportedLangs.has(lang)) {
1479      // Unknown non-English language — fall back to neutral scores
1480      return { isJsHeavy: true, lang };
1481    }
1482  
1483    return { isJsHeavy: false, lang: lang || 'en' };
1484  }
1485  
1486  /**
1487   * @deprecated Use detectSiteCharacteristics() instead.
1488   * Kept for backward compatibility — returns boolean only.
1489   */
1490  function isJsHeavySite(html) {
1491    return detectSiteCharacteristics(html).isJsHeavy;
1492  }
1493  
1494  function stripHtml(html) {
1495    return (html || '')
1496      .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
1497      .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
1498      .replace(/<[^>]+>/g, ' ')
1499      .replace(/&[a-z]+;/gi, ' ')
1500      .replace(/\s+/g, ' ')
1501      .trim();
1502  }