programmatic-scorer.js
1 /** 2 * Programmatic Scorer — Rule-based website scoring (no LLM required). 3 * 4 * Replaces GPT-4o-mini scoring with HTML/DOM analysis. 5 * Uses the same factor weights and grade thresholds as score.js. 6 * 7 * Each factor is scored 0-10 using regex/DOM pattern matching: 8 * 0 = absent, 3-4 = weak, 5-6 = adequate, 7-8 = strong, 9-10 = exceptional 9 * 10 * Non-English support: French (FR/BE/CH), Polish (PL), Spanish (ES/MX/AR), 11 * German (DE/AT/CH), Italian (IT). Language detected from HTML lang attribute. 12 */ 13 14 import { computeGrade, FACTOR_WEIGHTS } from '../score.js'; 15 import { extractContactsFromHtml } from './html-contact-extractor.js'; 16 import { detectCountryFromTLD } from './tld-detector.js'; 17 18 // ─── Vision-Aware Weight Redistribution ────────────────────────────────────── 19 20 /** 21 * When ENABLE_VISION is false, imagery_design and hook_engagement can only be 22 * partially assessed from HTML (no screenshots for layout/visual hierarchy). 23 * Redistribute their weight to factors the programmatic scorer CAN assess. 24 * 25 * Default weights (vision ON): 26 * headline_quality: 0.15, value_proposition: 0.14, unique_selling_proposition: 0.13, 27 * call_to_action: 0.13, urgency_messaging: 0.10, hook_engagement: 0.09, 28 * trust_signals: 0.11, imagery_design: 0.08, offer_clarity: 0.04, contextual: 0.03 29 * 30 * No-vision weights (vision OFF): 31 * imagery_design: 0.03 (down from 0.08 — can still detect img count, alt text, responsive) 32 * hook_engagement: 0.04 (down from 0.09 — can detect video embeds, hero images in markup) 33 * Freed weight (0.10) redistributed to semantic + structural factors the scorer handles well. 34 */ 35 const NO_VISION_WEIGHTS = { 36 headline_quality: 0.17, // +0.02 (detectable from h1) 37 value_proposition: 0.16, // +0.02 (detectable from text) 38 unique_selling_proposition: 0.14, // +0.01 (detectable from text) 39 call_to_action: 0.15, // +0.02 (well-detected from HTML) 40 urgency_messaging: 0.1, // unchanged 41 hook_engagement: 0.04, // -0.05 (can't assess visual impact) 42 trust_signals: 0.13, // +0.02 (well-detected from HTML) 43 imagery_design: 0.03, // -0.05 (can't assess design quality) 44 offer_clarity: 0.05, // +0.01 (well-detected from HTML) 45 contextual_appropriateness: 0.03, // unchanged 46 }; 47 48 /** 49 * Compute weighted total score using vision-aware weights. 50 * @param {Object} factorScores - Factor scores (each has .score 0-10) 51 * @param {boolean} visionEnabled - Whether vision/screenshots are available 52 * @returns {number} Score 0-100 53 */ 54 export function computeWeightedScore(factorScores, visionEnabled = true) { 55 if (!factorScores || typeof factorScores !== 'object') return null; 56 const weights = visionEnabled ? FACTOR_WEIGHTS : NO_VISION_WEIGHTS; 57 let total = 0; 58 for (const [factor, weight] of Object.entries(weights)) { 59 const score = factorScores[factor]?.score ?? 0; 60 total += score * weight; 61 } 62 return Math.round(total * 10 * 10) / 10; 63 } 64 65 // ─── Text Extraction for Hybrid Scoring ────────────────────────────────────── 66 67 /** 68 * Extract key text sections from HTML for LLM semantic scoring. 69 * Returns a compact text representation (~500-1500 tokens) that Haiku 70 * can use to evaluate headline quality, value proposition, and USP. 71 * 72 * @param {string} html - Full rendered DOM HTML 73 * @returns {Object} Extracted text sections 74 */ 75 export function extractScoringText(html) { 76 if (!html || html.length < 100) return null; 77 78 // H1 headline 79 const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i); 80 const h1 = h1Match ? h1Match[1].replace(/<[^>]+>/g, '').trim() : null; 81 82 // H2 subheadings (first 5) 83 const h2Matches = [...(html.matchAll(/<h2[^>]*>([\s\S]*?)<\/h2>/gi) || [])]; 84 const h2s = h2Matches 85 .slice(0, 5) 86 .map(m => m[1].replace(/<[^>]+>/g, '').trim()) 87 .filter(Boolean); 88 89 // Above-fold text (first ~3000 chars of HTML, stripped) 90 const aboveFold = stripHtmlForExtraction(html.slice(0, 4000)).slice(0, 800); 91 92 // Full page text (stripped, capped) 93 const fullText = stripHtmlForExtraction(html).slice(0, 2000); 94 95 // Testimonial/review sections 96 const testimonials = extractTestimonials(html); 97 98 // CTA text (button/link text) 99 const ctaTexts = extractCTATexts(html); 100 101 // Meta description 102 const metaMatch = html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']+)["']/i); 103 const metaDescription = metaMatch ? metaMatch[1].trim() : null; 104 105 // Title tag 106 const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i); 107 const title = titleMatch ? titleMatch[1].replace(/<[^>]+>/g, '').trim() : null; 108 109 return { 110 title, 111 meta_description: metaDescription, 112 h1, 113 h2s, 114 above_fold_text: aboveFold, 115 body_text: fullText, 116 testimonial_snippets: testimonials, 117 cta_texts: ctaTexts, 118 }; 119 } 120 121 function stripHtmlForExtraction(html) { 122 return (html || '') 123 .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') 124 .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') 125 .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '') 126 .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '') 127 .replace(/<[^>]+>/g, ' ') 128 .replace(/&[a-z]+;/gi, ' ') 129 .replace(/\s+/g, ' ') 130 .trim(); 131 } 132 133 function extractTestimonials(html) { 134 const snippets = []; 135 // Look for blockquote, testimonial divs, review sections 136 const blockquotes = html.match(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi) || []; 137 for (const bq of blockquotes.slice(0, 3)) { 138 const text = bq 139 .replace(/<[^>]+>/g, '') 140 .trim() 141 .slice(0, 200); 142 if (text.length > 20) snippets.push(text); 143 } 144 // testimonial/review class divs 145 const testimonialDivs = 146 html.match(/<div[^>]*class[^>]*(testimonial|review|feedback)[^>]*>([\s\S]*?)<\/div>/gi) || []; 147 for (const div of testimonialDivs.slice(0, 3)) { 148 const text = div 149 .replace(/<[^>]+>/g, '') 150 .trim() 151 .slice(0, 200); 152 if (text.length > 20 && !snippets.includes(text)) snippets.push(text); 153 } 154 return snippets.slice(0, 5); 155 } 156 157 function extractCTATexts(html) { 158 const ctas = []; 159 const buttons = 160 html.match(/<(button|a)[^>]*class[^>]*(btn|button|cta)[^>]*>([\s\S]*?)<\/\1>/gi) || []; 161 for (const btn of buttons.slice(0, 5)) { 162 const text = btn.replace(/<[^>]+>/g, '').trim(); 163 if (text.length >= 2 && text.length <= 50) ctas.push(text); 164 } 165 return [...new Set(ctas)].slice(0, 5); 166 } 167 168 // ─── Language-Specific Keyword Sets ───────────────────────────────────────── 169 170 /** 171 * CTA action keywords per language. 172 * Used in scoreCTA() to detect call-to-action text in links/buttons. 173 */ 174 const CTA_KEYWORDS = { 175 en: /\b(get started|book now|call now|contact us|get a quote|free quote|schedule|request|order now|buy now|sign up|learn more|find out|get in touch|start now|claim|download|try free|shop now)\b/i, 176 fr: /\b(appeler|appelez|contactez|contact|devis|devis gratuit|réserver|réservez|commander|demander|commencer|s'inscrire|en savoir plus|prendre rendez-vous|obtenir un devis|demandez un devis)\b/i, 177 pl: /\b(zadzwoń|zadzwon|kontakt|wycena|zamów|zamow|zarezerwuj|zapisz się|zapisz sie|dowiedz się więcej|dowiedz sie wiecej|bezpłatna wycena|darmowa wycena|skontaktuj się|skontaktuj sie)\b/i, 178 es: /\b(llamar|llame|contactar|contáctenos|contactenos|cotizar|cotización|cotizacion|reservar|solicitar|comenzar|suscribirse|más información|mas informacion|obtenga presupuesto|presupuesto gratis)\b/i, 179 de: /\b(anrufen|rufen sie|kontakt|angebot|angebot anfordern|kostenloses angebot|buchen|reservieren|bestellen|anfragen|jetzt starten|mehr erfahren|termin vereinbaren|jetzt anrufen)\b/i, 180 it: /\b(chiama|chiami|contatta|contattateci|preventivo|preventivo gratuito|prenota|prenotate|ordina|richiedi|inizia|iscriviti|scopri di più|scopri di piu|richiedi un preventivo)\b/i, 181 }; 182 183 /** 184 * Urgency / time-pressure keywords per language. 185 */ 186 const URGENCY_KEYWORDS = { 187 en: /\b(limited time|hurry|act now|don't miss|expires?|deadline|last chance|ending soon|today only|this week|offer ends|while supplies last|before it's too late)\b/i, 188 fr: /\b(offre limitée|offre limitee|dépêchez|depechez|agissez maintenant|ne ratez pas|expire|date limite|dernière chance|derniere chance|se termine bientôt|se termine bientot|aujourd'hui seulement|cette semaine)\b/i, 189 pl: /\b(ograniczona oferta|pośpiesz się|pospieszcie sie|działaj teraz|dzialaj teraz|nie przegap|wygasa|termin|ostatnia szansa|kończy się|konczy sie|tylko dziś|tylko dzis|w tym tygodniu)\b/i, 190 es: /\b(tiempo limitado|apúrese|apurese|actúe ahora|actue ahora|no se pierda|expira|fecha límite|fecha limite|última oportunidad|ultima oportunidad|termina pronto|solo hoy|esta semana)\b/i, 191 de: /\b(begrenzte zeit|beeilen sie sich|jetzt handeln|verpassen sie nicht|läuft ab|lauft ab|frist|letzte chance|endet bald|nur heute|diese woche|angebot endet)\b/i, 192 it: /\b(tempo limitato|affrettatevi|agite ora|non perdete|scade|scadenza|ultima possibilità|ultima possibilita|termina presto|solo oggi|questa settimana|offerta termina)\b/i, 193 }; 194 195 /** 196 * Scarcity keywords per language. 197 */ 198 const SCARCITY_KEYWORDS = { 199 en: /\b(only \d+|limited (spots?|seats?|availability)|few remaining|almost gone|selling fast|limited stock)\b/i, 200 fr: /\b(seulement \d+|places limitées|places limitees|disponibilité limitée|disponibilite limitee|presque épuisé|presque epuise|stock limité|stock limite)\b/i, 201 pl: /\b(tylko \d+|ograniczona liczba miejsc|ograniczona dostępność|ograniczona dostepnosc|prawie wyprzedane|ograniczony stan)\b/i, 202 es: /\b(solo \d+|plazas limitadas|disponibilidad limitada|casi agotado|stock limitado|pocas unidades)\b/i, 203 de: /\b(nur \d+|begrenzte plätze|begrenzte platze|begrenzte verfügbarkeit|begrenzte verfugbarkeit|fast ausverkauft|begrenzter bestand)\b/i, 204 it: /\b(solo \d+|posti limitati|disponibilità limitata|disponibilita limitata|quasi esaurito|scorte limitate)\b/i, 205 }; 206 207 /** 208 * Trust signal keywords per language. 209 */ 210 const TRUST_SECTION_KEYWORDS = { 211 en: /\b(testimonial|review|what (our |people |customers? |clients? )?say|feedback|rating)\b/i, 212 fr: /\b(témoignage|temoignage|avis|avis clients?|ce que (nos |les )?clients? disent|commentaire|note|évaluation|evaluation)\b/i, 213 pl: /\b(opinia|opinie|recenzja|recenzje|co mówią|co mowia|komentarz|komentarze|ocena|oceny|nasi klienci)\b/i, 214 es: /\b(testimonio|testimonios|reseña|reseñas|resena|resenas|lo que (nuestros |los )?clientes? (dicen|opinan)|comentario|calificación|calificacion|opinión|opinion)\b/i, 215 de: /\b(erfahrungsbericht|erfahrungsberichte|bewertung|bewertungen|kundenmeinung|kundenmeinungen|was (unsere |die )?kunden sagen|rezension|bewertung)\b/i, 216 it: /\b(testimonianza|testimonianze|recensione|recensioni|cosa (i nostri |i )?clienti (dicono|pensano)|commento|valutazione|opinione)\b/i, 217 }; 218 219 /** 220 * Certification / trust badge keywords per language. 221 */ 222 const CERT_KEYWORDS = { 223 en: /\b(certified|accredited|licensed|insured|bonded|registered|approved|verified|member of)\b/i, 224 fr: /\b(certifié|certifie|accrédité|accredite|agréé|agree|assuré|assure|enregistré|enregistre|approuvé|approuve|vérifié|verifie|membre de)\b/i, 225 pl: /\b(certyfikowany|certyfikat|akredytowany|licencjonowany|ubezpieczony|zarejestrowany|zatwierdzony|zweryfikowany|członek)\b/i, 226 es: /\b(certificado|acreditado|licenciado|asegurado|registrado|aprobado|verificado|miembro de)\b/i, 227 de: /\b(zertifiziert|akkreditiert|lizenziert|versichert|eingetragen|zugelassen|verifiziert|mitglied (von|der|des))\b/i, 228 it: /\b(certificato|accreditato|autorizzato|assicurato|registrato|approvato|verificato|membro di)\b/i, 229 }; 230 231 /** 232 * Guarantee / risk-reversal keywords per language. 233 */ 234 const GUARANTEE_KEYWORDS = { 235 en: /\b(guarantee|money.back|satisfaction|warranty|no.risk)\b/i, 236 fr: /\b(garantie|remboursement|satisfait ou remboursé|satisfait ou rembourse|sans risque)\b/i, 237 pl: /\b(gwarancja|zwrot pieniędzy|zwrot pieniedzy|satysfakcja|bez ryzyka)\b/i, 238 es: /\b(garantía|garantia|devolución|devolucion|reembolso|satisfacción|satisfaccion|sin riesgo)\b/i, 239 de: /\b(garantie|geld-zurück|geld zurück|geld zuruck|zufriedenheit|risikofrei)\b/i, 240 it: /\b(garanzia|rimborso|soddisfazione|senza rischio)\b/i, 241 }; 242 243 /** 244 * Benefit / value proposition keywords per language. 245 */ 246 const BENEFIT_KEYWORDS = { 247 en: /\b(save|reduce|increase|improve|grow|protect|maximize|minimize|eliminate|prevent|achieve|guarantee|ensure|deliver)\b/gi, 248 fr: /\b(économisez|economisez|réduisez|reduisez|augmentez|améliorez|ameliorez|développez|developpez|protégez|protegez|maximisez|éliminez|eliminez|garantissez|assurez|livrez)\b/gi, 249 pl: /\b(oszczędź|oszczedz|zmniejsz|zwiększ|zwieksz|popraw|rozwijaj|chroń|chron|maksymalizuj|eliminuj|gwarantuj|zapewnij|dostarcz)\b/gi, 250 es: /\b(ahorre|reduzca|aumente|mejore|crezca|proteja|maximice|minimice|elimine|prevenga|logre|garantice|asegure|entregue)\b/gi, 251 de: /\b(sparen|reduzieren|steigern|verbessern|wachsen|schützen|schutzen|maximieren|minimieren|beseitigen|verhindern|erreichen|garantieren|sicherstellen|liefern)\b/gi, 252 it: /\b(risparmia|riduci|aumenta|migliora|cresci|proteggi|massimizza|minimizza|elimina|previeni|raggiungi|garantisci|assicura|consegna)\b/gi, 253 }; 254 255 /** 256 * Headline benefit words per language (single word check). 257 */ 258 const HEADLINE_BENEFIT_WORDS = { 259 en: /\b(save|grow|boost|increase|improve|transform|get|start|discover|free|best|fast|easy|simple|trusted|guaranteed|proven|results?|solution|affordable|professional|expert|quality)\b/i, 260 fr: /\b(économisez|economisez|développez|developpez|améliorez|ameliorez|gratuit|meilleur|rapide|simple|fiable|garanti|résultats|resultats|solution|abordable|professionnel|expert|qualité|qualite)\b/i, 261 pl: /\b(oszczędź|oszczedz|rozwijaj|popraw|bezpłatny|bezplatny|darmowy|najlepszy|szybki|prosty|zaufany|gwarantowany|wyniki|rozwiązanie|rozwiazanie|przystępny|przystepny|profesjonalny|ekspert|jakość|jakosc)\b/i, 262 es: /\b(ahorre|desarrolle|mejore|gratis|mejor|rápido|rapido|simple|confiable|garantizado|resultados|solución|solucion|asequible|profesional|experto|calidad)\b/i, 263 de: /\b(sparen|entwickeln|verbessern|kostenlos|beste|schnell|einfach|zuverlässig|zuverlassig|garantiert|ergebnisse|lösung|losung|erschwinglich|professionell|experte|qualität|qualitat)\b/i, 264 it: /\b(risparmia|sviluppa|migliora|gratis|gratuito|migliore|veloce|semplice|affidabile|garantito|risultati|soluzione|conveniente|professionale|esperto|qualità|qualita)\b/i, 265 }; 266 267 /** 268 * USP / differentiation keywords per language. 269 */ 270 const USP_KEYWORDS = { 271 en: /\b(only|unique|exclusive|unlike|first|pioneering|original|proprietary|patented|award[- ]winning|leading|#1|number one|best in)\b/gi, 272 fr: /\b(unique|exclusif|exclusifs|contrairement|premier|pionnier|original|propriétaire|breveté|brevete|primé|prime|leader|n°1|numéro un|numero un|meilleur de)\b/gi, 273 pl: /\b(jedyny|unikalny|unikalny|wyjątkowy|wyjatkowy|ekskluzywny|w przeciwieństwie|w przeciwienstwie|pierwszy|pionierski|oryginalny|opatentowany|nagrodzony|wiodący|wiodacy|nr 1|numer jeden|najlepszy)\b/gi, 274 es: /\b(único|unico|exclusivo|a diferencia|primero|pionero|original|patentado|premiado|líder|lider|número uno|numero uno|el mejor)\b/gi, 275 de: /\b(einzigartig|exklusiv|im gegensatz|erste|erstmals|pionier|original|patentiert|preisgekrönt|preisgekront|führend|fuhrend|nr\.? ?1|nummer eins|bestes)\b/gi, 276 it: /\b(unico|esclusivo|a differenza|primo|pioniere|originale|brevettato|premiato|leader|n\. ?1|numero uno|il migliore)\b/gi, 277 }; 278 279 /** 280 * Value proposition "you-focus" pronouns per language. 281 */ 282 const YOU_PRONOUNS = { 283 en: /\b(you|your|you're|you'll)\b/gi, 284 fr: /\b(vous|votre|vos|tu|ton|ta|tes)\b/gi, 285 pl: /\b(ty|twój|twoja|twoje|twoi|wasz|wasza|wasze|wasi|pana|pani)\b/gi, 286 es: /\b(usted|su|sus|tú|tu|tus|vosotros|vuestro|vuestra|vuestros|vuestras)\b/gi, 287 de: /\b(sie|ihr|ihre|ihrem|ihren|ihres|du|dein|deine|deinem|deinen|deines)\b/gi, 288 it: /\b(lei|suo|sua|suoi|sue|tu|tuo|tua|tuoi|tue|voi|vostro|vostra)\b/gi, 289 }; 290 291 /** 292 * "We-focus" pronouns per language. 293 */ 294 const WE_PRONOUNS = { 295 en: /\b(we|our|we're|we'll)\b/gi, 296 fr: /\b(nous|notre|nos)\b/gi, 297 pl: /\b(my|nasz|nasza|nasze|nasi)\b/gi, 298 es: /\b(nosotros|nuestro|nuestra|nuestros|nuestras)\b/gi, 299 de: /\b(wir|unser|unsere|unserem|unseren|unseres)\b/gi, 300 it: /\b(noi|nostro|nostra|nostri|nostre)\b/gi, 301 }; 302 303 /** 304 * "Specific outcomes" phrases per language. 305 */ 306 const OUTCOME_PHRASES = { 307 en: /\b(up to|within|in just|only takes|as fast as|guaranteed)\b/i, 308 fr: /\b(jusqu'à|jusqu'a|en seulement|en moins de|aussi vite que|garanti|en \d+ (jours?|heures?|minutes?))\b/i, 309 pl: /\b(do|w ciągu|w ciagu|w zaledwie|tak szybko jak|gwarantowany|w \d+ (dniach?|godzinach?|minutach?))\b/i, 310 es: /\b(hasta|en solo|en menos de|tan rápido como|tan rapido como|garantizado|en \d+ (días?|dias?|horas?|minutos?))\b/i, 311 de: /\b(bis zu|innerhalb von|in nur|so schnell wie|garantiert|in \d+ (tagen?|stunden?|minuten?))\b/i, 312 it: /\b(fino a|entro|in soli|così velocemente come|cosi velocemente come|garantito|in \d+ (giorni?|ore|minuti?))\b/i, 313 }; 314 315 /** 316 * Service list keywords per language. 317 */ 318 const SERVICE_KEYWORDS = { 319 en: /\b(services?|what we (do|offer)|our (services?|work))\b/i, 320 fr: /\b(services?|prestations?|ce que nous (faisons|proposons|offrons)|nos (services?|prestations?))\b/i, 321 pl: /\b(usługi|uslugi|oferta|co (robimy|oferujemy)|nasze (usługi|uslugi))\b/i, 322 es: /\b(servicios?|lo que (hacemos|ofrecemos)|nuestros (servicios?|trabajos?))\b/i, 323 de: /\b(leistungen?|dienstleistungen?|was wir (tun|anbieten)|unsere (leistungen?|dienstleistungen?))\b/i, 324 it: /\b(servizi?|cosa (facciamo|offriamo)|i nostri (servizi?|lavori?))\b/i, 325 }; 326 327 /** 328 * Business hours keywords per language. 329 */ 330 const HOURS_KEYWORDS = { 331 en: /\b(hours?|open|serving|areas? served|locations?|coverage)\b/i, 332 fr: /\b(heures?|ouvert|horaires?|zones? desservies?|emplacements?|couverture)\b/i, 333 pl: /\b(godziny|otwarty|otwarte|obsługiwany|obsługiwane|obszary|lokalizacje|zasięg)\b/i, 334 es: /\b(horas?|abierto|horarios?|áreas? de servicio|areas? de servicio|ubicaciones?|cobertura)\b/i, 335 de: /\b(stunden?|öffnungszeiten|geöffnet|geöffnet|servicegebiete|standorte?|versorgungsgebiet)\b/i, 336 it: /\b(ore|aperto|orari?|aree servite|posizioni?|copertura)\b/i, 337 }; 338 339 /** 340 * Process / how-it-works keywords per language. 341 */ 342 const PROCESS_KEYWORDS = { 343 en: /\b(how it works|our process|step \d|getting started)\b/i, 344 fr: /\b(comment ça marche|comment cela fonctionne|notre processus|étape \d|etape \d|pour commencer)\b/i, 345 pl: /\b(jak to działa|jak to dziala|nasz proces|krok \d|jak zacząć|jak zaczac)\b/i, 346 es: /\b(cómo funciona|como funciona|nuestro proceso|paso \d|como empezar)\b/i, 347 de: /\b(wie es funktioniert|unser prozess|schritt \d|so geht's|so geht es)\b/i, 348 it: /\b(come funziona|il nostro processo|passo \d|fase \d|come iniziare)\b/i, 349 }; 350 351 /** 352 * Discount keywords per language (urgency-adjacent). 353 */ 354 const DISCOUNT_KEYWORDS = { 355 en: /\b(\d+%\s*off|save\s*\$?\d+|discount|special offer|deal)\b/i, 356 fr: /\b(\d+%\s*(de réduction|de reduction)|économisez|economisez|réduction|reduction|offre spéciale|offre speciale|promotion)\b/i, 357 pl: /\b(\d+%\s*(zniżki|znizki|taniej)|oszczędź|oszczedz|zniżka|znizka|oferta specjalna|promocja)\b/i, 358 es: /\b(\d+%\s*(de descuento|menos)|ahorre|descuento|oferta especial|promoción|promocion)\b/i, 359 de: /\b(\d+%\s*rabatt|\d+%\s*günstiger|gunstiger|sparen|rabatt|sonderangebot|aktion)\b/i, 360 it: /\b(\d+%\s*(di sconto|meno)|risparmia|sconto|offerta speciale|promozione)\b/i, 361 }; 362 363 /** 364 * Local business keywords per language. 365 */ 366 const LOCAL_KEYWORDS = { 367 en: /\b(local|nearby|serving|area|community|neighborhood|suburb)\b/i, 368 fr: /\b(local|locale|à proximité|a proximite|servant|zone|quartier|ville|région|region)\b/i, 369 pl: /\b(lokalny|lokalna|w pobliżu|w poblizu|obsługujemy|obszar|dzielnica|miasto|region)\b/i, 370 es: /\b(local|cercano|cercana|sirviendo|zona|barrio|vecindario|ciudad|región|region)\b/i, 371 de: /\b(lokal|in der nähe|in der nahe|vor ort|region|bezirk|stadtteil|gemeinde)\b/i, 372 it: /\b(locale|nelle vicinanze|che serve|zona|quartiere|città|citta|regione)\b/i, 373 }; 374 375 // ─── Helper: Detect Language ───────────────────────────────────────────────── 376 377 /** 378 * Detect the primary language from HTML lang attribute. 379 * Returns a short language code: 'en', 'fr', 'pl', 'es', 'de', 'it', or null. 380 */ 381 function detectLang(html) { 382 const rawLang = ((html.match(/<html[^>]*lang=["']([^"']+)["']/i) || [])[1] || '').toLowerCase(); 383 if (!rawLang) return null; 384 // Normalize: 'fr-CA' → 'fr', 'de-AT' → 'de', etc. 385 const base = rawLang.split(/[-_]/)[0]; 386 if (base === 'en') return 'en'; 387 if (base === 'fr') return 'fr'; 388 if (base === 'pl') return 'pl'; 389 if (base === 'es') return 'es'; 390 if (base === 'de') return 'de'; 391 if (base === 'it') return 'it'; 392 // Unknown non-English language 393 return base; 394 } 395 396 /** 397 * Return the keyword set for a given language, falling back to English. 398 */ 399 function langKey(map, lang) { 400 if (lang && map[lang]) return map[lang]; 401 return map['en']; 402 } 403 404 // ─── Factor Scoring Functions ─────────────────────────────────────────────── 405 406 /** 407 * Factor 1: Headline Quality (weight: 15%) 408 * Checks h1 presence, word count, benefit/action language 409 */ 410 export function scoreHeadlineQuality(html, lang = 'en') { 411 const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i); 412 if (!h1Match) 413 return { score: 0, reasoning: 'No h1 headline found', evidence: 'Missing <h1> tag' }; 414 415 const h1Text = h1Match[1].replace(/<[^>]+>/g, '').trim(); 416 if (!h1Text) return { score: 1, reasoning: 'Empty h1 tag', evidence: '<h1> exists but empty' }; 417 418 const words = h1Text.split(/\s+/).length; 419 let score = 5; // Base: h1 with meaningful content exists (LLM avg for basic headlines is ~5-6) 420 421 // Word count quality (3-12 words is ideal) 422 if (words >= 3 && words <= 12) score += 1; 423 else if (words >= 2) score += 1; 424 // Very short (1 word) or very long (>12) headlines = no bonus 425 426 // Benefit/action keywords (language-aware) 427 const benefitWords = langKey(HEADLINE_BENEFIT_WORDS, lang); 428 if (benefitWords.test(h1Text)) score += 2; 429 430 // Specific numbers or quantified claims 431 if (/\d+/.test(h1Text)) score += 1; 432 433 // Questions or direct address (language-aware) 434 const youPronouns = langKey(YOU_PRONOUNS, lang); 435 if (youPronouns.test(h1Text) || h1Text.includes('?')) score += 1; 436 437 return { 438 score: Math.min(score, 10), 439 reasoning: `H1 found with ${words} words${benefitWords.test(h1Text) ? ', includes benefit language' : ''}`, 440 evidence: h1Text.slice(0, 100), 441 }; 442 } 443 444 /** 445 * Factor 2: Value Proposition (weight: 14%) 446 * Looks for quantified claims, benefit statements, outcome language 447 */ 448 export function scoreValueProposition(html, lang = 'en') { 449 const text = stripHtml(html); 450 let score = 4; // Base: page has content (HTML alone can't assess visual quality) 451 452 // Quantified claims (percentages, dollar amounts, time frames — mostly language-neutral) 453 const quantified = text.match(/\d+\s*(%|percent|dollar|year|month|day|hour|minute|save|off)/gi); 454 if (quantified && quantified.length >= 2) score += 3; 455 else if (quantified) score += 2; 456 457 // Benefit language (language-aware) 458 const benefits = langKey(BENEFIT_KEYWORDS, lang); 459 const benefitCount = (text.match(benefits) || []).length; 460 if (benefitCount >= 3) score += 2; 461 else if (benefitCount >= 1) score += 1; 462 463 // Specific outcomes (language-aware) 464 const outcomePhrases = langKey(OUTCOME_PHRASES, lang); 465 if (outcomePhrases.test(text)) score += 1; 466 467 // "We" vs "You" focus — customer-centric language scores higher (language-aware) 468 const youCount = (text.match(langKey(YOU_PRONOUNS, lang)) || []).length; 469 const weCount = (text.match(langKey(WE_PRONOUNS, lang)) || []).length; 470 if (youCount > weCount) score += 1; 471 472 return { 473 score: Math.min(score, 10), 474 reasoning: `${quantified?.length || 0} quantified claims, ${benefitCount} benefit keywords`, 475 evidence: `You/Your: ${youCount}, We/Our: ${weCount}`, 476 }; 477 } 478 479 /** 480 * Factor 3: USP / Differentiation (weight: 13%) 481 * Looks for "only", "unique", "exclusive", comparative language 482 */ 483 export function scoreUSP(html, lang = 'en') { 484 const text = stripHtml(html); 485 let score = 4; // Base: real business exists (can't assess differentiation from HTML alone) 486 487 const uspKeywords = langKey(USP_KEYWORDS, lang); 488 const uspCount = (text.match(uspKeywords) || []).length; 489 if (uspCount >= 3) score += 3; 490 else if (uspCount >= 1) score += 2; 491 492 // Comparative language (mostly language-neutral patterns + language-specific) 493 const comparativeEn = 494 /\b(better than|compared to|versus|vs\.?|more than|faster than|cheaper than)\b/i; 495 const comparativeFr = 496 /\b(mieux que|par rapport à|par rapport a|versus|vs\.?|plus que|plus rapide que|moins cher que)\b/i; 497 const comparativePl = 498 /\b(lepszy niż|lepszy niz|w porównaniu do|versus|vs\.?|więcej niż|wiecej niz|szybszy niż|szybszy niz|tańszy niż|tanszy niz)\b/i; 499 const comparativeEs = 500 /\b(mejor que|comparado con|versus|vs\.?|más que|mas que|más rápido que|mas rapido que|más barato que|mas barato que)\b/i; 501 const comparativeDe = 502 /\b(besser als|verglichen mit|im vergleich zu|versus|vs\.?|mehr als|schneller als|günstiger als|gunstiger als)\b/i; 503 const comparativeIt = 504 /\b(meglio di|rispetto a|versus|vs\.?|più di|piu di|più veloce di|piu veloce di|più economico di|piu economico di)\b/i; 505 506 const comparativeMap = { 507 en: comparativeEn, 508 fr: comparativeFr, 509 pl: comparativePl, 510 es: comparativeEs, 511 de: comparativeDe, 512 it: comparativeIt, 513 }; 514 if (langKey(comparativeMap, lang).test(text)) score += 1; 515 516 // Specific differentiators (years of experience, number of customers — numbers are language-neutral) 517 if ( 518 /\b(\d+\+?\s*(years?|customers?|clients?|projects?|locations?|lat|ans?|kunden|clienti|clientes?|jahre?))\b/i.test( 519 text 520 ) 521 ) 522 score += 2; 523 524 // "Why choose us" or similar sections (language-aware) 525 const whyChooseEn = 526 /\b(why choose|what makes us|what sets us apart|our difference|our advantage)\b/i; 527 const whyChooseFr = 528 /\b(pourquoi nous choisir|pourquoi choisir|ce qui nous différencie|ce qui nous differencie|notre avantage)\b/i; 529 const whyChoosePl = 530 /\b(dlaczego my|dlaczego warto|co nas wyróżnia|co nas wyroznia|nasza przewaga)\b/i; 531 const whyChooseEs = 532 /\b(por qué elegirnos|por que elegirnos|qué nos diferencia|que nos diferencia|nuestra ventaja)\b/i; 533 const whyChooseDe = 534 /\b(warum uns wählen|warum uns wahlen|was uns auszeichnet|unser vorteil|warum wir)\b/i; 535 const whyChooseIt = 536 /\b(perché sceglierci|perche sceglierci|cosa ci distingue|il nostro vantaggio)\b/i; 537 538 const whyChooseMap = { 539 en: whyChooseEn, 540 fr: whyChooseFr, 541 pl: whyChoosePl, 542 es: whyChooseEs, 543 de: whyChooseDe, 544 it: whyChooseIt, 545 }; 546 if (langKey(whyChooseMap, lang).test(text)) score += 1; 547 548 return { 549 score: Math.min(score, 10), 550 reasoning: `${uspCount} differentiation keywords found`, 551 evidence: (text.match(uspKeywords) || []).slice(0, 3).join(', ') || 'None', 552 }; 553 } 554 555 /** 556 * Factor 4: CTA Design (weight: 13%) 557 * Checks for button elements, CTA text quality, action verbs 558 */ 559 export function scoreCTA(html, lang = 'en') { 560 let score = 2; // Base: most sites have some navigation/contact links 561 562 // Button or CTA-like elements 563 const buttons = html.match(/<(button|a)[^>]*class[^>]*(btn|button|cta)[^>]*>([\s\S]*?)<\/\1>/gi); 564 const linkButtons = html.match(/<a[^>]*>([\s\S]*?)<\/a>/gi) || []; 565 566 // Strong CTA text patterns (language-aware) 567 const ctaPattern = langKey(CTA_KEYWORDS, lang); 568 569 let ctaElements = 0; 570 for (const link of linkButtons) { 571 const linkText = link.replace(/<[^>]+>/g, '').trim(); 572 if (ctaPattern.test(linkText)) ctaElements++; 573 } 574 575 if (buttons && buttons.length > 0) score += 2; 576 else if (ctaElements > 0) score += 1; 577 578 // Multiple CTAs (good for conversion) 579 if (ctaElements >= 3) score += 2; 580 else if (ctaElements >= 1) score += 1; 581 582 // Phone number as CTA (tel: links) — universal 583 if (/<a[^>]*href\s*=\s*["']tel:/i.test(html)) score += 1; 584 585 // Form presence (another conversion path) — universal 586 if (/<form/i.test(html)) score += 1; 587 588 // Email link (mailto:) — universal 589 if (/<a[^>]*href\s*=\s*["']mailto:/i.test(html)) score += 1; 590 591 return { 592 score: Math.min(score, 10), 593 reasoning: `${buttons?.length || 0} button elements, ${ctaElements} CTA patterns, ${/<a[^>]*href\s*=\s*["']tel:/i.test(html) ? 'tel link found' : 'no tel link'}`, 594 evidence: `Buttons: ${buttons?.length || 0}, CTA links: ${ctaElements}`, 595 }; 596 } 597 598 /** 599 * Factor 5: Urgency/Scarcity (weight: 10%) 600 * Date/deadline patterns, quantity limits, urgency language 601 */ 602 export function scoreUrgency(html, lang = 'en') { 603 const text = stripHtml(html); 604 let score = 1; // Base: LLM typically gives 2-3 even without explicit urgency 605 606 // Time-bound urgency (language-aware) 607 const timeUrgency = langKey(URGENCY_KEYWORDS, lang); 608 if (timeUrgency.test(text)) score += 4; 609 610 // Quantity scarcity (language-aware) 611 const quantityScarcity = langKey(SCARCITY_KEYWORDS, lang); 612 if (quantityScarcity.test(text)) score += 3; 613 614 // Seasonal/dated offers — month names are mostly recognizable across languages 615 if ( 616 /\b(spring|summer|fall|autumn|winter|holiday|christmas|new year|black friday|printemps|été|automne|hiver|noël|nouvel an|wiosna|lato|jesień|zima|święta|primavera|verano|otoño|invierno|navidad|frühling|herbst|weihnachten|neujahr|estate|autunno|natale|capodanno)\b/i.test( 617 text 618 ) && 619 /\b(sale|offer|special|deal|discount|soldes|offre|promo|vente|oferta|especial|angebot|aktion|offerta|saldi)\b/i.test( 620 text 621 ) 622 ) 623 score += 2; 624 625 // Discount language (language-aware) 626 const discountKw = langKey(DISCOUNT_KEYWORDS, lang); 627 if (discountKw.test(text)) score += 2; 628 629 // Most local business sites lack urgency — that's normal. Score 0-2 is typical. 630 return { 631 score: Math.min(score, 10), 632 reasoning: 633 score === 0 ? 'No urgency or scarcity messaging present' : 'Urgency/scarcity elements found', 634 evidence: (text.match(timeUrgency) || ['None'])[0], 635 }; 636 } 637 638 /** 639 * Factor 6: Hook/Engagement (weight: 9%) 640 * Hero imagery, video, compelling above-fold content 641 */ 642 export function scoreHook(html) { 643 let score = 3; // Base: page exists and loads 644 645 // Hero image or background image 646 if (/<img[^>]*(hero|banner|header|main|feature)/i.test(html)) score += 2; 647 else if (/<img/i.test(html)) score += 1; 648 649 // Video embed 650 if (/<video|youtube\.com|vimeo\.com|wistia\.com/i.test(html)) score += 3; 651 652 // Background image in CSS 653 if (/background(-image)?\s*:\s*url/i.test(html)) score += 1; 654 655 // Compelling above-fold text (checking first 2000 chars) 656 const aboveFold = stripHtml(html.slice(0, 2000)); 657 if (aboveFold.length > 50) score += 1; 658 659 // Interactive elements 660 if ( 661 /<(slider|carousel|swiper|slideshow)/i.test(html) || 662 /class\s*=\s*["'][^"']*\b(slider|carousel|swiper)\b/i.test(html) 663 ) 664 score += 1; 665 666 return { 667 score: Math.min(score, 10), 668 reasoning: `${/<video|youtube|vimeo|wistia/i.test(html) ? 'Video present' : 'No video'}, ${/<img/i.test(html) ? 'images present' : 'no images'}`, 669 evidence: `Images: ${(html.match(/<img/gi) || []).length}`, 670 }; 671 } 672 673 /** 674 * Factor 7: Trust Signals (weight: 11%) 675 * Testimonials, ratings, badges, certifications, "since YYYY" 676 */ 677 export function scoreTrustSignals(html, lang = 'en') { 678 const text = stripHtml(html); 679 let score = 0; 680 681 // Testimonial/review sections (language-aware) 682 if (langKey(TRUST_SECTION_KEYWORDS, lang).test(text)) score += 2; 683 684 // Star ratings (★ characters or rating patterns) — universal 685 if (/[★☆⭐]|(\d(\.\d)?)\s*\/\s*5\s*(stars?)?|\bstar[s]?\b.*\brating\b/i.test(html)) score += 2; 686 687 // Trust badges and certifications (language-aware) 688 if (langKey(CERT_KEYWORDS, lang).test(text)) score += 2; 689 690 // Industry associations or awards — partially language-neutral 691 if ( 692 /\b(award|winner|finalist|recognized|featured in|as seen on|partner|prix|lauréat|nagroda|premio|preis|gewinner|premio|vincitore)\b/i.test( 693 text 694 ) 695 ) 696 score += 1; 697 698 // "Since YYYY" or years in business (mostly language-neutral with some additions) 699 if ( 700 /\b(since|established|est\.?|depuis|od roku|od|desde|seit|dal|fondat\w*)\s*(19|20)\d{2}\b/i.test( 701 text 702 ) 703 ) 704 score += 1; 705 706 // Review platform mentions — universal brand names 707 if ( 708 /\b(bbb|better business|google review|yelp|trustpilot|angi|homeadvisor|houzz|bark|checkatrade|trusted trader|avis vérifiés|avis verifies|opinie|opineo|ekomi|provenexpert)\b/i.test( 709 text 710 ) 711 ) 712 score += 1; 713 714 // Guarantee language (language-aware) 715 if (langKey(GUARANTEE_KEYWORDS, lang).test(text)) score += 1; 716 717 // Phone number visible (proxy for legitimacy — virtually all real businesses display one) — universal 718 if (/(\+?[0-9][\d\s\-().]{7,}[0-9]|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b)/.test(text)) score += 2; 719 720 return { 721 score: Math.min(score, 10), 722 reasoning: `Trust elements: ${score === 0 ? 'none found' : 'present'}`, 723 evidence: 724 [ 725 langKey(TRUST_SECTION_KEYWORDS, lang).test(text) ? 'reviews' : null, 726 langKey(CERT_KEYWORDS, lang).test(text) ? 'certifications' : null, 727 /since.*\d{4}|depuis.*\d{4}|od roku.*\d{4}|desde.*\d{4}|seit.*\d{4}|dal.*\d{4}/i.test(text) 728 ? 'establishment date' 729 : null, 730 langKey(GUARANTEE_KEYWORDS, lang).test(text) ? 'guarantee' : null, 731 ] 732 .filter(Boolean) 733 .join(', ') || 'None', 734 }; 735 } 736 737 /** 738 * Factor 8: Imagery/Design (weight: 8%) 739 * Image count, alt text quality, responsive design indicators 740 */ 741 export function scoreImageryDesign(html) { 742 let score = 3; // Base: page renders (we can't see it, assume basic design) 743 744 const images = html.match(/<img[^>]+>/gi) || []; 745 if (images.length >= 5) score += 2; 746 else if (images.length >= 2) score += 1; 747 748 // Alt text quality 749 const withAlt = images.filter(img => /alt\s*=\s*["'][^"']+["']/i.test(img)); 750 if (images.length > 0 && withAlt.length / images.length > 0.7) score += 1; 751 752 // Responsive design indicators 753 if (/<meta[^>]*viewport/i.test(html)) score += 1; 754 755 // CSS framework indicators (Bootstrap, Tailwind, etc.) 756 if (/bootstrap|tailwind|foundation|bulma|material/i.test(html)) score += 1; 757 758 // Lazy loading (performance indicator) 759 if (/loading\s*=\s*["']lazy["']/i.test(html)) score += 1; 760 761 // WebP or modern image formats 762 if (/\.webp|\.avif/i.test(html)) score += 1; 763 764 return { 765 score: Math.min(score, 10), 766 reasoning: `${images.length} images, ${withAlt.length} with alt text, ${/<meta[^>]*viewport/i.test(html) ? 'responsive' : 'not responsive'}`, 767 evidence: `Images: ${images.length}, Alt coverage: ${images.length > 0 ? Math.round((withAlt.length / images.length) * 100) : 0}%`, 768 }; 769 } 770 771 /** 772 * Factor 9: Offer Clarity (weight: 4%) 773 * Pricing patterns, specific terms, clear service descriptions 774 */ 775 export function scoreOfferClarity(html, lang = 'en') { 776 const text = stripHtml(html); 777 let score = 4; // Base: real business page (HTML alone can't verify offer clarity) 778 779 // Pricing visible — currency symbols are language-neutral 780 if ( 781 /\$\d+|€\d+|£\d+|\d+\s*(USD|AUD|GBP|EUR|CAD|NZD|PLN|CHF|MXN|COP|ARS)|\bpric(e|ing)\b|\bprix\b|\bcena\b|\bpreis\b|\bprezzo\b|\bprecio\b/i.test( 782 text 783 ) 784 ) 785 score += 3; 786 787 // Service list or menu (language-aware) 788 if (langKey(SERVICE_KEYWORDS, lang).test(text)) score += 2; 789 790 // Specific terms (areas served, hours, etc.) (language-aware) 791 if (langKey(HOURS_KEYWORDS, lang).test(text)) score += 1; 792 793 // FAQ section — language-neutral acronym + language-specific 794 if ( 795 /\b(faq|frequently asked|common questions|questions fréquentes|questions frequentes|często zadawane|preguntas frecuentes|häufige fragen|domande frequenti)\b/i.test( 796 text 797 ) 798 ) 799 score += 1; 800 801 // Process or "how it works" (language-aware) 802 if (langKey(PROCESS_KEYWORDS, lang).test(text)) score += 1; 803 804 return { 805 score: Math.min(score, 10), 806 reasoning: `${/pric|prix|cena|preis|prezzo|precio|€|\$|£/i.test(text) ? 'Pricing present' : 'No pricing'}, ${langKey(SERVICE_KEYWORDS, lang).test(text) ? 'services listed' : 'no service list'}`, 807 evidence: `Pricing: ${/\$|€|£/i.test(text) ? 'yes' : 'no'}, Services: ${langKey(SERVICE_KEYWORDS, lang).test(text) ? 'yes' : 'no'}`, 808 }; 809 } 810 811 /** 812 * Factor 10: Contextual Appropriateness (weight: 3%) 813 * Industry-relevant content, local business indicators 814 */ 815 export function scoreContext(html, keyword, lang = 'en') { 816 const text = stripHtml(html); 817 let score = 3; // Base: page has content 818 819 // Keyword relevance 820 if (keyword) { 821 const keywordParts = keyword.toLowerCase().split(/\s+/); 822 const textLower = text.toLowerCase(); 823 const matches = keywordParts.filter(part => part.length > 3 && textLower.includes(part)); 824 if (matches.length >= 2) score += 2; 825 else if (matches.length >= 1) score += 1; 826 } 827 828 // Local business indicators (language-aware) 829 if (langKey(LOCAL_KEYWORDS, lang).test(text)) score += 1; 830 831 // Address/location present — numbers in addresses are language-neutral 832 if ( 833 /\d+\s+\w+\s+(st|street|rd|road|ave|avenue|blvd|drive|lane|way|rue|avenue|boulevard|via|calle|straße|strasse|str\.|gasse|piazza|platz)\b/i.test( 834 text 835 ) 836 ) 837 score += 1; 838 839 // Business hours — language-aware day names 840 if ( 841 /\b(mon|tue|wed|thu|fri|sat|sun|lun|mar|mer|jeu|ven|sam|dim|pon|wt|śr|czw|pt|sob|nie|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|domingo|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag|lunedì|martedì|mercoledì|giovedì|venerdì|sabato|domenica)(day)?.*\d{1,2}(:\d{2})?\s*(am|pm|uhr|h)?/i.test( 842 text 843 ) 844 ) 845 score += 1; 846 847 // Phone number on page — universal 848 if (/\b(\+?\d[\d\s\-()]{8,})\b/.test(text)) score += 1; 849 850 // Map embed — universal 851 if (/google\.com\/maps|maps\.google|goo\.gl\/maps/i.test(html)) score += 1; 852 853 return { 854 score: Math.min(score, 10), 855 reasoning: `${keyword ? 'Keyword alignment checked' : 'No keyword'}, local business indicators ${langKey(LOCAL_KEYWORDS, lang).test(text) ? 'present' : 'absent'}`, 856 evidence: `Address: ${/\d+\s+\w+\s+(st|street|rd|road|ave|avenue|rue|via|calle|str\.)/i.test(text) ? 'yes' : 'no'}, Phone: ${/\b(\+?\d[\d\s\-()]{8,})\b/.test(text) ? 'yes' : 'no'}`, 857 }; 858 } 859 860 // ─── Metadata Extraction ──────────────────────────────────────────────────── 861 862 /** 863 * Detect if site is an error page (soft 404, parking page, etc.) 864 */ 865 export function detectErrorPage(html) { 866 const text = stripHtml(html).toLowerCase(); 867 868 // Common error patterns 869 if (/\b(page not found|404 error|404 not found|this page doesn.?t exist)\b/i.test(text)) 870 return { is_error_page: true, reason: '404 page' }; 871 872 // Parking/placeholder pages 873 if ( 874 /\b(this domain|domain for sale|buy this domain|parked|coming soon|under construction|website is being)\b/i.test( 875 text 876 ) 877 ) 878 return { is_error_page: true, reason: 'Parked/placeholder page' }; 879 880 // GoDaddy, Wix, Squarespace default pages 881 if ( 882 /\b(start your website|build your website|create your website)\b/i.test(text) && 883 text.length < 1000 884 ) 885 return { is_error_page: true, reason: 'Platform default page' }; 886 887 return { is_error_page: false, reason: null }; 888 } 889 890 /** 891 * Detect if site is a business directory (not a local business) 892 */ 893 export function detectBusinessDirectory(html) { 894 const text = stripHtml(html).toLowerCase(); 895 896 if ( 897 /\b(business directory|yellow pages|find a|search for business|local listings|company directory|add your business)\b/i.test( 898 text 899 ) 900 ) 901 return true; 902 903 // Multiple business listings pattern 904 const listingPattern = /<div[^>]*class[^>]*(listing|result|business-card|company-item)/gi; 905 const listings = html.match(listingPattern) || []; 906 if (listings.length > 10) return true; 907 908 return false; 909 } 910 911 /** 912 * Classify industry from page content 913 */ 914 export function classifyIndustry(html, keyword) { 915 const text = stripHtml(html).toLowerCase(); 916 const kw = (keyword || '').toLowerCase(); 917 918 const industries = { 919 plumber: /\b(plumb\w*|pipe|drain|faucet|water heater|leak|toilet|sewer)\b/i, 920 electrician: /\b(electri\w*|wiring|circuit|power|outlet|panel|switch)\b/i, 921 hvac: /\b(hvac|heat\w*|cool\w*|air condition\w*|furnace|heat pump|duct)\b/i, 922 roofing: /\b(roof\w*|shingle|gutter|flashing|leak repair)\b/i, 923 landscaping: /\b(landscap\w*|lawn\w*|garden\w*|mow\w*|tree|hedge|irrigation|turf)\b/i, 924 painter: /\b(paint\w*|stain\w*|coating|wallpaper)\b/i, 925 cleaner: /\b(clean\w*|janitorial|maid|housekeep\w*|carpet clean\w*|pressure wash\w*)\b/i, 926 pest_control: /\b(pest\w*|termite|exterminator|rodent|insect|bug|cockroach)\b/i, 927 locksmith: /\b(locksmith|lock|key|safe|security system|access control)\b/i, 928 mechanic: /\b(mechanic\w*|auto repair|car repair|brake|transmission|oil change)\b/i, 929 dentist: /\b(dent\w*|orthodont\w*|teeth|oral|filling|crown|implant)\b/i, 930 lawyer: /\b(lawyer|attorney|law firm|legal|litigation|practice area)\b/i, 931 accountant: /\b(account\w*|tax\w*|bookkeep\w*|cpa|audit\w*|payroll|financial)\b/i, 932 real_estate: /\b(real estate|realtor|property|home for sale|listing|broker)\b/i, 933 restaurant: /\b(restaurant|menu|reserv\w*|dine|cuisine|chef|takeout|delivery)\b/i, 934 fitness: /\b(gym|fitness|workout|personal train\w*|yoga|pilates|crossfit)\b/i, 935 salon: /\b(salon|hair|barber|spa|nails?|beauty|stylist|cosmetic)\b/i, 936 veterinarian: /\b(vet\w*|veterinar\w*|animal|pet|clinic|surgery|spay|neuter)\b/i, 937 photographer: /\b(photo\w*|portrait|wedding photo\w*|shoot|studio)\b/i, 938 contractor: /\b(contractor|renovati\w*|remodel\w*|home improvement|build\w*|construct\w*)\b/i, 939 }; 940 941 // Check keyword first (most reliable) 942 for (const [industry, pattern] of Object.entries(industries)) { 943 if (pattern.test(kw)) return industry; 944 } 945 946 // Then check page content — add global flag for counting all matches 947 for (const [industry, pattern] of Object.entries(industries)) { 948 const globalPattern = new RegExp(pattern.source, 'gi'); 949 const matches = (text.match(globalPattern) || []).length; 950 if (matches >= 3) return industry; 951 } 952 953 return 'general_business'; 954 } 955 956 /** 957 * Extract location info from HTML 958 */ 959 export function extractLocation(html) { 960 const text = stripHtml(html); 961 962 // Try to find city/state from common patterns 963 const cityStatePattern = 964 /\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)?),?\s+(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|ACT|NSW|NT|QLD|SA|TAS|VIC|WA)\b/; 965 const match = text.match(cityStatePattern); 966 967 return { 968 city: match ? match[1] : null, 969 state: match ? match[2] : null, 970 }; 971 } 972 973 // ─── Technical SEO Checks ─────────────────────────────────────────────────── 974 975 /** 976 * Comprehensive Technical SEO audit. 977 * Returns a detailed sub-report with individual check results. 978 * These signals are woven into existing factor scores as bonuses/penalties 979 * and also returned as a standalone technical_seo section in the result. 980 * 981 * @param {string} html - Raw HTML of the page 982 * @returns {Object} Technical SEO check results 983 */ 984 export function scoreTechnicalSEO(html) { 985 const checks = {}; 986 987 // 1. Meta description — presence and optimal length (120-160 chars) 988 const metaDescMatch = html.match( 989 /<meta[^>]*name\s*=\s*["']description["'][^>]*content\s*=\s*["']([^"']*)["']/i 990 ) || html.match( 991 /<meta[^>]*content\s*=\s*["']([^"']*)["'][^>]*name\s*=\s*["']description["']/i 992 ); 993 const metaDesc = metaDescMatch ? metaDescMatch[1].trim() : null; 994 checks.meta_description = { 995 present: !!metaDesc, 996 length: metaDesc ? metaDesc.length : 0, 997 optimal: metaDesc ? (metaDesc.length >= 120 && metaDesc.length <= 160) : false, 998 value: metaDesc ? metaDesc.slice(0, 200) : null, 999 }; 1000 1001 // 2. Title tag quality — presence, optimal length (30-60 chars), brand name detection 1002 const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i); 1003 const titleText = titleMatch ? titleMatch[1].replace(/<[^>]+>/g, '').trim() : null; 1004 const titleHasSeparator = titleText ? /\s[-|–—]\s/.test(titleText) : false; 1005 checks.title_tag = { 1006 present: !!titleText, 1007 length: titleText ? titleText.length : 0, 1008 optimal: titleText ? (titleText.length >= 30 && titleText.length <= 60) : false, 1009 has_separator: titleHasSeparator, // Suggests brand name included (e.g., "Services | Brand") 1010 value: titleText ? titleText.slice(0, 100) : null, 1011 }; 1012 1013 // 3. Multiple H1 tags — count <h1> elements, flag if >1 1014 const h1Matches = html.match(/<h1[^>]*>/gi) || []; 1015 checks.h1_tags = { 1016 count: h1Matches.length, 1017 optimal: h1Matches.length === 1, 1018 issue: h1Matches.length === 0 ? 'missing' : h1Matches.length > 1 ? 'multiple' : null, 1019 }; 1020 1021 // 4. Schema.org structured data — <script type="application/ld+json"> 1022 const ldJsonMatches = html.match(/<script[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi) || []; 1023 const schemaTypes = []; 1024 for (const block of ldJsonMatches) { 1025 const content = block.replace(/<\/?script[^>]*>/gi, '').trim(); 1026 try { 1027 const parsed = JSON.parse(content); 1028 const type = parsed['@type'] || (Array.isArray(parsed['@graph']) ? parsed['@graph'].map(g => g['@type']).filter(Boolean).join(', ') : null); 1029 if (type) schemaTypes.push(type); 1030 } catch { 1031 // Malformed JSON-LD — still counts as an attempt 1032 schemaTypes.push('malformed'); 1033 } 1034 } 1035 // Also check for microdata (itemtype attribute) 1036 const microdataTypes = (html.match(/itemtype\s*=\s*["']https?:\/\/schema\.org\/(\w+)["']/gi) || []) 1037 .map(m => { 1038 const match = m.match(/schema\.org\/(\w+)/i); 1039 return match ? match[1] : null; 1040 }) 1041 .filter(Boolean); 1042 1043 const allSchemaTypes = [...new Set([...schemaTypes, ...microdataTypes])]; 1044 const valuableSchemaTypes = ['LocalBusiness', 'Organization', 'FAQ', 'AggregateRating', 1045 'Product', 'Service', 'WebSite', 'BreadcrumbList', 'Review']; 1046 const hasValuableSchema = allSchemaTypes.some(t => 1047 valuableSchemaTypes.some(v => t.includes(v)) 1048 ); 1049 1050 checks.structured_data = { 1051 present: ldJsonMatches.length > 0 || microdataTypes.length > 0, 1052 json_ld_count: ldJsonMatches.length, 1053 types: allSchemaTypes, 1054 has_valuable_types: hasValuableSchema, 1055 }; 1056 1057 // 5. Open Graph meta — og:title, og:description, og:image 1058 const ogTitle = /<meta[^>]*property\s*=\s*["']og:title["']/i.test(html); 1059 const ogDesc = /<meta[^>]*property\s*=\s*["']og:description["']/i.test(html); 1060 const ogImage = /<meta[^>]*property\s*=\s*["']og:image["']/i.test(html); 1061 checks.open_graph = { 1062 has_title: ogTitle, 1063 has_description: ogDesc, 1064 has_image: ogImage, 1065 complete: ogTitle && ogDesc && ogImage, 1066 count: [ogTitle, ogDesc, ogImage].filter(Boolean).length, 1067 }; 1068 1069 // 6. Missing favicon 1070 const hasFavicon = /<link[^>]*rel\s*=\s*["'](icon|shortcut icon|apple-touch-icon)["']/i.test(html); 1071 checks.favicon = { 1072 present: hasFavicon, 1073 }; 1074 1075 // 7. HTML lang attribute 1076 const htmlLangMatch = html.match(/<html[^>]*lang\s*=\s*["']([^"']+)["']/i); 1077 checks.html_lang = { 1078 present: !!htmlLangMatch, 1079 value: htmlLangMatch ? htmlLangMatch[1] : null, 1080 }; 1081 1082 // 8. Render-blocking resources — stylesheets and scripts in <head> without async/defer 1083 const headMatch = html.match(/<head[^>]*>([\s\S]*?)<\/head>/i); 1084 const headHtml = headMatch ? headMatch[1] : ''; 1085 1086 // Count stylesheets in head (all are render-blocking by default unless media=print or preload) 1087 const stylesheets = (headHtml.match(/<link[^>]*rel\s*=\s*["']stylesheet["'][^>]*>/gi) || []); 1088 const blockingStylesheets = stylesheets.filter(s => 1089 !(/media\s*=\s*["']print["']/i.test(s)) && !(/rel\s*=\s*["']preload["']/i.test(s)) 1090 ); 1091 1092 // Count scripts in head without async or defer 1093 const headScripts = (headHtml.match(/<script[^>]*src\s*=\s*["'][^"']+["'][^>]*>/gi) || []); 1094 const blockingScripts = headScripts.filter(s => 1095 !(/\basync\b/i.test(s)) && !(/\bdefer\b/i.test(s)) 1096 ); 1097 1098 checks.render_blocking = { 1099 blocking_stylesheets: blockingStylesheets.length, 1100 blocking_scripts: blockingScripts.length, 1101 total_blocking: blockingStylesheets.length + blockingScripts.length, 1102 issue: (blockingStylesheets.length + blockingScripts.length) > 5 ? 'excessive' : null, 1103 }; 1104 1105 // Compute an overall technical SEO score (0-10) for inclusion in results 1106 let techScore = 3; // Base: site loads and has HTML 1107 if (checks.meta_description.present) techScore += 1; 1108 if (checks.meta_description.optimal) techScore += 0.5; 1109 if (checks.title_tag.present && checks.title_tag.optimal) techScore += 0.5; 1110 if (checks.h1_tags.optimal) techScore += 0.5; 1111 if (checks.h1_tags.count > 1) techScore -= 0.5; // Penalty for multiple H1s 1112 if (checks.structured_data.has_valuable_types) techScore += 1.5; 1113 else if (checks.structured_data.present) techScore += 0.5; 1114 if (checks.open_graph.complete) techScore += 1; 1115 else if (checks.open_graph.count >= 1) techScore += 0.5; 1116 if (checks.favicon.present) techScore += 0.5; 1117 if (checks.html_lang.present) techScore += 0.5; 1118 if (checks.render_blocking.total_blocking <= 3) techScore += 0.5; 1119 else if (checks.render_blocking.total_blocking > 8) techScore -= 0.5; 1120 1121 checks.overall_score = Math.max(0, Math.min(10, Math.round(techScore * 10) / 10)); 1122 1123 return checks; 1124 } 1125 1126 /** 1127 * Score page speed from performance timing data. 1128 * Returns a speed rating (0-10) and detailed metrics. 1129 * 1130 * Thresholds based on Google's Core Web Vitals guidelines: 1131 * - Good: FCP < 1.8s, LCP < 2.5s, DCL < 2s 1132 * - Needs improvement: FCP 1.8-3s, LCP 2.5-4s 1133 * - Poor: FCP > 3s, LCP > 4s 1134 * 1135 * @param {Object|null} perfJson - Performance data from assets capture 1136 * @returns {Object} Page speed score and details 1137 */ 1138 export function scorePageSpeed(perfJson) { 1139 if (!perfJson) { 1140 return { 1141 score: null, 1142 rating: 'unknown', 1143 details: 'No performance data available', 1144 metrics: null, 1145 }; 1146 } 1147 1148 let score = 5; // Base: page loads 1149 const issues = []; 1150 const strengths = []; 1151 1152 // First Contentful Paint (FCP) — Core Web Vital 1153 if (perfJson.firstContentfulPaint !== null && perfJson.firstContentfulPaint !== undefined) { 1154 const fcp = perfJson.firstContentfulPaint; 1155 if (fcp < 1000) { score += 2; strengths.push(`Fast FCP (${fcp}ms)`); } 1156 else if (fcp < 1800) { score += 1; strengths.push(`Good FCP (${fcp}ms)`); } 1157 else if (fcp < 3000) { issues.push(`Slow FCP (${fcp}ms, target <1.8s)`); } 1158 else { score -= 1; issues.push(`Very slow FCP (${fcp}ms, target <1.8s)`); } 1159 } 1160 1161 // DOM Content Loaded 1162 if (perfJson.domContentLoaded !== null && perfJson.domContentLoaded !== undefined) { 1163 const dcl = perfJson.domContentLoaded; 1164 if (dcl < 1500) { score += 1; strengths.push(`Fast DCL (${dcl}ms)`); } 1165 else if (dcl < 3000) { /* neutral */ } 1166 else { score -= 1; issues.push(`Slow DCL (${dcl}ms)`); } 1167 } 1168 1169 // Load time 1170 if (perfJson.loadTime !== null && perfJson.loadTime !== undefined && perfJson.loadTime > 0) { 1171 const load = perfJson.loadTime; 1172 if (load < 2000) { score += 1; strengths.push(`Fast load (${load}ms)`); } 1173 else if (load < 4000) { /* neutral */ } 1174 else { score -= 1; issues.push(`Slow load (${load}ms)`); } 1175 } 1176 1177 // DOM Interactive 1178 if (perfJson.domInteractive !== null && perfJson.domInteractive !== undefined) { 1179 const di = perfJson.domInteractive; 1180 if (di < 1000) { score += 1; strengths.push(`Fast DOM interactive (${di}ms)`); } 1181 else if (di > 3000) { score -= 1; issues.push(`Slow DOM interactive (${di}ms)`); } 1182 } 1183 1184 // Transfer size (page weight) 1185 if (perfJson.totalTransferSize !== null && perfJson.totalTransferSize !== undefined) { 1186 const sizeKB = Math.round(perfJson.totalTransferSize / 1024); 1187 if (sizeKB < 500) { score += 1; strengths.push(`Light page (${sizeKB}KB)`); } 1188 else if (sizeKB < 2000) { /* neutral */ } 1189 else if (sizeKB < 5000) { issues.push(`Heavy page (${sizeKB}KB)`); } 1190 else { score -= 1; issues.push(`Very heavy page (${sizeKB}KB)`); } 1191 } 1192 1193 // Resource count 1194 if (perfJson.resourceCount !== null && perfJson.resourceCount !== undefined) { 1195 if (perfJson.resourceCount > 100) { 1196 score -= 1; 1197 issues.push(`Too many requests (${perfJson.resourceCount})`); 1198 } else if (perfJson.resourceCount < 30) { 1199 score += 1; 1200 strengths.push(`Low request count (${perfJson.resourceCount})`); 1201 } 1202 } 1203 1204 score = Math.max(0, Math.min(10, score)); 1205 1206 let rating = 'average'; 1207 if (score >= 8) rating = 'fast'; 1208 else if (score >= 6) rating = 'good'; 1209 else if (score >= 4) rating = 'average'; 1210 else if (score >= 2) rating = 'slow'; 1211 else rating = 'very_slow'; 1212 1213 return { 1214 score, 1215 rating, 1216 issues, 1217 strengths, 1218 metrics: { 1219 fcp_ms: perfJson.firstContentfulPaint, 1220 dcl_ms: perfJson.domContentLoaded, 1221 load_ms: perfJson.loadTime, 1222 dom_interactive_ms: perfJson.domInteractive, 1223 transfer_size_bytes: perfJson.totalTransferSize, 1224 resource_count: perfJson.resourceCount, 1225 }, 1226 }; 1227 } 1228 1229 // ─── Main Scoring Function ────────────────────────────────────────────────── 1230 1231 /** 1232 * Score a website programmatically from its HTML. 1233 * 1234 * @param {string} html - Raw HTML of the page 1235 * @param {string} pageUrl - URL of the page 1236 * @param {string} [keyword] - Search keyword that found this site 1237 * @param {Object|null} [perfJson] - Performance timing data from assets capture 1238 * @returns {Object} Score result matching the LLM output schema 1239 */ 1240 export function scoreWebsiteProgrammatically(html, pageUrl, keyword = null, perfJson = null) { 1241 // Error/broken detection 1242 const errorCheck = detectErrorPage(html); 1243 const isBrokenSite = !html || html.length < 200; 1244 1245 if (isBrokenSite || errorCheck.is_error_page) { 1246 return { 1247 website_url: pageUrl, 1248 evaluation_date: new Date().toISOString(), 1249 conversion_score: 0, 1250 letter_grade: 'F', 1251 is_error_page: errorCheck.is_error_page, 1252 is_broken_site: isBrokenSite, 1253 error_reason: errorCheck.reason || 'Insufficient HTML content', 1254 factor_scores: null, 1255 contacts: extractContactsFromHtml(html || '', pageUrl), 1256 }; 1257 } 1258 1259 // Detect JS-heavy sites and language 1260 const { isJsHeavy, lang } = detectSiteCharacteristics(html); 1261 1262 // Score all 10 factors 1263 // For JS-heavy sites: apply calibrated neutral scores (~62 overall) since content 1264 // is rendered client-side and HTML analysis would give misleadingly low results. 1265 const neutralReasoning = 'JS-rendered site — HTML analysis unreliable, neutral score applied'; 1266 const factor_scores = isJsHeavy 1267 ? { 1268 headline_quality: { 1269 score: 6, 1270 reasoning: neutralReasoning, 1271 evidence: 'Next.js/React SSR detected', 1272 }, 1273 value_proposition: { 1274 score: 7, 1275 reasoning: neutralReasoning, 1276 evidence: 'Content rendered client-side', 1277 }, 1278 unique_selling_proposition: { 1279 score: 5, 1280 reasoning: neutralReasoning, 1281 evidence: 'Content rendered client-side', 1282 }, 1283 call_to_action: { 1284 score: 7, 1285 reasoning: neutralReasoning, 1286 evidence: 'Content rendered client-side', 1287 }, 1288 urgency_messaging: { 1289 score: 2, 1290 reasoning: neutralReasoning, 1291 evidence: 'Typically low for local businesses', 1292 }, 1293 hook_engagement: { 1294 score: 7, 1295 reasoning: neutralReasoning, 1296 evidence: 'JS sites typically have good imagery', 1297 }, 1298 trust_signals: { 1299 score: 6, 1300 reasoning: neutralReasoning, 1301 evidence: 'Content rendered client-side', 1302 }, 1303 imagery_design: { 1304 score: 8, 1305 reasoning: neutralReasoning, 1306 evidence: 'Modern JS framework implies good design', 1307 }, 1308 offer_clarity: { 1309 score: 7, 1310 reasoning: neutralReasoning, 1311 evidence: 'Content rendered client-side', 1312 }, 1313 contextual_appropriateness: { 1314 score: 7, 1315 reasoning: neutralReasoning, 1316 evidence: 'Content rendered client-side', 1317 }, 1318 } 1319 : { 1320 headline_quality: scoreHeadlineQuality(html, lang), 1321 value_proposition: scoreValueProposition(html, lang), 1322 unique_selling_proposition: scoreUSP(html, lang), 1323 call_to_action: scoreCTA(html, lang), 1324 urgency_messaging: scoreUrgency(html, lang), 1325 hook_engagement: scoreHook(html), 1326 trust_signals: scoreTrustSignals(html, lang), 1327 imagery_design: scoreImageryDesign(html), 1328 offer_clarity: scoreOfferClarity(html, lang), 1329 contextual_appropriateness: scoreContext(html, keyword, lang), 1330 }; 1331 1332 // Run technical SEO checks and page speed scoring 1333 const technicalSEO = scoreTechnicalSEO(html); 1334 const pageSpeed = scorePageSpeed(perfJson); 1335 1336 // Apply technical SEO signals as adjustments to existing factor scores. 1337 // These are small bonuses/penalties (max +/-1 per factor) to avoid 1338 // changing the overall factor weighting system. The detailed reports 1339 // are included separately in the result for proposal generation. 1340 if (!isJsHeavy) { 1341 // Headline quality: penalize multiple H1s, bonus for good title+H1 combo 1342 if (technicalSEO.h1_tags.count > 1) { 1343 factor_scores.headline_quality.score = Math.max(0, factor_scores.headline_quality.score - 1); 1344 factor_scores.headline_quality.reasoning += ' | Multiple H1 tags detected (SEO issue)'; 1345 } 1346 if (technicalSEO.title_tag.present && technicalSEO.title_tag.optimal) { 1347 factor_scores.headline_quality.score = Math.min(10, factor_scores.headline_quality.score + 1); 1348 factor_scores.headline_quality.reasoning += ' | Good title tag length'; 1349 } 1350 1351 // Trust signals: bonus for structured data (schema.org) 1352 if (technicalSEO.structured_data.has_valuable_types) { 1353 factor_scores.trust_signals.score = Math.min(10, factor_scores.trust_signals.score + 1); 1354 factor_scores.trust_signals.reasoning += ` | Schema.org: ${technicalSEO.structured_data.types.join(', ')}`; 1355 } 1356 1357 // Imagery/Design: bonus for OG tags and favicon, penalty for excessive render-blocking 1358 if (technicalSEO.open_graph.complete) { 1359 factor_scores.imagery_design.score = Math.min(10, factor_scores.imagery_design.score + 1); 1360 factor_scores.imagery_design.reasoning += ' | Complete Open Graph meta tags'; 1361 } 1362 if (!technicalSEO.favicon.present) { 1363 factor_scores.imagery_design.score = Math.max(0, factor_scores.imagery_design.score - 1); 1364 factor_scores.imagery_design.reasoning += ' | Missing favicon'; 1365 } 1366 if (technicalSEO.render_blocking.total_blocking > 8) { 1367 factor_scores.imagery_design.score = Math.max(0, factor_scores.imagery_design.score - 1); 1368 factor_scores.imagery_design.reasoning += ` | ${technicalSEO.render_blocking.total_blocking} render-blocking resources`; 1369 } 1370 1371 // Contextual: bonus for html lang attribute 1372 if (technicalSEO.html_lang.present) { 1373 factor_scores.contextual_appropriateness.score = Math.min(10, factor_scores.contextual_appropriateness.score + 1); 1374 factor_scores.contextual_appropriateness.reasoning += ` | HTML lang="${technicalSEO.html_lang.value}"`; 1375 } 1376 1377 // Offer clarity: bonus for meta description (helps search appearance) 1378 if (technicalSEO.meta_description.present && technicalSEO.meta_description.optimal) { 1379 factor_scores.offer_clarity.score = Math.min(10, factor_scores.offer_clarity.score + 1); 1380 factor_scores.offer_clarity.reasoning += ' | Optimal meta description length'; 1381 } 1382 1383 // Hook/Engagement: bonus/penalty from page speed 1384 if (pageSpeed.score !== null && pageSpeed.score !== undefined) { 1385 if (pageSpeed.score >= 8) { 1386 factor_scores.hook_engagement.score = Math.min(10, factor_scores.hook_engagement.score + 1); 1387 factor_scores.hook_engagement.reasoning += ` | Fast page speed (${pageSpeed.rating})`; 1388 } else if (pageSpeed.score <= 3) { 1389 factor_scores.hook_engagement.score = Math.max(0, factor_scores.hook_engagement.score - 1); 1390 factor_scores.hook_engagement.reasoning += ` | Slow page speed (${pageSpeed.rating})`; 1391 } 1392 } 1393 } 1394 1395 // Compute weighted score (vision-aware weights) 1396 const visionEnabled = process.env.ENABLE_VISION !== 'false'; 1397 const conversion_score = computeWeightedScore(factor_scores, visionEnabled); 1398 const letter_grade = computeGrade(conversion_score); 1399 1400 // Extract contacts 1401 const contacts = extractContactsFromHtml(html, pageUrl); 1402 1403 // Metadata 1404 const isDirectory = detectBusinessDirectory(html); 1405 const industry = classifyIndustry(html, keyword); 1406 const location = extractLocation(html); 1407 const tldResult = pageUrl ? detectCountryFromTLD(pageUrl) : null; 1408 const countryFromTLD = tldResult?.countryCode || null; 1409 const isLawFirm = industry === 'lawyer'; 1410 1411 return { 1412 website_url: pageUrl, 1413 evaluation_date: new Date().toISOString(), 1414 conversion_score, 1415 letter_grade, 1416 factor_scores, 1417 technical_seo: technicalSEO, 1418 page_speed: pageSpeed, 1419 is_error_page: false, 1420 is_broken_site: false, 1421 is_js_heavy: isJsHeavy, 1422 is_business_directory: isDirectory, 1423 is_local_business: !isDirectory, 1424 is_law_firm: isLawFirm, 1425 industry_classification: industry, 1426 country_code: countryFromTLD, 1427 city: location.city, 1428 state: location.state, 1429 contacts, 1430 }; 1431 } 1432 1433 // ─── Helpers ──────────────────────────────────────────────────────────────── 1434 1435 /** 1436 * Detect site characteristics: whether it's JS-heavy (unreliable for HTML scoring) 1437 * and its language. Returns { isJsHeavy: boolean, lang: string|null }. 1438 * 1439 * JS-heavy detection covers: React/Next.js/Vue SSR payloads, LiteSpeed lazy-load 1440 * skeletons, and any page where visible text < 300 chars after stripping. 1441 * 1442 * Language detection uses the HTML lang attribute. Non-English sites with 1443 * sufficient content are now scored using language-specific keyword patterns 1444 * rather than forced into neutral scores. 1445 */ 1446 function detectSiteCharacteristics(html) { 1447 // Next.js App Router flight protocol markers 1448 if (/self\.__next_f|__next_f\.push|\$RC\(/.test(html)) return { isJsHeavy: true, lang: null }; 1449 // Vue/Nuxt SSR hydration markers 1450 if (/window\.__NUXT__|__vue_ssr_context__|nuxtState/.test(html)) 1451 return { isJsHeavy: true, lang: null }; 1452 // Remix/React Router flight 1453 if (/__remixContext|window\.__remixManifest/.test(html)) return { isJsHeavy: true, lang: null }; 1454 // Angular universal 1455 if (/ng-server-context|ng-version/.test(html)) return { isJsHeavy: true, lang: null }; 1456 1457 // Generic: check readable word ratio — if huge HTML but nearly no readable words 1458 const wordCount = (html.replace(/<[^>]+>/g, ' ').match(/\b[a-zA-Z]{4,}\b/g) || []).length; 1459 if (html.length > 30000 && wordCount < 150) return { isJsHeavy: true, lang: null }; 1460 1461 // Sparse visible content after stripping scripts/styles — LiteSpeed lazy-load, etc. 1462 const visibleText = html 1463 .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') 1464 .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') 1465 .replace(/<[^>]+>/g, ' ') 1466 .replace(/\s+/g, ' ') 1467 .trim(); 1468 if (html.length > 2000 && visibleText.length < 300) return { isJsHeavy: true, lang: null }; 1469 1470 // Detect language from HTML lang attribute 1471 const lang = detectLang(html); 1472 1473 // Previously: non-English lang attr → return true (neutral score). 1474 // Now: only return isJsHeavy=true for unknown non-English languages we 1475 // don't have keyword sets for (so they still get neutral scores). 1476 // Known languages (fr, pl, es, de, it, en) get proper scoring. 1477 const supportedLangs = new Set(['en', 'fr', 'pl', 'es', 'de', 'it']); 1478 if (lang && !supportedLangs.has(lang)) { 1479 // Unknown non-English language — fall back to neutral scores 1480 return { isJsHeavy: true, lang }; 1481 } 1482 1483 return { isJsHeavy: false, lang: lang || 'en' }; 1484 } 1485 1486 /** 1487 * @deprecated Use detectSiteCharacteristics() instead. 1488 * Kept for backward compatibility — returns boolean only. 1489 */ 1490 function isJsHeavySite(html) { 1491 return detectSiteCharacteristics(html).isJsHeavy; 1492 } 1493 1494 function stripHtml(html) { 1495 return (html || '') 1496 .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') 1497 .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') 1498 .replace(/<[^>]+>/g, ' ') 1499 .replace(/&[a-z]+;/gi, ' ') 1500 .replace(/\s+/g, ' ') 1501 .trim(); 1502 }