keyword-filter.js
1 /** 2 * Keyword Filter Module 3 * 4 * LLM-based filtering for business/service keywords using Sofia's templates: 5 * 1. Pre-overview: Remove obvious non-service keywords (jobs, education, products) 6 * 2. Post-overview: Rank by business value (intent + volume + competition) 7 */ 8 9 import Logger from './logger.js'; 10 import { callLLM } from './llm-provider.js'; 11 import { parse } from 'csv-parse/sync'; 12 import fs from 'fs'; 13 import path from 'path'; 14 15 const logger = new Logger('KeywordFilter'); 16 17 // Deterministic regex patterns for strong negatives (run before LLM) 18 const NEGATIVE_PATTERNS = { 19 job: /\b(salary|wage|pay|jobs?|vacancy|vacancies|apprentice|apprenticeship|hiring|recruit|recruitment|cv|resume|career)\b/i, 20 education: 21 /\b(course|courses|training|certification|certificat|class|degree|school|university|diploma)\b/i, 22 products: /\b(supplies|tools?|equipment|parts?|replacement|spare|buy|for sale|shop|store)\b/i, 23 entertainment: /\b(band|movie|song|concert|show|lyrics|tickets?)\b/i, 24 informational: /\b(how to|how do i|tutorial|guide|diy|do it yourself|meaning|definition)\b/i, 25 nearme: /\b(near me|nearby|close to me|around me|in my area)\b/i, // Won't combine well with region names 26 }; 27 28 // Positive tokens that indicate service/hiring intent (but NOT "near me" - we filter those out) 29 const POSITIVE_PATTERNS = 30 /\b(hire|hire a|call|book|appointment|emergency|24\/7|repair|service|install|fix|maintenance|company|local)\b/i; 31 32 /** 33 * Load region names for a specific country from validated CSV 34 * Used to detect place-specific keywords in business lists 35 */ 36 function loadRegionNames(countryCode) { 37 try { 38 const regionsPath = path.join( 39 process.cwd(), 40 'data', 41 countryCode.toLowerCase(), 42 'regions-final-filtered.csv' 43 ); 44 if (!fs.existsSync(regionsPath)) { 45 return []; 46 } 47 48 const content = fs.readFileSync(regionsPath, 'utf-8'); 49 const records = parse(content, { 50 columns: true, 51 skip_empty_lines: true, 52 }); 53 54 const regions = records.map(row => row.keyword.toLowerCase()).filter(r => r && r.trim()); 55 56 return regions; 57 } catch (error) { 58 logger.warn(`Failed to load regions for ${countryCode}: ${error.message}`); 59 return []; 60 } 61 } 62 63 /** 64 * Filter out place-specific keywords from business lists 65 * Removes keywords like "plumber auckland", "electrician sydney" 66 */ 67 function filterPlaceSpecific(keywords, countryCode) { 68 if (!countryCode) { 69 return { kept: keywords, removed: [] }; 70 } 71 72 const regions = loadRegionNames(countryCode); 73 if (regions.length === 0) { 74 return { kept: keywords, removed: [] }; 75 } 76 77 const kept = []; 78 const removed = []; 79 80 for (const keyword of keywords) { 81 const kw = keyword.toLowerCase(); 82 let hasPlace = false; 83 84 // Check if keyword contains any region name 85 for (const region of regions) { 86 // Match whole word boundaries to avoid false positives 87 // e.g., "sydney" should match "plumber sydney" but not "sydneysider" 88 const regex = new RegExp(`\\b${region.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i'); 89 if (regex.test(kw)) { 90 hasPlace = true; 91 removed.push({ keyword, reason: `place-specific (${region})` }); 92 break; 93 } 94 } 95 96 if (!hasPlace) { 97 kept.push(keyword); 98 } 99 } 100 101 return { kept, removed }; 102 } 103 104 /** 105 * Deterministic pre-filter using regex patterns 106 * Removes obvious non-service keywords before LLM 107 */ 108 function deterministicFilter(keywords) { 109 const kept = []; 110 const removed = []; 111 112 for (const keyword of keywords) { 113 const kw = keyword.toLowerCase(); 114 let shouldRemove = false; 115 let reason = null; 116 117 // Check each negative pattern 118 for (const [category, pattern] of Object.entries(NEGATIVE_PATTERNS)) { 119 if (pattern.test(kw)) { 120 // Hard filters: always remove, even with positive tokens 121 const hardFilters = ['nearme']; 122 123 // If it's a hard filter, always remove 124 if (hardFilters.includes(category)) { 125 shouldRemove = true; 126 reason = category; 127 break; 128 } 129 // If it has positive tokens too, keep it (e.g., "plumber tools repair") 130 else if (POSITIVE_PATTERNS.test(kw)) { 131 logger.debug(`Keeping "${keyword}" despite ${category} match (has positive tokens)`); 132 kept.push(keyword); 133 shouldRemove = false; 134 reason = 'kept'; // sentinel: prevent double-push below 135 break; 136 } else { 137 shouldRemove = true; 138 reason = category; 139 break; 140 } 141 } 142 } 143 144 if (shouldRemove) { 145 removed.push({ keyword, reason }); 146 } else if (!reason) { 147 // No negative pattern matched 148 kept.push(keyword); 149 } 150 } 151 152 return { kept, removed }; 153 } 154 155 /** 156 * Pre-overview LLM filter 157 * Removes non-service keywords using conservative LLM filtering 158 */ 159 async function preOverviewFilter(keywords, language = 'en', maxOutput = 80) { 160 if (keywords.length === 0) return { filtered_keywords: [], removed: [] }; 161 162 const systemPrompt = `You are a precise keyword filter assistant for local service businesses (plumbers, electricians, landscapers, etc.). 163 Your task: from the provided list of keywords, remove non-service queries that are not generic business types. 164 165 REMOVE these categories: 166 - Jobs/careers: salary, apprenticeship, hiring, recruitment 167 - Education: course, training, certification, degree 168 - Products/supplies: tools, equipment, parts, buy, shop 169 - Entertainment: band, movie, concert, show 170 - Informational: how to, tutorial, guide, meaning 171 - Brand/company names: specific company names (e.g., "reliance home comfort", "great northern insulation") 172 - Wrong category: keywords for unrelated activities (e.g., "fencing sword" is sport, not fence installation) 173 - Location modifiers: "near me", "nearby", "close to me" (these will be combined with region names later) 174 175 KEEP only: 176 - Generic service types: "plumber", "electrician", "landscaping" 177 - Generic service + action: "plumber emergency", "hvac repair", "tree removal" 178 179 Rules: 180 - Output only JSON matching the schema described below. 181 - Be aggressive: remove anything that isn't a GENERIC service type or service+action. 182 - Remove brand-specific keywords (company names). 183 - Remove "near me" variants (these don't combine well with region names). 184 - Always return "reason" for each removed keyword: job, education, products, entertainment, informational, brand, wrong_category, near_me, other. 185 186 Few-shot examples: 187 Keep: "plumber" -> generic service type 188 Keep: "emergency plumber" -> generic service + modifier 189 Remove: "plumber near me" -> reason: near_me (won't combine with regions) 190 Remove: "reliance home comfort" -> reason: brand (company name, not service type) 191 Remove: "fencing sword" -> reason: wrong_category (sport, not fence installation) 192 Remove: "carpenter salary" -> reason: job 193 Remove: "plumber supplies" -> reason: products`; 194 195 const userPrompt = `Input JSON: 196 { "language": "${language}", "keywords": ${JSON.stringify(keywords)} } 197 198 Output JSON schema: 199 { "filtered_keywords": [...], "removed": [{"keyword":"...","reason":"job|education|products|entertainment|informational|other"}] } 200 201 Output only the JSON object.`; 202 203 try { 204 const response = await callLLM({ 205 model: process.env.SCORING_MODEL || 'openai/gpt-4o-mini', 206 messages: [ 207 { role: 'system', content: systemPrompt }, 208 { role: 'user', content: userPrompt }, 209 ], 210 temperature: 0.0, 211 max_tokens: 1000, 212 stage: 'keywords', 213 }); 214 215 // Parse JSON response (strip markdown code fences if present) 216 let content = response.content.trim(); 217 if (content.startsWith('```json')) { 218 content = content.replace(/^```json\n/, '').replace(/\n```$/, ''); 219 } else if (content.startsWith('```')) { 220 content = content.replace(/^```\n/, '').replace(/\n```$/, ''); 221 } 222 const result = JSON.parse(content); 223 return result; 224 } catch (error) { 225 logger.error(`LLM pre-filter failed: ${error.message}`); 226 // Fallback: keep all keywords 227 return { filtered_keywords: keywords, removed: [] }; 228 } 229 } 230 231 /** 232 * Post-overview LLM ranking 233 * Scores and ranks keywords by business value (intent + volume + competition) 234 */ 235 async function postOverviewRank(keywordsData, language = 'en', metricWeights = null) { 236 if (keywordsData.length === 0) return { ranked: [] }; 237 238 const weights = metricWeights || { 239 intent_weight: 0.5, 240 volume_weight: 0.3, 241 competition_weight: 0.2, 242 }; 243 244 const systemPrompt = `You are an expert SEO analyst. Given a list of keywords with metrics (search_volume, competition [0..1], cpc), score and rank them by business value for a local service business. Use a combined score of: 245 - Intent score (0-100) based on keyword text: high for hiring/service intent (emergency, hire, near me, repair, install), low for informational queries (how to, tutorial, meaning). 246 - Volume component: normalize search_volume across the dataset to 0-100. 247 - Competition component: invert competition (1 - competition) to reward lower competition. 248 249 Combine as: 250 final_score = round( intent_weight*intent + volume_weight*volume_norm + competition_weight*(1-competition)*100 , 1 ) 251 252 Where weights: intent ${weights.intent_weight}, volume ${weights.volume_weight}, competition ${weights.competition_weight}. 253 254 Intent detection rules for intent_score (0-100): 255 - 90..100 for clear hiring/commercial: contains "emergency", "hire", "call", "book", "near me", "24/7", "service", "repair", "install", "company", "local". 256 - 70..89 for transactional/commercial: "cost", "price", "quote", "rates", "estimate". 257 - 40..69 for mixed/comparative: "best plumber", "plumber reviews". 258 - 0..39 for informational/how-to/research/education: "how to", "what is", "tutorial", "course", "apprentice", "salary". 259 260 Rules: 261 - Provide intent as part of explanation with a short justification. 262 - Break ties by higher volume and then lower competition. 263 - Output only the JSON object.`; 264 265 const userPrompt = `Input JSON: { "language": "${language}", "keywords_data": ${JSON.stringify(keywordsData)} } 266 267 Output JSON: 268 { "ranked": [ { "keyword":"...", "score":float, "rank":int, "explanation":"..." }, ... ] } 269 270 Output only the JSON object.`; 271 272 try { 273 const response = await callLLM({ 274 model: process.env.SCORING_MODEL || 'openai/gpt-4o-mini', 275 messages: [ 276 { role: 'system', content: systemPrompt }, 277 { role: 'user', content: userPrompt }, 278 ], 279 temperature: 0.0, 280 max_tokens: 2000, 281 stage: 'keywords', 282 }); 283 284 // Parse JSON response (strip markdown code fences if present) 285 let content = response.content.trim(); 286 if (content.startsWith('```json')) { 287 content = content.replace(/^```json\n/, '').replace(/\n```$/, ''); 288 } else if (content.startsWith('```')) { 289 content = content.replace(/^```\n/, '').replace(/\n```$/, ''); 290 } 291 const result = JSON.parse(content); 292 return result; 293 } catch (error) { 294 logger.error(`LLM post-rank failed: ${error.message}`); 295 // Fallback: return sorted by volume 296 return { 297 ranked: keywordsData 298 .sort((a, b) => b.search_volume - a.search_volume) 299 .map((kw, i) => ({ 300 keyword: kw.keyword, 301 score: 100 - i, 302 rank: i + 1, 303 explanation: 'Fallback: sorted by volume', 304 })), 305 }; 306 } 307 } 308 309 /** 310 * Combined filter: place-specific + deterministic + LLM pre-filter 311 */ 312 async function filterKeywordsPreOverview( 313 keywords, 314 language = 'en', 315 maxOutput = 80, 316 countryCode = null 317 ) { 318 logger.info(`Pre-overview filtering: ${keywords.length} keywords`); 319 320 const allRemoved = []; 321 let currentKeywords = keywords; 322 323 // Step 0: Filter place-specific keywords (for businesses only) 324 if (countryCode) { 325 const { kept: placeKept, removed: placeRemoved } = filterPlaceSpecific( 326 currentKeywords, 327 countryCode 328 ); 329 if (placeRemoved.length > 0) { 330 logger.info( 331 ` Place-specific filter: ${placeKept.length} kept, ${placeRemoved.length} removed` 332 ); 333 allRemoved.push(...placeRemoved); 334 currentKeywords = placeKept; 335 } 336 } 337 338 // Step 1: Deterministic filter 339 const { kept, removed: detRemoved } = deterministicFilter(currentKeywords); 340 logger.info(` Deterministic filter: ${kept.length} kept, ${detRemoved.length} removed`); 341 342 if (detRemoved.length > 0) { 343 logger.debug( 344 ` Removed (deterministic): ${detRemoved.map(r => `${r.keyword} (${r.reason})`).join(', ')}` 345 ); 346 allRemoved.push(...detRemoved); 347 } 348 349 // Step 2: LLM filter (conservative) 350 if (kept.length > maxOutput) { 351 logger.info(` LLM filter: reducing from ${kept.length} to ~${maxOutput}...`); 352 const llmResult = await preOverviewFilter(kept, language, maxOutput); 353 logger.success( 354 ` LLM filter: ${llmResult.filtered_keywords.length} kept, ${llmResult.removed.length} removed` 355 ); 356 357 return { 358 filtered_keywords: llmResult.filtered_keywords, 359 removed: [...allRemoved, ...llmResult.removed], 360 }; 361 } else { 362 return { 363 filtered_keywords: kept, 364 removed: allRemoved, 365 }; 366 } 367 } 368 369 /** 370 * Rank keywords after Overview using LLM 371 */ 372 async function rankKeywordsPostOverview(keywordsData, language = 'en', topN = 50) { 373 logger.info(`Post-overview ranking: ${keywordsData.length} keywords`); 374 375 const result = await postOverviewRank(keywordsData, language); 376 377 const ranked = result.ranked || []; 378 logger.success(` Ranked ${ranked.length} keywords`); 379 380 // Return top N 381 return ranked.slice(0, topN); 382 } 383 384 export { filterKeywordsPreOverview, rankKeywordsPostOverview, deterministicFilter };