/ src / utils / keyword-filter.js
keyword-filter.js
  1  /**
  2   * Keyword Filter Module
  3   *
  4   * LLM-based filtering for business/service keywords using Sofia's templates:
  5   * 1. Pre-overview: Remove obvious non-service keywords (jobs, education, products)
  6   * 2. Post-overview: Rank by business value (intent + volume + competition)
  7   */
  8  
  9  import Logger from './logger.js';
 10  import { callLLM } from './llm-provider.js';
 11  import { parse } from 'csv-parse/sync';
 12  import fs from 'fs';
 13  import path from 'path';
 14  
 15  const logger = new Logger('KeywordFilter');
 16  
 17  // Deterministic regex patterns for strong negatives (run before LLM)
 18  const NEGATIVE_PATTERNS = {
 19    job: /\b(salary|wage|pay|jobs?|vacancy|vacancies|apprentice|apprenticeship|hiring|recruit|recruitment|cv|resume|career)\b/i,
 20    education:
 21      /\b(course|courses|training|certification|certificat|class|degree|school|university|diploma)\b/i,
 22    products: /\b(supplies|tools?|equipment|parts?|replacement|spare|buy|for sale|shop|store)\b/i,
 23    entertainment: /\b(band|movie|song|concert|show|lyrics|tickets?)\b/i,
 24    informational: /\b(how to|how do i|tutorial|guide|diy|do it yourself|meaning|definition)\b/i,
 25    nearme: /\b(near me|nearby|close to me|around me|in my area)\b/i, // Won't combine well with region names
 26  };
 27  
 28  // Positive tokens that indicate service/hiring intent (but NOT "near me" - we filter those out)
 29  const POSITIVE_PATTERNS =
 30    /\b(hire|hire a|call|book|appointment|emergency|24\/7|repair|service|install|fix|maintenance|company|local)\b/i;
 31  
 32  /**
 33   * Load region names for a specific country from validated CSV
 34   * Used to detect place-specific keywords in business lists
 35   */
 36  function loadRegionNames(countryCode) {
 37    try {
 38      const regionsPath = path.join(
 39        process.cwd(),
 40        'data',
 41        countryCode.toLowerCase(),
 42        'regions-final-filtered.csv'
 43      );
 44      if (!fs.existsSync(regionsPath)) {
 45        return [];
 46      }
 47  
 48      const content = fs.readFileSync(regionsPath, 'utf-8');
 49      const records = parse(content, {
 50        columns: true,
 51        skip_empty_lines: true,
 52      });
 53  
 54      const regions = records.map(row => row.keyword.toLowerCase()).filter(r => r && r.trim());
 55  
 56      return regions;
 57    } catch (error) {
 58      logger.warn(`Failed to load regions for ${countryCode}: ${error.message}`);
 59      return [];
 60    }
 61  }
 62  
 63  /**
 64   * Filter out place-specific keywords from business lists
 65   * Removes keywords like "plumber auckland", "electrician sydney"
 66   */
 67  function filterPlaceSpecific(keywords, countryCode) {
 68    if (!countryCode) {
 69      return { kept: keywords, removed: [] };
 70    }
 71  
 72    const regions = loadRegionNames(countryCode);
 73    if (regions.length === 0) {
 74      return { kept: keywords, removed: [] };
 75    }
 76  
 77    const kept = [];
 78    const removed = [];
 79  
 80    for (const keyword of keywords) {
 81      const kw = keyword.toLowerCase();
 82      let hasPlace = false;
 83  
 84      // Check if keyword contains any region name
 85      for (const region of regions) {
 86        // Match whole word boundaries to avoid false positives
 87        // e.g., "sydney" should match "plumber sydney" but not "sydneysider"
 88        const regex = new RegExp(`\\b${region.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i');
 89        if (regex.test(kw)) {
 90          hasPlace = true;
 91          removed.push({ keyword, reason: `place-specific (${region})` });
 92          break;
 93        }
 94      }
 95  
 96      if (!hasPlace) {
 97        kept.push(keyword);
 98      }
 99    }
100  
101    return { kept, removed };
102  }
103  
104  /**
105   * Deterministic pre-filter using regex patterns
106   * Removes obvious non-service keywords before LLM
107   */
108  function deterministicFilter(keywords) {
109    const kept = [];
110    const removed = [];
111  
112    for (const keyword of keywords) {
113      const kw = keyword.toLowerCase();
114      let shouldRemove = false;
115      let reason = null;
116  
117      // Check each negative pattern
118      for (const [category, pattern] of Object.entries(NEGATIVE_PATTERNS)) {
119        if (pattern.test(kw)) {
120          // Hard filters: always remove, even with positive tokens
121          const hardFilters = ['nearme'];
122  
123          // If it's a hard filter, always remove
124          if (hardFilters.includes(category)) {
125            shouldRemove = true;
126            reason = category;
127            break;
128          }
129          // If it has positive tokens too, keep it (e.g., "plumber tools repair")
130          else if (POSITIVE_PATTERNS.test(kw)) {
131            logger.debug(`Keeping "${keyword}" despite ${category} match (has positive tokens)`);
132            kept.push(keyword);
133            shouldRemove = false;
134            reason = 'kept'; // sentinel: prevent double-push below
135            break;
136          } else {
137            shouldRemove = true;
138            reason = category;
139            break;
140          }
141        }
142      }
143  
144      if (shouldRemove) {
145        removed.push({ keyword, reason });
146      } else if (!reason) {
147        // No negative pattern matched
148        kept.push(keyword);
149      }
150    }
151  
152    return { kept, removed };
153  }
154  
155  /**
156   * Pre-overview LLM filter
157   * Removes non-service keywords using conservative LLM filtering
158   */
159  async function preOverviewFilter(keywords, language = 'en', maxOutput = 80) {
160    if (keywords.length === 0) return { filtered_keywords: [], removed: [] };
161  
162    const systemPrompt = `You are a precise keyword filter assistant for local service businesses (plumbers, electricians, landscapers, etc.).
163  Your task: from the provided list of keywords, remove non-service queries that are not generic business types.
164  
165  REMOVE these categories:
166  - Jobs/careers: salary, apprenticeship, hiring, recruitment
167  - Education: course, training, certification, degree
168  - Products/supplies: tools, equipment, parts, buy, shop
169  - Entertainment: band, movie, concert, show
170  - Informational: how to, tutorial, guide, meaning
171  - Brand/company names: specific company names (e.g., "reliance home comfort", "great northern insulation")
172  - Wrong category: keywords for unrelated activities (e.g., "fencing sword" is sport, not fence installation)
173  - Location modifiers: "near me", "nearby", "close to me" (these will be combined with region names later)
174  
175  KEEP only:
176  - Generic service types: "plumber", "electrician", "landscaping"
177  - Generic service + action: "plumber emergency", "hvac repair", "tree removal"
178  
179  Rules:
180  - Output only JSON matching the schema described below.
181  - Be aggressive: remove anything that isn't a GENERIC service type or service+action.
182  - Remove brand-specific keywords (company names).
183  - Remove "near me" variants (these don't combine well with region names).
184  - Always return "reason" for each removed keyword: job, education, products, entertainment, informational, brand, wrong_category, near_me, other.
185  
186  Few-shot examples:
187  Keep: "plumber" -> generic service type
188  Keep: "emergency plumber" -> generic service + modifier
189  Remove: "plumber near me" -> reason: near_me (won't combine with regions)
190  Remove: "reliance home comfort" -> reason: brand (company name, not service type)
191  Remove: "fencing sword" -> reason: wrong_category (sport, not fence installation)
192  Remove: "carpenter salary" -> reason: job
193  Remove: "plumber supplies" -> reason: products`;
194  
195    const userPrompt = `Input JSON:
196  { "language": "${language}", "keywords": ${JSON.stringify(keywords)} }
197  
198  Output JSON schema:
199  { "filtered_keywords": [...], "removed": [{"keyword":"...","reason":"job|education|products|entertainment|informational|other"}] }
200  
201  Output only the JSON object.`;
202  
203    try {
204      const response = await callLLM({
205        model: process.env.SCORING_MODEL || 'openai/gpt-4o-mini',
206        messages: [
207          { role: 'system', content: systemPrompt },
208          { role: 'user', content: userPrompt },
209        ],
210        temperature: 0.0,
211        max_tokens: 1000,
212        stage: 'keywords',
213      });
214  
215      // Parse JSON response (strip markdown code fences if present)
216      let content = response.content.trim();
217      if (content.startsWith('```json')) {
218        content = content.replace(/^```json\n/, '').replace(/\n```$/, '');
219      } else if (content.startsWith('```')) {
220        content = content.replace(/^```\n/, '').replace(/\n```$/, '');
221      }
222      const result = JSON.parse(content);
223      return result;
224    } catch (error) {
225      logger.error(`LLM pre-filter failed: ${error.message}`);
226      // Fallback: keep all keywords
227      return { filtered_keywords: keywords, removed: [] };
228    }
229  }
230  
231  /**
232   * Post-overview LLM ranking
233   * Scores and ranks keywords by business value (intent + volume + competition)
234   */
235  async function postOverviewRank(keywordsData, language = 'en', metricWeights = null) {
236    if (keywordsData.length === 0) return { ranked: [] };
237  
238    const weights = metricWeights || {
239      intent_weight: 0.5,
240      volume_weight: 0.3,
241      competition_weight: 0.2,
242    };
243  
244    const systemPrompt = `You are an expert SEO analyst. Given a list of keywords with metrics (search_volume, competition [0..1], cpc), score and rank them by business value for a local service business. Use a combined score of:
245  - Intent score (0-100) based on keyword text: high for hiring/service intent (emergency, hire, near me, repair, install), low for informational queries (how to, tutorial, meaning).
246  - Volume component: normalize search_volume across the dataset to 0-100.
247  - Competition component: invert competition (1 - competition) to reward lower competition.
248  
249  Combine as:
250  final_score = round( intent_weight*intent + volume_weight*volume_norm + competition_weight*(1-competition)*100 , 1 )
251  
252  Where weights: intent ${weights.intent_weight}, volume ${weights.volume_weight}, competition ${weights.competition_weight}.
253  
254  Intent detection rules for intent_score (0-100):
255  - 90..100 for clear hiring/commercial: contains "emergency", "hire", "call", "book", "near me", "24/7", "service", "repair", "install", "company", "local".
256  - 70..89 for transactional/commercial: "cost", "price", "quote", "rates", "estimate".
257  - 40..69 for mixed/comparative: "best plumber", "plumber reviews".
258  - 0..39 for informational/how-to/research/education: "how to", "what is", "tutorial", "course", "apprentice", "salary".
259  
260  Rules:
261  - Provide intent as part of explanation with a short justification.
262  - Break ties by higher volume and then lower competition.
263  - Output only the JSON object.`;
264  
265    const userPrompt = `Input JSON: { "language": "${language}", "keywords_data": ${JSON.stringify(keywordsData)} }
266  
267  Output JSON:
268  { "ranked": [ { "keyword":"...", "score":float, "rank":int, "explanation":"..." }, ... ] }
269  
270  Output only the JSON object.`;
271  
272    try {
273      const response = await callLLM({
274        model: process.env.SCORING_MODEL || 'openai/gpt-4o-mini',
275        messages: [
276          { role: 'system', content: systemPrompt },
277          { role: 'user', content: userPrompt },
278        ],
279        temperature: 0.0,
280        max_tokens: 2000,
281        stage: 'keywords',
282      });
283  
284      // Parse JSON response (strip markdown code fences if present)
285      let content = response.content.trim();
286      if (content.startsWith('```json')) {
287        content = content.replace(/^```json\n/, '').replace(/\n```$/, '');
288      } else if (content.startsWith('```')) {
289        content = content.replace(/^```\n/, '').replace(/\n```$/, '');
290      }
291      const result = JSON.parse(content);
292      return result;
293    } catch (error) {
294      logger.error(`LLM post-rank failed: ${error.message}`);
295      // Fallback: return sorted by volume
296      return {
297        ranked: keywordsData
298          .sort((a, b) => b.search_volume - a.search_volume)
299          .map((kw, i) => ({
300            keyword: kw.keyword,
301            score: 100 - i,
302            rank: i + 1,
303            explanation: 'Fallback: sorted by volume',
304          })),
305      };
306    }
307  }
308  
309  /**
310   * Combined filter: place-specific + deterministic + LLM pre-filter
311   */
312  async function filterKeywordsPreOverview(
313    keywords,
314    language = 'en',
315    maxOutput = 80,
316    countryCode = null
317  ) {
318    logger.info(`Pre-overview filtering: ${keywords.length} keywords`);
319  
320    const allRemoved = [];
321    let currentKeywords = keywords;
322  
323    // Step 0: Filter place-specific keywords (for businesses only)
324    if (countryCode) {
325      const { kept: placeKept, removed: placeRemoved } = filterPlaceSpecific(
326        currentKeywords,
327        countryCode
328      );
329      if (placeRemoved.length > 0) {
330        logger.info(
331          `  Place-specific filter: ${placeKept.length} kept, ${placeRemoved.length} removed`
332        );
333        allRemoved.push(...placeRemoved);
334        currentKeywords = placeKept;
335      }
336    }
337  
338    // Step 1: Deterministic filter
339    const { kept, removed: detRemoved } = deterministicFilter(currentKeywords);
340    logger.info(`  Deterministic filter: ${kept.length} kept, ${detRemoved.length} removed`);
341  
342    if (detRemoved.length > 0) {
343      logger.debug(
344        `  Removed (deterministic): ${detRemoved.map(r => `${r.keyword} (${r.reason})`).join(', ')}`
345      );
346      allRemoved.push(...detRemoved);
347    }
348  
349    // Step 2: LLM filter (conservative)
350    if (kept.length > maxOutput) {
351      logger.info(`  LLM filter: reducing from ${kept.length} to ~${maxOutput}...`);
352      const llmResult = await preOverviewFilter(kept, language, maxOutput);
353      logger.success(
354        `  LLM filter: ${llmResult.filtered_keywords.length} kept, ${llmResult.removed.length} removed`
355      );
356  
357      return {
358        filtered_keywords: llmResult.filtered_keywords,
359        removed: [...allRemoved, ...llmResult.removed],
360      };
361    } else {
362      return {
363        filtered_keywords: kept,
364        removed: allRemoved,
365      };
366    }
367  }
368  
369  /**
370   * Rank keywords after Overview using LLM
371   */
372  async function rankKeywordsPostOverview(keywordsData, language = 'en', topN = 50) {
373    logger.info(`Post-overview ranking: ${keywordsData.length} keywords`);
374  
375    const result = await postOverviewRank(keywordsData, language);
376  
377    const ranked = result.ranked || [];
378    logger.success(`  Ranked ${ranked.length} keywords`);
379  
380    // Return top N
381    return ranked.slice(0, topN);
382  }
383  
384  export { filterKeywordsPreOverview, rankKeywordsPostOverview, deterministicFilter };