/ src / utils / keyword-manager.js
keyword-manager.js
  1  /**
  2   * Keyword Manager
  3   * Utilities for managing keywords table and generating keyword combinations
  4   *
  5   * MIGRATION STATUS: Functions that accept a `db` parameter (upsertKeyword,
  6   * incrementProcessedCount, incrementLowScoringCount, incrementReworkLowScoringCount,
  7   * getKeywordByName, getKeywordStats, etc.) still use the SQLite .prepare() API.
  8   * They are called by process.js and cli/keywords.js via the active pipeline.
  9   * Convert each to use async getAll/getOne/run from db.js before removing this notice.
 10   */
 11  
 12  import { readFileSync } from 'fs';
 13  import { join, dirname } from 'path';
 14  import { fileURLToPath } from 'url';
 15  import { parse } from 'csv-parse/sync';
 16  import Logger from './logger.js';
 17  import { getCountryByCode } from '../config/countries.js';
 18  
 19  const __filename = fileURLToPath(import.meta.url);
 20  const __dirname = dirname(__filename);
 21  const projectRoot = join(__dirname, '../..');
 22  
 23  const logger = new Logger('KeywordManager');
 24  
 25  /**
 26   * Load keywords from CSV file (extracts keyword column only)
 27   * @param {string} filename - CSV file to load (relative to data/)
 28   * @returns {string[]} Array of keywords
 29   */
 30  export function loadKeywordsFromCSV(filename) {
 31    // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal
 32    const filePath = join(projectRoot, 'data', filename);
 33    // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir
 34    const content = readFileSync(filePath, 'utf-8');
 35  
 36    const records = parse(content, {
 37      columns: true,
 38      skip_empty_lines: true,
 39    });
 40  
 41    const keywords = records.map(row => row.keyword).filter(k => k && k.trim());
 42  
 43    logger.info(`Loaded ${keywords.length} keywords from ${filename}`);
 44    return keywords;
 45  }
 46  
 47  /**
 48   * Load search volumes from CSV file
 49   * @param {string} filename - CSV file to load (relative to data/)
 50   * @returns {Map<string, number>} Map of keyword → search_volume
 51   */
 52  export function loadSearchVolumesFromCSV(filename) {
 53    // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal
 54    const filePath = join(projectRoot, 'data', filename);
 55    // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir
 56    const content = readFileSync(filePath, 'utf-8');
 57  
 58    const records = parse(content, {
 59      columns: true,
 60      skip_empty_lines: true,
 61    });
 62  
 63    const searchVolumes = new Map();
 64    for (const row of records) {
 65      if (row.keyword && row.search_volume) {
 66        searchVolumes.set(row.keyword, parseInt(row.search_volume, 10) || 0);
 67      }
 68    }
 69  
 70    logger.info(`Loaded ${searchVolumes.size} search volumes from ${filename}`);
 71    return searchVolumes;
 72  }
 73  
 74  /**
 75   * Calculate priority (1-10) based on search volume
 76   * Uses percentile-based bucketing for fair distribution
 77   * @param {number} searchVolume - Combined search volume
 78   * @param {number[]} allVolumes - All search volumes for percentile calculation
 79   * @returns {number} Priority 1-10
 80   */
 81  export function calculatePriority(searchVolume, allVolumes) {
 82    // Sort volumes for percentile calculation
 83    const sorted = [...allVolumes].sort((a, b) => a - b);
 84    const n = sorted.length;
 85  
 86    // Find percentile of this search volume
 87    const index = sorted.findIndex(v => v >= searchVolume);
 88    const percentile = index === -1 ? 100 : (index / n) * 100;
 89  
 90    // Map percentiles to priority (1-10)
 91    // Bottom 10% = 1, Top 10% = 10
 92    if (percentile >= 90) return 10;
 93    if (percentile >= 80) return 9;
 94    if (percentile >= 70) return 8;
 95    if (percentile >= 60) return 7;
 96    if (percentile >= 50) return 6;
 97    if (percentile >= 40) return 5;
 98    if (percentile >= 30) return 4;
 99    if (percentile >= 20) return 3;
100    if (percentile >= 10) return 2;
101    return 1;
102  }
103  
104  /**
105   * Load keywords from text file (for native script files only)
106   * @param {string} filename - File to load (relative to data/)
107   * @returns {string[]} Array of keywords
108   */
109  export function loadKeywordsFromFile(filename) {
110    // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal
111    const filePath = join(projectRoot, 'data', filename);
112    // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir
113    const content = readFileSync(filePath, 'utf-8');
114  
115    const keywords = content
116      .split('\n')
117      .map(line => line.trim())
118      .filter(line => line && !line.startsWith('#')); // Remove comments and empty lines
119  
120    logger.info(`Loaded ${keywords.length} keywords from ${filename}`);
121    return keywords;
122  }
123  
124  /**
125   * Generate keyword combinations (business keyword + region)
126   * @param {string} countryCode - ISO country code (e.g., 'AU', 'US', 'UK')
127   * @returns {Array} Array of {keyword, countryCode, googleDomain, searchVolume, priority} objects
128   * @throws {Error} If region or business file for country doesn't exist
129   */
130  export function generateKeywordCombinations(countryCode = 'AU') {
131    const country = getCountryByCode(countryCode);
132    const lowerCode = countryCode.toLowerCase();
133  
134    // Load country-specific business keywords from validated CSV (REQUIRED - no fallback)
135    const businessFile = `${lowerCode}/businesses-final-filtered.csv`;
136    let businessKeywords;
137    let businessSearchVolumes;
138    try {
139      businessKeywords = loadKeywordsFromCSV(businessFile);
140      businessSearchVolumes = loadSearchVolumesFromCSV(businessFile);
141    } catch {
142      throw new Error(
143        `Business keywords file ${businessFile} not found for ${country.name}. Run keyword validation first: npm run keywords generate-csv -- --type businesses --country ${countryCode}`
144      );
145    }
146  
147    // Load romanized regions from validated CSV (REQUIRED)
148    const regionFile = `${lowerCode}/regions-final-filtered.csv`;
149    let regions;
150    let regionSearchVolumes;
151    try {
152      regions = loadKeywordsFromCSV(regionFile);
153      regionSearchVolumes = loadSearchVolumesFromCSV(regionFile);
154    } catch {
155      throw new Error(
156        `Region file ${regionFile} not found for ${country.name}. Run keyword validation first: npm run keywords generate-csv -- --type regions --country ${countryCode}`
157      );
158    }
159  
160    const combinations = [];
161    const allSearchVolumes = []; // Collect all SVs for percentile calculation
162  
163    // ✅ Generate romanized × romanized combinations
164    // Example: "plumber" × "tokyo" → "plumber tokyo"
165    for (const business of businessKeywords) {
166      for (const region of regions) {
167        const businessSV = businessSearchVolumes.get(business) || 0;
168        const regionSV = regionSearchVolumes.get(region) || 0;
169  
170        // Calculate geometric mean: sqrt(business_sv × region_sv)
171        const combinedSV =
172          businessSV > 0 && regionSV > 0 ? Math.round(Math.sqrt(businessSV * regionSV)) : 0;
173  
174        allSearchVolumes.push(combinedSV);
175  
176        combinations.push({
177          keyword: `${business} ${region}`,
178          countryCode: country.code,
179          googleDomain: country.googleDomain,
180          searchVolume: combinedSV,
181        });
182      }
183    }
184  
185    // Try to load native script files (JP, KR, CN only)
186    const nativeBusinessFile = `${lowerCode}/businesses-native.txt`;
187    const nativeRegionFile = `${lowerCode}/regions-native.txt`;
188  
189    try {
190      const nativeBusinesses = loadKeywordsFromFile(nativeBusinessFile);
191      const nativeRegions = loadKeywordsFromFile(nativeRegionFile);
192  
193      // For native scripts, use median SV as default (no DataForSEO data available)
194      const medianSV =
195        allSearchVolumes.length > 0
196          ? allSearchVolumes.sort((a, b) => a - b)[Math.floor(allSearchVolumes.length / 2)]
197          : 0;
198  
199      // ✅ Generate native × native combinations ONLY
200      // Example: "配管工" × "東京" → "配管工 東京"
201      // ❌ NOT: "plumber" × "東京" (mixed)
202      // ❌ NOT: "配管工" × "tokyo" (mixed)
203      for (const business of nativeBusinesses) {
204        for (const region of nativeRegions) {
205          allSearchVolumes.push(medianSV);
206  
207          combinations.push({
208            keyword: `${business} ${region}`,
209            countryCode: country.code,
210            googleDomain: country.googleDomain,
211            searchVolume: medianSV,
212          });
213        }
214      }
215  
216      logger.info(
217        `Added ${nativeBusinesses.length * nativeRegions.length} native script combinations for ${country.name}`
218      );
219    } catch {
220      // No native script files - this is expected for most countries
221      logger.debug(`No native script files for ${country.name} (${country.code})`);
222    }
223  
224    // Calculate priority for all combinations based on search volume percentiles
225    for (const combo of combinations) {
226      combo.priority = calculatePriority(combo.searchVolume, allSearchVolumes);
227    }
228  
229    logger.success(
230      `Generated ${combinations.length} keyword combinations for ${country.name} (${country.code})`
231    );
232    logger.info(
233      `Search volume range: ${Math.min(...allSearchVolumes)}-${Math.max(...allSearchVolumes)}, median: ${allSearchVolumes.sort((a, b) => a - b)[Math.floor(allSearchVolumes.length / 2)]}`
234    );
235  
236    return combinations;
237  }
238  
239  /**
240   * Insert or update keyword in database
241   * @param {Database.Database} db - SQLite database instance
242   * @param {string} keyword - Keyword to insert/update
243   * @param {Object} updates - Fields to update (can include country_code, google_domain)
244   * @returns {Object} The keyword record
245   */
246  export function upsertKeyword(db, keyword, updates = {}) {
247    // Require country code
248    if (!updates.country_code) {
249      throw new Error('country_code is required for keyword operations');
250    }
251    const country = getCountryByCode(updates.country_code);
252  
253    // Ensure country fields are set
254    const updatesWithCountry = {
255      country_code: country.code,
256      google_domain: country.googleDomain,
257      ...updates,
258    };
259  
260    // Check if keyword exists with this country_code (keywords are unique per country)
261    const existing = db
262      .prepare('SELECT * FROM keywords WHERE keyword = ? AND country_code = ?')
263      .get(keyword, country.code);
264  
265    if (existing) {
266      // Update existing keyword
267      const updateFields = [];
268      const updateValues = [];
269  
270      for (const [key, value] of Object.entries(updatesWithCountry)) {
271        if (key !== 'country_code') {
272          // Don't update country_code in WHERE clause
273          updateFields.push(`${key} = ?`);
274          updateValues.push(value);
275        }
276      }
277  
278      if (updateFields.length > 0) {
279        updateValues.push(keyword, country.code);
280        const sql = `UPDATE keywords SET ${updateFields.join(', ')} WHERE keyword = ? AND country_code = ?`;
281        db.prepare(sql).run(...updateValues);
282      }
283  
284      return db
285        .prepare('SELECT * FROM keywords WHERE keyword = ? AND country_code = ?')
286        .get(keyword, country.code);
287    } else {
288      // Insert new keyword
289      const fields = ['keyword', ...Object.keys(updatesWithCountry)];
290      const placeholders = fields.map(() => '?');
291      const values = [keyword, ...Object.values(updatesWithCountry)];
292  
293      const sql = `INSERT INTO keywords (${fields.join(', ')}) VALUES (${placeholders.join(', ')})`;
294      const result = db.prepare(sql).run(...values);
295  
296      return db.prepare('SELECT * FROM keywords WHERE id = ?').get(result.lastInsertRowid);
297    }
298  }
299  
300  /**
301   * Increment processed count for a keyword
302   * @param {Database.Database} db - SQLite database instance
303   * @param {string} keyword - Keyword to update
304   */
305  export function incrementProcessedCount(db, keyword) {
306    db.prepare(
307      `UPDATE keywords 
308       SET processed_count = processed_count + 1, 
309           updated_at = CURRENT_TIMESTAMP 
310       WHERE keyword = ?`
311    ).run(keyword);
312  }
313  
314  /**
315   * Increment low scoring count for a keyword
316   * @param {Database.Database} db - SQLite database instance
317   * @param {string} keyword - Keyword to update
318   */
319  export function incrementLowScoringCount(db, keyword) {
320    db.prepare(
321      `UPDATE keywords 
322       SET low_scoring_count = low_scoring_count + 1, 
323           updated_at = CURRENT_TIMESTAMP 
324       WHERE keyword = ?`
325    ).run(keyword);
326  }
327  
328  /**
329   * Increment rework low scoring count for a keyword
330   * @param {Database.Database} db - SQLite database instance
331   * @param {string} keyword - Keyword to update
332   */
333  export function incrementReworkLowScoringCount(db, keyword) {
334    db.prepare(
335      `UPDATE keywords 
336       SET rework_low_scoring_count = rework_low_scoring_count + 1, 
337           updated_at = CURRENT_TIMESTAMP 
338       WHERE keyword = ?`
339    ).run(keyword);
340  }
341  
342  /**
343   * Backfill keywords table from existing sites
344   * @param {Database.Database} db - SQLite database instance
345   * @returns {number} Number of keywords backfilled
346   */
347  export function backfillKeywordsFromSites(db) {
348    logger.info('Backfilling keywords table from existing sites...');
349  
350    // Get all unique keywords from sites table
351    const keywords = db
352      .prepare(
353        `SELECT 
354          keyword,
355          COUNT(*) as processed_count,
356          SUM(CASE WHEN processing_status = 'scored' THEN 1 ELSE 0 END) as scored_count
357        FROM sites 
358        GROUP BY keyword`
359      )
360      .all();
361  
362    let backfilled = 0;
363  
364    for (const row of keywords) {
365      const { keyword, processed_count } = row;
366  
367      // Count low-scoring sites (would need to parse conversion_score_json)
368      const lowScoringCount = db
369        .prepare(
370          `SELECT COUNT(*) as count 
371           FROM sites 
372           WHERE keyword = ? 
373           AND conversion_score_json IS NOT NULL 
374           AND (
375             conversion_score_json LIKE '%"letter_grade":"B-"%' OR
376             conversion_score_json LIKE '%"letter_grade":"C%' OR
377             conversion_score_json LIKE '%"letter_grade":"D%' OR
378             conversion_score_json LIKE '%"letter_grade":"F"%'
379           )`
380        )
381        .get(keyword).count;
382  
383      upsertKeyword(db, keyword, {
384        processed_count,
385        low_scoring_count: lowScoringCount,
386      });
387  
388      backfilled++;
389    }
390  
391    logger.success(`Backfilled ${backfilled} keywords from sites table`);
392    return backfilled;
393  }
394  
395  /**
396   * Get keyword statistics
397   * @param {Database.Database} db - SQLite database instance
398   * @param {string} keyword - Keyword to get stats for
399   * @returns {Object} Keyword stats
400   */
401  export function getKeywordStats(db, keyword) {
402    return db.prepare('SELECT * FROM keywords WHERE keyword = ?').get(keyword);
403  }
404  
405  /**
406   * Get all keywords sorted by priority
407   * @param {Database.Database} db - SQLite database instance
408   * @param {number} limit - Maximum number to return
409   * @returns {Array} Keywords ordered by priority
410   */
411  export function getKeywordsByPriority(db, limit = 100) {
412    // Priority: unscraped keywords first, then by zenrows_count DESC
413    return db
414      .prepare(
415        `SELECT * FROM keywords 
416         ORDER BY 
417           CASE WHEN last_scraped_at IS NULL THEN 0 ELSE 1 END,
418           zenrows_count DESC,
419           processed_count ASC
420         LIMIT ?`
421      )
422      .all(limit);
423  }
424  
425  export default {
426    loadKeywordsFromCSV,
427    loadKeywordsFromFile,
428    loadSearchVolumesFromCSV,
429    calculatePriority,
430    generateKeywordCombinations,
431    upsertKeyword,
432    incrementProcessedCount,
433    incrementLowScoringCount,
434    incrementReworkLowScoringCount,
435    backfillKeywordsFromSites,
436    getKeywordStats,
437    getKeywordsByPriority,
438  };