keyword-manager.js
1 /** 2 * Keyword Manager 3 * Utilities for managing keywords table and generating keyword combinations 4 * 5 * MIGRATION STATUS: Functions that accept a `db` parameter (upsertKeyword, 6 * incrementProcessedCount, incrementLowScoringCount, incrementReworkLowScoringCount, 7 * getKeywordByName, getKeywordStats, etc.) still use the SQLite .prepare() API. 8 * They are called by process.js and cli/keywords.js via the active pipeline. 9 * Convert each to use async getAll/getOne/run from db.js before removing this notice. 10 */ 11 12 import { readFileSync } from 'fs'; 13 import { join, dirname } from 'path'; 14 import { fileURLToPath } from 'url'; 15 import { parse } from 'csv-parse/sync'; 16 import Logger from './logger.js'; 17 import { getCountryByCode } from '../config/countries.js'; 18 19 const __filename = fileURLToPath(import.meta.url); 20 const __dirname = dirname(__filename); 21 const projectRoot = join(__dirname, '../..'); 22 23 const logger = new Logger('KeywordManager'); 24 25 /** 26 * Load keywords from CSV file (extracts keyword column only) 27 * @param {string} filename - CSV file to load (relative to data/) 28 * @returns {string[]} Array of keywords 29 */ 30 export function loadKeywordsFromCSV(filename) { 31 // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal 32 const filePath = join(projectRoot, 'data', filename); 33 // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir 34 const content = readFileSync(filePath, 'utf-8'); 35 36 const records = parse(content, { 37 columns: true, 38 skip_empty_lines: true, 39 }); 40 41 const keywords = records.map(row => row.keyword).filter(k => k && k.trim()); 42 43 logger.info(`Loaded ${keywords.length} keywords from ${filename}`); 44 return keywords; 45 } 46 47 /** 48 * Load search volumes from CSV file 49 * @param {string} filename - CSV file to load (relative to data/) 50 * @returns {Map<string, number>} Map of keyword → search_volume 51 */ 52 export function loadSearchVolumesFromCSV(filename) { 53 // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal 54 const filePath = join(projectRoot, 'data', filename); 55 // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir 56 const content = readFileSync(filePath, 'utf-8'); 57 58 const records = parse(content, { 59 columns: true, 60 skip_empty_lines: true, 61 }); 62 63 const searchVolumes = new Map(); 64 for (const row of records) { 65 if (row.keyword && row.search_volume) { 66 searchVolumes.set(row.keyword, parseInt(row.search_volume, 10) || 0); 67 } 68 } 69 70 logger.info(`Loaded ${searchVolumes.size} search volumes from ${filename}`); 71 return searchVolumes; 72 } 73 74 /** 75 * Calculate priority (1-10) based on search volume 76 * Uses percentile-based bucketing for fair distribution 77 * @param {number} searchVolume - Combined search volume 78 * @param {number[]} allVolumes - All search volumes for percentile calculation 79 * @returns {number} Priority 1-10 80 */ 81 export function calculatePriority(searchVolume, allVolumes) { 82 // Sort volumes for percentile calculation 83 const sorted = [...allVolumes].sort((a, b) => a - b); 84 const n = sorted.length; 85 86 // Find percentile of this search volume 87 const index = sorted.findIndex(v => v >= searchVolume); 88 const percentile = index === -1 ? 100 : (index / n) * 100; 89 90 // Map percentiles to priority (1-10) 91 // Bottom 10% = 1, Top 10% = 10 92 if (percentile >= 90) return 10; 93 if (percentile >= 80) return 9; 94 if (percentile >= 70) return 8; 95 if (percentile >= 60) return 7; 96 if (percentile >= 50) return 6; 97 if (percentile >= 40) return 5; 98 if (percentile >= 30) return 4; 99 if (percentile >= 20) return 3; 100 if (percentile >= 10) return 2; 101 return 1; 102 } 103 104 /** 105 * Load keywords from text file (for native script files only) 106 * @param {string} filename - File to load (relative to data/) 107 * @returns {string[]} Array of keywords 108 */ 109 export function loadKeywordsFromFile(filename) { 110 // nosemgrep: javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal 111 const filePath = join(projectRoot, 'data', filename); 112 // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: path is env var + known data dir 113 const content = readFileSync(filePath, 'utf-8'); 114 115 const keywords = content 116 .split('\n') 117 .map(line => line.trim()) 118 .filter(line => line && !line.startsWith('#')); // Remove comments and empty lines 119 120 logger.info(`Loaded ${keywords.length} keywords from ${filename}`); 121 return keywords; 122 } 123 124 /** 125 * Generate keyword combinations (business keyword + region) 126 * @param {string} countryCode - ISO country code (e.g., 'AU', 'US', 'UK') 127 * @returns {Array} Array of {keyword, countryCode, googleDomain, searchVolume, priority} objects 128 * @throws {Error} If region or business file for country doesn't exist 129 */ 130 export function generateKeywordCombinations(countryCode = 'AU') { 131 const country = getCountryByCode(countryCode); 132 const lowerCode = countryCode.toLowerCase(); 133 134 // Load country-specific business keywords from validated CSV (REQUIRED - no fallback) 135 const businessFile = `${lowerCode}/businesses-final-filtered.csv`; 136 let businessKeywords; 137 let businessSearchVolumes; 138 try { 139 businessKeywords = loadKeywordsFromCSV(businessFile); 140 businessSearchVolumes = loadSearchVolumesFromCSV(businessFile); 141 } catch { 142 throw new Error( 143 `Business keywords file ${businessFile} not found for ${country.name}. Run keyword validation first: npm run keywords generate-csv -- --type businesses --country ${countryCode}` 144 ); 145 } 146 147 // Load romanized regions from validated CSV (REQUIRED) 148 const regionFile = `${lowerCode}/regions-final-filtered.csv`; 149 let regions; 150 let regionSearchVolumes; 151 try { 152 regions = loadKeywordsFromCSV(regionFile); 153 regionSearchVolumes = loadSearchVolumesFromCSV(regionFile); 154 } catch { 155 throw new Error( 156 `Region file ${regionFile} not found for ${country.name}. Run keyword validation first: npm run keywords generate-csv -- --type regions --country ${countryCode}` 157 ); 158 } 159 160 const combinations = []; 161 const allSearchVolumes = []; // Collect all SVs for percentile calculation 162 163 // ✅ Generate romanized × romanized combinations 164 // Example: "plumber" × "tokyo" → "plumber tokyo" 165 for (const business of businessKeywords) { 166 for (const region of regions) { 167 const businessSV = businessSearchVolumes.get(business) || 0; 168 const regionSV = regionSearchVolumes.get(region) || 0; 169 170 // Calculate geometric mean: sqrt(business_sv × region_sv) 171 const combinedSV = 172 businessSV > 0 && regionSV > 0 ? Math.round(Math.sqrt(businessSV * regionSV)) : 0; 173 174 allSearchVolumes.push(combinedSV); 175 176 combinations.push({ 177 keyword: `${business} ${region}`, 178 countryCode: country.code, 179 googleDomain: country.googleDomain, 180 searchVolume: combinedSV, 181 }); 182 } 183 } 184 185 // Try to load native script files (JP, KR, CN only) 186 const nativeBusinessFile = `${lowerCode}/businesses-native.txt`; 187 const nativeRegionFile = `${lowerCode}/regions-native.txt`; 188 189 try { 190 const nativeBusinesses = loadKeywordsFromFile(nativeBusinessFile); 191 const nativeRegions = loadKeywordsFromFile(nativeRegionFile); 192 193 // For native scripts, use median SV as default (no DataForSEO data available) 194 const medianSV = 195 allSearchVolumes.length > 0 196 ? allSearchVolumes.sort((a, b) => a - b)[Math.floor(allSearchVolumes.length / 2)] 197 : 0; 198 199 // ✅ Generate native × native combinations ONLY 200 // Example: "配管工" × "東京" → "配管工 東京" 201 // ❌ NOT: "plumber" × "東京" (mixed) 202 // ❌ NOT: "配管工" × "tokyo" (mixed) 203 for (const business of nativeBusinesses) { 204 for (const region of nativeRegions) { 205 allSearchVolumes.push(medianSV); 206 207 combinations.push({ 208 keyword: `${business} ${region}`, 209 countryCode: country.code, 210 googleDomain: country.googleDomain, 211 searchVolume: medianSV, 212 }); 213 } 214 } 215 216 logger.info( 217 `Added ${nativeBusinesses.length * nativeRegions.length} native script combinations for ${country.name}` 218 ); 219 } catch { 220 // No native script files - this is expected for most countries 221 logger.debug(`No native script files for ${country.name} (${country.code})`); 222 } 223 224 // Calculate priority for all combinations based on search volume percentiles 225 for (const combo of combinations) { 226 combo.priority = calculatePriority(combo.searchVolume, allSearchVolumes); 227 } 228 229 logger.success( 230 `Generated ${combinations.length} keyword combinations for ${country.name} (${country.code})` 231 ); 232 logger.info( 233 `Search volume range: ${Math.min(...allSearchVolumes)}-${Math.max(...allSearchVolumes)}, median: ${allSearchVolumes.sort((a, b) => a - b)[Math.floor(allSearchVolumes.length / 2)]}` 234 ); 235 236 return combinations; 237 } 238 239 /** 240 * Insert or update keyword in database 241 * @param {Database.Database} db - SQLite database instance 242 * @param {string} keyword - Keyword to insert/update 243 * @param {Object} updates - Fields to update (can include country_code, google_domain) 244 * @returns {Object} The keyword record 245 */ 246 export function upsertKeyword(db, keyword, updates = {}) { 247 // Require country code 248 if (!updates.country_code) { 249 throw new Error('country_code is required for keyword operations'); 250 } 251 const country = getCountryByCode(updates.country_code); 252 253 // Ensure country fields are set 254 const updatesWithCountry = { 255 country_code: country.code, 256 google_domain: country.googleDomain, 257 ...updates, 258 }; 259 260 // Check if keyword exists with this country_code (keywords are unique per country) 261 const existing = db 262 .prepare('SELECT * FROM keywords WHERE keyword = ? AND country_code = ?') 263 .get(keyword, country.code); 264 265 if (existing) { 266 // Update existing keyword 267 const updateFields = []; 268 const updateValues = []; 269 270 for (const [key, value] of Object.entries(updatesWithCountry)) { 271 if (key !== 'country_code') { 272 // Don't update country_code in WHERE clause 273 updateFields.push(`${key} = ?`); 274 updateValues.push(value); 275 } 276 } 277 278 if (updateFields.length > 0) { 279 updateValues.push(keyword, country.code); 280 const sql = `UPDATE keywords SET ${updateFields.join(', ')} WHERE keyword = ? AND country_code = ?`; 281 db.prepare(sql).run(...updateValues); 282 } 283 284 return db 285 .prepare('SELECT * FROM keywords WHERE keyword = ? AND country_code = ?') 286 .get(keyword, country.code); 287 } else { 288 // Insert new keyword 289 const fields = ['keyword', ...Object.keys(updatesWithCountry)]; 290 const placeholders = fields.map(() => '?'); 291 const values = [keyword, ...Object.values(updatesWithCountry)]; 292 293 const sql = `INSERT INTO keywords (${fields.join(', ')}) VALUES (${placeholders.join(', ')})`; 294 const result = db.prepare(sql).run(...values); 295 296 return db.prepare('SELECT * FROM keywords WHERE id = ?').get(result.lastInsertRowid); 297 } 298 } 299 300 /** 301 * Increment processed count for a keyword 302 * @param {Database.Database} db - SQLite database instance 303 * @param {string} keyword - Keyword to update 304 */ 305 export function incrementProcessedCount(db, keyword) { 306 db.prepare( 307 `UPDATE keywords 308 SET processed_count = processed_count + 1, 309 updated_at = CURRENT_TIMESTAMP 310 WHERE keyword = ?` 311 ).run(keyword); 312 } 313 314 /** 315 * Increment low scoring count for a keyword 316 * @param {Database.Database} db - SQLite database instance 317 * @param {string} keyword - Keyword to update 318 */ 319 export function incrementLowScoringCount(db, keyword) { 320 db.prepare( 321 `UPDATE keywords 322 SET low_scoring_count = low_scoring_count + 1, 323 updated_at = CURRENT_TIMESTAMP 324 WHERE keyword = ?` 325 ).run(keyword); 326 } 327 328 /** 329 * Increment rework low scoring count for a keyword 330 * @param {Database.Database} db - SQLite database instance 331 * @param {string} keyword - Keyword to update 332 */ 333 export function incrementReworkLowScoringCount(db, keyword) { 334 db.prepare( 335 `UPDATE keywords 336 SET rework_low_scoring_count = rework_low_scoring_count + 1, 337 updated_at = CURRENT_TIMESTAMP 338 WHERE keyword = ?` 339 ).run(keyword); 340 } 341 342 /** 343 * Backfill keywords table from existing sites 344 * @param {Database.Database} db - SQLite database instance 345 * @returns {number} Number of keywords backfilled 346 */ 347 export function backfillKeywordsFromSites(db) { 348 logger.info('Backfilling keywords table from existing sites...'); 349 350 // Get all unique keywords from sites table 351 const keywords = db 352 .prepare( 353 `SELECT 354 keyword, 355 COUNT(*) as processed_count, 356 SUM(CASE WHEN processing_status = 'scored' THEN 1 ELSE 0 END) as scored_count 357 FROM sites 358 GROUP BY keyword` 359 ) 360 .all(); 361 362 let backfilled = 0; 363 364 for (const row of keywords) { 365 const { keyword, processed_count } = row; 366 367 // Count low-scoring sites (would need to parse conversion_score_json) 368 const lowScoringCount = db 369 .prepare( 370 `SELECT COUNT(*) as count 371 FROM sites 372 WHERE keyword = ? 373 AND conversion_score_json IS NOT NULL 374 AND ( 375 conversion_score_json LIKE '%"letter_grade":"B-"%' OR 376 conversion_score_json LIKE '%"letter_grade":"C%' OR 377 conversion_score_json LIKE '%"letter_grade":"D%' OR 378 conversion_score_json LIKE '%"letter_grade":"F"%' 379 )` 380 ) 381 .get(keyword).count; 382 383 upsertKeyword(db, keyword, { 384 processed_count, 385 low_scoring_count: lowScoringCount, 386 }); 387 388 backfilled++; 389 } 390 391 logger.success(`Backfilled ${backfilled} keywords from sites table`); 392 return backfilled; 393 } 394 395 /** 396 * Get keyword statistics 397 * @param {Database.Database} db - SQLite database instance 398 * @param {string} keyword - Keyword to get stats for 399 * @returns {Object} Keyword stats 400 */ 401 export function getKeywordStats(db, keyword) { 402 return db.prepare('SELECT * FROM keywords WHERE keyword = ?').get(keyword); 403 } 404 405 /** 406 * Get all keywords sorted by priority 407 * @param {Database.Database} db - SQLite database instance 408 * @param {number} limit - Maximum number to return 409 * @returns {Array} Keywords ordered by priority 410 */ 411 export function getKeywordsByPriority(db, limit = 100) { 412 // Priority: unscraped keywords first, then by zenrows_count DESC 413 return db 414 .prepare( 415 `SELECT * FROM keywords 416 ORDER BY 417 CASE WHEN last_scraped_at IS NULL THEN 0 ELSE 1 END, 418 zenrows_count DESC, 419 processed_count ASC 420 LIMIT ?` 421 ) 422 .all(limit); 423 } 424 425 export default { 426 loadKeywordsFromCSV, 427 loadKeywordsFromFile, 428 loadSearchVolumesFromCSV, 429 calculatePriority, 430 generateKeywordCombinations, 431 upsertKeyword, 432 incrementProcessedCount, 433 incrementLowScoringCount, 434 incrementReworkLowScoringCount, 435 backfillKeywordsFromSites, 436 getKeywordStats, 437 getKeywordsByPriority, 438 };