compare-sitemaps.ts
1 import * as fs from 'fs'; 2 import { XMLParser } from 'fast-xml-parser'; 3 import fetch from 'node-fetch'; 4 5 async function readSitemap(input: string): Promise<string> { 6 if (/^https?:\/\//.test(input)) { 7 const res = await fetch(input); 8 if (!res.ok) { 9 const responseBody = await res.text(); 10 throw new Error(`Failed to fetch ${input}: ${res.status} ${res.statusText}. Response body: ${responseBody}`); 11 } 12 return await res.text(); 13 } else { 14 return await fs.promises.readFile(input, 'utf8'); 15 } 16 } 17 18 function normalizePath(url: string): string { 19 const idx = url.indexOf('/latest/'); 20 return idx >= 0 ? url.slice(idx + 'latest/'.length) : url; 21 } 22 23 async function parseSitemap(input: string): Promise<Map<string, string>> { 24 const xml = await readSitemap(input); 25 const parser = new XMLParser(); 26 const parsed = parser.parse(xml); 27 const urlset = parsed.urlset.url as { loc: string }[]; 28 29 const urlMap = new Map<string, string>(); 30 for (const { loc } of urlset) { 31 const normalized = normalizePath(loc); 32 urlMap.set(normalized, loc); 33 } 34 return urlMap; 35 } 36 37 function compareSitemaps(mapA: Map<string, string>, mapB: Map<string, string>) { 38 const onlyInA: string[] = []; 39 const onlyInB: string[] = []; 40 const inBoth: string[] = []; 41 42 for (const [url, _] of mapA) { 43 if (!mapB.has(url)) { 44 onlyInA.push(url); 45 } else { 46 inBoth.push(url); 47 } 48 } 49 50 for (const url of mapB.keys()) { 51 if (!mapA.has(url)) { 52 onlyInB.push(url); 53 } 54 } 55 56 return { onlyInA, onlyInB, inBoth }; 57 } 58 59 (async () => { 60 const fileA = process.argv[2]; 61 const fileB = process.argv[3]; 62 63 if (!fileA || !fileB) { 64 console.error('Usage: tsx compare-sitemaps.ts <fileA|urlA> <fileB|urlB>'); 65 process.exit(1); 66 } 67 68 const [mapA, mapB] = await Promise.all([parseSitemap(fileA), parseSitemap(fileB)]); 69 70 const { onlyInA, onlyInB, inBoth } = compareSitemaps(mapA, mapB); 71 72 console.log(`URLs in both: ${inBoth.length}`); 73 console.log(`Only in ${fileA}: ${onlyInA.length}`); 74 onlyInA.forEach((url) => console.log(` ${url}`)); 75 76 console.log(`Only in ${fileB}: ${onlyInB.length}`); 77 onlyInB.forEach((url) => console.log(` ${url}`)); 78 })();