/ docs / scripts / compare-sitemaps.ts
compare-sitemaps.ts
 1  import * as fs from 'fs';
 2  import { XMLParser } from 'fast-xml-parser';
 3  import fetch from 'node-fetch';
 4  
 5  async function readSitemap(input: string): Promise<string> {
 6    if (/^https?:\/\//.test(input)) {
 7      const res = await fetch(input);
 8      if (!res.ok) {
 9        const responseBody = await res.text();
10        throw new Error(`Failed to fetch ${input}: ${res.status} ${res.statusText}. Response body: ${responseBody}`);
11      }
12      return await res.text();
13    } else {
14      return await fs.promises.readFile(input, 'utf8');
15    }
16  }
17  
18  function normalizePath(url: string): string {
19    const idx = url.indexOf('/latest/');
20    return idx >= 0 ? url.slice(idx + 'latest/'.length) : url;
21  }
22  
23  async function parseSitemap(input: string): Promise<Map<string, string>> {
24    const xml = await readSitemap(input);
25    const parser = new XMLParser();
26    const parsed = parser.parse(xml);
27    const urlset = parsed.urlset.url as { loc: string }[];
28  
29    const urlMap = new Map<string, string>();
30    for (const { loc } of urlset) {
31      const normalized = normalizePath(loc);
32      urlMap.set(normalized, loc);
33    }
34    return urlMap;
35  }
36  
37  function compareSitemaps(mapA: Map<string, string>, mapB: Map<string, string>) {
38    const onlyInA: string[] = [];
39    const onlyInB: string[] = [];
40    const inBoth: string[] = [];
41  
42    for (const [url, _] of mapA) {
43      if (!mapB.has(url)) {
44        onlyInA.push(url);
45      } else {
46        inBoth.push(url);
47      }
48    }
49  
50    for (const url of mapB.keys()) {
51      if (!mapA.has(url)) {
52        onlyInB.push(url);
53      }
54    }
55  
56    return { onlyInA, onlyInB, inBoth };
57  }
58  
59  (async () => {
60    const fileA = process.argv[2];
61    const fileB = process.argv[3];
62  
63    if (!fileA || !fileB) {
64      console.error('Usage: tsx compare-sitemaps.ts <fileA|urlA> <fileB|urlB>');
65      process.exit(1);
66    }
67  
68    const [mapA, mapB] = await Promise.all([parseSitemap(fileA), parseSitemap(fileB)]);
69  
70    const { onlyInA, onlyInB, inBoth } = compareSitemaps(mapA, mapB);
71  
72    console.log(`URLs in both: ${inBoth.length}`);
73    console.log(`Only in ${fileA}: ${onlyInA.length}`);
74    onlyInA.forEach((url) => console.log(`  ${url}`));
75  
76    console.log(`Only in ${fileB}: ${onlyInB.length}`);
77    onlyInB.forEach((url) => console.log(`  ${url}`));
78  })();