/ scrape-rss.mjs
scrape-rss.mjs
1 #!/usr/bin/env node 2 /** 3 * Scrape podcast episodes from RSS feed → one markdown file per episode. 4 * 5 * Usage: 6 * node scrape-rss.mjs <rss-url> # all episodes from RSS 7 * node scrape-rss.mjs <rss-url> --limit 5 # first 5 episodes only 8 * node scrape-rss.mjs <rss-url> --dry-run # preview without writing files 9 * node scrape-rss.mjs <rss-url> --out ./eps # custom output directory 10 */ 11 12 import { parseArgs } from 'node:util'; 13 import { writeFileSync, mkdirSync } from 'node:fs'; 14 import { join } from 'node:path'; 15 16 const { values: flags, positionals } = parseArgs({ 17 options: { 18 limit: { type: 'string', short: 'l' }, 19 'dry-run': { type: 'boolean', short: 'd' }, 20 out: { type: 'string', short: 'o' }, 21 }, 22 allowPositionals: true, 23 }); 24 25 const RSS_URL = positionals[0]; 26 if (!RSS_URL) { 27 console.error('Usage: node scrape-rss.mjs <rss-url> [--limit N] [--dry-run] [--out dir]'); 28 process.exit(1); 29 } 30 31 const EPISODES_DIR = flags.out || join(process.cwd(), 'episodes'); 32 33 // --- Helpers --- 34 35 function sanitizeFilename(title) { 36 return title 37 .replace(/[<>:"/\\|?*]/g, '-') 38 .replace(/\s+/g, ' ') 39 .replace(/\.+$/, '') 40 .trim(); 41 } 42 43 function formatDuration(seconds) { 44 const h = Math.floor(seconds / 3600); 45 const m = Math.floor((seconds % 3600) / 60); 46 const s = seconds % 60; 47 return h > 0 48 ? `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}` 49 : `${m}:${String(s).padStart(2, '0')}`; 50 } 51 52 function extractText(xml, tag) { 53 const re = new RegExp(`<${tag}[^>]*>(?:<!\\[CDATA\\[)?([\\s\\S]*?)(?:\\]\\]>)?</${tag}>`, 'i'); 54 const m = xml.match(re); 55 return m ? m[1].trim() : ''; 56 } 57 58 function extractAttr(xml, tag, attr) { 59 const re = new RegExp(`<${tag}[^>]*${attr}="([^"]*)"`, 'i'); 60 const m = xml.match(re); 61 return m ? m[1] : ''; 62 } 63 64 function decodeEntities(text) { 65 return text 66 .replace(/&/g, '&') 67 .replace(/</g, '<') 68 .replace(/>/g, '>') 69 .replace(/"/g, '"') 70 .replace(/'/g, "'") 71 .replace(/'/g, "'") 72 .replace(/ /g, ' ') 73 .replace(/&#\d+;/g, m => String.fromCharCode(parseInt(m.slice(2, -1)))); 74 } 75 76 function stripHtml(html) { 77 return decodeEntities( 78 html 79 .replace(/<br\s*\/?>/gi, '\n') 80 .replace(/<\/p>/gi, '\n\n') 81 .replace(/<[^>]+>/g, '') 82 ) 83 .replace(/\n{3,}/g, '\n\n') 84 .trim(); 85 } 86 87 function parseDate(dateStr) { 88 const d = new Date(dateStr); 89 return d.toISOString().split('T')[0]; 90 } 91 92 // --- Main --- 93 94 async function main() { 95 console.log(`Fetching RSS feed: ${RSS_URL}`); 96 const res = await fetch(RSS_URL); 97 if (!res.ok) throw new Error(`HTTP ${res.status}`); 98 const xml = await res.text(); 99 100 // Extract channel metadata 101 const channelTitle = decodeEntities(extractText(xml, 'title')); 102 const channelAuthor = decodeEntities(extractText(xml, 'itunes:author')); 103 console.log(`Podcast: ${channelTitle} by ${channelAuthor}`); 104 105 // Split into items 106 const items = xml.split('<item>').slice(1).map(chunk => '<item>' + chunk.split('</item>')[0] + '</item>'); 107 108 const limit = flags.limit ? parseInt(flags.limit) : items.length; 109 const episodes = items.slice(0, limit); 110 111 console.log(`Found ${items.length} episodes in feed, processing ${episodes.length}`); 112 113 if (!flags['dry-run']) { 114 mkdirSync(EPISODES_DIR, { recursive: true }); 115 } 116 117 for (const item of episodes) { 118 const title = decodeEntities(extractText(item, 'title')); 119 const description = extractText(item, 'description'); 120 const summary = extractText(item, 'itunes:summary'); 121 const pubDate = extractText(item, 'pubDate'); 122 const duration = parseInt(extractText(item, 'itunes:duration')) || 0; 123 const season = extractText(item, 'itunes:season'); 124 const episode = extractText(item, 'itunes:episode'); 125 const audioUrl = extractAttr(item, 'enclosure', 'url'); 126 127 const date = parseDate(pubDate); 128 const descClean = stripHtml(description || summary); 129 130 const meta = [`- **Date**: ${date}`]; 131 if (season || episode) { 132 const parts = []; 133 if (season) parts.push(`**Season**: ${season}`); 134 if (episode) parts.push(`**Episode**: ${episode}`); 135 meta.push(`- ${parts.join(' | ')}`); 136 } 137 meta.push(`- **Duration**: ${formatDuration(duration)}`); 138 meta.push(`- **Audio**: ${audioUrl}`); 139 140 const md = `# ${title} 141 142 ## Metadata 143 ${meta.join('\n')} 144 145 ## Description 146 147 ${descClean} 148 149 ## Transcript 150 151 _Not yet transcribed._ 152 `; 153 154 const filename = sanitizeFilename(title) + '.md'; 155 156 if (flags['dry-run']) { 157 console.log(`\n--- ${filename} ---`); 158 console.log(md.slice(0, 300) + '...'); 159 } else { 160 const filepath = join(EPISODES_DIR, filename); 161 writeFileSync(filepath, md); 162 console.log(` ✓ ${filename}`); 163 } 164 } 165 166 console.log(`\nDone. ${episodes.length} episodes processed.`); 167 } 168 169 main().catch(err => { console.error(err); process.exit(1); });