article-download.ts
1 /** 2 * Article download helper — shared logic for downloading articles as Markdown. 3 * 4 * Used by: zhihu/download, weixin/download, and future article adapters. 5 * 6 * Flow: ArticleData → TurndownService → image download → frontmatter → .md file 7 */ 8 9 import * as fs from 'node:fs'; 10 import * as path from 'node:path'; 11 import TurndownService from 'turndown'; 12 import { gfm } from 'turndown-plugin-gfm'; 13 import { httpDownload, sanitizeFilename } from './index.js'; 14 import { formatBytes } from './progress.js'; 15 16 const IMAGE_CONCURRENCY = 5; 17 18 // ============================================================ 19 // Types 20 // ============================================================ 21 22 export interface ArticleData { 23 title: string; 24 author?: string; 25 publishTime?: string; 26 sourceUrl?: string; 27 contentHtml: string; 28 /** Pre-extracted code blocks to restore after Markdown conversion */ 29 codeBlocks?: Array<{ lang: string; code: string }>; 30 /** Image URLs found in the article (pre-collected from DOM) */ 31 imageUrls?: string[]; 32 } 33 34 export interface FrontmatterLabels { 35 author?: string; 36 publishTime?: string; 37 sourceUrl?: string; 38 } 39 40 export interface ArticleDownloadOptions { 41 output: string; 42 downloadImages?: boolean; 43 /** Extra headers for image downloads (e.g. { Referer: '...' }) */ 44 imageHeaders?: Record<string, string>; 45 maxTitleLength?: number; 46 /** Custom TurndownService configuration callback */ 47 configureTurndown?: (td: TurndownService) => void; 48 /** Custom image extension detector (default: infer from URL extension) */ 49 detectImageExt?: (url: string) => string; 50 /** Custom frontmatter labels (default: Chinese labels) */ 51 frontmatterLabels?: FrontmatterLabels; 52 /** 53 * Extra CSS selectors removed from the article before Turndown conversion. 54 * Use this to drop site-specific noise the adapter can't always trim upstream 55 * (e.g. zhihu 折叠卡, weixin 赞赏栏, wiki infobox). 56 */ 57 cleanSelectors?: string[]; 58 /** 59 * Write the markdown to `process.stdout` instead of a file on disk. Image 60 * download and directory creation are skipped — remote image URLs are kept 61 * as-is so the output is self-contained when piped. 62 */ 63 stdout?: boolean; 64 } 65 66 export interface ArticleDownloadResult { 67 title: string; 68 author: string; 69 publish_time: string; 70 status: string; 71 size: string; 72 saved: string; 73 } 74 75 const DEFAULT_LABELS: Required<FrontmatterLabels> = { 76 author: '作者', 77 publishTime: '发布时间', 78 sourceUrl: '原文链接', 79 }; 80 81 // ============================================================ 82 // Markdown Conversion 83 // ============================================================ 84 85 // Nodes that never carry article content. Turndown keeps them by default — if an 86 // adapter's contentHtml extraction misses one, CSS / scripts / widget markup 87 // ends up inline in the .md. Strip them unconditionally at the converter level. 88 // `svg` is not in HTMLElementTagNameMap, so we type-narrow manually. 89 // `header/footer/nav/aside` cover page chrome that adapters occasionally 90 // forget to trim — the article's own title/author/publishTime are supplied 91 // as separate fields on ArticleData, so duplicated nodes are redundant. 92 // `iframe` is NOT in this set — it's handled by a dedicated rule below that 93 // degrades to a link so embedded content (YouTube, Twitter, CodePen …) keeps 94 // a reachable URL in the exported markdown. 95 const STRIPPED_TAGS: Array<keyof HTMLElementTagNameMap> = [ 96 'script', 'style', 'noscript', 97 'canvas', 98 'form', 'button', 'dialog', 99 'header', 'footer', 'nav', 'aside', 100 ]; 101 102 function createTurndown( 103 configure?: (td: TurndownService) => void, 104 cleanSelectors?: string[], 105 ): TurndownService { 106 const td = new TurndownService({ 107 headingStyle: 'atx', 108 codeBlockStyle: 'fenced', 109 bulletListMarker: '-', 110 }); 111 td.use(gfm); 112 td.remove(STRIPPED_TAGS); 113 // turndown-plugin-gfm@1.0.2 emits single-tilde strikethrough (`~x~`), which 114 // is not the canonical GFM form. Override it so exported markdown is 115 // portable across common renderers. 116 td.addRule('canonicalStrikethrough', { 117 filter: (node) => ['DEL', 'S', 'STRIKE'].includes(node.nodeName), 118 replacement: (content) => `~~${content}~~`, 119 }); 120 // SVG isn't in the static HTML tag map; match by name with a custom filter. 121 td.addRule('stripSvg', { 122 filter: (node) => node.nodeName === 'svg' || node.nodeName === 'SVG', 123 replacement: () => '', 124 }); 125 td.addRule('linebreak', { 126 filter: 'br', 127 replacement: () => '\n', 128 }); 129 // Inline base64 images would land as huge `` 130 // strings that the image downloader can't localize. Drop them. 131 td.addRule('ignoreBase64Images', { 132 filter: (node) => { 133 if (node.nodeName !== 'IMG') return false; 134 const src = (node as HTMLImageElement).getAttribute?.('src') ?? ''; 135 return src.startsWith('data:'); 136 }, 137 replacement: () => '', 138 }); 139 // Markdown has no native video/audio primitive. Emit inline HTML so 140 // renderers that support it (GitHub, VS Code preview …) still play the 141 // media; viewers that don't simply show the tag as text, which is still 142 // more information than dropping the node outright. 143 td.addRule('videoElement', { 144 filter: (node) => node.nodeName === 'VIDEO', 145 replacement: (_content, node) => { 146 const el = node as Element; 147 const src = el.getAttribute('src') 148 || el.querySelector('source')?.getAttribute('src') 149 || ''; 150 if (!src) return ''; 151 const poster = el.getAttribute('poster') || ''; 152 return `\n<video src="${src}" controls${poster ? ` poster="${poster}"` : ''}></video>\n`; 153 }, 154 }); 155 td.addRule('audioElement', { 156 filter: (node) => node.nodeName === 'AUDIO', 157 replacement: (_content, node) => { 158 const el = node as Element; 159 const src = el.getAttribute('src') 160 || el.querySelector('source')?.getAttribute('src') 161 || ''; 162 return src ? `\n<audio src="${src}" controls></audio>\n` : ''; 163 }, 164 }); 165 // Iframes (YouTube, Twitter, CodePen …) degrade to a markdown link so the 166 // embedded resource is still reachable from the exported file. 167 td.addRule('iframeToLink', { 168 filter: (node) => node.nodeName === 'IFRAME', 169 replacement: (_content, node) => { 170 const el = node as Element; 171 const src = el.getAttribute('src') || ''; 172 if (!src) return ''; 173 const title = el.getAttribute('title') || 'Embedded content'; 174 return `\n[${title}](${src})\n`; 175 }, 176 }); 177 // Per-adapter dirty-node removal. Adapters know their site's specific noise 178 // (zhihu 折叠卡, weixin 赞赏栏, wiki 折叠 infobox …); we keep the default set 179 // empty so the generic converter stays untouched. 180 const selectorRules = (cleanSelectors ?? []) 181 .map(sel => sel.trim()) 182 .filter(Boolean); 183 if (selectorRules.length > 0) { 184 td.addRule('cleanSelectors', { 185 filter: (node) => { 186 const match = (node as Element).matches; 187 if (typeof match !== 'function') return false; 188 return selectorRules.some((sel) => { 189 try { 190 return match.call(node, sel); 191 } catch { 192 return false; 193 } 194 }); 195 }, 196 replacement: () => '', 197 }); 198 } 199 if (configure) configure(td); 200 return td; 201 } 202 203 function convertToMarkdown( 204 contentHtml: string, 205 codeBlocks: Array<{ lang: string; code: string }>, 206 configure?: (td: TurndownService) => void, 207 cleanSelectors?: string[], 208 ): string { 209 const td = createTurndown(configure, cleanSelectors); 210 let md = td.turndown(contentHtml); 211 212 // Restore code block placeholders 213 codeBlocks.forEach((block, i) => { 214 const placeholder = `CODEBLOCK-PLACEHOLDER-${i}`; 215 const fenced = `\n\`\`\`${block.lang}\n${block.code}\n\`\`\`\n`; 216 md = md.replace(placeholder, fenced); 217 }); 218 219 // Clean up 220 md = md.replace(/\u00a0/g, ' '); 221 // Turndown leaves behind lone dashes / middle dots when list bullets or 222 // decorative separators lose their surrounding inline context. 223 md = md.replace(/^[ \t]*[-·][ \t]*$/gm, ''); 224 md = md.replace(/^[ \t]+$/gm, ''); 225 md = md.replace(/[ \t]+$/gm, ''); 226 md = md.replace(/\n{3,}/g, '\n\n'); 227 228 return md; 229 } 230 231 function replaceImageUrls(md: string, urlMap: Record<string, string>): string { 232 return md.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, imgUrl) => { 233 const local = urlMap[imgUrl]; 234 return local ? `` : match; 235 }); 236 } 237 238 // ============================================================ 239 // Image Downloading 240 // ============================================================ 241 242 function defaultDetectImageExt(url: string): string { 243 const extMatch = url.match(/\.(\w{3,4})(?:\?|$)/); 244 return extMatch ? extMatch[1] : 'jpg'; 245 } 246 247 async function downloadImages( 248 imgUrls: string[], 249 imgDir: string, 250 headers?: Record<string, string>, 251 detectExt?: (url: string) => string, 252 ): Promise<Record<string, string>> { 253 const urlMap: Record<string, string> = {}; 254 if (imgUrls.length === 0) return urlMap; 255 256 const detect = detectExt || defaultDetectImageExt; 257 258 // Deduplicate image URLs 259 const seen = new Set<string>(); 260 const uniqueUrls = imgUrls.filter(url => { 261 if (seen.has(url)) return false; 262 seen.add(url); 263 return true; 264 }); 265 266 for (let i = 0; i < uniqueUrls.length; i += IMAGE_CONCURRENCY) { 267 const batch = uniqueUrls.slice(i, i + IMAGE_CONCURRENCY); 268 const results = await Promise.all( 269 batch.map(async (rawUrl, j) => { 270 const index = i + j + 1; 271 let imgUrl = rawUrl; 272 if (imgUrl.startsWith('//')) imgUrl = `https:${imgUrl}`; 273 274 const ext = detect(imgUrl); 275 const filename = `img_${String(index).padStart(3, '0')}.${ext}`; 276 const filepath = path.join(imgDir, filename); 277 278 try { 279 const result = await httpDownload(imgUrl, filepath, { 280 headers, 281 timeout: 15000, 282 }); 283 if (result.success) { 284 return { remoteUrl: rawUrl, localPath: `images/${filename}` }; 285 } 286 } catch { 287 // Skip failed downloads 288 } 289 return null; 290 }), 291 ); 292 293 for (const r of results) { 294 if (r) urlMap[r.remoteUrl] = r.localPath; 295 } 296 } 297 return urlMap; 298 } 299 300 // ============================================================ 301 // Main API 302 // ============================================================ 303 304 /** 305 * Download an article to Markdown with optional image localization. 306 * 307 * Handles the full pipeline: 308 * 1. HTML → Markdown (via TurndownService) 309 * 2. Code block placeholder restoration 310 * 3. Batch image downloading with concurrency + deduplication 311 * 4. Image URL replacement in Markdown 312 * 5. Frontmatter generation (customizable labels) 313 * 6. File write 314 */ 315 export async function downloadArticle( 316 data: ArticleData, 317 options: ArticleDownloadOptions, 318 ): Promise<ArticleDownloadResult[]> { 319 const { 320 output, 321 downloadImages: shouldDownloadImages = true, 322 imageHeaders, 323 maxTitleLength = 80, 324 configureTurndown, 325 detectImageExt, 326 frontmatterLabels, 327 cleanSelectors, 328 stdout = false, 329 } = options; 330 331 const labels = { ...DEFAULT_LABELS, ...frontmatterLabels }; 332 333 if (!data.title) { 334 return [{ 335 title: 'Error', 336 author: '-', 337 publish_time: '-', 338 status: 'failed — no title', 339 size: '-', 340 saved: '-', 341 }]; 342 } 343 344 if (!data.contentHtml) { 345 return [{ 346 title: data.title, 347 author: data.author || '-', 348 publish_time: data.publishTime || '-', 349 status: 'failed — no content', 350 size: '-', 351 saved: '-', 352 }]; 353 } 354 355 // Convert HTML to Markdown 356 let markdown = convertToMarkdown( 357 data.contentHtml, 358 data.codeBlocks || [], 359 configureTurndown, 360 cleanSelectors, 361 ); 362 363 const safeTitle = sanitizeFilename(data.title, maxTitleLength); 364 365 // Download images only when writing to disk. In stdout mode remote URLs 366 // stay intact so the piped output is self-contained. 367 if (!stdout && shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) { 368 const articleDir = path.join(output, safeTitle); 369 fs.mkdirSync(articleDir, { recursive: true }); 370 const imagesDir = path.join(articleDir, 'images'); 371 fs.mkdirSync(imagesDir, { recursive: true }); 372 373 const urlMap = await downloadImages(data.imageUrls, imagesDir, imageHeaders, detectImageExt); 374 markdown = replaceImageUrls(markdown, urlMap); 375 } 376 377 // Build frontmatter with customizable labels. 378 // Shape: `# Title\n[> meta\n...]\n---\n\n<markdown>` — exactly one blank 379 // line separates every section, so we never produce ≥3 consecutive newlines. 380 const headerLines = [`# ${data.title}`]; 381 if (data.author) headerLines.push(`> ${labels.author}: ${data.author}`); 382 if (data.publishTime) headerLines.push(`> ${labels.publishTime}: ${data.publishTime}`); 383 if (data.sourceUrl) headerLines.push(`> ${labels.sourceUrl}: ${data.sourceUrl}`); 384 const frontmatter = headerLines.join('\n') + '\n\n---\n\n'; 385 const fullContent = frontmatter + markdown; 386 const size = Buffer.byteLength(fullContent, 'utf-8'); 387 388 if (stdout) { 389 process.stdout.write(fullContent.endsWith('\n') ? fullContent : fullContent + '\n'); 390 return [{ 391 title: data.title, 392 author: data.author || '-', 393 publish_time: data.publishTime || '-', 394 status: 'success', 395 size: formatBytes(size), 396 saved: '-', 397 }]; 398 } 399 400 const articleDir = path.join(output, safeTitle); 401 fs.mkdirSync(articleDir, { recursive: true }); 402 const filename = `${safeTitle}.md`; 403 const filePath = path.join(articleDir, filename); 404 fs.writeFileSync(filePath, fullContent, 'utf-8'); 405 406 return [{ 407 title: data.title, 408 author: data.author || '-', 409 publish_time: data.publishTime || '-', 410 status: 'success', 411 size: formatBytes(size), 412 saved: filePath, 413 }]; 414 }