Cradicle Explorer

/ src / download / article-download.ts
article-download.ts
  1  /**
  2   * Article download helper — shared logic for downloading articles as Markdown.
  3   *
  4   * Used by: zhihu/download, weixin/download, and future article adapters.
  5   *
  6   * Flow: ArticleData → TurndownService → image download → frontmatter → .md file
  7   */
  8  
  9  import * as fs from 'node:fs';
 10  import * as path from 'node:path';
 11  import TurndownService from 'turndown';
 12  import { gfm } from 'turndown-plugin-gfm';
 13  import { httpDownload, sanitizeFilename } from './index.js';
 14  import { formatBytes } from './progress.js';
 15  
 16  const IMAGE_CONCURRENCY = 5;
 17  
 18  // ============================================================
 19  // Types
 20  // ============================================================
 21  
 22  export interface ArticleData {
 23    title: string;
 24    author?: string;
 25    publishTime?: string;
 26    sourceUrl?: string;
 27    contentHtml: string;
 28    /** Pre-extracted code blocks to restore after Markdown conversion */
 29    codeBlocks?: Array<{ lang: string; code: string }>;
 30    /** Image URLs found in the article (pre-collected from DOM) */
 31    imageUrls?: string[];
 32  }
 33  
 34  export interface FrontmatterLabels {
 35    author?: string;
 36    publishTime?: string;
 37    sourceUrl?: string;
 38  }
 39  
 40  export interface ArticleDownloadOptions {
 41    output: string;
 42    downloadImages?: boolean;
 43    /** Extra headers for image downloads (e.g. { Referer: '...' }) */
 44    imageHeaders?: Record<string, string>;
 45    maxTitleLength?: number;
 46    /** Custom TurndownService configuration callback */
 47    configureTurndown?: (td: TurndownService) => void;
 48    /** Custom image extension detector (default: infer from URL extension) */
 49    detectImageExt?: (url: string) => string;
 50    /** Custom frontmatter labels (default: Chinese labels) */
 51    frontmatterLabels?: FrontmatterLabels;
 52    /**
 53     * Extra CSS selectors removed from the article before Turndown conversion.
 54     * Use this to drop site-specific noise the adapter can't always trim upstream
 55     * (e.g. zhihu 折叠卡, weixin 赞赏栏, wiki infobox).
 56     */
 57    cleanSelectors?: string[];
 58    /**
 59     * Write the markdown to `process.stdout` instead of a file on disk. Image
 60     * download and directory creation are skipped — remote image URLs are kept
 61     * as-is so the output is self-contained when piped.
 62     */
 63    stdout?: boolean;
 64  }
 65  
 66  export interface ArticleDownloadResult {
 67    title: string;
 68    author: string;
 69    publish_time: string;
 70    status: string;
 71    size: string;
 72    saved: string;
 73  }
 74  
 75  const DEFAULT_LABELS: Required<FrontmatterLabels> = {
 76    author: '作者',
 77    publishTime: '发布时间',
 78    sourceUrl: '原文链接',
 79  };
 80  
 81  // ============================================================
 82  // Markdown Conversion
 83  // ============================================================
 84  
 85  // Nodes that never carry article content. Turndown keeps them by default — if an
 86  // adapter's contentHtml extraction misses one, CSS / scripts / widget markup
 87  // ends up inline in the .md. Strip them unconditionally at the converter level.
 88  // `svg` is not in HTMLElementTagNameMap, so we type-narrow manually.
 89  // `header/footer/nav/aside` cover page chrome that adapters occasionally
 90  // forget to trim — the article's own title/author/publishTime are supplied
 91  // as separate fields on ArticleData, so duplicated nodes are redundant.
 92  // `iframe` is NOT in this set — it's handled by a dedicated rule below that
 93  // degrades to a link so embedded content (YouTube, Twitter, CodePen …) keeps
 94  // a reachable URL in the exported markdown.
 95  const STRIPPED_TAGS: Array<keyof HTMLElementTagNameMap> = [
 96    'script', 'style', 'noscript',
 97    'canvas',
 98    'form', 'button', 'dialog',
 99    'header', 'footer', 'nav', 'aside',
100  ];
101  
102  function createTurndown(
103    configure?: (td: TurndownService) => void,
104    cleanSelectors?: string[],
105  ): TurndownService {
106    const td = new TurndownService({
107      headingStyle: 'atx',
108      codeBlockStyle: 'fenced',
109      bulletListMarker: '-',
110    });
111    td.use(gfm);
112    td.remove(STRIPPED_TAGS);
113    // turndown-plugin-gfm@1.0.2 emits single-tilde strikethrough (`~x~`), which
114    // is not the canonical GFM form. Override it so exported markdown is
115    // portable across common renderers.
116    td.addRule('canonicalStrikethrough', {
117      filter: (node) => ['DEL', 'S', 'STRIKE'].includes(node.nodeName),
118      replacement: (content) => `~~${content}~~`,
119    });
120    // SVG isn't in the static HTML tag map; match by name with a custom filter.
121    td.addRule('stripSvg', {
122      filter: (node) => node.nodeName === 'svg' || node.nodeName === 'SVG',
123      replacement: () => '',
124    });
125    td.addRule('linebreak', {
126      filter: 'br',
127      replacement: () => '\n',
128    });
129    // Inline base64 images would land as huge `![](data:image/...;base64,...)`
130    // strings that the image downloader can't localize. Drop them.
131    td.addRule('ignoreBase64Images', {
132      filter: (node) => {
133        if (node.nodeName !== 'IMG') return false;
134        const src = (node as HTMLImageElement).getAttribute?.('src') ?? '';
135        return src.startsWith('data:');
136      },
137      replacement: () => '',
138    });
139    // Markdown has no native video/audio primitive. Emit inline HTML so
140    // renderers that support it (GitHub, VS Code preview …) still play the
141    // media; viewers that don't simply show the tag as text, which is still
142    // more information than dropping the node outright.
143    td.addRule('videoElement', {
144      filter: (node) => node.nodeName === 'VIDEO',
145      replacement: (_content, node) => {
146        const el = node as Element;
147        const src = el.getAttribute('src')
148          || el.querySelector('source')?.getAttribute('src')
149          || '';
150        if (!src) return '';
151        const poster = el.getAttribute('poster') || '';
152        return `\n<video src="${src}" controls${poster ? ` poster="${poster}"` : ''}></video>\n`;
153      },
154    });
155    td.addRule('audioElement', {
156      filter: (node) => node.nodeName === 'AUDIO',
157      replacement: (_content, node) => {
158        const el = node as Element;
159        const src = el.getAttribute('src')
160          || el.querySelector('source')?.getAttribute('src')
161          || '';
162        return src ? `\n<audio src="${src}" controls></audio>\n` : '';
163      },
164    });
165    // Iframes (YouTube, Twitter, CodePen …) degrade to a markdown link so the
166    // embedded resource is still reachable from the exported file.
167    td.addRule('iframeToLink', {
168      filter: (node) => node.nodeName === 'IFRAME',
169      replacement: (_content, node) => {
170        const el = node as Element;
171        const src = el.getAttribute('src') || '';
172        if (!src) return '';
173        const title = el.getAttribute('title') || 'Embedded content';
174        return `\n[${title}](${src})\n`;
175      },
176    });
177    // Per-adapter dirty-node removal. Adapters know their site's specific noise
178    // (zhihu 折叠卡, weixin 赞赏栏, wiki 折叠 infobox …); we keep the default set
179    // empty so the generic converter stays untouched.
180    const selectorRules = (cleanSelectors ?? [])
181      .map(sel => sel.trim())
182      .filter(Boolean);
183    if (selectorRules.length > 0) {
184      td.addRule('cleanSelectors', {
185        filter: (node) => {
186          const match = (node as Element).matches;
187          if (typeof match !== 'function') return false;
188          return selectorRules.some((sel) => {
189            try {
190              return match.call(node, sel);
191            } catch {
192              return false;
193            }
194          });
195        },
196        replacement: () => '',
197      });
198    }
199    if (configure) configure(td);
200    return td;
201  }
202  
203  function convertToMarkdown(
204    contentHtml: string,
205    codeBlocks: Array<{ lang: string; code: string }>,
206    configure?: (td: TurndownService) => void,
207    cleanSelectors?: string[],
208  ): string {
209    const td = createTurndown(configure, cleanSelectors);
210    let md = td.turndown(contentHtml);
211  
212    // Restore code block placeholders
213    codeBlocks.forEach((block, i) => {
214      const placeholder = `CODEBLOCK-PLACEHOLDER-${i}`;
215      const fenced = `\n\`\`\`${block.lang}\n${block.code}\n\`\`\`\n`;
216      md = md.replace(placeholder, fenced);
217    });
218  
219    // Clean up
220    md = md.replace(/\u00a0/g, ' ');
221    // Turndown leaves behind lone dashes / middle dots when list bullets or
222    // decorative separators lose their surrounding inline context.
223    md = md.replace(/^[ \t]*[-·][ \t]*$/gm, '');
224    md = md.replace(/^[ \t]+$/gm, '');
225    md = md.replace(/[ \t]+$/gm, '');
226    md = md.replace(/\n{3,}/g, '\n\n');
227  
228    return md;
229  }
230  
231  function replaceImageUrls(md: string, urlMap: Record<string, string>): string {
232    return md.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, imgUrl) => {
233      const local = urlMap[imgUrl];
234      return local ? `![${alt}](${local})` : match;
235    });
236  }
237  
238  // ============================================================
239  // Image Downloading
240  // ============================================================
241  
242  function defaultDetectImageExt(url: string): string {
243    const extMatch = url.match(/\.(\w{3,4})(?:\?|$)/);
244    return extMatch ? extMatch[1] : 'jpg';
245  }
246  
247  async function downloadImages(
248    imgUrls: string[],
249    imgDir: string,
250    headers?: Record<string, string>,
251    detectExt?: (url: string) => string,
252  ): Promise<Record<string, string>> {
253    const urlMap: Record<string, string> = {};
254    if (imgUrls.length === 0) return urlMap;
255  
256    const detect = detectExt || defaultDetectImageExt;
257  
258    // Deduplicate image URLs
259    const seen = new Set<string>();
260    const uniqueUrls = imgUrls.filter(url => {
261      if (seen.has(url)) return false;
262      seen.add(url);
263      return true;
264    });
265  
266    for (let i = 0; i < uniqueUrls.length; i += IMAGE_CONCURRENCY) {
267      const batch = uniqueUrls.slice(i, i + IMAGE_CONCURRENCY);
268      const results = await Promise.all(
269        batch.map(async (rawUrl, j) => {
270          const index = i + j + 1;
271          let imgUrl = rawUrl;
272          if (imgUrl.startsWith('//')) imgUrl = `https:${imgUrl}`;
273  
274          const ext = detect(imgUrl);
275          const filename = `img_${String(index).padStart(3, '0')}.${ext}`;
276          const filepath = path.join(imgDir, filename);
277  
278          try {
279            const result = await httpDownload(imgUrl, filepath, {
280              headers,
281              timeout: 15000,
282            });
283            if (result.success) {
284              return { remoteUrl: rawUrl, localPath: `images/${filename}` };
285            }
286          } catch {
287            // Skip failed downloads
288          }
289          return null;
290        }),
291      );
292  
293      for (const r of results) {
294        if (r) urlMap[r.remoteUrl] = r.localPath;
295      }
296    }
297    return urlMap;
298  }
299  
300  // ============================================================
301  // Main API
302  // ============================================================
303  
304  /**
305   * Download an article to Markdown with optional image localization.
306   *
307   * Handles the full pipeline:
308   * 1. HTML → Markdown (via TurndownService)
309   * 2. Code block placeholder restoration
310   * 3. Batch image downloading with concurrency + deduplication
311   * 4. Image URL replacement in Markdown
312   * 5. Frontmatter generation (customizable labels)
313   * 6. File write
314   */
315  export async function downloadArticle(
316    data: ArticleData,
317    options: ArticleDownloadOptions,
318  ): Promise<ArticleDownloadResult[]> {
319    const {
320      output,
321      downloadImages: shouldDownloadImages = true,
322      imageHeaders,
323      maxTitleLength = 80,
324      configureTurndown,
325      detectImageExt,
326      frontmatterLabels,
327      cleanSelectors,
328      stdout = false,
329    } = options;
330  
331    const labels = { ...DEFAULT_LABELS, ...frontmatterLabels };
332  
333    if (!data.title) {
334      return [{
335        title: 'Error',
336        author: '-',
337        publish_time: '-',
338        status: 'failed — no title',
339        size: '-',
340        saved: '-',
341      }];
342    }
343  
344    if (!data.contentHtml) {
345      return [{
346        title: data.title,
347        author: data.author || '-',
348        publish_time: data.publishTime || '-',
349        status: 'failed — no content',
350        size: '-',
351        saved: '-',
352      }];
353    }
354  
355    // Convert HTML to Markdown
356    let markdown = convertToMarkdown(
357      data.contentHtml,
358      data.codeBlocks || [],
359      configureTurndown,
360      cleanSelectors,
361    );
362  
363    const safeTitle = sanitizeFilename(data.title, maxTitleLength);
364  
365    // Download images only when writing to disk. In stdout mode remote URLs
366    // stay intact so the piped output is self-contained.
367    if (!stdout && shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
368      const articleDir = path.join(output, safeTitle);
369      fs.mkdirSync(articleDir, { recursive: true });
370      const imagesDir = path.join(articleDir, 'images');
371      fs.mkdirSync(imagesDir, { recursive: true });
372  
373      const urlMap = await downloadImages(data.imageUrls, imagesDir, imageHeaders, detectImageExt);
374      markdown = replaceImageUrls(markdown, urlMap);
375    }
376  
377    // Build frontmatter with customizable labels.
378    // Shape: `# Title\n[> meta\n...]\n---\n\n<markdown>` — exactly one blank
379    // line separates every section, so we never produce ≥3 consecutive newlines.
380    const headerLines = [`# ${data.title}`];
381    if (data.author) headerLines.push(`> ${labels.author}: ${data.author}`);
382    if (data.publishTime) headerLines.push(`> ${labels.publishTime}: ${data.publishTime}`);
383    if (data.sourceUrl) headerLines.push(`> ${labels.sourceUrl}: ${data.sourceUrl}`);
384    const frontmatter = headerLines.join('\n') + '\n\n---\n\n';
385    const fullContent = frontmatter + markdown;
386    const size = Buffer.byteLength(fullContent, 'utf-8');
387  
388    if (stdout) {
389      process.stdout.write(fullContent.endsWith('\n') ? fullContent : fullContent + '\n');
390      return [{
391        title: data.title,
392        author: data.author || '-',
393        publish_time: data.publishTime || '-',
394        status: 'success',
395        size: formatBytes(size),
396        saved: '-',
397      }];
398    }
399  
400    const articleDir = path.join(output, safeTitle);
401    fs.mkdirSync(articleDir, { recursive: true });
402    const filename = `${safeTitle}.md`;
403    const filePath = path.join(articleDir, filename);
404    fs.writeFileSync(filePath, fullContent, 'utf-8');
405  
406    return [{
407      title: data.title,
408      author: data.author || '-',
409      publish_time: data.publishTime || '-',
410      status: 'success',
411      size: formatBytes(size),
412      saved: filePath,
413    }];
414  }