utils.js
1 import { clamp } from '../_shared/common.js'; 2 const clampLimit = (limit) => clamp(limit || 20, 1, 50); 3 export function buildSinaBlogSearchUrl(keyword) { 4 return `https://search.sina.com.cn/search?q=${encodeURIComponent(keyword)}&tp=mix`; 5 } 6 export function buildSinaBlogUserUrl(uid) { 7 return `https://blog.sina.com.cn/s/articlelist_${encodeURIComponent(uid)}_0_1.html`; 8 } 9 export async function loadSinaBlogArticle(page, url) { 10 await page.goto(url); 11 await page.wait({ selector: 'h1', timeout: 3 }); 12 return page.evaluate(` 13 (async () => { 14 await new Promise((resolve) => setTimeout(resolve, 1500)); 15 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 16 const title = normalize(document.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent); 17 const titleParts = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean); 18 const author = titleParts[1] || title.split(/[::]/)[0] || ''; 19 const timeText = normalize(document.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, ''); 20 const date = timeText || normalize(document.body.innerText.match(/\\b\\d{4}-\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2}:\\d{2})?\\b/)?.[0]); 21 const category = normalize(document.querySelector('.articalTag .blog_class a, .blog_class a')?.textContent); 22 const tags = Array.from(document.querySelectorAll('.blog_tag h3, .blog_tag a, .tag a, .artical_tag a')) 23 .map((node) => normalize(node.textContent)) 24 .filter(Boolean); 25 const content = normalize(document.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 500); 26 const images = Array.from(document.querySelectorAll('.articalContent img, .blog_content img, .content img')) 27 .map((img) => img.getAttribute('src') || img.getAttribute('real_src') || '') 28 .filter((src) => src && !src.includes('icon')) 29 .slice(0, 5); 30 return { 31 title, 32 author, 33 date, 34 category, 35 tags: tags.join(', '), 36 readCount: '', 37 commentCount: '', 38 content: content + (content.length >= 500 ? '...' : ''), 39 images: images.join(', '), 40 url: ${JSON.stringify(url)}, 41 }; 42 })() 43 `); 44 } 45 export async function loadSinaBlogHot(page, limit) { 46 const safeLimit = clampLimit(limit); 47 await page.goto('https://blog.sina.com.cn/'); 48 await page.wait({ selector: 'h1', timeout: 3 }); 49 const data = await page.evaluate(` 50 (async () => { 51 await new Promise((resolve) => setTimeout(resolve, 1500)); 52 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 53 const limit = ${safeLimit}; 54 const abs = (href) => { 55 if (!href) return ''; 56 if (href.startsWith('//')) return 'https:' + href; 57 if (href.startsWith('http')) return href; 58 return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href; 59 }; 60 const parseArticle = (doc, fallback) => { 61 const title = normalize(doc.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent) || fallback.title; 62 const titleParts = normalize(doc.title).split('_').map((part) => normalize(part)).filter(Boolean); 63 const timeText = normalize(doc.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, ''); 64 const articleId = fallback.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || ''; 65 return { 66 articleId, 67 title, 68 author: titleParts[1] || title.split(/[::]/)[0] || '', 69 date: timeText || '', 70 readCount: '', 71 description: normalize(doc.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 150), 72 }; 73 }; 74 75 const seeds = []; 76 const seen = new Set(); 77 for (const link of Array.from(document.querySelectorAll('.day-hot-rank .art-list a[href*="/s/blog_"], .hot-rank .art-list a[href*="/s/blog_"]'))) { 78 const title = normalize(link.textContent); 79 const url = abs(link.getAttribute('href') || ''); 80 if (!title || !url || seen.has(url)) continue; 81 seen.add(url); 82 seeds.push({ rank: seeds.length + 1, title, url }); 83 if (seeds.length >= limit) break; 84 } 85 86 const results = []; 87 for (const item of seeds) { 88 let merged = { 89 rank: item.rank, 90 articleId: item.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '', 91 title: item.title, 92 author: '', 93 date: '', 94 readCount: '', 95 description: '', 96 url: item.url, 97 }; 98 try { 99 const resp = await fetch(item.url, { credentials: 'include' }); 100 if (resp.ok) { 101 const html = await resp.text(); 102 const doc = new DOMParser().parseFromString(html, 'text/html'); 103 merged = Object.assign(merged, parseArticle(doc, item)); 104 } 105 } catch {} 106 results.push(merged); 107 } 108 return results; 109 })() 110 `); 111 return Array.isArray(data) ? data : []; 112 } 113 export async function loadSinaBlogSearch(page, keyword, limit) { 114 const safeLimit = clampLimit(limit); 115 await page.goto(buildSinaBlogSearchUrl(keyword)); 116 await page.wait({ selector: '.result-item', timeout: 5 }); 117 const data = await page.evaluate(` 118 (async () => { 119 const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); 120 for (let i = 0; i < 20; i += 1) { 121 if (document.querySelector('.result-item')) break; 122 await sleep(500); 123 } 124 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 125 const limit = ${safeLimit}; 126 const items = Array.from(document.querySelectorAll('.result-item')); 127 const results = []; 128 for (const item of items) { 129 const link = item.querySelector('.result-title a[href*="blog.sina.com.cn/s/blog_"]'); 130 const title = normalize(link?.textContent); 131 const url = link?.getAttribute('href') || ''; 132 if (!title || !url) continue; 133 results.push({ 134 rank: results.length + 1, 135 title, 136 author: normalize(item.querySelector('.result-meta .source')?.textContent), 137 date: normalize(item.querySelector('.result-meta .time')?.textContent), 138 description: normalize(item.querySelector('.result-intro')?.textContent).slice(0, 150), 139 url, 140 }); 141 if (results.length >= limit) break; 142 } 143 return results; 144 })() 145 `); 146 return Array.isArray(data) ? data : []; 147 } 148 export async function loadSinaBlogUser(page, uid, limit) { 149 const safeLimit = clampLimit(limit); 150 await page.goto(buildSinaBlogUserUrl(uid)); 151 await page.wait({ selector: 'h1', timeout: 3 }); 152 const data = await page.evaluate(` 153 (async () => { 154 await new Promise((resolve) => setTimeout(resolve, 1000)); 155 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 156 const limit = ${safeLimit}; 157 const author = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean)[1] || ''; 158 const abs = (href) => { 159 if (!href) return ''; 160 if (href.startsWith('//')) return 'https:' + href; 161 if (href.startsWith('http')) return href; 162 return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href; 163 }; 164 const results = []; 165 for (const item of Array.from(document.querySelectorAll('.articleList .articleCell'))) { 166 const link = item.querySelector('.atc_title a[href*="/s/blog_"]'); 167 const title = normalize(link?.textContent); 168 const url = abs(link?.getAttribute('href') || ''); 169 if (!title || !url) continue; 170 results.push({ 171 rank: results.length + 1, 172 articleId: url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '', 173 title, 174 author, 175 date: normalize(item.querySelector('.atc_tm')?.textContent), 176 readCount: '', 177 description: '', 178 url, 179 }); 180 if (results.length >= limit) break; 181 } 182 return results; 183 })() 184 `); 185 return Array.isArray(data) ? data : []; 186 }