/ clis / sinablog / utils.js
utils.js
  1  import { clamp } from '../_shared/common.js';
  2  const clampLimit = (limit) => clamp(limit || 20, 1, 50);
  3  export function buildSinaBlogSearchUrl(keyword) {
  4      return `https://search.sina.com.cn/search?q=${encodeURIComponent(keyword)}&tp=mix`;
  5  }
  6  export function buildSinaBlogUserUrl(uid) {
  7      return `https://blog.sina.com.cn/s/articlelist_${encodeURIComponent(uid)}_0_1.html`;
  8  }
  9  export async function loadSinaBlogArticle(page, url) {
 10      await page.goto(url);
 11      await page.wait({ selector: 'h1', timeout: 3 });
 12      return page.evaluate(`
 13      (async () => {
 14        await new Promise((resolve) => setTimeout(resolve, 1500));
 15        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
 16        const title = normalize(document.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent);
 17        const titleParts = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean);
 18        const author = titleParts[1] || title.split(/[::]/)[0] || '';
 19        const timeText = normalize(document.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, '');
 20        const date = timeText || normalize(document.body.innerText.match(/\\b\\d{4}-\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2}:\\d{2})?\\b/)?.[0]);
 21        const category = normalize(document.querySelector('.articalTag .blog_class a, .blog_class a')?.textContent);
 22        const tags = Array.from(document.querySelectorAll('.blog_tag h3, .blog_tag a, .tag a, .artical_tag a'))
 23          .map((node) => normalize(node.textContent))
 24          .filter(Boolean);
 25        const content = normalize(document.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 500);
 26        const images = Array.from(document.querySelectorAll('.articalContent img, .blog_content img, .content img'))
 27          .map((img) => img.getAttribute('src') || img.getAttribute('real_src') || '')
 28          .filter((src) => src && !src.includes('icon'))
 29          .slice(0, 5);
 30        return {
 31          title,
 32          author,
 33          date,
 34          category,
 35          tags: tags.join(', '),
 36          readCount: '',
 37          commentCount: '',
 38          content: content + (content.length >= 500 ? '...' : ''),
 39          images: images.join(', '),
 40          url: ${JSON.stringify(url)},
 41        };
 42      })()
 43    `);
 44  }
 45  export async function loadSinaBlogHot(page, limit) {
 46      const safeLimit = clampLimit(limit);
 47      await page.goto('https://blog.sina.com.cn/');
 48      await page.wait({ selector: 'h1', timeout: 3 });
 49      const data = await page.evaluate(`
 50      (async () => {
 51        await new Promise((resolve) => setTimeout(resolve, 1500));
 52        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
 53        const limit = ${safeLimit};
 54        const abs = (href) => {
 55          if (!href) return '';
 56          if (href.startsWith('//')) return 'https:' + href;
 57          if (href.startsWith('http')) return href;
 58          return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href;
 59        };
 60        const parseArticle = (doc, fallback) => {
 61          const title = normalize(doc.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent) || fallback.title;
 62          const titleParts = normalize(doc.title).split('_').map((part) => normalize(part)).filter(Boolean);
 63          const timeText = normalize(doc.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, '');
 64          const articleId = fallback.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '';
 65          return {
 66            articleId,
 67            title,
 68            author: titleParts[1] || title.split(/[::]/)[0] || '',
 69            date: timeText || '',
 70            readCount: '',
 71            description: normalize(doc.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 150),
 72          };
 73        };
 74  
 75        const seeds = [];
 76        const seen = new Set();
 77        for (const link of Array.from(document.querySelectorAll('.day-hot-rank .art-list a[href*="/s/blog_"], .hot-rank .art-list a[href*="/s/blog_"]'))) {
 78          const title = normalize(link.textContent);
 79          const url = abs(link.getAttribute('href') || '');
 80          if (!title || !url || seen.has(url)) continue;
 81          seen.add(url);
 82          seeds.push({ rank: seeds.length + 1, title, url });
 83          if (seeds.length >= limit) break;
 84        }
 85  
 86        const results = [];
 87        for (const item of seeds) {
 88          let merged = {
 89            rank: item.rank,
 90            articleId: item.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '',
 91            title: item.title,
 92            author: '',
 93            date: '',
 94            readCount: '',
 95            description: '',
 96            url: item.url,
 97          };
 98          try {
 99            const resp = await fetch(item.url, { credentials: 'include' });
100            if (resp.ok) {
101              const html = await resp.text();
102              const doc = new DOMParser().parseFromString(html, 'text/html');
103              merged = Object.assign(merged, parseArticle(doc, item));
104            }
105          } catch {}
106          results.push(merged);
107        }
108        return results;
109      })()
110    `);
111      return Array.isArray(data) ? data : [];
112  }
113  export async function loadSinaBlogSearch(page, keyword, limit) {
114      const safeLimit = clampLimit(limit);
115      await page.goto(buildSinaBlogSearchUrl(keyword));
116      await page.wait({ selector: '.result-item', timeout: 5 });
117      const data = await page.evaluate(`
118      (async () => {
119        const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
120        for (let i = 0; i < 20; i += 1) {
121          if (document.querySelector('.result-item')) break;
122          await sleep(500);
123        }
124        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
125        const limit = ${safeLimit};
126        const items = Array.from(document.querySelectorAll('.result-item'));
127        const results = [];
128        for (const item of items) {
129          const link = item.querySelector('.result-title a[href*="blog.sina.com.cn/s/blog_"]');
130          const title = normalize(link?.textContent);
131          const url = link?.getAttribute('href') || '';
132          if (!title || !url) continue;
133          results.push({
134            rank: results.length + 1,
135            title,
136            author: normalize(item.querySelector('.result-meta .source')?.textContent),
137            date: normalize(item.querySelector('.result-meta .time')?.textContent),
138            description: normalize(item.querySelector('.result-intro')?.textContent).slice(0, 150),
139            url,
140          });
141          if (results.length >= limit) break;
142        }
143        return results;
144      })()
145    `);
146      return Array.isArray(data) ? data : [];
147  }
148  export async function loadSinaBlogUser(page, uid, limit) {
149      const safeLimit = clampLimit(limit);
150      await page.goto(buildSinaBlogUserUrl(uid));
151      await page.wait({ selector: 'h1', timeout: 3 });
152      const data = await page.evaluate(`
153      (async () => {
154        await new Promise((resolve) => setTimeout(resolve, 1000));
155        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
156        const limit = ${safeLimit};
157        const author = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean)[1] || '';
158        const abs = (href) => {
159          if (!href) return '';
160          if (href.startsWith('//')) return 'https:' + href;
161          if (href.startsWith('http')) return href;
162          return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href;
163        };
164        const results = [];
165        for (const item of Array.from(document.querySelectorAll('.articleList .articleCell'))) {
166          const link = item.querySelector('.atc_title a[href*="/s/blog_"]');
167          const title = normalize(link?.textContent);
168          const url = abs(link?.getAttribute('href') || '');
169          if (!title || !url) continue;
170          results.push({
171            rank: results.length + 1,
172            articleId: url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '',
173            title,
174            author,
175            date: normalize(item.querySelector('.atc_tm')?.textContent),
176            readCount: '',
177            description: '',
178            url,
179          });
180          if (results.length >= limit) break;
181        }
182        return results;
183      })()
184    `);
185      return Array.isArray(data) ? data : [];
186  }