assets.js
1 import { cli, Strategy } from '@jackwener/opencli/registry'; 2 import { assertAuthenticatedState, buildDetailUrl, buildProvenance, cleanText, extractOfferId, gotoAndReadState, uniqueMediaSources, } from './shared.js'; 3 function scriptToReadAssets() { 4 return ` 5 (() => { 6 const root = window.context ?? {}; 7 const model = root.result?.global?.globalData?.model ?? null; 8 const gallery = root.result?.data?.gallery?.fields ?? null; 9 const defaultSrcProps = ['data-lazyload-src', 'data-src', 'data-ks-lazyload', 'currentSrc', 'src']; 10 const groups = [ 11 { key: 'main', type: 'image', selectors: ['#dt-tab img', '.detail-gallery-turn img.detail-gallery-img', '.img-list-wrapper img.od-gallery-img', '.od-scroller-item span'] }, 12 { key: 'video', type: 'video', selectors: ['.lib-video video', 'video[src]', 'video source[src]'] }, 13 { key: 'sku', type: 'image', selectors: ['.pc-sku-wrapper .prop-item-inner-wrapper', '.sku-item-wrapper', '.specification-cell', '.sku-filter-button', '.expand-view-item', '.feature-item img'], srcProps: ['backgroundImage'] }, 14 { key: 'detail', type: 'image', selectors: ['.de-description-detail img', '#detailContentContainer img', '.html-description img', '.html-description source', '.desc-lazyload-container img'] }, 15 ]; 16 const assets = []; 17 const seen = new Set(); 18 19 const normalizeUrl = (value) => { 20 if (typeof value !== 'string') return ''; 21 let next = value 22 .replace(/^url\\((.*)\\)$/i, '$1') 23 .replace(/^['"]|['"]$/g, '') 24 .replace(/\\\\u002F/g, '/') 25 .replace(/&/g, '&') 26 .trim(); 27 if (!next || next.startsWith('blob:') || next.startsWith('data:')) return ''; 28 if (next.startsWith('//')) next = 'https:' + next; 29 try { 30 return new URL(next, location.href).toString(); 31 } catch { 32 return ''; 33 } 34 }; 35 36 const push = (type, group, url, source) => { 37 const normalized = normalizeUrl(url); 38 if (!normalized) return; 39 const key = type + ':' + normalized; 40 if (seen.has(key)) return; 41 seen.add(key); 42 assets.push({ type, group, url: normalized, source }); 43 }; 44 45 const queryAllDeep = (selector) => { 46 const results = []; 47 const visitedRoots = new Set(); 48 const walkRoots = (root, fn) => { 49 if (!root || visitedRoots.has(root)) return; 50 visitedRoots.add(root); 51 fn(root); 52 const childElements = root.querySelectorAll ? Array.from(root.querySelectorAll('*')) : []; 53 for (const child of childElements) { 54 if (child && child.shadowRoot) { 55 walkRoots(child.shadowRoot, fn); 56 } 57 } 58 }; 59 walkRoots(document, (root) => { 60 if (root.querySelectorAll) { 61 results.push(...Array.from(root.querySelectorAll(selector))); 62 } 63 }); 64 return results; 65 }; 66 67 const valuesFromElement = (element, srcProps) => { 68 const values = []; 69 const props = srcProps && srcProps.length ? srcProps : defaultSrcProps; 70 for (const prop of props) { 71 try { 72 if (prop === 'backgroundImage') { 73 const bg = getComputedStyle(element).backgroundImage || ''; 74 const matches = bg.match(/url\\(([^)]+)\\)/g) || []; 75 for (const match of matches) { 76 const clean = match.replace(/^url\\(/, '').replace(/\\)$/, ''); 77 values.push(clean); 78 } 79 continue; 80 } 81 82 const direct = element[prop]; 83 if (typeof direct === 'string' && direct) values.push(direct); 84 const attr = element.getAttribute ? element.getAttribute(prop) : ''; 85 if (attr) values.push(attr); 86 } catch {} 87 } 88 89 if (element.tagName === 'SOURCE' && element.parentElement?.tagName === 'VIDEO') { 90 values.push(element.src || element.getAttribute('src') || ''); 91 } 92 93 if (element.tagName === 'VIDEO') { 94 values.push(element.currentSrc || ''); 95 values.push(element.src || ''); 96 } 97 98 return values; 99 }; 100 101 for (const group of groups) { 102 for (const selector of group.selectors) { 103 for (const element of queryAllDeep(selector)) { 104 for (const value of valuesFromElement(element, group.srcProps)) { 105 push(group.type, group.key, value, 'dom:' + selector); 106 } 107 } 108 } 109 } 110 111 const scriptTexts = Array.from(document.scripts).map((script) => script.textContent || ''); 112 const videoRegex = /https?:\\/\\/[^"'\\s]+\\.(?:mp4|m3u8)(?:\\?[^"'\\s]*)?/gi; 113 for (const scriptText of scriptTexts) { 114 const matches = scriptText.match(videoRegex) || []; 115 for (const match of matches) { 116 push('video', 'video', match, 'script'); 117 } 118 } 119 120 const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); 121 return { 122 href: window.location.href, 123 title: document.title || '', 124 offerTitle: model?.offerTitleModel?.subject ?? '', 125 offerId: model?.tradeModel?.offerId ?? '', 126 gallery: toJson(gallery), 127 scannedAssets: assets, 128 }; 129 })() 130 `; 131 } 132 function normalizeAssets(payload) { 133 const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(cleanText(payload.href)) || null; 134 const itemUrl = offerId ? buildDetailUrl(offerId) : cleanText(payload.href); 135 const seededAssets = [ 136 ...((payload.gallery?.mainImage ?? []).map((url) => ({ type: 'image', group: 'main', url, source: 'page_state:mainImage' }))), 137 ...((payload.gallery?.offerImgList ?? []).map((url) => ({ type: 'image', group: 'main', url, source: 'page_state:offerImgList' }))), 138 ...((payload.gallery?.wlImageInfos ?? []).map((item) => ({ 139 type: 'image', 140 group: 'main', 141 url: item?.fullPathImageURI ?? '', 142 source: 'page_state:wlImageInfos', 143 }))), 144 ]; 145 const assets = uniqueMediaSources([...seededAssets, ...(payload.scannedAssets ?? [])]); 146 const mainImages = assets.filter((item) => item.type === 'image' && item.group === 'main').map((item) => item.url); 147 const skuImages = assets.filter((item) => item.type === 'image' && item.group === 'sku').map((item) => item.url); 148 const detailImages = assets.filter((item) => item.type === 'image' && item.group === 'detail').map((item) => item.url); 149 const videos = assets.filter((item) => item.type === 'video').map((item) => item.url); 150 const otherImages = assets 151 .filter((item) => item.type === 'image' && !['main', 'sku', 'detail'].includes(item.group)) 152 .map((item) => item.url); 153 return { 154 offer_id: offerId, 155 title: cleanText(payload.offerTitle) || cleanText(payload.title) || null, 156 item_url: itemUrl, 157 main_images: mainImages, 158 sku_images: skuImages, 159 detail_images: detailImages, 160 videos, 161 other_images: otherImages, 162 raw_assets: assets, 163 source: [...new Set(assets.map((item) => cleanText(item.source)).filter(Boolean))], 164 main_count: mainImages.length, 165 sku_count: skuImages.length, 166 detail_count: detailImages.length, 167 video_count: videos.length, 168 ...buildProvenance(cleanText(payload.href) || itemUrl), 169 }; 170 } 171 async function readAssetsPayload(page, itemUrl) { 172 const state = await gotoAndReadState(page, itemUrl, 2500, 'assets'); 173 assertAuthenticatedState(state, 'assets'); 174 await page.autoScroll({ times: 3, delayMs: 400 }); 175 await page.wait(1); 176 return await page.evaluate(scriptToReadAssets()); 177 } 178 export async function extractAssetsForInput(page, input) { 179 const itemUrl = buildDetailUrl(String(input ?? '')); 180 const payload = await readAssetsPayload(page, itemUrl); 181 return normalizeAssets(payload); 182 } 183 cli({ 184 site: '1688', 185 name: 'assets', 186 description: '列出 1688 商品页可提取的图片/视频素材', 187 domain: 'www.1688.com', 188 strategy: Strategy.COOKIE, 189 args: [ 190 { 191 name: 'input', 192 required: true, 193 positional: true, 194 help: '1688 商品 URL 或 offer ID(如 887904326744)', 195 }, 196 ], 197 columns: ['offer_id', 'title', 'main_count', 'sku_count', 'detail_count', 'video_count'], 198 func: async (page, kwargs) => { 199 return [await extractAssetsForInput(page, String(kwargs.input ?? ''))]; 200 }, 201 }); 202 export const __test__ = { 203 normalizeAssets, 204 };