/ clis / 1688 / assets.js
assets.js
  1  import { cli, Strategy } from '@jackwener/opencli/registry';
  2  import { assertAuthenticatedState, buildDetailUrl, buildProvenance, cleanText, extractOfferId, gotoAndReadState, uniqueMediaSources, } from './shared.js';
  3  function scriptToReadAssets() {
  4      return `
  5      (() => {
  6        const root = window.context ?? {};
  7        const model = root.result?.global?.globalData?.model ?? null;
  8        const gallery = root.result?.data?.gallery?.fields ?? null;
  9        const defaultSrcProps = ['data-lazyload-src', 'data-src', 'data-ks-lazyload', 'currentSrc', 'src'];
 10        const groups = [
 11          { key: 'main', type: 'image', selectors: ['#dt-tab img', '.detail-gallery-turn img.detail-gallery-img', '.img-list-wrapper img.od-gallery-img', '.od-scroller-item span'] },
 12          { key: 'video', type: 'video', selectors: ['.lib-video video', 'video[src]', 'video source[src]'] },
 13          { key: 'sku', type: 'image', selectors: ['.pc-sku-wrapper .prop-item-inner-wrapper', '.sku-item-wrapper', '.specification-cell', '.sku-filter-button', '.expand-view-item', '.feature-item img'], srcProps: ['backgroundImage'] },
 14          { key: 'detail', type: 'image', selectors: ['.de-description-detail img', '#detailContentContainer img', '.html-description img', '.html-description source', '.desc-lazyload-container img'] },
 15        ];
 16        const assets = [];
 17        const seen = new Set();
 18  
 19        const normalizeUrl = (value) => {
 20          if (typeof value !== 'string') return '';
 21          let next = value
 22            .replace(/^url\\((.*)\\)$/i, '$1')
 23            .replace(/^['"]|['"]$/g, '')
 24            .replace(/\\\\u002F/g, '/')
 25            .replace(/&/g, '&')
 26            .trim();
 27          if (!next || next.startsWith('blob:') || next.startsWith('data:')) return '';
 28          if (next.startsWith('//')) next = 'https:' + next;
 29          try {
 30            return new URL(next, location.href).toString();
 31          } catch {
 32            return '';
 33          }
 34        };
 35  
 36        const push = (type, group, url, source) => {
 37          const normalized = normalizeUrl(url);
 38          if (!normalized) return;
 39          const key = type + ':' + normalized;
 40          if (seen.has(key)) return;
 41          seen.add(key);
 42          assets.push({ type, group, url: normalized, source });
 43        };
 44  
 45        const queryAllDeep = (selector) => {
 46          const results = [];
 47          const visitedRoots = new Set();
 48          const walkRoots = (root, fn) => {
 49            if (!root || visitedRoots.has(root)) return;
 50            visitedRoots.add(root);
 51            fn(root);
 52            const childElements = root.querySelectorAll ? Array.from(root.querySelectorAll('*')) : [];
 53            for (const child of childElements) {
 54              if (child && child.shadowRoot) {
 55                walkRoots(child.shadowRoot, fn);
 56              }
 57            }
 58          };
 59          walkRoots(document, (root) => {
 60            if (root.querySelectorAll) {
 61              results.push(...Array.from(root.querySelectorAll(selector)));
 62            }
 63          });
 64          return results;
 65        };
 66  
 67        const valuesFromElement = (element, srcProps) => {
 68          const values = [];
 69          const props = srcProps && srcProps.length ? srcProps : defaultSrcProps;
 70          for (const prop of props) {
 71            try {
 72              if (prop === 'backgroundImage') {
 73                const bg = getComputedStyle(element).backgroundImage || '';
 74                const matches = bg.match(/url\\(([^)]+)\\)/g) || [];
 75                for (const match of matches) {
 76                  const clean = match.replace(/^url\\(/, '').replace(/\\)$/, '');
 77                  values.push(clean);
 78                }
 79                continue;
 80              }
 81  
 82              const direct = element[prop];
 83              if (typeof direct === 'string' && direct) values.push(direct);
 84              const attr = element.getAttribute ? element.getAttribute(prop) : '';
 85              if (attr) values.push(attr);
 86            } catch {}
 87          }
 88  
 89          if (element.tagName === 'SOURCE' && element.parentElement?.tagName === 'VIDEO') {
 90            values.push(element.src || element.getAttribute('src') || '');
 91          }
 92  
 93          if (element.tagName === 'VIDEO') {
 94            values.push(element.currentSrc || '');
 95            values.push(element.src || '');
 96          }
 97  
 98          return values;
 99        };
100  
101        for (const group of groups) {
102          for (const selector of group.selectors) {
103            for (const element of queryAllDeep(selector)) {
104              for (const value of valuesFromElement(element, group.srcProps)) {
105                push(group.type, group.key, value, 'dom:' + selector);
106              }
107            }
108          }
109        }
110  
111        const scriptTexts = Array.from(document.scripts).map((script) => script.textContent || '');
112        const videoRegex = /https?:\\/\\/[^"'\\s]+\\.(?:mp4|m3u8)(?:\\?[^"'\\s]*)?/gi;
113        for (const scriptText of scriptTexts) {
114          const matches = scriptText.match(videoRegex) || [];
115          for (const match of matches) {
116            push('video', 'video', match, 'script');
117          }
118        }
119  
120        const toJson = (value) => JSON.parse(JSON.stringify(value ?? null));
121        return {
122          href: window.location.href,
123          title: document.title || '',
124          offerTitle: model?.offerTitleModel?.subject ?? '',
125          offerId: model?.tradeModel?.offerId ?? '',
126          gallery: toJson(gallery),
127          scannedAssets: assets,
128        };
129      })()
130    `;
131  }
132  function normalizeAssets(payload) {
133      const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(cleanText(payload.href)) || null;
134      const itemUrl = offerId ? buildDetailUrl(offerId) : cleanText(payload.href);
135      const seededAssets = [
136          ...((payload.gallery?.mainImage ?? []).map((url) => ({ type: 'image', group: 'main', url, source: 'page_state:mainImage' }))),
137          ...((payload.gallery?.offerImgList ?? []).map((url) => ({ type: 'image', group: 'main', url, source: 'page_state:offerImgList' }))),
138          ...((payload.gallery?.wlImageInfos ?? []).map((item) => ({
139              type: 'image',
140              group: 'main',
141              url: item?.fullPathImageURI ?? '',
142              source: 'page_state:wlImageInfos',
143          }))),
144      ];
145      const assets = uniqueMediaSources([...seededAssets, ...(payload.scannedAssets ?? [])]);
146      const mainImages = assets.filter((item) => item.type === 'image' && item.group === 'main').map((item) => item.url);
147      const skuImages = assets.filter((item) => item.type === 'image' && item.group === 'sku').map((item) => item.url);
148      const detailImages = assets.filter((item) => item.type === 'image' && item.group === 'detail').map((item) => item.url);
149      const videos = assets.filter((item) => item.type === 'video').map((item) => item.url);
150      const otherImages = assets
151          .filter((item) => item.type === 'image' && !['main', 'sku', 'detail'].includes(item.group))
152          .map((item) => item.url);
153      return {
154          offer_id: offerId,
155          title: cleanText(payload.offerTitle) || cleanText(payload.title) || null,
156          item_url: itemUrl,
157          main_images: mainImages,
158          sku_images: skuImages,
159          detail_images: detailImages,
160          videos,
161          other_images: otherImages,
162          raw_assets: assets,
163          source: [...new Set(assets.map((item) => cleanText(item.source)).filter(Boolean))],
164          main_count: mainImages.length,
165          sku_count: skuImages.length,
166          detail_count: detailImages.length,
167          video_count: videos.length,
168          ...buildProvenance(cleanText(payload.href) || itemUrl),
169      };
170  }
171  async function readAssetsPayload(page, itemUrl) {
172      const state = await gotoAndReadState(page, itemUrl, 2500, 'assets');
173      assertAuthenticatedState(state, 'assets');
174      await page.autoScroll({ times: 3, delayMs: 400 });
175      await page.wait(1);
176      return await page.evaluate(scriptToReadAssets());
177  }
178  export async function extractAssetsForInput(page, input) {
179      const itemUrl = buildDetailUrl(String(input ?? ''));
180      const payload = await readAssetsPayload(page, itemUrl);
181      return normalizeAssets(payload);
182  }
183  cli({
184      site: '1688',
185      name: 'assets',
186      description: '列出 1688 商品页可提取的图片/视频素材',
187      domain: 'www.1688.com',
188      strategy: Strategy.COOKIE,
189      args: [
190          {
191              name: 'input',
192              required: true,
193              positional: true,
194              help: '1688 商品 URL 或 offer ID(如 887904326744)',
195          },
196      ],
197      columns: ['offer_id', 'title', 'main_count', 'sku_count', 'detail_count', 'video_count'],
198      func: async (page, kwargs) => {
199          return [await extractAssetsForInput(page, String(kwargs.input ?? ''))];
200      },
201  });
202  export const __test__ = {
203      normalizeAssets,
204  };