/ clis / bloomberg / utils.js
utils.js
  1  import { CliError } from '@jackwener/opencli/errors';
  2  export const BLOOMBERG_FEEDS = {
  3      main: 'https://feeds.bloomberg.com/news.rss',
  4      markets: 'https://feeds.bloomberg.com/markets/news.rss',
  5      economics: 'https://feeds.bloomberg.com/economics/news.rss',
  6      industries: 'https://feeds.bloomberg.com/industries/news.rss',
  7      tech: 'https://feeds.bloomberg.com/technology/news.rss',
  8      politics: 'https://feeds.bloomberg.com/politics/news.rss',
  9      businessweek: 'https://feeds.bloomberg.com/businessweek/news.rss',
 10      opinions: 'https://feeds.bloomberg.com/bview/news.rss',
 11  };
 12  const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; opencli)';
 13  export async function fetchBloombergFeed(name, limit = 1) {
 14      const feedUrl = BLOOMBERG_FEEDS[name];
 15      if (!feedUrl) {
 16          throw new CliError('ARGUMENT', `Unknown Bloomberg feed: ${name}`);
 17      }
 18      const resp = await fetch(feedUrl, {
 19          headers: { 'User-Agent': DEFAULT_USER_AGENT },
 20      });
 21      if (!resp.ok) {
 22          throw new CliError('FETCH_ERROR', `Bloomberg RSS HTTP ${resp.status}`, 'Bloomberg may be temporarily unavailable; try again later.');
 23      }
 24      const xml = await resp.text();
 25      const items = parseBloombergRss(xml);
 26      if (!items.length) {
 27          throw new CliError('NOT_FOUND', 'Bloomberg RSS feed returned no items', 'Bloomberg may have changed the feed format.');
 28      }
 29      const count = Math.max(1, Math.min(Number(limit) || 1, 20));
 30      return items.slice(0, count);
 31  }
 32  export function parseBloombergRss(xml) {
 33      const items = [];
 34      const itemRegex = /<item\b[^>]*>([\s\S]*?)<\/item>/gi;
 35      let match;
 36      while ((match = itemRegex.exec(xml))) {
 37          const block = match[1];
 38          const title = extractTagText(block, 'title');
 39          const summary = extractTagText(block, 'description');
 40          const link = extractTagText(block, 'link') || extractTagText(block, 'guid');
 41          const mediaLinks = extractMediaLinksFromRssItem(block);
 42          if (!title || !link)
 43              continue;
 44          items.push({
 45              title,
 46              summary,
 47              link,
 48              mediaLinks,
 49          });
 50      }
 51      return items;
 52  }
 53  export function normalizeBloombergLink(input) {
 54      const raw = String(input || '').trim();
 55      if (!raw) {
 56          throw new CliError('ARGUMENT', 'A Bloomberg link is required');
 57      }
 58      if (raw.startsWith('/'))
 59          return `https://www.bloomberg.com${raw}`;
 60      return raw;
 61  }
 62  export function validateBloombergLink(input) {
 63      const normalized = normalizeBloombergLink(input);
 64      let url;
 65      try {
 66          url = new URL(normalized);
 67      }
 68      catch {
 69          throw new CliError('ARGUMENT', `Invalid Bloomberg link: ${input}`, 'Pass a full https://www.bloomberg.com/... URL or a relative Bloomberg path.');
 70      }
 71      if (!/(?:\.|^)bloomberg\.com$/i.test(url.hostname)) {
 72          throw new CliError('ARGUMENT', `Expected a bloomberg.com link, got: ${url.hostname}`, 'Pass a Bloomberg article URL from bloomberg.com.');
 73      }
 74      return url.toString();
 75  }
 76  export function renderStoryBody(body) {
 77      const blocks = Array.isArray(body?.content) ? body.content : [];
 78      const parts = blocks
 79          .map((block) => renderBlock(block, 0))
 80          .map((part) => normalizeBlockText(part))
 81          .filter(Boolean);
 82      return parts.join('\n\n').replace(/\n{3,}/g, '\n\n').trim();
 83  }
 84  export function extractStoryMediaLinks(story) {
 85      const urls = new Set();
 86      collectMediaUrls(story?.ledeImageUrl, urls);
 87      collectMediaUrls(story?.socialImageUrl, urls);
 88      collectMediaUrls(story?.lede, urls);
 89      collectMediaUrls(story?.imageAttachments, urls);
 90      collectMediaUrls(story?.videoAttachments, urls);
 91      const mediaBlocks = Array.isArray(story?.body?.content)
 92          ? story.body.content.filter((block) => block?.type === 'media')
 93          : [];
 94      collectMediaUrls(mediaBlocks, urls);
 95      return [...urls];
 96  }
 97  function renderBlock(block, depth) {
 98      if (!block || typeof block !== 'object')
 99          return '';
100      switch (block.type) {
101          case 'paragraph':
102              return renderInlineNodes(block.content || []);
103          case 'heading': {
104              const text = renderInlineNodes(block.content || []);
105              if (!text)
106                  return '';
107              const level = Number(block.data?.level ?? block.data?.weight ?? 2);
108              const prefix = level <= 1 ? '# ' : level === 2 ? '## ' : '### ';
109              return `${prefix}${text}`;
110          }
111          case 'blockquote': {
112              const text = renderInlineNodes(block.content || []);
113              if (!text)
114                  return '';
115              return text.split('\n').map((line) => line ? `> ${line}` : '>').join('\n');
116          }
117          case 'list':
118              return renderListBlock(block, depth);
119          case 'tabularData':
120              return renderTabularDataBlock(block);
121          case 'media':
122              return renderMediaBlock(block);
123          case 'inline-newsletter':
124          case 'newsletter':
125          case 'ad':
126              return '';
127          default: {
128              if (Array.isArray(block.content) && block.content.length > 0) {
129                  const inlineText = renderInlineNodes(block.content);
130                  if (inlineText)
131                      return inlineText;
132                  const nested = block.content.map((child) => renderBlock(child, depth + 1)).filter(Boolean);
133                  if (nested.length)
134                      return nested.join('\n');
135              }
136              return extractGenericText(block);
137          }
138      }
139  }
140  function renderInlineNodes(nodes) {
141      return nodes.map((node) => renderInlineNode(node)).join('');
142  }
143  function renderInlineNode(node) {
144      if (node == null)
145          return '';
146      if (typeof node === 'string')
147          return decodeXmlEntities(node);
148      switch (node.type) {
149          case 'text':
150              return decodeXmlEntities(node.value || '');
151          case 'linebreak':
152              return '\n';
153          case 'link':
154          case 'entity':
155          case 'strong':
156          case 'emphasis':
157          case 'italic':
158          case 'underline':
159          case 'span':
160              if (Array.isArray(node.content) && node.content.length > 0) {
161                  return renderInlineNodes(node.content);
162              }
163              return decodeXmlEntities(node.value || '');
164          default:
165              if (Array.isArray(node.content) && node.content.length > 0) {
166                  return renderInlineNodes(node.content);
167              }
168              if (typeof node.value === 'string')
169                  return decodeXmlEntities(node.value);
170              return '';
171      }
172  }
173  function renderListBlock(block, depth) {
174      const items = Array.isArray(block.content) ? block.content : [];
175      if (!items.length)
176          return '';
177      const listStyle = String(block.subType || block.data?.style || block.data?.listType || '');
178      const ordered = /\bordered\b|\bnumber(?:ed)?\b/i.test(listStyle);
179      let index = 1;
180      return items
181          .map((item) => {
182          const prefix = ordered ? `${index++}. ` : '- ';
183          return renderListItem(item, prefix, depth);
184      })
185          .filter(Boolean)
186          .join('\n');
187  }
188  function renderListItem(item, prefix, depth) {
189      const indent = '  '.repeat(depth);
190      const body = normalizeBlockText(renderListItemBody(item, depth + 1));
191      if (!body)
192          return '';
193      const lines = body.split('\n');
194      const head = `${indent}${prefix}${lines[0]}`;
195      if (lines.length === 1)
196          return head;
197      const continuationIndent = `${indent}${' '.repeat(prefix.length)}`;
198      const tail = lines.slice(1).map((line) => `${continuationIndent}${line}`).join('\n');
199      return `${head}\n${tail}`;
200  }
201  function renderListItemBody(item, depth) {
202      if (!item || typeof item !== 'object')
203          return '';
204      if (item.type === 'list-item' && Array.isArray(item.content)) {
205          const parts = item.content
206              .map((child) => child?.type === 'paragraph'
207              ? renderInlineNodes(child.content || [])
208              : renderBlock(child, depth))
209              .map((part) => normalizeBlockText(part))
210              .filter(Boolean);
211          return parts.join('\n');
212      }
213      return renderBlock(item, depth);
214  }
215  function renderTabularDataBlock(block) {
216      const rows = block?.data?.rows ?? block?.data?.table?.rows ?? block?.content;
217      if (!Array.isArray(rows) || !rows.length) {
218          return extractGenericText(block.data || block.content || block);
219      }
220      const lines = rows
221          .map((row) => extractGenericText(row))
222          .map((line) => normalizeBlockText(line))
223          .filter(Boolean);
224      return lines.join('\n');
225  }
226  function renderMediaBlock(block) {
227      const candidates = [
228          block?.data?.chart?.caption,
229          block?.data?.attachment?.caption,
230          block?.data?.attachment?.title,
231          block?.data?.attachment?.subtitle,
232          block?.data?.video?.caption,
233      ];
234      const caption = candidates
235          .map((value) => normalizeBlockText(stripHtml(String(value || ''))))
236          .find(Boolean);
237      return caption || '';
238  }
239  function extractGenericText(value) {
240      const parts = [];
241      collectText(value, parts);
242      return parts.join(' ').replace(/\s+/g, ' ').trim();
243  }
244  function collectText(value, out) {
245      if (value == null)
246          return;
247      if (typeof value === 'string') {
248          const text = normalizeBlockText(stripHtml(decodeXmlEntities(value)));
249          if (text)
250              out.push(text);
251          return;
252      }
253      if (Array.isArray(value)) {
254          for (const item of value)
255              collectText(item, out);
256          return;
257      }
258      if (typeof value === 'object') {
259          if (typeof value.value === 'string') {
260              const text = normalizeBlockText(stripHtml(decodeXmlEntities(value.value)));
261              if (text)
262                  out.push(text);
263              return;
264          }
265          if (Array.isArray(value.content)) {
266              collectText(value.content, out);
267              return;
268          }
269          for (const entry of Object.values(value))
270              collectText(entry, out);
271      }
272  }
273  function extractTagText(block, tag) {
274      const safeTag = escapeRegExp(tag);
275      const match = block.match(new RegExp(`<${safeTag}(?:\\s[^>]*)?>([\\s\\S]*?)<\\/${safeTag}>`, 'i'));
276      if (!match)
277          return '';
278      return normalizeBlockText(stripHtml(decodeXmlEntities(stripCdata(match[1]))));
279  }
280  function extractMediaLinksFromRssItem(block) {
281      const links = new Set();
282      const mediaRegex = /<(?:media:content|media:thumbnail|enclosure)\b[^>]*\burl="([^"]+)"[^>]*>/gi;
283      let match;
284      while ((match = mediaRegex.exec(block))) {
285          const url = decodeXmlEntities(match[1] || '').trim();
286          if (url)
287              links.add(url);
288      }
289      return [...links];
290  }
291  function collectMediaUrls(value, out, seen = new WeakSet()) {
292      if (value == null)
293          return;
294      if (typeof value === 'string') {
295          const normalized = normalizeMediaUrl(value);
296          if (normalized)
297              out.add(normalized);
298          return;
299      }
300      if (Array.isArray(value)) {
301          for (const item of value)
302              collectMediaUrls(item, out, seen);
303          return;
304      }
305      if (typeof value === 'object') {
306          if (seen.has(value))
307              return;
308          seen.add(value);
309          for (const key of ['url', 'src', 'fallback', 'poster']) {
310              const candidate = value[key];
311              if (typeof candidate === 'string') {
312                  const normalized = normalizeMediaUrl(candidate);
313                  if (normalized)
314                      out.add(normalized);
315              }
316          }
317          for (const entry of Object.values(value)) {
318              collectMediaUrls(entry, out, seen);
319          }
320      }
321  }
322  function normalizeMediaUrl(value) {
323      const url = decodeXmlEntities(String(value || '')).trim();
324      if (!/^https?:\/\//i.test(url))
325          return null;
326      if (!looksLikeMediaUrl(url))
327          return null;
328      return url;
329  }
330  function looksLikeMediaUrl(url) {
331      return /(?:assets\.bwbx\.io|resource\.bloomberg\.com|media\.bloomberg\.com)/i.test(url)
332          || /\.(?:jpg|jpeg|png|webp|gif|svg|mp4|m3u8)(?:[?#].*)?$/i.test(url);
333  }
334  function stripCdata(value) {
335      const match = value.match(/^<!\[CDATA\[([\s\S]*?)\]\]>$/);
336      return match ? match[1] : value;
337  }
338  function stripHtml(value) {
339      return String(value || '').replace(/<[^>]+>/g, ' ');
340  }
341  function decodeXmlEntities(value) {
342      return String(value || '')
343          .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1')
344          .replace(/&#(\d+);/g, (_m, code) => String.fromCodePoint(Number(code)))
345          .replace(/&#x([0-9a-f]+);/gi, (_m, code) => String.fromCodePoint(parseInt(code, 16)))
346          .replace(/&amp;/g, '&')
347          .replace(/&lt;/g, '<')
348          .replace(/&gt;/g, '>')
349          .replace(/&quot;/g, '"')
350          .replace(/&#39;/g, "'")
351          .replace(/&apos;/g, "'")
352          .replace(/&nbsp;/g, ' ');
353  }
354  function normalizeBlockText(value) {
355      return String(value || '')
356          .replace(/\r/g, '')
357          .replace(/[ \t]+\n/g, '\n')
358          .replace(/\n[ \t]+/g, '\n')
359          .replace(/[ \t]{2,}/g, ' ')
360          .trim();
361  }
362  function escapeRegExp(value) {
363      return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
364  }