Cradicle Explorer

/ tests / web-fetch.test.ts
web-fetch.test.ts
  1  import { describe, expect, it } from 'vitest'
  2  
  3  import { __testing__ } from '@/server/tools/web-fetch'
  4  
  5  const {
  6    extractHtmlContent,
  7    convertHtmlToMarkdown,
  8    stripHtmlTags,
  9    removeScriptsAndStyles,
 10    normalizeHtmlStructure,
 11    cleanMarkdownOutput,
 12    convertTableToMarkdown,
 13    extractTitleFromHtml,
 14    isCssLikeText,
 15  } = __testing__
 16  
 17  describe('extractTitleFromHtml', () => {
 18    it('extracts the title from HTML', () => {
 19      const html = '<html><head><title>Test Page</title></head></html>'
 20      expect(extractTitleFromHtml(html)).toBe('Test Page')
 21    })
 22  
 23    it('returns null when no title exists', () => {
 24      const html = '<html><body>No title here</body></html>'
 25      expect(extractTitleFromHtml(html)).toBeNull()
 26    })
 27  
 28    it('handles title with extra attributes', () => {
 29      const html = '<title class="foo">Title with attributes</title>'
 30      expect(extractTitleFromHtml(html)).toBe('Title with attributes')
 31    })
 32  
 33    it('is case insensitive', () => {
 34      const html = '<TITLE>Lowercase extraction</title>'
 35      expect(extractTitleFromHtml(html)).toBe('Lowercase extraction')
 36    })
 37  })
 38  
 39  describe('stripHtmlTags', () => {
 40    it('removes all HTML tags', () => {
 41      expect(stripHtmlTags('<p>Hello <strong>World</strong></p>')).toBe(
 42        'Hello World',
 43      )
 44    })
 45  
 46    it('handles nested tags', () => {
 47      expect(stripHtmlTags('<div><span><a href="#">Link</a></span></div>')).toBe(
 48        'Link',
 49      )
 50    })
 51  
 52    it('handles self-closing tags', () => {
 53      expect(stripHtmlTags('<br/><hr/><img src="x"/>')).toBe('')
 54    })
 55  })
 56  
 57  describe('removeScriptsAndStyles', () => {
 58    it('removes script tags', () => {
 59      const html =
 60        '<html><script>alert("xss")</script><body>Content</body></html>'
 61      const result = removeScriptsAndStyles(html)
 62      expect(result).not.toContain('<script>')
 63      expect(result).toContain('Content')
 64    })
 65  
 66    it('removes style tags', () => {
 67      const html =
 68        '<html><style>.foo { color: red; }</style><body>Content</body></html>'
 69      const result = removeScriptsAndStyles(html)
 70      expect(result).not.toContain('<style>')
 71      expect(result).toContain('Content')
 72    })
 73  
 74    it('handles multiline scripts', () => {
 75      const html = `<script>
 76        const x = 1;
 77        console.log(x);
 78      </script><p>Text</p>`
 79      expect(removeScriptsAndStyles(html)).toContain('<p>Text</p>')
 80    })
 81  
 82    it('removes style tags with nested braces and pseudo selectors', () => {
 83      const html = `
 84        <style>
 85          .foo{display:flex}
 86          .foo:hover{color:red}
 87          .foo::before{content:"<"}
 88        </style>
 89        <p>Visible</p>
 90      `
 91      const result = removeScriptsAndStyles(html)
 92      expect(result).toContain('<p>Visible</p>')
 93      expect(result).not.toContain('.foo{display:flex}')
 94      expect(result).not.toContain('foo::before')
 95    })
 96  })
 97  
 98  describe('normalizeHtmlStructure', () => {
 99    it('normalizes br tags', () => {
100      expect(normalizeHtmlStructure('<br>')).toBe('<br />')
101      expect(normalizeHtmlStructure('<br/>')).toContain('<br')
102      expect(normalizeHtmlStructure('<br/>')).toContain('/>')
103    })
104  
105    it('normalizes hr tags', () => {
106      expect(normalizeHtmlStructure('<hr>')).toBe('<hr />')
107    })
108  
109    it('normalizes img tags', () => {
110      expect(normalizeHtmlStructure('<img src="x">')).toBe('<img src="x" />')
111    })
112  
113    it('removes empty p tags', () => {
114      expect(normalizeHtmlStructure('<p></p>')).toBe('')
115      expect(normalizeHtmlStructure('<p> </p>')).toBe('')
116    })
117  
118    it('removes excessive whitespace between tags', () => {
119      expect(normalizeHtmlStructure('>  <')).toBe('><')
120    })
121  })
122  
123  describe('convertHtmlToMarkdown', () => {
124    it('converts h1-h6 to markdown', () => {
125      expect(convertHtmlToMarkdown('<h1>Title 1</h1>')).toContain('# Title 1')
126      expect(convertHtmlToMarkdown('<h2>Title 2</h2>')).toContain('## Title 2')
127      expect(convertHtmlToMarkdown('<h3>Title 3</h3>')).toContain('### Title 3')
128      expect(convertHtmlToMarkdown('<h4>Title 4</h4>')).toContain('#### Title 4')
129      expect(convertHtmlToMarkdown('<h5>Title 5</h5>')).toContain('##### Title 5')
130      expect(convertHtmlToMarkdown('<h6>Title 6</h6>')).toContain(
131        '###### Title 6',
132      )
133    })
134  
135    it('converts bold and italic', () => {
136      expect(convertHtmlToMarkdown('<strong>bold</strong>')).toContain('**bold**')
137      expect(convertHtmlToMarkdown('<b>bold</b>')).toContain('**bold**')
138      expect(convertHtmlToMarkdown('<em>italic</em>')).toContain('*italic*')
139      expect(convertHtmlToMarkdown('<i>italic</i>')).toContain('*italic*')
140    })
141  
142    it('converts code and pre tags', () => {
143      expect(convertHtmlToMarkdown('<code>code</code>')).toContain('`code`')
144      expect(convertHtmlToMarkdown('<pre>code</pre>')).toContain('```\ncode\n```')
145    })
146  
147    it('converts links', () => {
148      const result = convertHtmlToMarkdown(
149        '<a href="https://example.com">Link</a>',
150      )
151      expect(result).toContain('[Link](https://example.com)')
152    })
153  
154    it('converts unordered lists', () => {
155      const result = convertHtmlToMarkdown(
156        '<ul><li>Item 1</li><li>Item 2</li></ul>',
157      )
158      expect(result).toContain('- Item 1')
159      expect(result).toContain('- Item 2')
160    })
161  
162    it('converts ordered lists', () => {
163      const result = convertHtmlToMarkdown(
164        '<ol><li>Item 1</li><li>Item 2</li></ol>',
165      )
166      expect(result).toMatch(/\d+\./)
167    })
168  
169    it('converts blockquotes', () => {
170      const result = convertHtmlToMarkdown('<blockquote>Quote</blockquote>')
171      expect(result).toContain('> Quote')
172    })
173  
174    it('converts paragraphs', () => {
175      const result = convertHtmlToMarkdown('<p>Paragraph text</p>')
176      expect(result).toContain('Paragraph text')
177    })
178  
179    it('converts line breaks', () => {
180      expect(convertHtmlToMarkdown('Line 1<br>Line 2')).toContain(
181        'Line 1\nLine 2',
182      )
183      expect(convertHtmlToMarkdown('Line 1<br/>Line 2')).toContain(
184        'Line 1\nLine 2',
185      )
186      expect(convertHtmlToMarkdown('Line 1<br />Line 2')).toContain(
187        'Line 1\nLine 2',
188      )
189    })
190  
191    it('converts horizontal rules', () => {
192      expect(convertHtmlToMarkdown('<hr>')).toContain('---')
193    })
194  
195    it('converts images', () => {
196      const result = convertHtmlToMarkdown('<img src="image.jpg" alt="Alt text">')
197      expect(result).toContain('![Alt text](image.jpg)')
198    })
199  })
200  
201  describe('convertTableToMarkdown', () => {
202    it('converts basic table', () => {
203      const html = `
204        <tr><th>Header 1</th><th>Header 2</th></tr>
205        <tr><td>Cell 1</td><td>Cell 2</td></tr>
206      `
207      const result = convertTableToMarkdown(html)
208      expect(result).toContain('| Header 1 | Header 2 |')
209      expect(result).toContain('| --- | --- |')
210      expect(result).toContain('| Cell 1 | Cell 2 |')
211    })
212  
213    it('returns empty string for empty content', () => {
214      expect(convertTableToMarkdown('')).toBe('')
215      expect(convertTableToMarkdown('no table here')).toBe('')
216    })
217  })
218  
219  describe('cleanMarkdownOutput', () => {
220    it('trims leading and trailing whitespace on lines', () => {
221      expect(cleanMarkdownOutput('  Hello\n  World')).toBe('Hello\nWorld')
222    })
223  
224    it('trims overall content', () => {
225      expect(cleanMarkdownOutput('  \nHello\n  ')).toBe('Hello')
226    })
227  })
228  
229  describe('extractHtmlContent', () => {
230    it('extracts markdown content from HTML', () => {
231      const html = `
232        <html>
233          <head><title>Test Page</title></head>
234          <body>
235            <h1>Main Title</h1>
236            <p>Some content here.</p>
237          </body>
238        </html>
239      `
240      const result = extractHtmlContent(html, { mode: 'markdown' })
241      expect(result.content).toContain('# Main Title')
242      expect(result.content).toContain('Some content here')
243      expect(result.title).toBe('Test Page')
244    })
245  
246    it('extracts plain text from HTML', () => {
247      const html =
248        '<style>.hidden{display:none}</style><script>console.log(1)</script><p>Hello <strong>World</strong></p>'
249      const result = extractHtmlContent(html, { mode: 'text' })
250      expect(result.content).toBe('Hello World')
251    })
252  
253    it('returns title as undefined if not found', () => {
254      const html = '<p>No title here</p>'
255      const result = extractHtmlContent(html, { mode: 'markdown' })
256      expect(result.title).toBeUndefined()
257    })
258  })
259  
260  describe('isCssLikeText', () => {
261    it('detects stylesheet-like payloads', () => {
262      const cssPayload = `
263        .VUoKZ{display:none;position:absolute;z-index:1001;}
264        .TRHLAc{position:absolute;top:0;left:0;right:0;height:4px;}
265        .mTM26c .VUoKZ{display:block;}
266        .gm3-sys-color--on-error{color:#fff;background:#111;}
267        .abc{margin:0;padding:0;display:flex;transform:scaleX(0);}
268      `.repeat(12)
269      expect(isCssLikeText(cssPayload)).toBe(true)
270    })
271  
272    it('does not mark normal article text as stylesheet-like', () => {
273      const articleText = `
274        This is a plain news article paragraph discussing current events and
275        includes punctuation, quotes, and normal sentence structure.
276      `.repeat(12)
277      expect(isCssLikeText(articleText)).toBe(false)
278    })
279  })