web-fetch.test.ts
1 import { describe, expect, it } from 'vitest' 2 3 import { __testing__ } from '@/server/tools/web-fetch' 4 5 const { 6 extractHtmlContent, 7 convertHtmlToMarkdown, 8 stripHtmlTags, 9 removeScriptsAndStyles, 10 normalizeHtmlStructure, 11 cleanMarkdownOutput, 12 convertTableToMarkdown, 13 extractTitleFromHtml, 14 isCssLikeText, 15 } = __testing__ 16 17 describe('extractTitleFromHtml', () => { 18 it('extracts the title from HTML', () => { 19 const html = '<html><head><title>Test Page</title></head></html>' 20 expect(extractTitleFromHtml(html)).toBe('Test Page') 21 }) 22 23 it('returns null when no title exists', () => { 24 const html = '<html><body>No title here</body></html>' 25 expect(extractTitleFromHtml(html)).toBeNull() 26 }) 27 28 it('handles title with extra attributes', () => { 29 const html = '<title class="foo">Title with attributes</title>' 30 expect(extractTitleFromHtml(html)).toBe('Title with attributes') 31 }) 32 33 it('is case insensitive', () => { 34 const html = '<TITLE>Lowercase extraction</title>' 35 expect(extractTitleFromHtml(html)).toBe('Lowercase extraction') 36 }) 37 }) 38 39 describe('stripHtmlTags', () => { 40 it('removes all HTML tags', () => { 41 expect(stripHtmlTags('<p>Hello <strong>World</strong></p>')).toBe( 42 'Hello World', 43 ) 44 }) 45 46 it('handles nested tags', () => { 47 expect(stripHtmlTags('<div><span><a href="#">Link</a></span></div>')).toBe( 48 'Link', 49 ) 50 }) 51 52 it('handles self-closing tags', () => { 53 expect(stripHtmlTags('<br/><hr/><img src="x"/>')).toBe('') 54 }) 55 }) 56 57 describe('removeScriptsAndStyles', () => { 58 it('removes script tags', () => { 59 const html = 60 '<html><script>alert("xss")</script><body>Content</body></html>' 61 const result = removeScriptsAndStyles(html) 62 expect(result).not.toContain('<script>') 63 expect(result).toContain('Content') 64 }) 65 66 it('removes style tags', () => { 67 const html = 68 '<html><style>.foo { color: red; }</style><body>Content</body></html>' 69 const result = removeScriptsAndStyles(html) 70 expect(result).not.toContain('<style>') 71 expect(result).toContain('Content') 72 }) 73 74 it('handles multiline scripts', () => { 75 const html = `<script> 76 const x = 1; 77 console.log(x); 78 </script><p>Text</p>` 79 expect(removeScriptsAndStyles(html)).toContain('<p>Text</p>') 80 }) 81 82 it('removes style tags with nested braces and pseudo selectors', () => { 83 const html = ` 84 <style> 85 .foo{display:flex} 86 .foo:hover{color:red} 87 .foo::before{content:"<"} 88 </style> 89 <p>Visible</p> 90 ` 91 const result = removeScriptsAndStyles(html) 92 expect(result).toContain('<p>Visible</p>') 93 expect(result).not.toContain('.foo{display:flex}') 94 expect(result).not.toContain('foo::before') 95 }) 96 }) 97 98 describe('normalizeHtmlStructure', () => { 99 it('normalizes br tags', () => { 100 expect(normalizeHtmlStructure('<br>')).toBe('<br />') 101 expect(normalizeHtmlStructure('<br/>')).toContain('<br') 102 expect(normalizeHtmlStructure('<br/>')).toContain('/>') 103 }) 104 105 it('normalizes hr tags', () => { 106 expect(normalizeHtmlStructure('<hr>')).toBe('<hr />') 107 }) 108 109 it('normalizes img tags', () => { 110 expect(normalizeHtmlStructure('<img src="x">')).toBe('<img src="x" />') 111 }) 112 113 it('removes empty p tags', () => { 114 expect(normalizeHtmlStructure('<p></p>')).toBe('') 115 expect(normalizeHtmlStructure('<p> </p>')).toBe('') 116 }) 117 118 it('removes excessive whitespace between tags', () => { 119 expect(normalizeHtmlStructure('> <')).toBe('><') 120 }) 121 }) 122 123 describe('convertHtmlToMarkdown', () => { 124 it('converts h1-h6 to markdown', () => { 125 expect(convertHtmlToMarkdown('<h1>Title 1</h1>')).toContain('# Title 1') 126 expect(convertHtmlToMarkdown('<h2>Title 2</h2>')).toContain('## Title 2') 127 expect(convertHtmlToMarkdown('<h3>Title 3</h3>')).toContain('### Title 3') 128 expect(convertHtmlToMarkdown('<h4>Title 4</h4>')).toContain('#### Title 4') 129 expect(convertHtmlToMarkdown('<h5>Title 5</h5>')).toContain('##### Title 5') 130 expect(convertHtmlToMarkdown('<h6>Title 6</h6>')).toContain( 131 '###### Title 6', 132 ) 133 }) 134 135 it('converts bold and italic', () => { 136 expect(convertHtmlToMarkdown('<strong>bold</strong>')).toContain('**bold**') 137 expect(convertHtmlToMarkdown('<b>bold</b>')).toContain('**bold**') 138 expect(convertHtmlToMarkdown('<em>italic</em>')).toContain('*italic*') 139 expect(convertHtmlToMarkdown('<i>italic</i>')).toContain('*italic*') 140 }) 141 142 it('converts code and pre tags', () => { 143 expect(convertHtmlToMarkdown('<code>code</code>')).toContain('`code`') 144 expect(convertHtmlToMarkdown('<pre>code</pre>')).toContain('```\ncode\n```') 145 }) 146 147 it('converts links', () => { 148 const result = convertHtmlToMarkdown( 149 '<a href="https://example.com">Link</a>', 150 ) 151 expect(result).toContain('[Link](https://example.com)') 152 }) 153 154 it('converts unordered lists', () => { 155 const result = convertHtmlToMarkdown( 156 '<ul><li>Item 1</li><li>Item 2</li></ul>', 157 ) 158 expect(result).toContain('- Item 1') 159 expect(result).toContain('- Item 2') 160 }) 161 162 it('converts ordered lists', () => { 163 const result = convertHtmlToMarkdown( 164 '<ol><li>Item 1</li><li>Item 2</li></ol>', 165 ) 166 expect(result).toMatch(/\d+\./) 167 }) 168 169 it('converts blockquotes', () => { 170 const result = convertHtmlToMarkdown('<blockquote>Quote</blockquote>') 171 expect(result).toContain('> Quote') 172 }) 173 174 it('converts paragraphs', () => { 175 const result = convertHtmlToMarkdown('<p>Paragraph text</p>') 176 expect(result).toContain('Paragraph text') 177 }) 178 179 it('converts line breaks', () => { 180 expect(convertHtmlToMarkdown('Line 1<br>Line 2')).toContain( 181 'Line 1\nLine 2', 182 ) 183 expect(convertHtmlToMarkdown('Line 1<br/>Line 2')).toContain( 184 'Line 1\nLine 2', 185 ) 186 expect(convertHtmlToMarkdown('Line 1<br />Line 2')).toContain( 187 'Line 1\nLine 2', 188 ) 189 }) 190 191 it('converts horizontal rules', () => { 192 expect(convertHtmlToMarkdown('<hr>')).toContain('---') 193 }) 194 195 it('converts images', () => { 196 const result = convertHtmlToMarkdown('<img src="image.jpg" alt="Alt text">') 197 expect(result).toContain('') 198 }) 199 }) 200 201 describe('convertTableToMarkdown', () => { 202 it('converts basic table', () => { 203 const html = ` 204 <tr><th>Header 1</th><th>Header 2</th></tr> 205 <tr><td>Cell 1</td><td>Cell 2</td></tr> 206 ` 207 const result = convertTableToMarkdown(html) 208 expect(result).toContain('| Header 1 | Header 2 |') 209 expect(result).toContain('| --- | --- |') 210 expect(result).toContain('| Cell 1 | Cell 2 |') 211 }) 212 213 it('returns empty string for empty content', () => { 214 expect(convertTableToMarkdown('')).toBe('') 215 expect(convertTableToMarkdown('no table here')).toBe('') 216 }) 217 }) 218 219 describe('cleanMarkdownOutput', () => { 220 it('trims leading and trailing whitespace on lines', () => { 221 expect(cleanMarkdownOutput(' Hello\n World')).toBe('Hello\nWorld') 222 }) 223 224 it('trims overall content', () => { 225 expect(cleanMarkdownOutput(' \nHello\n ')).toBe('Hello') 226 }) 227 }) 228 229 describe('extractHtmlContent', () => { 230 it('extracts markdown content from HTML', () => { 231 const html = ` 232 <html> 233 <head><title>Test Page</title></head> 234 <body> 235 <h1>Main Title</h1> 236 <p>Some content here.</p> 237 </body> 238 </html> 239 ` 240 const result = extractHtmlContent(html, { mode: 'markdown' }) 241 expect(result.content).toContain('# Main Title') 242 expect(result.content).toContain('Some content here') 243 expect(result.title).toBe('Test Page') 244 }) 245 246 it('extracts plain text from HTML', () => { 247 const html = 248 '<style>.hidden{display:none}</style><script>console.log(1)</script><p>Hello <strong>World</strong></p>' 249 const result = extractHtmlContent(html, { mode: 'text' }) 250 expect(result.content).toBe('Hello World') 251 }) 252 253 it('returns title as undefined if not found', () => { 254 const html = '<p>No title here</p>' 255 const result = extractHtmlContent(html, { mode: 'markdown' }) 256 expect(result.title).toBeUndefined() 257 }) 258 }) 259 260 describe('isCssLikeText', () => { 261 it('detects stylesheet-like payloads', () => { 262 const cssPayload = ` 263 .VUoKZ{display:none;position:absolute;z-index:1001;} 264 .TRHLAc{position:absolute;top:0;left:0;right:0;height:4px;} 265 .mTM26c .VUoKZ{display:block;} 266 .gm3-sys-color--on-error{color:#fff;background:#111;} 267 .abc{margin:0;padding:0;display:flex;transform:scaleX(0);} 268 `.repeat(12) 269 expect(isCssLikeText(cssPayload)).toBe(true) 270 }) 271 272 it('does not mark normal article text as stylesheet-like', () => { 273 const articleText = ` 274 This is a plain news article paragraph discussing current events and 275 includes punctuation, quotes, and normal sentence structure. 276 `.repeat(12) 277 expect(isCssLikeText(articleText)).toBe(false) 278 }) 279 })