parse.rs
1 //! HTML parsing and rendering helpers. 2 //! 3 //! This module intentionally does *not* attempt to implement a full browser. 4 //! It does two things for the POC: 5 //! - Convert HTML → wrapped text (via `html2text`) 6 //! - Extract `<img src ...>` references (via `scraper`) 7 8 use scraper::{ElementRef, Html, Selector}; 9 use url::Url; 10 11 /// A discovered `<img>` reference from the HTML document. 12 #[derive(Debug, Clone, PartialEq, Eq)] 13 pub struct ImageRef { 14 /// 1-based index (for user-friendly display). 15 pub index: usize, 16 pub url: Url, 17 pub alt: String, 18 } 19 20 /// A discovered `<a href>` reference from the HTML document. 21 #[derive(Debug, Clone, PartialEq, Eq)] 22 pub struct LinkRef { 23 /// 1-based index (for user-friendly display). 24 pub index: usize, 25 pub url: Url, 26 /// Visible anchor text (best effort). 27 pub text: String, 28 /// Optional title attribute. 29 pub title: Option<String>, 30 } 31 32 /// A discovered heading from the document. 33 #[derive(Debug, Clone, PartialEq, Eq)] 34 pub struct HeadingRef { 35 /// Heading level (1..=6). 36 pub level: u8, 37 pub text: String, 38 } 39 40 /// Convert HTML into wrapped plain text lines. 41 pub fn render_text_lines(html: &str, wrap_width: usize) -> Vec<String> { 42 let w = wrap_width.max(20); // avoid pathological tiny widths 43 let text = html2text::from_read(html.as_bytes(), w); 44 // Preserve empty lines so the output has shape. 45 text.split('\n').map(|l| l.to_string()).collect() 46 } 47 48 /// Extract `<a href>` elements from the HTML and resolve their `href` against `base_url`. 49 pub fn extract_links(html: &str, base_url: &Url) -> Vec<LinkRef> { 50 let doc = Html::parse_document(html); 51 let sel = Selector::parse("a[href]").expect("valid selector"); 52 53 let mut out = Vec::new(); 54 for el in doc.select(&sel) { 55 let href = match el.value().attr("href") { 56 Some(s) if !s.trim().is_empty() => s.trim(), 57 _ => continue, 58 }; 59 60 let resolved = match base_url.join(href) { 61 Ok(u) => u, 62 Err(_) => continue, 63 }; 64 65 let mut text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" ")); 66 let title = el.value().attr("title").map(|s| s.to_string()); 67 if text.is_empty() { 68 if let Some(t) = &title { 69 text = normalize_whitespace(t); 70 } 71 } 72 if text.is_empty() { 73 text = href.to_string(); 74 } 75 76 let index = out.len() + 1; 77 out.push(LinkRef { 78 index, 79 url: resolved, 80 text, 81 title, 82 }); 83 } 84 85 out 86 } 87 88 /// Extract document headings in order. 89 pub fn extract_headings(html: &str) -> Vec<HeadingRef> { 90 let doc = Html::parse_document(html); 91 let sel = Selector::parse("h1,h2,h3,h4,h5,h6").expect("valid selector"); 92 93 let mut out = Vec::new(); 94 for el in doc.select(&sel) { 95 let tag = el.value().name(); 96 let level = tag 97 .strip_prefix('h') 98 .and_then(|n| n.parse::<u8>().ok()) 99 .filter(|lvl| (1..=6).contains(lvl)) 100 .unwrap_or(1); 101 let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" ")); 102 if text.is_empty() { 103 continue; 104 } 105 out.push(HeadingRef { level, text }); 106 } 107 108 out 109 } 110 111 /// Extract the document `<title>` text, if present. 112 pub fn extract_title(html: &str) -> Option<String> { 113 let doc = Html::parse_document(html); 114 let sel = Selector::parse("title").expect("valid selector"); 115 let el = doc.select(&sel).next()?; 116 let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" ")); 117 if text.is_empty() { 118 None 119 } else { 120 Some(text) 121 } 122 } 123 124 /// Attempt to extract a readable "article" HTML fragment using a simple heuristic. 125 /// 126 /// Returns `None` when no better candidate is found. 127 pub fn extract_readable_html(html: &str) -> Option<String> { 128 let doc = Html::parse_document(html); 129 let article_sel = Selector::parse("article").expect("valid selector"); 130 let main_sel = Selector::parse("main").expect("valid selector"); 131 let role_main_sel = Selector::parse("[role=main]").expect("valid selector"); 132 let section_sel = Selector::parse("section").expect("valid selector"); 133 let div_sel = Selector::parse("div").expect("valid selector"); 134 135 let primary = [&article_sel, &main_sel, &role_main_sel]; 136 if let Some(best) = best_candidate(&doc, &primary, 20) { 137 return Some(best.html()); 138 } 139 140 let secondary = [§ion_sel, &div_sel]; 141 best_candidate(&doc, &secondary, 20).map(|el| el.html()) 142 } 143 144 /// Extract `<img>` elements from the HTML and resolve their `src` against `base_url`. 145 pub fn extract_images(html: &str, base_url: &Url) -> Vec<ImageRef> { 146 let doc = Html::parse_document(html); 147 let sel = Selector::parse("img").expect("valid selector"); 148 149 let mut out = Vec::new(); 150 for el in doc.select(&sel) { 151 let src = match el.value().attr("src") { 152 Some(s) if !s.trim().is_empty() => s.trim(), 153 _ => continue, 154 }; 155 156 let resolved = match base_url.join(src) { 157 Ok(u) => u, 158 Err(_) => continue, 159 }; 160 161 let alt = el.value().attr("alt").unwrap_or("").to_string(); 162 let index = out.len() + 1; 163 out.push(ImageRef { 164 index, 165 url: resolved, 166 alt, 167 }); 168 } 169 170 out 171 } 172 173 /// Collapse any internal whitespace runs and trim ends. 174 fn normalize_whitespace(input: &str) -> String { 175 let mut out = String::with_capacity(input.len()); 176 let mut last_was_space = false; 177 for ch in input.chars() { 178 if ch.is_whitespace() { 179 if !last_was_space { 180 out.push(' '); 181 last_was_space = true; 182 } 183 } else { 184 out.push(ch); 185 last_was_space = false; 186 } 187 } 188 out.trim().to_string() 189 } 190 191 fn best_candidate<'a>( 192 doc: &'a Html, 193 selectors: &[&Selector], 194 min_len: usize, 195 ) -> Option<ElementRef<'a>> { 196 let mut best: Option<(ElementRef<'a>, usize)> = None; 197 let a_sel = Selector::parse("a").expect("valid selector"); 198 199 for sel in selectors { 200 for el in doc.select(sel) { 201 if is_noise_element(&el) { 202 continue; 203 } 204 let text_len = element_text_len(&el); 205 if text_len < min_len { 206 continue; 207 } 208 let link_len = element_link_text_len(&el, &a_sel); 209 let score = text_len.saturating_sub(link_len / 2); 210 match &mut best { 211 Some((_, best_score)) if score <= *best_score => {} 212 _ => best = Some((el, score)), 213 } 214 } 215 } 216 217 best.map(|(el, _)| el) 218 } 219 220 fn element_text_len(el: &ElementRef<'_>) -> usize { 221 let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" ")); 222 text.len() 223 } 224 225 fn element_link_text_len(el: &ElementRef<'_>, a_sel: &Selector) -> usize { 226 let mut total = 0; 227 for link in el.select(a_sel) { 228 let text = normalize_whitespace(&link.text().collect::<Vec<_>>().join(" ")); 229 total += text.len(); 230 } 231 total 232 } 233 234 fn is_noise_element(el: &ElementRef<'_>) -> bool { 235 let mut haystack = String::new(); 236 if let Some(id) = el.value().attr("id") { 237 haystack.push_str(id); 238 haystack.push(' '); 239 } 240 if let Some(class) = el.value().attr("class") { 241 haystack.push_str(class); 242 } 243 let haystack = haystack.to_ascii_lowercase(); 244 let noise = [ 245 "nav", "menu", "header", "footer", "sidebar", "comment", "ads", "advert", 246 ]; 247 noise.iter().any(|n| haystack.contains(n)) 248 } 249 250 #[cfg(test)] 251 mod tests { 252 use super::*; 253 use pretty_assertions::assert_eq; 254 255 #[test] 256 fn extract_images_resolves_relative_urls() { 257 let base = Url::parse("https://example.com/path/page.html").unwrap(); 258 let html = r#" 259 <html> 260 <body> 261 <img src="a.png" alt="A"> 262 <img src="/static/b.jpg" alt="B"> 263 <img src="https://cdn.example.net/c.webp" alt="C"> 264 <img alt="no-src"> 265 </body> 266 </html> 267 "#; 268 269 let imgs = extract_images(html, &base); 270 assert_eq!(imgs.len(), 3); 271 assert_eq!(imgs[0].index, 1); 272 assert_eq!(imgs[0].url.as_str(), "https://example.com/path/a.png"); 273 assert_eq!(imgs[1].url.as_str(), "https://example.com/static/b.jpg"); 274 assert_eq!(imgs[2].url.as_str(), "https://cdn.example.net/c.webp"); 275 } 276 277 #[test] 278 fn render_text_lines_produces_lines() { 279 let html = "<h1>Hello</h1><p>World</p>"; 280 let lines = render_text_lines(html, 40); 281 assert!(lines.iter().any(|l| l.contains("Hello"))); 282 assert!(lines.iter().any(|l| l.contains("World"))); 283 } 284 285 #[test] 286 fn extract_links_resolves_relative_urls_and_text() { 287 let base = Url::parse("https://example.com/path/page.html").unwrap(); 288 let html = r#" 289 <a href="a.html"> A </a> 290 <a href="/b"><span>B</span> <em>bee</em></a> 291 <a href="">ignored</a> 292 "#; 293 294 let links = extract_links(html, &base); 295 assert_eq!(links.len(), 2); 296 assert_eq!(links[0].index, 1); 297 assert_eq!(links[0].url.as_str(), "https://example.com/path/a.html"); 298 assert_eq!(links[0].text, "A"); 299 assert_eq!(links[1].url.as_str(), "https://example.com/b"); 300 assert_eq!(links[1].text, "B bee"); 301 } 302 303 #[test] 304 fn extract_links_uses_title_when_text_empty() { 305 let base = Url::parse("https://example.com/").unwrap(); 306 let html = r#"<a href="/x" title="Read more"></a>"#; 307 308 let links = extract_links(html, &base); 309 assert_eq!(links.len(), 1); 310 assert_eq!(links[0].text, "Read more"); 311 } 312 313 #[test] 314 fn extract_title_returns_text() { 315 let html = "<html><head><title> Hello world </title></head></html>"; 316 assert_eq!(extract_title(html), Some("Hello world".to_string())); 317 } 318 319 #[test] 320 fn extract_title_none_when_missing() { 321 let html = "<html><head></head><body>Hi</body></html>"; 322 assert_eq!(extract_title(html), None); 323 } 324 325 #[test] 326 fn extract_headings_collects_levels() { 327 let html = r#" 328 <h1>Title</h1> 329 <h2>Section</h2> 330 <h3><span>Sub</span> heading</h3> 331 "#; 332 333 let heads = extract_headings(html); 334 assert_eq!(heads.len(), 3); 335 assert_eq!(heads[0].level, 1); 336 assert_eq!(heads[0].text, "Title"); 337 assert_eq!(heads[1].level, 2); 338 assert_eq!(heads[1].text, "Section"); 339 assert_eq!(heads[2].level, 3); 340 assert_eq!(heads[2].text, "Sub heading"); 341 } 342 343 #[test] 344 fn extract_readable_html_prefers_article() { 345 let html = r#" 346 <nav id="menu"> 347 <a href="/a">A</a> 348 <a href="/b">B</a> 349 <a href="/c">C</a> 350 </nav> 351 <article> 352 <h1>Story</h1> 353 <p>This is the main content.</p> 354 </article> 355 "#; 356 357 let extracted = extract_readable_html(html).unwrap(); 358 assert!(extracted.contains("main content")); 359 assert!(!extracted.contains("menu")); 360 } 361 362 #[test] 363 fn extract_readable_html_falls_back_to_div() { 364 let html = r#" 365 <div id="content"> 366 <p>Primary content lives here.</p> 367 </div> 368 "#; 369 370 let extracted = extract_readable_html(html).unwrap(); 371 assert!(extracted.contains("Primary content")); 372 } 373 }