Cradicle Explorer

/ src / parse.rs
parse.rs
  1  //! HTML parsing and rendering helpers.
  2  //!
  3  //! This module intentionally does *not* attempt to implement a full browser.
  4  //! It does two things for the POC:
  5  //! - Convert HTML → wrapped text (via `html2text`)
  6  //! - Extract `<img src ...>` references (via `scraper`)
  7  
  8  use scraper::{ElementRef, Html, Selector};
  9  use url::Url;
 10  
 11  /// A discovered `<img>` reference from the HTML document.
 12  #[derive(Debug, Clone, PartialEq, Eq)]
 13  pub struct ImageRef {
 14      /// 1-based index (for user-friendly display).
 15      pub index: usize,
 16      pub url: Url,
 17      pub alt: String,
 18  }
 19  
 20  /// A discovered `<a href>` reference from the HTML document.
 21  #[derive(Debug, Clone, PartialEq, Eq)]
 22  pub struct LinkRef {
 23      /// 1-based index (for user-friendly display).
 24      pub index: usize,
 25      pub url: Url,
 26      /// Visible anchor text (best effort).
 27      pub text: String,
 28      /// Optional title attribute.
 29      pub title: Option<String>,
 30  }
 31  
 32  /// A discovered heading from the document.
 33  #[derive(Debug, Clone, PartialEq, Eq)]
 34  pub struct HeadingRef {
 35      /// Heading level (1..=6).
 36      pub level: u8,
 37      pub text: String,
 38  }
 39  
 40  /// Convert HTML into wrapped plain text lines.
 41  pub fn render_text_lines(html: &str, wrap_width: usize) -> Vec<String> {
 42      let w = wrap_width.max(20); // avoid pathological tiny widths
 43      let text = html2text::from_read(html.as_bytes(), w);
 44      // Preserve empty lines so the output has shape.
 45      text.split('\n').map(|l| l.to_string()).collect()
 46  }
 47  
 48  /// Extract `<a href>` elements from the HTML and resolve their `href` against `base_url`.
 49  pub fn extract_links(html: &str, base_url: &Url) -> Vec<LinkRef> {
 50      let doc = Html::parse_document(html);
 51      let sel = Selector::parse("a[href]").expect("valid selector");
 52  
 53      let mut out = Vec::new();
 54      for el in doc.select(&sel) {
 55          let href = match el.value().attr("href") {
 56              Some(s) if !s.trim().is_empty() => s.trim(),
 57              _ => continue,
 58          };
 59  
 60          let resolved = match base_url.join(href) {
 61              Ok(u) => u,
 62              Err(_) => continue,
 63          };
 64  
 65          let mut text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" "));
 66          let title = el.value().attr("title").map(|s| s.to_string());
 67          if text.is_empty() {
 68              if let Some(t) = &title {
 69                  text = normalize_whitespace(t);
 70              }
 71          }
 72          if text.is_empty() {
 73              text = href.to_string();
 74          }
 75  
 76          let index = out.len() + 1;
 77          out.push(LinkRef {
 78              index,
 79              url: resolved,
 80              text,
 81              title,
 82          });
 83      }
 84  
 85      out
 86  }
 87  
 88  /// Extract document headings in order.
 89  pub fn extract_headings(html: &str) -> Vec<HeadingRef> {
 90      let doc = Html::parse_document(html);
 91      let sel = Selector::parse("h1,h2,h3,h4,h5,h6").expect("valid selector");
 92  
 93      let mut out = Vec::new();
 94      for el in doc.select(&sel) {
 95          let tag = el.value().name();
 96          let level = tag
 97              .strip_prefix('h')
 98              .and_then(|n| n.parse::<u8>().ok())
 99              .filter(|lvl| (1..=6).contains(lvl))
100              .unwrap_or(1);
101          let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" "));
102          if text.is_empty() {
103              continue;
104          }
105          out.push(HeadingRef { level, text });
106      }
107  
108      out
109  }
110  
111  /// Extract the document `<title>` text, if present.
112  pub fn extract_title(html: &str) -> Option<String> {
113      let doc = Html::parse_document(html);
114      let sel = Selector::parse("title").expect("valid selector");
115      let el = doc.select(&sel).next()?;
116      let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" "));
117      if text.is_empty() {
118          None
119      } else {
120          Some(text)
121      }
122  }
123  
124  /// Attempt to extract a readable "article" HTML fragment using a simple heuristic.
125  ///
126  /// Returns `None` when no better candidate is found.
127  pub fn extract_readable_html(html: &str) -> Option<String> {
128      let doc = Html::parse_document(html);
129      let article_sel = Selector::parse("article").expect("valid selector");
130      let main_sel = Selector::parse("main").expect("valid selector");
131      let role_main_sel = Selector::parse("[role=main]").expect("valid selector");
132      let section_sel = Selector::parse("section").expect("valid selector");
133      let div_sel = Selector::parse("div").expect("valid selector");
134  
135      let primary = [&article_sel, &main_sel, &role_main_sel];
136      if let Some(best) = best_candidate(&doc, &primary, 20) {
137          return Some(best.html());
138      }
139  
140      let secondary = [&section_sel, &div_sel];
141      best_candidate(&doc, &secondary, 20).map(|el| el.html())
142  }
143  
144  /// Extract `<img>` elements from the HTML and resolve their `src` against `base_url`.
145  pub fn extract_images(html: &str, base_url: &Url) -> Vec<ImageRef> {
146      let doc = Html::parse_document(html);
147      let sel = Selector::parse("img").expect("valid selector");
148  
149      let mut out = Vec::new();
150      for el in doc.select(&sel) {
151          let src = match el.value().attr("src") {
152              Some(s) if !s.trim().is_empty() => s.trim(),
153              _ => continue,
154          };
155  
156          let resolved = match base_url.join(src) {
157              Ok(u) => u,
158              Err(_) => continue,
159          };
160  
161          let alt = el.value().attr("alt").unwrap_or("").to_string();
162          let index = out.len() + 1;
163          out.push(ImageRef {
164              index,
165              url: resolved,
166              alt,
167          });
168      }
169  
170      out
171  }
172  
173  /// Collapse any internal whitespace runs and trim ends.
174  fn normalize_whitespace(input: &str) -> String {
175      let mut out = String::with_capacity(input.len());
176      let mut last_was_space = false;
177      for ch in input.chars() {
178          if ch.is_whitespace() {
179              if !last_was_space {
180                  out.push(' ');
181                  last_was_space = true;
182              }
183          } else {
184              out.push(ch);
185              last_was_space = false;
186          }
187      }
188      out.trim().to_string()
189  }
190  
191  fn best_candidate<'a>(
192      doc: &'a Html,
193      selectors: &[&Selector],
194      min_len: usize,
195  ) -> Option<ElementRef<'a>> {
196      let mut best: Option<(ElementRef<'a>, usize)> = None;
197      let a_sel = Selector::parse("a").expect("valid selector");
198  
199      for sel in selectors {
200          for el in doc.select(sel) {
201              if is_noise_element(&el) {
202                  continue;
203              }
204              let text_len = element_text_len(&el);
205              if text_len < min_len {
206                  continue;
207              }
208              let link_len = element_link_text_len(&el, &a_sel);
209              let score = text_len.saturating_sub(link_len / 2);
210              match &mut best {
211                  Some((_, best_score)) if score <= *best_score => {}
212                  _ => best = Some((el, score)),
213              }
214          }
215      }
216  
217      best.map(|(el, _)| el)
218  }
219  
220  fn element_text_len(el: &ElementRef<'_>) -> usize {
221      let text = normalize_whitespace(&el.text().collect::<Vec<_>>().join(" "));
222      text.len()
223  }
224  
225  fn element_link_text_len(el: &ElementRef<'_>, a_sel: &Selector) -> usize {
226      let mut total = 0;
227      for link in el.select(a_sel) {
228          let text = normalize_whitespace(&link.text().collect::<Vec<_>>().join(" "));
229          total += text.len();
230      }
231      total
232  }
233  
234  fn is_noise_element(el: &ElementRef<'_>) -> bool {
235      let mut haystack = String::new();
236      if let Some(id) = el.value().attr("id") {
237          haystack.push_str(id);
238          haystack.push(' ');
239      }
240      if let Some(class) = el.value().attr("class") {
241          haystack.push_str(class);
242      }
243      let haystack = haystack.to_ascii_lowercase();
244      let noise = [
245          "nav", "menu", "header", "footer", "sidebar", "comment", "ads", "advert",
246      ];
247      noise.iter().any(|n| haystack.contains(n))
248  }
249  
250  #[cfg(test)]
251  mod tests {
252      use super::*;
253      use pretty_assertions::assert_eq;
254  
255      #[test]
256      fn extract_images_resolves_relative_urls() {
257          let base = Url::parse("https://example.com/path/page.html").unwrap();
258          let html = r#"
259              <html>
260                <body>
261                  <img src="a.png" alt="A">
262                  <img src="/static/b.jpg" alt="B">
263                  <img src="https://cdn.example.net/c.webp" alt="C">
264                  <img alt="no-src">
265                </body>
266              </html>
267          "#;
268  
269          let imgs = extract_images(html, &base);
270          assert_eq!(imgs.len(), 3);
271          assert_eq!(imgs[0].index, 1);
272          assert_eq!(imgs[0].url.as_str(), "https://example.com/path/a.png");
273          assert_eq!(imgs[1].url.as_str(), "https://example.com/static/b.jpg");
274          assert_eq!(imgs[2].url.as_str(), "https://cdn.example.net/c.webp");
275      }
276  
277      #[test]
278      fn render_text_lines_produces_lines() {
279          let html = "<h1>Hello</h1><p>World</p>";
280          let lines = render_text_lines(html, 40);
281          assert!(lines.iter().any(|l| l.contains("Hello")));
282          assert!(lines.iter().any(|l| l.contains("World")));
283      }
284  
285      #[test]
286      fn extract_links_resolves_relative_urls_and_text() {
287          let base = Url::parse("https://example.com/path/page.html").unwrap();
288          let html = r#"
289              <a href="a.html"> A </a>
290              <a href="/b"><span>B</span>  <em>bee</em></a>
291              <a href="">ignored</a>
292          "#;
293  
294          let links = extract_links(html, &base);
295          assert_eq!(links.len(), 2);
296          assert_eq!(links[0].index, 1);
297          assert_eq!(links[0].url.as_str(), "https://example.com/path/a.html");
298          assert_eq!(links[0].text, "A");
299          assert_eq!(links[1].url.as_str(), "https://example.com/b");
300          assert_eq!(links[1].text, "B bee");
301      }
302  
303      #[test]
304      fn extract_links_uses_title_when_text_empty() {
305          let base = Url::parse("https://example.com/").unwrap();
306          let html = r#"<a href="/x" title="Read more"></a>"#;
307  
308          let links = extract_links(html, &base);
309          assert_eq!(links.len(), 1);
310          assert_eq!(links[0].text, "Read more");
311      }
312  
313      #[test]
314      fn extract_title_returns_text() {
315          let html = "<html><head><title> Hello   world </title></head></html>";
316          assert_eq!(extract_title(html), Some("Hello world".to_string()));
317      }
318  
319      #[test]
320      fn extract_title_none_when_missing() {
321          let html = "<html><head></head><body>Hi</body></html>";
322          assert_eq!(extract_title(html), None);
323      }
324  
325      #[test]
326      fn extract_headings_collects_levels() {
327          let html = r#"
328              <h1>Title</h1>
329              <h2>Section</h2>
330              <h3><span>Sub</span> heading</h3>
331          "#;
332  
333          let heads = extract_headings(html);
334          assert_eq!(heads.len(), 3);
335          assert_eq!(heads[0].level, 1);
336          assert_eq!(heads[0].text, "Title");
337          assert_eq!(heads[1].level, 2);
338          assert_eq!(heads[1].text, "Section");
339          assert_eq!(heads[2].level, 3);
340          assert_eq!(heads[2].text, "Sub heading");
341      }
342  
343      #[test]
344      fn extract_readable_html_prefers_article() {
345          let html = r#"
346              <nav id="menu">
347                <a href="/a">A</a>
348                <a href="/b">B</a>
349                <a href="/c">C</a>
350              </nav>
351              <article>
352                <h1>Story</h1>
353                <p>This is the main content.</p>
354              </article>
355          "#;
356  
357          let extracted = extract_readable_html(html).unwrap();
358          assert!(extracted.contains("main content"));
359          assert!(!extracted.contains("menu"));
360      }
361  
362      #[test]
363      fn extract_readable_html_falls_back_to_div() {
364          let html = r#"
365              <div id="content">
366                <p>Primary content lives here.</p>
367              </div>
368          "#;
369  
370          let extracted = extract_readable_html(html).unwrap();
371          assert!(extracted.contains("Primary content"));
372      }
373  }