Cradicle Explorer

url.py
  1  """Beautiful Soup Web scraper."""
  2  
  3  import logging
  4  from typing import TYPE_CHECKING, List, Literal, Optional, Union
  5  from unstructured.partition.html import partition_html
  6  
  7  if TYPE_CHECKING:
  8      from selenium.webdriver import Chrome, Firefox
  9  
 10  from llama_index.core.readers.base import BaseReader
 11  from llama_index.core.schema import Document
 12  
 13  logger = logging.getLogger(__name__)
 14  
 15  
 16  class SeleniumWebReader(BaseReader):
 17      """BeautifulSoup web page reader.
 18  
 19      Reads pages from the web.
 20      Requires the `bs4` and `urllib` packages.
 21  
 22      Args:
 23          website_extractor (Optional[Dict[str, Callable]]): A mapping of website
 24              hostname (e.g. google.com) to a function that specifies how to
 25              extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR.
 26      """
 27  
 28      def __init__(
 29          self,
 30              continue_on_failure: bool = True,
 31              browser: Literal["chrome", "firefox"] = "chrome",
 32              binary_location: Optional[str] = None,
 33              executable_path: Optional[str] = None,
 34              headless: bool = True,
 35              arguments: object = None,
 36      ) -> None:
 37          
 38          """Load a list of URLs using Selenium and unstructured."""
 39          if arguments is None:
 40              arguments = []
 41          try:
 42              import selenium  # noqa:F401
 43          except ImportError:
 44              raise ImportError(
 45                  "selenium package not found, please install it with "
 46                  "`pip install selenium`"
 47              )
 48  
 49          try:
 50              import unstructured  # noqa:F401
 51          except ImportError:
 52              raise ImportError(
 53                  "unstructured package not found, please install it with "
 54                  "`pip install unstructured`"
 55              )
 56  
 57          self.continue_on_failure = continue_on_failure
 58          self.browser = browser
 59          self.binary_location = binary_location
 60          self.executable_path = executable_path
 61          self.headless = headless
 62          self.arguments = arguments
 63  
 64      def _get_driver(self) -> Union["Chrome", "Firefox"]:
 65          """Create and return a WebDriver instance based on the specified browser.
 66  
 67          Raises:
 68              ValueError: If an invalid browser is specified.
 69  
 70          Returns:
 71              Union[Chrome, Firefox]: A WebDriver instance for the specified browser.
 72          """
 73          if self.browser.lower() == "chrome":
 74              from selenium.webdriver import Chrome
 75              from selenium.webdriver.chrome.options import Options as ChromeOptions
 76              from selenium.webdriver.chrome.service import Service
 77  
 78              chrome_options = ChromeOptions()
 79  
 80              for arg in self.arguments:
 81                  chrome_options.add_argument(arg)
 82  
 83              if self.headless:
 84                  chrome_options.add_argument("--headless")
 85                  chrome_options.add_argument("--no-sandbox")
 86                  chrome_options.add_argument("--disable-dev-shm-usage")
 87                  chrome_options.add_argument("--disable-gpu")
 88  
 89              binary = self.binary_location
 90              exec_path = self.executable_path
 91  
 92              # Auto-detect snap Chromium if no explicit paths provided
 93              if binary is None and exec_path is None:
 94                  import shutil
 95                  import os
 96                  if shutil.which("chromium-browser") and not shutil.which("google-chrome"):
 97                      binary = shutil.which("chromium-browser")
 98                      snap_driver = "/snap/chromium/current/usr/lib/chromium-browser/chromedriver"
 99                      if os.path.exists(snap_driver):
100                          exec_path = snap_driver
101  
102              if binary is not None:
103                  chrome_options.binary_location = binary
104              if exec_path is None:
105                  return Chrome(options=chrome_options)
106              return Chrome(
107                  options=chrome_options,
108                  service=Service(executable_path=exec_path),
109              )
110          elif self.browser.lower() == "firefox":
111              from selenium.webdriver import Firefox
112              from selenium.webdriver.firefox.options import Options as FirefoxOptions
113              from selenium.webdriver.firefox.service import Service
114  
115              firefox_options = FirefoxOptions()
116  
117              for arg in self.arguments:
118                  firefox_options.add_argument(arg)
119  
120              if self.headless:
121                  firefox_options.add_argument("--headless")
122              if self.binary_location is not None:
123                  firefox_options.binary_location = self.binary_location
124              if self.executable_path is None:
125                  return Firefox(options=firefox_options)
126              return Firefox(
127                  options=firefox_options,
128                  service=Service(executable_path=self.executable_path),
129              )
130          else:
131              raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")
132  
133      @staticmethod
134      def _build_metadata(url: str, driver: Union["Chrome", "Firefox"]) -> dict:
135          from selenium.common.exceptions import NoSuchElementException
136          from selenium.webdriver.common.by import By
137  
138          """Build metadata based on the contents of the webpage"""
139          metadata = {
140              "source": url,
141              "title": "No title found.",
142              "description": "No description found.",
143              "language": "No language found.",
144          }
145          if title := driver.title:
146              metadata["title"] = title
147          try:
148              if description := driver.find_element(
149                  By.XPATH, '//meta[@name="description"]'
150              ):
151                  metadata["description"] = (
152                      description.get_attribute("content") or "No description found."
153                  )
154          except NoSuchElementException:
155              pass
156          try:
157              if html_tag := driver.find_element(By.TAG_NAME, "html"):
158                  metadata["language"] = (
159                      html_tag.get_attribute("lang") or "No language found."
160                  )
161          except NoSuchElementException:
162              pass
163          return metadata
164  
165      def load_data(
166          self,
167          urls: list[str],
168      ) -> List[Document]:
169          """Load the specified URLs using Selenium and create Document instances.
170  
171          Returns:
172              List[Document]: A list of Document instances with loaded content.
173          """
174  
175          docs: List[Document] = list()
176          driver = self._get_driver()
177  
178          for url in urls:
179              try:
180                  driver.get(url)
181                  page_content = driver.page_source
182                  elements = partition_html(text=page_content)
183                  text = "\n\n".join([str(el) for el in elements])
184                  metadata = self._build_metadata(url, driver)
185                  docs.append(Document(text=text, metadata=metadata))
186              except Exception as e:
187                  if self.continue_on_failure:
188                      logger.error(f"Error fetching or processing {url}, exception: {e}")
189                  else:
190                      raise e
191  
192          driver.quit()
193          return docs