Cradicle Explorer

sources.rs
  1  use std::{
  2      collections::HashMap,
  3      error::Error,
  4      fs::{self, File},
  5      path::PathBuf,
  6  };
  7  
  8  use crate::{setup::paths::Paths, sourcing::crawl::crawl_through_source};
  9  use regex::Regex;
 10  use url::Url;
 11  
 12  pub struct Source<'a> {
 13      pub name: &'a str,
 14      pub start_url: Url,
 15      pub trail: Option<&'a str>,
 16      pub root_path: PathBuf,
 17      pub regexes: Option<Vec<Regex>>,
 18      pub header_indicators: Option<Vec<&'a str>>,
 19      pub footnote_indicators: Option<Vec<&'a str>>,
 20      pub named_artifacts: Option<Vec<&'a str>>,
 21      pub subdomains_and_pages: Option<HashMap<&'a str, Vec<&'a str>>>,
 22  }
 23  
 24  impl Source<'_> {
 25      pub fn get_paths_to_all_files(&self) -> Vec<PathBuf> {
 26          let root_dir: &PathBuf = &self.root_path;
 27          let mut paths: Vec<PathBuf> = Vec::new();
 28  
 29          for reader in fs::read_dir(root_dir).unwrap() {
 30              let entry = reader.unwrap();
 31              let file_name: String = entry.file_name().to_str().unwrap().to_string();
 32              let file_path_as_str: String =
 33                  root_dir.to_str().unwrap().to_string() + "/" + &file_name;
 34  
 35              paths.push(PathBuf::from(file_path_as_str))
 36          }
 37  
 38          paths
 39      }
 40  
 41      pub fn prepare_links_for_crawling(&self) -> Vec<Url> {
 42          let mut links_to_scrape: Vec<Url> = Vec::new();
 43  
 44          if let Some(subdomains_and_pages) = &self.subdomains_and_pages {
 45              for (subdomain, pages) in subdomains_and_pages {
 46                  for page in pages {
 47                      let tail: &str = &(subdomain.to_string() + "/" + page);
 48                      let full_url: Url = self.start_url.join(tail).unwrap();
 49                      links_to_scrape.push(full_url);
 50                  }
 51              }
 52          } else {
 53              links_to_scrape.push(self.start_url.clone());
 54          }
 55  
 56          links_to_scrape
 57      }
 58  
 59      pub fn retrieve_logged_file_names_and_links(
 60          &self,
 61      ) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
 62          let path_to_log_file = self.root_path.join("log.json");
 63          let opened_file = File::open(path_to_log_file)?;
 64  
 65          log::info!("Reading log file...");
 66          let logged_urls_and_file_names = serde_json::from_reader(opened_file)?;
 67          Ok(logged_urls_and_file_names)
 68      }
 69  
 70      pub async fn find_links_to_missing_files(&self) -> Result<Vec<String>, Box<dyn Error>> {
 71          let log_retrieval_attempt = self.retrieve_logged_file_names_and_links();
 72  
 73          match log_retrieval_attempt {
 74              Ok(logged_file_names_and_links) => {
 75  
 76                  if logged_file_names_and_links.is_empty() {
 77                      log::error!("The log file is empty! -> Crawling again...");
 78                      crawl_through_source(self).await; 
 79                      let _ = Box::pin(self.find_links_to_missing_files()).await; 
 80                  } 
 81                  let links_to_missing_files = logged_file_names_and_links
 82                      .into_iter()
 83                      .filter(|(file_name, _)| !self.root_path.join(file_name).exists())
 84                      .map(|(_, url)| url)
 85                      .collect::<Vec<String>>();
 86  
 87                  Ok(links_to_missing_files)
 88              }
 89  
 90              Err(e) => {
 91                  log::error!("Could not retrieve the log file for {}. Error: {}", self.name, e);
 92                  Err(e)
 93              }
 94          }
 95      }
 96  }
 97  
 98  pub fn get_sources() -> Vec<Source<'static>> {
 99      let paths = Paths::init();
100  
101      Vec::from([
102          Source {
103              name: "The Rust Book",
104              root_path: paths.the_book,
105              subdomains_and_pages: None,
106              trail: Some("title-page.html"),
107              start_url: Url::parse("https://doc.rust-lang.org/stable/book/title-page.html").unwrap(),
108              header_indicators: None,
109              footnote_indicators: None,
110              regexes: Some(Vec::from([
111                  Regex::new(r"^\[\d+\]: #w+$").unwrap(),
112                  Regex::new(r"^\[\d+\]: #[\w-]+$").unwrap(),
113                  Regex::new(r"^\[\d+\]: [\w.-]+#[\w-]+$").unwrap(),
114                  Regex::new(r"^\[\d+\]: [\w.-]+#[\w]+$").unwrap(),
115                  Regex::new(r"^\[\d+\]: [\w.-]+#[\w]+$").unwrap(),
116                  Regex::new(r"^\[\d+\]: https://[^\s/]+(?:/[^\s#]*)*").unwrap(),
117                  Regex::new(r"^\[\d+\]: ch\d{2}-\d{2}-[a-z]+\.html$").unwrap(), // Targets lines like [4]: ch06-00-enums.html
118                  Regex::new(r"^\[\d+\]: ch\d{2}-\d{2}-[\w-]+\.html$").unwrap(), // Takes care of lines like [5]: ch18-02-trait-objects.html
119              ])),
120              named_artifacts: Some(Vec::from([
121                  "<iframe class=\"sidebar-iframe-outer\" src=\"toc.html\"></iframe>",
122                  "* Light",
123                  "* Rust",
124                  "* Coal",
125                  "* Auto",
126                  "* Navy",
127                  "* Ayu",
128                  "# The Rust Programming Language",
129                  "[ ][1] [ ][2]",
130                  "## [Appendix A: Keywords][3]",
131                  "### [Keywords Currently in Use][5]",
132                  "### [Keywords Reserved for Future Use][7]",
133                  "### [Raw Identifiers][8]",
134                  "[1]: print.html",
135                  "[2]: https://github.com/rust-lang/book",
136                  "[3]: #appendix-a-keywords",
137                  "[4]: #raw-identifiers",
138                  "[5]: #keywords-currently-in-use",
139                  "[6]: ../reference/items/unions.html",
140                  "[7]: #keywords-reserved-for-future-use",
141                  "[8]: #raw-identifiers",
142                  "[9]: appendix-05-editions.html",
143              ])),
144          },
145          Source {
146              name: "Rust Guide",
147              root_path: paths.rust_guide,
148              trail: None,
149              start_url: Url::parse("https://rust-guide.com/en/documentation/").unwrap(),
150              named_artifacts: None,
151              regexes: None,
152              header_indicators: Some(Vec::from(["[Rust Guide][1]"])),
153              footnote_indicators: Some(Vec::from([
154                  "[1]: /", "[2]: /", "[3]: /", "[4]: /", "[5]: /", "[6]: /", "[7]: /",
155              ])),
156              subdomains_and_pages: Some(HashMap::from([
157                  (
158                      "collections",
159                      Vec::from([
160                          "Array",
161                          "BinaryHeap",
162                          "VecDeque",
163                          "HashSet",
164                          "Result",
165                          "Slice",
166                          "str",
167                      ]),
168                  ),
169                  (
170                      "iterators",
171                      Vec::from([
172                          "fold",
173                          "all",
174                          "chain",
175                          "Collect",
176                          "count",
177                          "enumerate",
178                          "filterMap",
179                          "find",
180                          "max",
181                          "skip",
182                          "zip",
183                      ]),
184                  ),
185                  (
186                      "concurrency",
187                      Vec::from([
188                          "Arc",
189                          "AtomicBool",
190                          "AtomicI32",
191                          "AtomicI64",
192                          "AtomicISize",
193                          "CondVar",
194                          "JoinHandle",
195                          "Max",
196                          "Skip",
197                          "Zip",
198                      ]),
199                  ),
200              ])),
201          },
202      ])
203  }