sources.rs
1 use std::{ 2 collections::HashMap, 3 error::Error, 4 fs::{self, File}, 5 path::PathBuf, 6 }; 7 8 use crate::{setup::paths::Paths, sourcing::crawl::crawl_through_source}; 9 use regex::Regex; 10 use url::Url; 11 12 pub struct Source<'a> { 13 pub name: &'a str, 14 pub start_url: Url, 15 pub trail: Option<&'a str>, 16 pub root_path: PathBuf, 17 pub regexes: Option<Vec<Regex>>, 18 pub header_indicators: Option<Vec<&'a str>>, 19 pub footnote_indicators: Option<Vec<&'a str>>, 20 pub named_artifacts: Option<Vec<&'a str>>, 21 pub subdomains_and_pages: Option<HashMap<&'a str, Vec<&'a str>>>, 22 } 23 24 impl Source<'_> { 25 pub fn get_paths_to_all_files(&self) -> Vec<PathBuf> { 26 let root_dir: &PathBuf = &self.root_path; 27 let mut paths: Vec<PathBuf> = Vec::new(); 28 29 for reader in fs::read_dir(root_dir).unwrap() { 30 let entry = reader.unwrap(); 31 let file_name: String = entry.file_name().to_str().unwrap().to_string(); 32 let file_path_as_str: String = 33 root_dir.to_str().unwrap().to_string() + "/" + &file_name; 34 35 paths.push(PathBuf::from(file_path_as_str)) 36 } 37 38 paths 39 } 40 41 pub fn prepare_links_for_crawling(&self) -> Vec<Url> { 42 let mut links_to_scrape: Vec<Url> = Vec::new(); 43 44 if let Some(subdomains_and_pages) = &self.subdomains_and_pages { 45 for (subdomain, pages) in subdomains_and_pages { 46 for page in pages { 47 let tail: &str = &(subdomain.to_string() + "/" + page); 48 let full_url: Url = self.start_url.join(tail).unwrap(); 49 links_to_scrape.push(full_url); 50 } 51 } 52 } else { 53 links_to_scrape.push(self.start_url.clone()); 54 } 55 56 links_to_scrape 57 } 58 59 pub fn retrieve_logged_file_names_and_links( 60 &self, 61 ) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> { 62 let path_to_log_file = self.root_path.join("log.json"); 63 let opened_file = File::open(path_to_log_file)?; 64 65 log::info!("Reading log file..."); 66 let logged_urls_and_file_names = serde_json::from_reader(opened_file)?; 67 Ok(logged_urls_and_file_names) 68 } 69 70 pub async fn find_links_to_missing_files(&self) -> Result<Vec<String>, Box<dyn Error>> { 71 let log_retrieval_attempt = self.retrieve_logged_file_names_and_links(); 72 73 match log_retrieval_attempt { 74 Ok(logged_file_names_and_links) => { 75 76 if logged_file_names_and_links.is_empty() { 77 log::error!("The log file is empty! -> Crawling again..."); 78 crawl_through_source(self).await; 79 let _ = Box::pin(self.find_links_to_missing_files()).await; 80 } 81 let links_to_missing_files = logged_file_names_and_links 82 .into_iter() 83 .filter(|(file_name, _)| !self.root_path.join(file_name).exists()) 84 .map(|(_, url)| url) 85 .collect::<Vec<String>>(); 86 87 Ok(links_to_missing_files) 88 } 89 90 Err(e) => { 91 log::error!("Could not retrieve the log file for {}. Error: {}", self.name, e); 92 Err(e) 93 } 94 } 95 } 96 } 97 98 pub fn get_sources() -> Vec<Source<'static>> { 99 let paths = Paths::init(); 100 101 Vec::from([ 102 Source { 103 name: "The Rust Book", 104 root_path: paths.the_book, 105 subdomains_and_pages: None, 106 trail: Some("title-page.html"), 107 start_url: Url::parse("https://doc.rust-lang.org/stable/book/title-page.html").unwrap(), 108 header_indicators: None, 109 footnote_indicators: None, 110 regexes: Some(Vec::from([ 111 Regex::new(r"^\[\d+\]: #w+$").unwrap(), 112 Regex::new(r"^\[\d+\]: #[\w-]+$").unwrap(), 113 Regex::new(r"^\[\d+\]: [\w.-]+#[\w-]+$").unwrap(), 114 Regex::new(r"^\[\d+\]: [\w.-]+#[\w]+$").unwrap(), 115 Regex::new(r"^\[\d+\]: [\w.-]+#[\w]+$").unwrap(), 116 Regex::new(r"^\[\d+\]: https://[^\s/]+(?:/[^\s#]*)*").unwrap(), 117 Regex::new(r"^\[\d+\]: ch\d{2}-\d{2}-[a-z]+\.html$").unwrap(), // Targets lines like [4]: ch06-00-enums.html 118 Regex::new(r"^\[\d+\]: ch\d{2}-\d{2}-[\w-]+\.html$").unwrap(), // Takes care of lines like [5]: ch18-02-trait-objects.html 119 ])), 120 named_artifacts: Some(Vec::from([ 121 "<iframe class=\"sidebar-iframe-outer\" src=\"toc.html\"></iframe>", 122 "* Light", 123 "* Rust", 124 "* Coal", 125 "* Auto", 126 "* Navy", 127 "* Ayu", 128 "# The Rust Programming Language", 129 "[ ][1] [ ][2]", 130 "## [Appendix A: Keywords][3]", 131 "### [Keywords Currently in Use][5]", 132 "### [Keywords Reserved for Future Use][7]", 133 "### [Raw Identifiers][8]", 134 "[1]: print.html", 135 "[2]: https://github.com/rust-lang/book", 136 "[3]: #appendix-a-keywords", 137 "[4]: #raw-identifiers", 138 "[5]: #keywords-currently-in-use", 139 "[6]: ../reference/items/unions.html", 140 "[7]: #keywords-reserved-for-future-use", 141 "[8]: #raw-identifiers", 142 "[9]: appendix-05-editions.html", 143 ])), 144 }, 145 Source { 146 name: "Rust Guide", 147 root_path: paths.rust_guide, 148 trail: None, 149 start_url: Url::parse("https://rust-guide.com/en/documentation/").unwrap(), 150 named_artifacts: None, 151 regexes: None, 152 header_indicators: Some(Vec::from(["[Rust Guide][1]"])), 153 footnote_indicators: Some(Vec::from([ 154 "[1]: /", "[2]: /", "[3]: /", "[4]: /", "[5]: /", "[6]: /", "[7]: /", 155 ])), 156 subdomains_and_pages: Some(HashMap::from([ 157 ( 158 "collections", 159 Vec::from([ 160 "Array", 161 "BinaryHeap", 162 "VecDeque", 163 "HashSet", 164 "Result", 165 "Slice", 166 "str", 167 ]), 168 ), 169 ( 170 "iterators", 171 Vec::from([ 172 "fold", 173 "all", 174 "chain", 175 "Collect", 176 "count", 177 "enumerate", 178 "filterMap", 179 "find", 180 "max", 181 "skip", 182 "zip", 183 ]), 184 ), 185 ( 186 "concurrency", 187 Vec::from([ 188 "Arc", 189 "AtomicBool", 190 "AtomicI32", 191 "AtomicI64", 192 "AtomicISize", 193 "CondVar", 194 "JoinHandle", 195 "Max", 196 "Skip", 197 "Zip", 198 ]), 199 ), 200 ])), 201 }, 202 ]) 203 }