/ src / main.rs
main.rs
 1  mod utilities;
 2  
 3  mod sourcing {
 4      pub mod crawl;
 5      pub mod crawler;
 6      pub mod scrape;
 7      pub mod sources;
 8  }
 9  
10  mod preprocessing {
11      pub mod cleaning {
12          pub mod artifacts;
13          pub mod core;
14      }
15  
16      pub mod text_merging;
17  }
18  
19  mod tokenizer {
20      pub mod components;
21      pub mod core;
22      pub mod pairs;
23  }
24  
25  mod setup {
26      pub mod logging;
27      pub mod paths;
28  }
29  
30  use crate::{
31      setup::{
32          logging::setup_logging, 
33          paths::make_fundamental_directories,
34      },
35      tokenizer::core::train_byte_pair_encoding,
36      sourcing::{crawl::scrape_all_texts, sources},
37      preprocessing::cleaning::core::clean_all_texts,
38  };
39  
40  #[tokio::main]
41  async fn run_preprocessing() {
42      setup_logging();
43      make_fundamental_directories();
44      scrape_all_texts().await;
45      clean_all_texts();
46  }
47  
48  fn main() {
49      run_preprocessing();
50      train_byte_pair_encoding();
51  }