main.rs
1 mod utilities; 2 3 mod sourcing { 4 pub mod crawl; 5 pub mod crawler; 6 pub mod scrape; 7 pub mod sources; 8 } 9 10 mod preprocessing { 11 pub mod cleaning { 12 pub mod artifacts; 13 pub mod core; 14 } 15 16 pub mod text_merging; 17 } 18 19 mod tokenizer { 20 pub mod components; 21 pub mod core; 22 pub mod pairs; 23 } 24 25 mod setup { 26 pub mod logging; 27 pub mod paths; 28 } 29 30 use crate::{ 31 setup::{ 32 logging::setup_logging, 33 paths::make_fundamental_directories, 34 }, 35 tokenizer::core::train_byte_pair_encoding, 36 sourcing::{crawl::scrape_all_texts, sources}, 37 preprocessing::cleaning::core::clean_all_texts, 38 }; 39 40 #[tokio::main] 41 async fn run_preprocessing() { 42 setup_logging(); 43 make_fundamental_directories(); 44 scrape_all_texts().await; 45 clean_all_texts(); 46 } 47 48 fn main() { 49 run_preprocessing(); 50 train_byte_pair_encoding(); 51 }