core.rs
1 use std::{ 2 fs::{self, File}, 3 io::{Read, Write}, 4 }; 5 6 use kdam; 7 use log; 8 9 use crate::{ 10 preprocessing::cleaning::artifacts::line_is_an_artifact, 11 sourcing::sources::{Source, get_sources}, 12 }; 13 14 pub fn clean_all_texts() { 15 for source in get_sources() { 16 let cleaning_needed: bool = source.header_indicators.is_some() 17 || source.footnote_indicators.is_some() 18 || source.named_artifacts.is_some(); 19 20 if cleaning_needed { 21 log::info!("Started cleaning \"{}\"", source.name); 22 remove_lines_with_only_artifacts(&source); 23 }; 24 } 25 } 26 27 pub fn remove_lines_with_only_artifacts(source: &Source) { 28 let file_paths = source.get_paths_to_all_files(); 29 30 for path in kdam::tqdm!(file_paths.iter(), desc = "Cleaning up texts...") { 31 let mut file_buffer = String::from(""); // This will contain the whole file as a string 32 let mut file = File::open(path).unwrap(); 33 file.read_to_string(&mut file_buffer).unwrap(); // Move contents of the file into the file_buffer 34 35 for line in file_buffer.clone().lines() { 36 if line_is_an_artifact(line, source) { 37 file_buffer = file_buffer 38 .replace(line, "") 39 .trim_start() 40 .trim_end() 41 .to_string() 42 } 43 } 44 45 fs::remove_file(path).unwrap(); 46 let mut file = File::create(path).unwrap(); 47 file.write_all(file_buffer.as_bytes()).unwrap(); 48 } 49 }