/ src / preprocessing / cleaning / core.rs
core.rs
 1  use std::{
 2      fs::{self, File},
 3      io::{Read, Write},
 4  };
 5  
 6  use kdam;
 7  use log;
 8  
 9  use crate::{
10      preprocessing::cleaning::artifacts::line_is_an_artifact,
11      sourcing::sources::{Source, get_sources},
12  };
13  
14  pub fn clean_all_texts() {
15      for source in get_sources() {
16          let cleaning_needed: bool = source.header_indicators.is_some()
17              || source.footnote_indicators.is_some()
18              || source.named_artifacts.is_some();
19  
20          if cleaning_needed {
21              log::info!("Started cleaning \"{}\"", source.name);
22              remove_lines_with_only_artifacts(&source);
23          };
24      }
25  }
26  
27  pub fn remove_lines_with_only_artifacts(source: &Source) {
28      let file_paths = source.get_paths_to_all_files();
29  
30      for path in kdam::tqdm!(file_paths.iter(), desc = "Cleaning up texts...") {
31          let mut file_buffer = String::from(""); // This will contain the whole file as a string
32          let mut file = File::open(path).unwrap();
33          file.read_to_string(&mut file_buffer).unwrap(); // Move contents of the file into the file_buffer
34  
35          for line in file_buffer.clone().lines() {
36              if line_is_an_artifact(line, source) {
37                  file_buffer = file_buffer
38                      .replace(line, "")
39                      .trim_start()
40                      .trim_end()
41                      .to_string()
42              }
43          }
44  
45          fs::remove_file(path).unwrap();
46          let mut file = File::create(path).unwrap();
47          file.write_all(file_buffer.as_bytes()).unwrap();
48      }
49  }