/ src / tokenizer / core.rs
core.rs
 1  use std::collections::HashSet;
 2  
 3  use crate::{
 4      preprocessing::text_merging::merge_all_texts, setup::paths::Paths, tokenizer::{
 5          components::{init_components, init_corpus_tokens, TextComponents, VOCAB_SIZE},
 6          pairs::{get_most_frequent_pair, init_unique_pairs, make_raw_pairs_of_characters, Pair},
 7      }
 8  };
 9  
10  pub fn train_byte_pair_encoding() {
11      let full_text: String = merge_all_texts();
12      let mut merged_pairs: Vec<Pair> = Vec::new();
13      let mut components: TextComponents<'_> = init_components(&full_text);
14      let characters = components.text.chars().collect::<Vec<char>>();
15      let raw_pairs: Vec<Vec<char>> = make_raw_pairs_of_characters(&characters);
16  
17      let corpus: Vec<Vec<String>> = init_corpus_tokens(&full_text);
18  
19      for _ in kdam::tqdm!(corpus.into_iter(), desc="Testing") {
20          let counted_pairs: HashSet<Pair> = init_unique_pairs(&raw_pairs);
21          let most_frequent_pair: Pair = get_most_frequent_pair(counted_pairs);
22  
23          if !merged_pairs.contains(&most_frequent_pair) {
24              merged_pairs.push(most_frequent_pair.clone());
25              let merged_pair: String = most_frequent_pair.merge();
26              components.add_merged_pair(&merged_pair);
27          } 
28      }
29      
30      components.make_vocab_file();
31  }