core.rs
1 use std::collections::HashSet; 2 3 use crate::{ 4 preprocessing::text_merging::merge_all_texts, setup::paths::Paths, tokenizer::{ 5 components::{init_components, init_corpus_tokens, TextComponents, VOCAB_SIZE}, 6 pairs::{get_most_frequent_pair, init_unique_pairs, make_raw_pairs_of_characters, Pair}, 7 } 8 }; 9 10 pub fn train_byte_pair_encoding() { 11 let full_text: String = merge_all_texts(); 12 let mut merged_pairs: Vec<Pair> = Vec::new(); 13 let mut components: TextComponents<'_> = init_components(&full_text); 14 let characters = components.text.chars().collect::<Vec<char>>(); 15 let raw_pairs: Vec<Vec<char>> = make_raw_pairs_of_characters(&characters); 16 17 let corpus: Vec<Vec<String>> = init_corpus_tokens(&full_text); 18 19 for _ in kdam::tqdm!(corpus.into_iter(), desc="Testing") { 20 let counted_pairs: HashSet<Pair> = init_unique_pairs(&raw_pairs); 21 let most_frequent_pair: Pair = get_most_frequent_pair(counted_pairs); 22 23 if !merged_pairs.contains(&most_frequent_pair) { 24 merged_pairs.push(most_frequent_pair.clone()); 25 let merged_pair: String = most_frequent_pair.merge(); 26 components.add_merged_pair(&merged_pair); 27 } 28 } 29 30 components.make_vocab_file(); 31 }