words.rs
1 use cjk::is_cjk_codepoint; 2 3 pub struct Words<S: AsRef<str>> { 4 source: S, 5 position: usize, 6 previous: usize, 7 preserve_whitespace: bool, 8 } 9 10 impl<S: AsRef<str>> Words<S> { 11 pub fn new(source: S) -> Self { 12 Self { 13 source, 14 previous: 0, 15 position: 0, 16 preserve_whitespace: false, 17 } 18 } 19 20 pub fn preserving_whitespace(source: S) -> Self { 21 Self { 22 source, 23 previous: 0, 24 position: 0, 25 preserve_whitespace: true, 26 } 27 } 28 } 29 30 impl<S: AsRef<str>> Words<S> { 31 pub fn undo(&mut self) { 32 self.position = self.previous; 33 } 34 } 35 36 // NOTE: this almost certainly does some extra processing... but for my sanity, 37 // we accept that 38 fn may_end_word_cjk(ch: char) -> bool { 39 // simplified chinese 40 !"$(£¥·'\"〈《「『【〔〖〝﹙﹛$(.[{£¥" 41 .chars() 42 .any(|c| c == ch) 43 // traditional chinese 44 && !"([{£¥'\"‵〈《「『〔〝︴﹙﹛({︵︷︹︻︽︿﹁﹃﹏" 45 .chars() 46 .any(|c| c == ch) 47 // japanese 48 && !"([{〔〈《「『【〘〖〝'\"⦅«" 49 .chars() 50 .any(|c| c == ch) 51 // japanese inseparable 52 && !"—...‥〳〴〵" 53 .chars() 54 .any(|c| c == ch) 55 // korean 56 && !"$([\\{£¥'\"々〇〉》」〔$([{⦆¥₩ #" 57 .chars() 58 .any(|c| c == ch) 59 } 60 61 fn may_start_word_cjk(ch: char) -> bool { 62 // simplified chinese 63 !"!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?!]}~" 64 .chars() 65 .any(|c| c == ch) 66 // traditional chinese 67 && !"!),.:;?]}¢·–— '\"• 、。〆〞〕〉》」︰︱︲︳﹐﹑﹒﹔﹕﹖﹘﹚﹜!),.:;?︶︸︺︼︾﹀﹂﹗]|}、" 68 .chars() 69 .any(|c| c == ch) 70 // japenese 71 && !")]}〕〉》」』】〙〗〟'\"⦆»" 72 .chars() 73 .any(|c| c == ch) 74 && !"ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻" 75 .chars() 76 .any(|c| c == ch) 77 && !"‐゠–〜? ! ‼ ⁇ ⁈ ⁉・、:;,。." 78 .chars() 79 .any(|c| c == ch) 80 // japanese inseparable 81 && !"—...‥〳〴〵" 82 .chars() 83 .any(|c| c == ch) 84 // korean 85 && !"!%),.:;?]}¢°'\"†‡℃〆〈《「『〕!%),.:;?]}" 86 .chars() 87 .any(|c| c == ch) 88 } 89 90 impl<S: AsRef<str>> Iterator for Words<S> { 91 type Item = String; 92 93 fn next(&mut self) -> Option<String> { 94 self.previous = self.position; 95 let chars: Vec<char> = self.source.as_ref().chars().skip(self.position).collect(); 96 let mut start = 0; 97 while start < chars.len() && chars[start].is_whitespace() { 98 start += 1; 99 } 100 self.position += start; 101 if start == chars.len() { 102 if chars.len() == 0 { 103 return None; 104 } else if self.preserve_whitespace { 105 return Some(chars[..].into_iter().collect()); 106 } else { 107 return Some(" ".to_string()); 108 } 109 } 110 let mut len = 0; 111 while start + len < chars.len() { 112 if chars[start + len] == '-' { 113 len += 1; 114 break; 115 } 116 if chars[start + len].is_whitespace() { 117 break; 118 } 119 if len != 0 120 // Before or after cjk characters, we can usually break line, unless it's one of the exceptions. 121 // I got the exceptions off Wikipedia: 122 // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages 123 && (is_cjk_codepoint(chars[start + len - 1]) || is_cjk_codepoint(chars[start + len])) 124 && may_end_word_cjk(chars[start + len - 1]) 125 && may_start_word_cjk(chars[start + len]) 126 { 127 break; 128 } 129 len += 1; 130 } 131 self.position += len; 132 if chars[0].is_whitespace() { 133 if self.preserve_whitespace { 134 return Some(chars[0..start + len].into_iter().collect::<String>()); 135 } else { 136 return Some( 137 String::from(" ") + &chars[start..start + len].into_iter().collect::<String>(), 138 ); 139 } 140 } else { 141 return Some(chars[start..start + len].into_iter().collect::<String>()); 142 } 143 } 144 }