/ src / words.rs
words.rs
  1  use cjk::is_cjk_codepoint;
  2  
  3  pub struct Words<S: AsRef<str>> {
  4      source: S,
  5      position: usize,
  6      previous: usize,
  7      preserve_whitespace: bool,
  8  }
  9  
 10  impl<S: AsRef<str>> Words<S> {
 11      pub fn new(source: S) -> Self {
 12          Self {
 13              source,
 14              previous: 0,
 15              position: 0,
 16              preserve_whitespace: false,
 17          }
 18      }
 19  
 20      pub fn preserving_whitespace(source: S) -> Self {
 21          Self {
 22              source,
 23              previous: 0,
 24              position: 0,
 25              preserve_whitespace: true,
 26          }
 27      }
 28  }
 29  
 30  impl<S: AsRef<str>> Words<S> {
 31      pub fn undo(&mut self) {
 32          self.position = self.previous;
 33      }
 34  }
 35  
 36  // NOTE: this almost certainly does some extra processing... but for my sanity,
 37  // we accept that
 38  fn may_end_word_cjk(ch: char) -> bool {
 39      // simplified chinese
 40      !"$(£¥·'\"〈《「『【〔〖〝﹙﹛$(.[{£¥"
 41          .chars()
 42          .any(|c| c == ch)
 43      // traditional chinese
 44      && !"([{£¥'\"‵〈《「『〔〝︴﹙﹛({︵︷︹︻︽︿﹁﹃﹏"
 45          .chars()
 46          .any(|c| c == ch)
 47      // japanese
 48      && !"([{〔〈《「『【〘〖〝'\"⦅«"
 49          .chars()
 50          .any(|c| c == ch)
 51      // japanese inseparable
 52      && !"—...‥〳〴〵"
 53          .chars()
 54          .any(|c| c == ch)
 55      // korean
 56      && !"$([\\{£¥'\"々〇〉》」〔$([{⦆¥₩ #"
 57          .chars()
 58          .any(|c| c == ch)
 59  }
 60  
 61  fn may_start_word_cjk(ch: char) -> bool {
 62      // simplified chinese
 63      !"!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?!]}~"
 64          .chars()
 65          .any(|c| c == ch)
 66      // traditional chinese
 67      && !"!),.:;?]}¢·–— '\"• 、。〆〞〕〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘﹚﹜!),.:;?︶︸︺︼︾﹀﹂﹗]|}、"
 68          .chars()
 69          .any(|c| c == ch)
 70      // japenese
 71      && !")]}〕〉》」』】〙〗〟'\"⦆»"
 72          .chars()
 73          .any(|c| c == ch)
 74      && !"ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻"
 75          .chars()
 76          .any(|c| c == ch)
 77      && !"‐゠–〜? ! ‼ ⁇ ⁈ ⁉・、:;,。."
 78          .chars()
 79          .any(|c| c == ch)
 80      // japanese inseparable
 81      && !"—...‥〳〴〵"
 82          .chars()
 83          .any(|c| c == ch)
 84      // korean
 85      && !"!%),.:;?]}¢°'\"†‡℃〆〈《「『〕!%),.:;?]}"
 86          .chars()
 87          .any(|c| c == ch)
 88  }
 89  
 90  impl<S: AsRef<str>> Iterator for Words<S> {
 91      type Item = String;
 92  
 93      fn next(&mut self) -> Option<String> {
 94          self.previous = self.position;
 95          let chars: Vec<char> = self.source.as_ref().chars().skip(self.position).collect();
 96          let mut start = 0;
 97          while start < chars.len() && chars[start].is_whitespace() {
 98              start += 1;
 99          }
100          self.position += start;
101          if start == chars.len() {
102              if chars.len() == 0 {
103                  return None;
104              } else if self.preserve_whitespace {
105                  return Some(chars[..].into_iter().collect());
106              } else {
107                  return Some(" ".to_string());
108              }
109          }
110          let mut len = 0;
111          while start + len < chars.len() {
112              if chars[start + len] == '-' {
113                  len += 1;
114                  break;
115              }
116              if chars[start + len].is_whitespace() {
117                  break;
118              }
119              if len != 0
120                  // Before or after cjk characters, we can usually break line, unless it's one of the exceptions.
121                  // I got the exceptions off Wikipedia:
122                  //     https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages
123                  && (is_cjk_codepoint(chars[start + len - 1]) || is_cjk_codepoint(chars[start + len]))
124                  && may_end_word_cjk(chars[start + len - 1])
125                  && may_start_word_cjk(chars[start + len])
126              {
127                  break;
128              }
129              len += 1;
130          }
131          self.position += len;
132          if chars[0].is_whitespace() {
133              if self.preserve_whitespace {
134                  return Some(chars[0..start + len].into_iter().collect::<String>());
135              } else {
136                  return Some(
137                      String::from(" ") + &chars[start..start + len].into_iter().collect::<String>(),
138                  );
139              }
140          } else {
141              return Some(chars[start..start + len].into_iter().collect::<String>());
142          }
143      }
144  }