/ lib / wind / normalize.c
normalize.c
  1  /*
  2   * Copyright (c) 2004 Kungliga Tekniska Högskolan
  3   * (Royal Institute of Technology, Stockholm, Sweden).
  4   * All rights reserved.
  5   *
  6   * Redistribution and use in source and binary forms, with or without
  7   * modification, are permitted provided that the following conditions
  8   * are met:
  9   *
 10   * 1. Redistributions of source code must retain the above copyright
 11   *    notice, this list of conditions and the following disclaimer.
 12   *
 13   * 2. Redistributions in binary form must reproduce the above copyright
 14   *    notice, this list of conditions and the following disclaimer in the
 15   *    documentation and/or other materials provided with the distribution.
 16   *
 17   * 3. Neither the name of the Institute nor the names of its contributors
 18   *    may be used to endorse or promote products derived from this software
 19   *    without specific prior written permission.
 20   *
 21   * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
 22   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 23   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 24   * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
 25   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 26   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 27   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 28   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 29   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 30   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 31   * SUCH DAMAGE.
 32   */
 33  
 34  #ifdef HAVE_CONFIG_H
 35  #include <config.h>
 36  #endif
 37  #include "windlocl.h"
 38  
 39  #include <assert.h>
 40  #include <stdlib.h>
 41  #include <errno.h>
 42  #include <stdio.h>
 43  
 44  #include "roken.h"
 45  
 46  #include "normalize_table.h"
 47  
 48  static int
 49  translation_cmp(const void *key, const void *data)
 50  {
 51      const struct translation *t1 = (const struct translation *)key;
 52      const struct translation *t2 = (const struct translation *)data;
 53  
 54      return t1->key - t2->key;
 55  }
 56  
 57  enum { s_base  = 0xAC00};
 58  enum { s_count = 11172};
 59  enum { l_base  = 0x1100};
 60  enum { l_count = 19};
 61  enum { v_base  = 0x1161};
 62  enum { v_count = 21};
 63  enum { t_base  = 0x11A7};
 64  enum { t_count = 28};
 65  enum { n_count = v_count * t_count};
 66  
 67  static int
 68  hangul_decomp(const uint32_t *in, size_t in_len,
 69  	      uint32_t *out, size_t *out_len)
 70  {
 71      uint32_t u = *in;
 72      unsigned s_index;
 73      unsigned l, v, t;
 74      unsigned o;
 75  
 76      if (u < s_base || u >= s_base + s_count)
 77  	return 0;
 78      s_index = u - s_base;
 79      l = l_base + s_index / n_count;
 80      v = v_base + (s_index % n_count) / t_count;
 81      t = t_base + s_index % t_count;
 82      o = 2;
 83      if (t != t_base)
 84  	++o;
 85      if (*out_len < o)
 86  	return WIND_ERR_OVERRUN;
 87      out[0] = l;
 88      out[1] = v;
 89      if (t != t_base)
 90  	out[2] = t;
 91      *out_len = o;
 92      return 1;
 93  }
 94  
 95  static uint32_t
 96  hangul_composition(const uint32_t *in, size_t in_len)
 97  {
 98      if (in_len < 2)
 99  	return 0;
100      if (in[0] >= l_base && in[0] < l_base + l_count) {
101  	unsigned l_index = in[0] - l_base;
102  	unsigned v_index;
103  
104  	if (in[1] < v_base || in[1] >= v_base + v_count)
105  	    return 0;
106  	v_index = in[1] - v_base;
107  	return (l_index * v_count + v_index) * t_count + s_base;
108      } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109  	unsigned s_index = in[0] - s_base;
110  	unsigned t_index;
111  
112  	if (s_index % t_count != 0)
113  	    return 0;
114  	if (in[1] < t_base || in[1] >= t_base + t_count)
115  	    return 0;
116  	t_index = in[1] - t_base;
117  	return in[0] + t_index;
118      }
119      return 0;
120  }
121  
122  static int
123  compat_decomp(const uint32_t *in, size_t in_len,
124  	      uint32_t *out, size_t *out_len)
125  {
126      unsigned i;
127      unsigned o = 0;
128  
129      for (i = 0; i < in_len; ++i) {
130  	struct translation ts = {in[i]};
131  	size_t sub_len = *out_len - o;
132  	int ret;
133  
134  	ret = hangul_decomp(in + i, in_len - i,
135  			    out + o, &sub_len);
136  	if (ret) {
137  	    if (ret == WIND_ERR_OVERRUN)
138  		return ret;
139  	    o += sub_len;
140  	} else {
141  	    void *s = bsearch(&ts,
142  			      _wind_normalize_table,
143  			      _wind_normalize_table_size,
144  			      sizeof(_wind_normalize_table[0]),
145  			      translation_cmp);
146  	    if (s != NULL) {
147  		const struct translation *t = (const struct translation *)s;
148  
149  		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150  				    t->val_len,
151  				    out + o, &sub_len);
152  		if (ret)
153  		    return ret;
154  		o += sub_len;
155  	    } else {
156  		if (o >= *out_len)
157  		    return WIND_ERR_OVERRUN;
158  		out[o++] = in[i];
159  
160  	    }
161  	}
162      }
163      *out_len = o;
164      return 0;
165  }
166  
167  static void
168  swap_char(uint32_t * a, uint32_t * b)
169  {
170      uint32_t t;
171      t = *a;
172      *a = *b;
173      *b = t;
174  }
175  
176  /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177   * that all have Canonical_Combining_Class > 0 */
178  static void
179  canonical_reorder_sequence(uint32_t * a, size_t len)
180  {
181      size_t i, j;
182  
183      if (len <= 1)
184  	return;
185  
186      for (i = 1; i < len; i++) {
187  	for (j = i;
188  	     j > 0 &&
189  		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190  	     j--)
191  	    swap_char(&a[j], &a[j-1]);
192      }
193  }
194  
195  static void
196  canonical_reorder(uint32_t *tmp, size_t tmp_len)
197  {
198      size_t i;
199  
200      for (i = 0; i < tmp_len; ++i) {
201  	int cc = _wind_combining_class(tmp[i]);
202  	if (cc) {
203  	    size_t j;
204  	    for (j = i + 1;
205  		 j < tmp_len && _wind_combining_class(tmp[j]);
206  		 ++j)
207  		;
208  	    canonical_reorder_sequence(&tmp[i], j - i);
209  	    i = j;
210  	}
211      }
212  }
213  
214  static uint32_t
215  find_composition(const uint32_t *in, unsigned in_len)
216  {
217      unsigned short canon_index = 0;
218      uint32_t cur;
219      unsigned n = 0;
220  
221      cur = hangul_composition(in, in_len);
222      if (cur)
223  	return cur;
224  
225      do {
226  	const struct canon_node *c = &_wind_canon_table[canon_index];
227  	unsigned i;
228  
229  	if (n % 5 == 0) {
230  	    cur = *in++;
231  	    if (in_len-- == 0)
232  		return c->val;
233  	}
234  
235  	i = cur >> 16;
236  	if (i < c->next_start || i >= c->next_end)
237  	    canon_index = 0;
238  	else
239  	    canon_index =
240  		_wind_canon_next_table[c->next_offset + i - c->next_start];
241  	if (canon_index != 0) {
242  	    cur = (cur << 4) & 0xFFFFF;
243  	    ++n;
244  	}
245      } while (canon_index != 0);
246      return 0;
247  }
248  
249  static int
250  combine(const uint32_t *in, size_t in_len,
251  	uint32_t *out, size_t *out_len)
252  {
253      unsigned i;
254      int ostarter;
255      unsigned o = 0;
256      int old_cc;
257  
258      for (i = 0; i < in_len;) {
259  	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260  	    out[o++] = in[i++];
261  	}
262  	if (i < in_len) {
263  	    if (o >= *out_len)
264  		return WIND_ERR_OVERRUN;
265  	    ostarter = o;
266  	    out[o++] = in[i++];
267  	    old_cc   = -1;
268  
269  	    while (i < in_len) {
270  		uint32_t comb;
271  		uint32_t v[2];
272  		int cc;
273  
274  		v[0] = out[ostarter];
275  		v[1] = in[i];
276  
277  		cc = _wind_combining_class(in[i]);
278  		if (old_cc != cc && (comb = find_composition(v, 2))) {
279  		    out[ostarter] = comb;
280  		} else if (cc == 0) {
281  		    break;
282  		} else {
283  		    if (o >= *out_len)
284  			return WIND_ERR_OVERRUN;
285  		    out[o++] = in[i];
286  		    old_cc   = cc;
287  		}
288  		++i;
289  	    }
290  	}
291      }
292      *out_len = o;
293      return 0;
294  }
295  
296  int
297  _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298  			   uint32_t *out, size_t *out_len)
299  {
300      size_t tmp_len;
301      uint32_t *tmp;
302      int ret;
303  
304      if (in_len == 0) {
305  	*out_len = 0;
306  	return 0;
307      }
308  
309      tmp_len = in_len * 4;
310      if (tmp_len < MAX_LENGTH_CANON)
311  	tmp_len = MAX_LENGTH_CANON;
312      tmp = malloc(tmp_len * sizeof(uint32_t));
313      if (tmp == NULL)
314  	return ENOMEM;
315  
316      ret = compat_decomp(in, in_len, tmp, &tmp_len);
317      if (ret) {
318  	free(tmp);
319  	return ret;
320      }
321      canonical_reorder(tmp, tmp_len);
322      ret = combine(tmp, tmp_len, out, out_len);
323      free(tmp);
324      return ret;
325  }