/ duct-tape / xnu / bsd / sys / utfconv.h
utfconv.h
  1  /*
  2   * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   *
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   *
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   *
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   */
 28  
 29  #ifndef _SYS_UTFCONV_H_
 30  #define _SYS_UTFCONV_H_
 31  
 32  #include <sys/appleapiopts.h>
 33  #include <sys/cdefs.h>
 34  
 35  #ifdef KERNEL
 36  #ifdef __APPLE_API_UNSTABLE
 37  
 38  /*
 39   * UTF-8 encode/decode flags
 40   */
 41  #define UTF_REVERSE_ENDIAN   0x0001   /* reverse UCS-2 byte order */
 42  #define UTF_NO_NULL_TERM     0x0002   /* do not add null termination */
 43  #define UTF_DECOMPOSED       0x0004   /* generate fully decomposed UCS-2 */
 44  #define UTF_PRECOMPOSED      0x0008   /* generate precomposed UCS-2 */
 45  #define UTF_ESCAPE_ILLEGAL   0x0010   /* escape illegal UTF-8 */
 46  #define UTF_SFM_CONVERSIONS  0x0020   /* Use SFM mappings for illegal NTFS chars */
 47  
 48  #define UTF_BIG_ENDIAN       \
 49  	((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
 50  
 51  #define UTF_LITTLE_ENDIAN    \
 52  	((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
 53  
 54  __BEGIN_DECLS
 55  
 56  
 57  /*
 58   * unicode_combinable - Test for a combining unicode character.
 59   *
 60   * This function is similar to __CFUniCharIsNonBaseCharacter except
 61   * that it also includes Hangul Jamo characters.
 62   */
 63  
 64  int unicode_combinable(u_int16_t character);
 65  
 66  /*
 67   * Test for a precomposed character.
 68   *
 69   * Similar to __CFUniCharIsDecomposableCharacter.
 70   */
 71  
 72  int unicode_decomposeable(u_int16_t character);
 73  
 74  
 75  /*
 76   * utf8_encodelen - Calculate the UTF-8 encoding length
 77   *
 78   * This function takes an Unicode input string, ucsp, of ucslen bytes
 79   * and calculates the size of the UTF-8 output in bytes (not including
 80   * a NULL termination byte). The string must reside in kernel memory.
 81   *
 82   * FLAGS
 83   *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 84   *
 85   *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 86   *
 87   *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 88   *
 89   *    UTF_DECOMPOSED:  assume fully decomposed output
 90   *
 91   * ERRORS
 92   *    None
 93   */
 94  size_t
 95  utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
 96      int flags);
 97  
 98  
 99  /*
100   * utf8_encodestr - Encodes a Unicode string into UTF-8
101   *
102   * This function takes an Unicode input string, ucsp, of ucslen bytes
103   * and produces the UTF-8 output into a buffer of buflen bytes pointed
104   * to by utf8p. The size of the output in bytes (not including a NULL
105   * termination byte) is returned in utf8len. The UTF-8 string output
106   * is NULL terminated. Both buffers must reside in kernel memory.
107   *
108   * If '/' chars are possible in the Unicode input then an alternate
109   * (replacement) char must be provided in altslash.
110   *
111   * FLAGS
112   *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
113   *
114   *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
115   *
116   *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
117   *
118   *    UTF_NO_NULL_TERM:  do not add null termination to output string
119   *
120   *    UTF_DECOMPOSED:  generate fully decomposed output
121   *
122   * ERRORS
123   *    ENAMETOOLONG:  output did not fit; only utf8len bytes were encoded
124   *
125   *    EINVAL:  illegal Unicode char encountered
126   */
127  int
128  utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
129      size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
130  
131  
132  /*
133   * utf8_decodestr - Decodes a UTF-8 string into Unicode
134   *
135   * This function takes an UTF-8 input string, utf8p, of utf8len bytes
136   * and produces the Unicode output into a buffer of buflen bytes pointed
137   * to by ucsp. The size of the output in bytes (not including a NULL
138   * termination byte) is returned in ucslen. Both buffers must reside
139   * in kernel memory.
140   *
141   * If '/' chars are allowed in the Unicode output then an alternate
142   * (replacement) char must be provided in altslash.
143   *
144   * FLAGS
145   *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
146   *
147   *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
148   *
149   *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
150   *
151   *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
152   *
153   *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
154   *
155   *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
156   *
157   * ERRORS
158   *    ENAMETOOLONG:  output did not fit; only ucslen bytes were decoded.
159   *
160   *    EINVAL:  illegal UTF-8 sequence encountered.
161   */
162  int
163  utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
164      size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
165  
166  
167  /*
168   * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
169   *
170   * This function takes an UTF-8 input string, instr, of inlen bytes
171   * and produces normalized UTF-8 output into a buffer of buflen bytes
172   * pointed to by outstr. The size of the output in bytes (not including
173   * a NULL termination byte) is returned in outlen. In-place conversions
174   * are not supported (i.e. instr != outstr).  Both buffers must reside
175   * in kernel memory.
176   *
177   * FLAGS
178   *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
179   *
180   *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
181   *
182   *    UTF_NO_NULL_TERM:  do not add null termination to output string
183   *
184   *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
185   *
186   * ERRORS
187   *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
188   *
189   *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
190   */
191  int
192  utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
193      size_t *outlen, size_t buflen, int flags);
194  
195  
196  /*
197   * utf8_validatestr - validates a UTF-8 string
198   *
199   * This function takes an UTF-8 input string, utf8p, of utf8len bytes
200   * and determines if its valid UTF-8.  The string must reside in kernel
201   * memory.
202   *
203   * ERRORS
204   *    EINVAL:  illegal UTF-8 sequence encountered.
205   */
206  int
207  utf8_validatestr(const u_int8_t* utf8p, size_t utf8len);
208  
209  
210  __END_DECLS
211  
212  #endif /* __APPLE_API_UNSTABLE */
213  #endif /* KERNEL */
214  
215  #endif /* !_SYS_UTFCONV_H_ */