/ duct-tape / xnu / bsd / sys / unicode.h
unicode.h
  1  /*
  2   * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. Please obtain a copy of the License at
 10   * http://www.opensource.apple.com/apsl/ and read it before using this
 11   * file.
 12   *
 13   * The Original Code and all software distributed under the License are
 14   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 15   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 16   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 17   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 18   * Please see the License for the specific language governing rights and
 19   * limitations under the License.
 20   *
 21   * @APPLE_LICENSE_HEADER_END@
 22   */
 23  
 24  #ifndef unicode_h
 25  #define unicode_h
 26  
 27  #ifdef KERNEL_PRIVATE
 28  
 29  #include <sys/cdefs.h>
 30  #include <stdbool.h>
 31  
 32  /*
 33   * WARNING - callers that use the following Unicode normalization interface for on-disk
 34   * structures should be aware that the implementation will be periodically updated for
 35   * the latest Unicode standard version.
 36   */
 37  
 38  enum {
 39  	/* Maximum size of UTF32 reordering buffer for stream-safe format */
 40  	kNCFStreamSafeBufMax = 32
 41  };
 42  
 43  /*
 44   * utf8_normalizeOptCaseFoldAndHash
 45   *
 46   * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
 47   * as specified by the case_sens parameter, and feed the result incrementally to
 48   * the provided hash function callback:
 49   * - "canonical caseless form" (case-folded NFD, as described by definition D145
 50   *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
 51   * - standard NFD; for case-sensitive behavior (if case_sens = true).
 52   *
 53   * The input string should be valid UTF-8 that meets the criteria for stream safe
 54   * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
 55   * It should not contain ASCII 0x00 or '/'.
 56   *
 57   * str:       The input UTF-8 string (need not be 0 terminated)
 58   * str_len:   The byte length of the input string (excluding any 0 terminator)
 59   * case_sens: False for case-insensitive behavior; generates canonical caseless form.
 60   *            True for case-sensitive behavior; generates standard NFD.
 61   * hash_func: A pointer to a hashing function to compute the hash of the
 62   *            normalized/case-folded result. buf contains buf_len bytes
 63   *            of data to be added to the hash using the caller-supplied
 64   *            context (ctx).
 65   * hash_ctx:  The context for the hash function.
 66   *
 67   * Returns: 0 on success, or
 68   *          EILSEQ: The input string contains illegal ASCII-range characters
 69   *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
 70   *                  contains codepoints that are non-characters or unassigned in
 71   *                  the version of Unicode currently supported.
 72   */
 73  int utf8_normalizeOptCaseFoldAndHash(const char *str,
 74      size_t      str_len,
 75      bool        case_sens,
 76      void      (*hash_func)(void *buf, size_t buf_len, void *ctx),
 77      void       *hash_ctx);
 78  
 79  /*
 80   * utf8_normalizeOptCaseFoldAndCompare
 81   *
 82   * Determine whether two UTF-8 strings are equal after converting each to one of the
 83   * following normalized forms, as specified by the case_sens parameter:
 84   * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
 85   * - standard NFD; for case-sensitive comparison (if case_sens = true).
 86   * On success, sets are_equal to true if the strings are equal, or false if they are not.
 87   *
 88   * The input strings should be valid UTF-8 that meet the criteria for stream safe
 89   * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
 90   * They should not contain ASCII 0x00 or '/'.
 91   *
 92   * strA:      A UTF-8 string to be compared (need not be 0 terminated)
 93   * strA_len:  The byte length of strA (excluding any 0 terminator)
 94   * strB:      The second UTF-8 string to be compared (need not be 0 terminated)
 95   * strB_len:  The byte length of strB (excluding any 0 terminator)
 96   * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
 97   *            True for case-sensitive behavior; compares standard NFD forms.
 98   * are_equal: On success, set to true if the strings are equal, or set to false
 99   *            if they are not.
100   *
101   * Returns: 0 on success, or
102   *          EILSEQ: One or both of the input strings contains illegal ASCII-range
103   *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
104   *                  or contains codepoints that are non-characters or unassigned in
105   *                  the version of Unicode currently supported.
106   *                  Note: The comparison may terminate early when a difference is
107   *                        detected, and may return 0 and set *are_equal=false even
108   *                        if one or both strings are invalid.
109   */
110  int utf8_normalizeOptCaseFoldAndCompare(const char *strA,
111      size_t      strA_len,
112      const char *strB,
113      size_t      strB_len,
114      bool        case_sens,
115      bool       *are_equal);
116  
117  /*
118   * utf8_normalizeOptCaseFold
119   *
120   * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
121   * as specified by the case_sens parameter, and copy the result to the ustr
122   * buffer:
123   * - "canonical caseless form" (case-folded NFD, as described by definition D145
124   *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
125   * - standard NFD; for case-sensitive behavior (if case_sens = true).
126   *
127   * The input string should be valid UTF-8 that meets the criteria for stream safe
128   * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
129   * It should not contain ASCII 0x00 or '/'.
130   *
131   * str:       The input UTF-8 string (need not be 0 terminated)
132   * str_len:   The byte length of the input string (excluding any 0 terminator)
133   * case_sens: False for case-insensitive behavior; generates canonical caseless form.
134   *            True for case-sensitive behavior; generates standard NFD.
135   * ustr:      A pointer to a buffer for the resulting UTF-32 string.
136   * ustr_size: The capacity of ustr, in UTF-32 units.
137   * ustr_len:  Pointer to a value that will be filled in with the actual length
138   *            in UTF-32 units of the string copied to ustr.
139   *
140   * Returns: 0 on success, or
141   *          EILSEQ: The input string contains illegal ASCII-range characters
142   *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
143   *                  contains codepoints that are non-characters or unassigned in
144   *                  the version of Unicode currently supported.
145   *          ENOMEM: ustr_size is insufficient for the resulting string. In this
146   *                  case the value returned in *ustr_len is invalid.
147   */
148  int utf8_normalizeOptCaseFold(const char *str,
149      size_t      str_len,
150      bool        case_sens,
151      int32_t    *ustr,
152      int32_t     ustr_size,
153      int32_t    *ustr_len);
154  
155  /*
156   * utf8_normalizeOptCaseFoldToUTF8
157   *
158   * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,
159   * as specified by the case_sens parameter, and copy the result to the ustr
160   * buffer:
161   * - "canonical caseless form" (case-folded NFD, as described by definition D145
162   *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
163   * - standard NFD; for case-sensitive behavior (if case_sens = true).
164   *
165   * The input string should be valid UTF-8 that meets the criteria for stream safe
166   * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
167   * It should not contain ASCII 0x00 or '/'.
168   *
169   * str:       The input UTF-8 string (need not be 0 terminated)
170   * str_len:   The byte length of the input string (excluding any 0 terminator)
171   * case_sens: False for case-insensitive behavior; generates canonical caseless form.
172   *            True for case-sensitive behavior; generates standard NFD.
173   * ustr:      A pointer to a buffer for the resulting UTF-8 string.
174   * ustr_size: The capacity of ustr, in bytes.
175   * ustr_len:  Pointer to a value that will be filled in with the actual length
176   *            in bytes of the string copied to ustr.
177   *
178   * Returns: 0 on success, or
179   *          EILSEQ: The input string contains illegal ASCII-range characters
180   *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
181   *                  contains codepoints that are non-characters or unassigned in
182   *                  the version of Unicode currently supported.
183   *          ENOMEM: ustr_size is insufficient for the resulting string. In this
184   *                  case the value returned in *ustr_len is invalid.
185   */
186  int utf8_normalizeOptCaseFoldToUTF8(const char *str,
187      size_t      str_len,
188      bool        case_sens,
189      char       *ustr,
190      size_t      ustr_size,
191      size_t     *ustr_len);
192  
193  /*
194   * utf8_normalizeOptCaseFoldAndMatchSubstring
195   *
196   * Determine whether the normalized UTF32 string derived from a specified UTF-8 string
197   * strA contains another UTF32 string ustrB which has already been normalized, typically
198   * with normalizeOptCaseFold. The normalization for both strings is one of the following,
199   * as specified by the case_sens parameter:
200   * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
201   * - standard NFD; for case-sensitive comparison (if case_sens = true).
202   * On success, sets are_equal to true if strA contains ustrB, or false otherwise.
203   *
204   * The input string strA should be valid UTF-8 that meets the criteria for stream safe
205   * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
206   * It should not contain ASCII 0x00 or '/'.
207   *
208   * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the
209   *            substring specified by ustrB.
210   * strA_len:  The byte length of strA (excluding any 0 terminator)
211   * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched
212   *            for in the UTF-32 string resulting from converting strA to the normalized
213   *            UTF-32 form specified by the case_sens parameter; ustrB must already be
214   *            in that form. Normally this will be produced using normalizeOptCaseFold.
215   * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
216   * case_sens: False for case-insensitive matching; compares canonical caseless forms.
217   *            True for case-sensitive matching; compares standard NFD forms.
218   * buf:       Pointer to caller-supplied working memory for storing the portion of
219   *            strA which has been converted to normalized UTF-32.
220   * buf_size:  The size of buf.
221   * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
222   *            per case_sens) contains ustrB, set to false otherwise.
223   *
224   * Returns: 0 on success, or
225   *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
226   *                  not well-formed stream-safe UTF-8, or contains codepoints that are
227   *                  non-characters or unassigned in the version of Unicode currently
228   *                  supported.
229   *                  Note: The search may terminate early when a match is detected, and
230   *                        may return 0 and set *has_match=true even if strA is invalid.
231   *          ENOMEM: buf_size is insufficient.
232   */
233  int utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA,
234      size_t         strA_len,
235      const int32_t *ustrB,
236      int32_t        ustrB_len,
237      bool           case_sens,
238      void          *buf,
239      size_t         buf_size,
240      bool          *has_match);
241  
242  /*
243   * utf8_normalizeOptCaseFoldGetUVersion
244   *
245   * Get the Unicode and code version currently associated with the normalizeOptCaseFold
246   * functions. The caller allocates the version array and passes it to the function,
247   * which will fill out the array as follows:
248   * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
249   * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
250   * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
251   * version[3] = Code revision level; for any given Unicode version, this value starts
252   *              at 0 and is incremented for each significant revision to the
253   *              normalizeOptCaseFold functions.
254   */
255  void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);
256  
257  #endif /* KERNEL_PRIVATE */
258  
259  #endif  /* unicode_h */