GDAL
utf8.h
1 // NOTE: for GDAL, this is an extract from the https://github.com/sheredom/utf8.h
2 // code
3 
4 /* The latest version of this library is available on GitHub;
5  * https://github.com/sheredom/utf8.h */
6 
7 /* This is free and unencumbered software released into the public domain.
8  *
9  * Anyone is free to copy, modify, publish, use, compile, sell, or
10  * distribute this software, either in source code form or as a compiled
11  * binary, for any purpose, commercial or non-commercial, and by any
12  * means.
13  *
14  * In jurisdictions that recognize copyright laws, the author or authors
15  * of this software dedicate any and all copyright interest in the
16  * software to the public domain. We make this dedication for the benefit
17  * of the public at large and to the detriment of our heirs and
18  * successors. We intend this dedication to be an overt act of
19  * relinquishment in perpetuity of all present and future rights to this
20  * software under copyright law.
21  *
22  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
25  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
26  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
27  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28  * OTHER DEALINGS IN THE SOFTWARE.
29  *
30  * For more information, please refer to <http://unlicense.org/> */
31 
32 #ifndef SHEREDOM_UTF8_H_INCLUDED
33 #define SHEREDOM_UTF8_H_INCLUDED
34 
35 #if defined(_MSC_VER)
36 #pragma warning(push)
37 
38 /* disable warning: no function prototype given: converting '()' to '(void)' */
39 #pragma warning(disable : 4255)
40 
41 /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
42  * replacing with '0' for '#if/#elif' */
43 #pragma warning(disable : 4668)
44 
45 /* disable warning: bytes padding added after construct */
46 #pragma warning(disable : 4820)
47 #endif
48 
49 #include <stddef.h>
50 #include <stdlib.h>
51 
52 #if defined(_MSC_VER)
53 #pragma warning(pop)
54 #endif
55 
56 #if defined(_MSC_VER) && (_MSC_VER < 1920)
57 typedef __int32 utf8_int32_t;
58 #else
59 #include <stdint.h>
60 typedef int32_t utf8_int32_t;
61 #endif
62 
63 #if defined(__clang__)
64 #pragma clang diagnostic push
65 #pragma clang diagnostic ignored "-Wold-style-cast"
66 #pragma clang diagnostic ignored "-Wcast-qual"
67 
68 #if __has_warning("-Wunsafe-buffer-usage")
69 #pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
70 #endif
71 #endif
72 
73 namespace {
74 
75 #if defined(_MSC_VER)
76 #define utf8_nonnull
77 #define utf8_pure
78 #define utf8_restrict __restrict
79 #define utf8_weak __inline
80 #elif defined(__clang__) || defined(__GNUC__)
81 #define utf8_nonnull __attribute__((nonnull))
82 #define utf8_pure __attribute__((pure))
83 #define utf8_restrict __restrict__
84 #define utf8_weak __attribute__((weak))
85 #else
86 #define utf8_nonnull
87 #define utf8_pure
88 #define utf8_restrict
89 #define utf8_weak
90 #endif
91 
92 #ifdef __cplusplus
93 #define utf8_null NULL
94 #else
95 #define utf8_null 0
96 #endif
97 
98 #if (defined(__cplusplus) && __cplusplus >= 201402L)
99 #define utf8_constexpr14 constexpr
100 #define utf8_constexpr14_impl constexpr
101 #else
102 /* constexpr and weak are incompatible. so only enable one of them */
103 #define utf8_constexpr14 utf8_weak
104 #define utf8_constexpr14_impl
105 #endif
106 
107 #if defined(__cplusplus) && __cplusplus >= 202002L
108 using utf8_int8_t = char8_t; /* Introduced in C++20 */
109 #else
110 typedef char utf8_int8_t;
111 #endif
112 
113 #if 0
114 /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
115  * src2 respectively, case insensitive. */
116 utf8_constexpr14 utf8_nonnull utf8_pure int
117 utf8casecmp(const utf8_int8_t *src1, const utf8_int8_t *src2);
118 
119 /* Append the utf8 string src onto the utf8 string dst. */
120 utf8_nonnull utf8_weak utf8_int8_t *
121 utf8cat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
122 
123 /* Find the first match of the utf8 codepoint chr in the utf8 string src. */
124 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
125 utf8chr(const utf8_int8_t *src, utf8_int32_t chr);
126 
127 /* Return less than 0, 0, greater than 0 if src1 < src2,
128  * src1 == src2, src1 > src2 respectively. */
129 utf8_constexpr14 utf8_nonnull utf8_pure int utf8cmp(const utf8_int8_t *src1,
130  const utf8_int8_t *src2);
131 
132 /* Copy the utf8 string src onto the memory allocated in dst. */
133 utf8_nonnull utf8_weak utf8_int8_t *
134 utf8cpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
135 
136 /* Number of utf8 codepoints in the utf8 string src that consists entirely
137  * of utf8 codepoints not from the utf8 string reject. */
138 utf8_constexpr14 utf8_nonnull utf8_pure size_t
139 utf8cspn(const utf8_int8_t *src, const utf8_int8_t *reject);
140 
141 /* Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
142  * copying over the data, and returning that. Or 0 if malloc failed. */
143 utf8_weak utf8_int8_t *utf8dup(const utf8_int8_t *src);
144 
145 /* Number of utf8 codepoints in the utf8 string str,
146  * excluding the null terminating byte. */
147 utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8len(const utf8_int8_t *str);
148 
149 /* Similar to utf8len, except that only at most n bytes of src are looked. */
150 utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8nlen(const utf8_int8_t *str,
151  size_t n);
152 
153 /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
154  * src2 respectively, case insensitive. Checking at most n bytes of each utf8
155  * string. */
156 utf8_constexpr14 utf8_nonnull utf8_pure int
157 utf8ncasecmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
158 
159 /* Append the utf8 string src onto the utf8 string dst,
160  * writing at most n+1 bytes. Can produce an invalid utf8
161  * string if n falls partway through a utf8 codepoint. */
162 utf8_nonnull utf8_weak utf8_int8_t *
163 utf8ncat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
164  size_t n);
165 
166 /* Return less than 0, 0, greater than 0 if src1 < src2,
167  * src1 == src2, src1 > src2 respectively. Checking at most n
168  * bytes of each utf8 string. */
169 utf8_constexpr14 utf8_nonnull utf8_pure int
170 utf8ncmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
171 
172 /* Copy the utf8 string src onto the memory allocated in dst.
173  * Copies at most n bytes. If n falls partway through a utf8
174  * codepoint, or if dst doesn't have enough room for a null
175  * terminator, the final string will be cut short to preserve
176  * utf8 validity. */
177 
178 utf8_nonnull utf8_weak utf8_int8_t *
179 utf8ncpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
180  size_t n);
181 
182 /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
183  * longer than n, only n bytes are copied and a null byte is added.
184  *
185  * Returns a new string if successful, 0 otherwise */
186 utf8_weak utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n);
187 
188 /* Locates the first occurrence in the utf8 string str of any byte in the
189  * utf8 string accept, or 0 if no match was found. */
190 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
191 utf8pbrk(const utf8_int8_t *str, const utf8_int8_t *accept);
192 
193 /* Find the last match of the utf8 codepoint chr in the utf8 string src. */
194 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
195 utf8rchr(const utf8_int8_t *src, int chr);
196 
197 /* Number of bytes in the utf8 string str,
198  * including the null terminating byte. */
199 utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8size(const utf8_int8_t *str);
200 
201 /* Similar to utf8size, except that the null terminating byte is excluded. */
202 utf8_constexpr14 utf8_nonnull utf8_pure size_t
203 utf8size_lazy(const utf8_int8_t *str);
204 
205 /* Similar to utf8size, except that only at most n bytes of src are looked and
206  * the null terminating byte is excluded. */
207 utf8_constexpr14 utf8_nonnull utf8_pure size_t
208 utf8nsize_lazy(const utf8_int8_t *str, size_t n);
209 
210 /* Number of utf8 codepoints in the utf8 string src that consists entirely
211  * of utf8 codepoints from the utf8 string accept. */
212 utf8_constexpr14 utf8_nonnull utf8_pure size_t
213 utf8spn(const utf8_int8_t *src, const utf8_int8_t *accept);
214 
215 /* The position of the utf8 string needle in the utf8 string haystack. */
216 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
217 utf8str(const utf8_int8_t *haystack, const utf8_int8_t *needle);
218 
219 /* The position of the utf8 string needle in the utf8 string haystack, case
220  * insensitive. */
221 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
222 utf8casestr(const utf8_int8_t *haystack, const utf8_int8_t *needle);
223 
224 /* Return 0 on success, or the position of the invalid
225  * utf8 codepoint on failure. */
226 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
227 utf8valid(const utf8_int8_t *str);
228 
229 /* Similar to utf8valid, except that only at most n bytes of src are looked. */
230 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
231 utf8nvalid(const utf8_int8_t *str, size_t n);
232 
233 /* Given a null-terminated string, makes the string valid by replacing invalid
234  * codepoints with a 1-byte replacement. Returns 0 on success. */
235 utf8_nonnull utf8_weak int utf8makevalid(utf8_int8_t *str,
236  const utf8_int32_t replacement);
237 #endif
238 /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
239  * address of the next utf8 codepoint after the current one in str. */
240 utf8_constexpr14 utf8_nonnull utf8_int8_t *
241 utf8codepoint(const utf8_int8_t *utf8_restrict str,
242  utf8_int32_t *utf8_restrict out_codepoint);
243 
244 /* Calculates the size of the next utf8 codepoint in str. */
245 utf8_constexpr14 utf8_nonnull size_t
246 utf8codepointcalcsize(const utf8_int8_t *str);
247 
248 #if 0
249 /* Returns the size of the given codepoint in bytes. */
250 utf8_constexpr14 size_t utf8codepointsize(utf8_int32_t chr);
251 
252 /* Write a codepoint to the given string, and return the address to the next
253  * place after the written codepoint. Pass how many bytes left in the buffer to
254  * n. If there is not enough space for the codepoint, this function returns
255  * null. */
256 utf8_nonnull utf8_weak utf8_int8_t *
257 utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n);
258 
259 /* Returns 1 if the given character is lowercase, or 0 if it is not. */
260 utf8_constexpr14 int utf8islower(utf8_int32_t chr);
261 
262 /* Returns 1 if the given character is uppercase, or 0 if it is not. */
263 utf8_constexpr14 int utf8isupper(utf8_int32_t chr);
264 
265 /* Transform the given string into all lowercase codepoints. */
266 utf8_nonnull utf8_weak void utf8lwr(utf8_int8_t *utf8_restrict str);
267 
268 /* Transform the given string into all uppercase codepoints. */
269 utf8_nonnull utf8_weak void utf8upr(utf8_int8_t *utf8_restrict str);
270 #endif
271 
272 /* Make a codepoint lower case if possible. */
273 utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
274 
275 /* Make a codepoint upper case if possible. */
276 utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
277 
278 #if 0
279 /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
280  * address of the previous utf8 codepoint before the current one in str. */
281 utf8_constexpr14 utf8_nonnull utf8_int8_t *
282 utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
283  utf8_int32_t *utf8_restrict out_codepoint);
284 
285 /* Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
286  * copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
287  * returned null. */
288 utf8_weak utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
289  utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
290  size_t),
291  utf8_int8_t *user_data);
292 
293 /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
294  * longer than n, only n bytes are copied and a null byte is added.
295  *
296  * Returns a new string if successful, 0 otherwise. */
297 utf8_weak utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
298  utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
299  size_t),
300  utf8_int8_t *user_data);
301 #endif
302 
303 #undef utf8_weak
304 #undef utf8_pure
305 #undef utf8_nonnull
306 
307 #if 0
308 utf8_constexpr14_impl int utf8casecmp(const utf8_int8_t *src1,
309  const utf8_int8_t *src2) {
310  utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
311  src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
312 
313  for (;;) {
314  src1 = utf8codepoint(src1, &src1_orig_cp);
315  src2 = utf8codepoint(src2, &src2_orig_cp);
316 
317  /* lower the srcs if required */
318  src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
319  src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
320 
321  /* lower the srcs if required */
322  src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
323  src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
324 
325  /* check if the lowered codepoints match */
326  if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
327  return 0;
328  } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
329  continue;
330  }
331 
332  /* if they don't match, then we return the difference between the characters
333  */
334  return src1_lwr_cp - src2_lwr_cp;
335  }
336 }
337 
338 utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
339  const utf8_int8_t *utf8_restrict src) {
340  utf8_int8_t *d = dst;
341  /* find the null terminating byte in dst */
342  while ('\0' != *d) {
343  d++;
344  }
345 
346  /* overwriting the null terminating byte in dst, append src byte-by-byte */
347  while ('\0' != *src) {
348  *d++ = *src++;
349  }
350 
351  /* write out a new null terminating byte into dst */
352  *d = '\0';
353 
354  return dst;
355 }
356 
357 utf8_constexpr14_impl utf8_int8_t *utf8chr(const utf8_int8_t *src,
358  utf8_int32_t chr) {
359  utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
360 
361  if (0 == chr) {
362  /* being asked to return position of null terminating byte, so
363  * just run s to the end, and return! */
364  while ('\0' != *src) {
365  src++;
366  }
367  return (utf8_int8_t *)src;
368  } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
369  /* 1-byte/7-bit ascii
370  * (0b0xxxxxxx) */
371  c[0] = (utf8_int8_t)chr;
372  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
373  /* 2-byte/11-bit utf8 code point
374  * (0b110xxxxx 0b10xxxxxx) */
375  c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
376  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
377  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
378  /* 3-byte/16-bit utf8 code point
379  * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
380  c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
381  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
382  c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
383  } else { /* if (0 == ((int)0xffe00000 & chr)) { */
384  /* 4-byte/21-bit utf8 code point
385  * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
386  c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
387  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
388  c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
389  c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
390  }
391 
392  /* we've made c into a 2 utf8 codepoint string, one for the chr we are
393  * seeking, another for the null terminating byte. Now use utf8str to
394  * search */
395  return utf8str(src, c);
396 }
397 
398 utf8_constexpr14_impl int utf8cmp(const utf8_int8_t *src1,
399  const utf8_int8_t *src2) {
400  while (('\0' != *src1) || ('\0' != *src2)) {
401  if (*src1 < *src2) {
402  return -1;
403  } else if (*src1 > *src2) {
404  return 1;
405  }
406 
407  src1++;
408  src2++;
409  }
410 
411  /* both utf8 strings matched */
412  return 0;
413 }
414 
415 utf8_constexpr14_impl int utf8coll(const utf8_int8_t *src1,
416  const utf8_int8_t *src2);
417 
418 utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
419  const utf8_int8_t *utf8_restrict src) {
420  utf8_int8_t *d = dst;
421 
422  /* overwriting anything previously in dst, write byte-by-byte
423  * from src */
424  while ('\0' != *src) {
425  *d++ = *src++;
426  }
427 
428  /* append null terminating byte */
429  *d = '\0';
430 
431  return dst;
432 }
433 
434 utf8_constexpr14_impl size_t utf8cspn(const utf8_int8_t *src,
435  const utf8_int8_t *reject) {
436  size_t chars = 0;
437 
438  while ('\0' != *src) {
439  const utf8_int8_t *r = reject;
440  size_t offset = 0;
441 
442  while ('\0' != *r) {
443  /* checking that if *r is the start of a utf8 codepoint
444  * (it is not 0b10xxxxxx) and we have successfully matched
445  * a previous character (0 < offset) - we found a match */
446  if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
447  return chars;
448  } else {
449  if (*r == src[offset]) {
450  /* part of a utf8 codepoint matched, so move our checking
451  * onwards to the next byte */
452  offset++;
453  r++;
454  } else {
455  /* r could be in the middle of an unmatching utf8 code point,
456  * so we need to march it on to the next character beginning, */
457 
458  do {
459  r++;
460  } while (0x80 == (0xc0 & *r));
461 
462  /* reset offset too as we found a mismatch */
463  offset = 0;
464  }
465  }
466  }
467 
468  /* found a match at the end of *r, so didn't get a chance to test it */
469  if (0 < offset) {
470  return chars;
471  }
472 
473  /* the current utf8 codepoint in src did not match reject, but src
474  * could have been partway through a utf8 codepoint, so we need to
475  * march it onto the next utf8 codepoint starting byte */
476  do {
477  src++;
478  } while ((0x80 == (0xc0 & *src)));
479  chars++;
480  }
481 
482  return chars;
483 }
484 
485 utf8_int8_t *utf8dup(const utf8_int8_t *src) {
486  return utf8dup_ex(src, utf8_null, utf8_null);
487 }
488 
489 utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
490  utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
491  utf8_int8_t *user_data) {
492  utf8_int8_t *n = utf8_null;
493 
494  /* figure out how many bytes (including the terminator) we need to copy first
495  */
496  size_t bytes = utf8size(src);
497 
498  if (alloc_func_ptr) {
499  n = alloc_func_ptr(user_data, bytes);
500  } else {
501 #if !defined(UTF8_NO_STD_MALLOC)
502  n = (utf8_int8_t *)malloc(bytes);
503 #else
504  return utf8_null;
505 #endif
506  }
507 
508  if (utf8_null == n) {
509  /* out of memory so we bail */
510  return utf8_null;
511  } else {
512  bytes = 0;
513 
514  /* copy src byte-by-byte into our new utf8 string */
515  while ('\0' != src[bytes]) {
516  n[bytes] = src[bytes];
517  bytes++;
518  }
519 
520  /* append null terminating byte */
521  n[bytes] = '\0';
522  return n;
523  }
524 }
525 
526 utf8_constexpr14_impl utf8_int8_t *utf8fry(const utf8_int8_t *str);
527 
528 utf8_constexpr14_impl size_t utf8len(const utf8_int8_t *str) {
529  return utf8nlen(str, SIZE_MAX);
530 }
531 
532 utf8_constexpr14_impl size_t utf8nlen(const utf8_int8_t *str, size_t n) {
533  const utf8_int8_t *t = str;
534  size_t length = 0;
535 
536  while ((size_t)(str - t) < n && '\0' != *str) {
537  if (0xf0 == (0xf8 & *str)) {
538  /* 4-byte utf8 code point (began with 0b11110xxx) */
539  str += 4;
540  } else if (0xe0 == (0xf0 & *str)) {
541  /* 3-byte utf8 code point (began with 0b1110xxxx) */
542  str += 3;
543  } else if (0xc0 == (0xe0 & *str)) {
544  /* 2-byte utf8 code point (began with 0b110xxxxx) */
545  str += 2;
546  } else { /* if (0x00 == (0x80 & *s)) { */
547  /* 1-byte ascii (began with 0b0xxxxxxx) */
548  str += 1;
549  }
550 
551  /* no matter the bytes we marched s forward by, it was
552  * only 1 utf8 codepoint */
553  length++;
554  }
555 
556  if ((size_t)(str - t) > n) {
557  length--;
558  }
559  return length;
560 }
561 
562 utf8_constexpr14_impl int utf8ncasecmp(const utf8_int8_t *src1,
563  const utf8_int8_t *src2, size_t n) {
564  utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
565  src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
566 
567  do {
568  const utf8_int8_t *const s1 = src1;
569  const utf8_int8_t *const s2 = src2;
570 
571  /* first check that we have enough bytes left in n to contain an entire
572  * codepoint */
573  if (0 == n) {
574  return 0;
575  }
576 
577  if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
578  const utf8_int32_t c1 = (0xe0 & *s1);
579  const utf8_int32_t c2 = (0xe0 & *s2);
580 
581  if (c1 < c2) {
582  return c1 - c2;
583  } else {
584  return 0;
585  }
586  }
587 
588  if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
589  const utf8_int32_t c1 = (0xf0 & *s1);
590  const utf8_int32_t c2 = (0xf0 & *s2);
591 
592  if (c1 < c2) {
593  return c1 - c2;
594  } else {
595  return 0;
596  }
597  }
598 
599  if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
600  const utf8_int32_t c1 = (0xf8 & *s1);
601  const utf8_int32_t c2 = (0xf8 & *s2);
602 
603  if (c1 < c2) {
604  return c1 - c2;
605  } else {
606  return 0;
607  }
608  }
609 
610  src1 = utf8codepoint(src1, &src1_orig_cp);
611  src2 = utf8codepoint(src2, &src2_orig_cp);
612  n -= utf8codepointsize(src1_orig_cp);
613 
614  src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
615  src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
616 
617  src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
618  src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
619 
620  /* check if the lowered codepoints match */
621  if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
622  return 0;
623  } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
624  continue;
625  }
626 
627  /* if they don't match, then we return the difference between the characters
628  */
629  return src1_lwr_cp - src2_lwr_cp;
630  } while (0 < n);
631 
632  /* both utf8 strings matched */
633  return 0;
634 }
635 
636 utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
637  const utf8_int8_t *utf8_restrict src, size_t n) {
638  utf8_int8_t *d = dst;
639 
640  /* find the null terminating byte in dst */
641  while ('\0' != *d) {
642  d++;
643  }
644 
645  /* overwriting the null terminating byte in dst, append src byte-by-byte
646  * stopping if we run out of space */
647  while (('\0' != *src) && (0 != n--)) {
648  *d++ = *src++;
649  }
650 
651  /* write out a new null terminating byte into dst */
652  *d = '\0';
653 
654  return dst;
655 }
656 
657 utf8_constexpr14_impl int utf8ncmp(const utf8_int8_t *src1,
658  const utf8_int8_t *src2, size_t n) {
659  while ((0 != n--) && (('\0' != *src1) || ('\0' != *src2))) {
660  if (*src1 < *src2) {
661  return -1;
662  } else if (*src1 > *src2) {
663  return 1;
664  }
665 
666  src1++;
667  src2++;
668  }
669 
670  /* both utf8 strings matched */
671  return 0;
672 }
673 
674 utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
675  const utf8_int8_t *utf8_restrict src, size_t n) {
676  utf8_int8_t *d = dst;
677  size_t index = 0, check_index = 0;
678 
679  if (n == 0) {
680  return dst;
681  }
682 
683  /* overwriting anything previously in dst, write byte-by-byte
684  * from src */
685  for (index = 0; index < n; index++) {
686  d[index] = src[index];
687  if ('\0' == src[index]) {
688  break;
689  }
690  }
691 
692  for (check_index = index - 1;
693  check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
694  /* just moving the index */
695  }
696 
697  if (check_index < index &&
698  ((index - check_index) < utf8codepointcalcsize(&d[check_index]) ||
699  (index - check_index) == n)) {
700  index = check_index;
701  }
702 
703  /* append null terminating byte */
704  for (; index < n; index++) {
705  d[index] = 0;
706  }
707 
708  return dst;
709 }
710 
711 utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n) {
712  return utf8ndup_ex(src, n, utf8_null, utf8_null);
713 }
714 
715 utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
716  utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
717  utf8_int8_t *user_data) {
718  utf8_int8_t *c = utf8_null;
719  size_t bytes = 0;
720 
721  /* Find the end of the string or stop when n is reached */
722  while ('\0' != src[bytes] && bytes < n) {
723  bytes++;
724  }
725 
726  /* In case bytes is actually less than n, we need to set it
727  * to be used later in the copy byte by byte. */
728  n = bytes;
729 
730  if (alloc_func_ptr) {
731  c = alloc_func_ptr(user_data, bytes + 1);
732  } else {
733 #if !defined(UTF8_NO_STD_MALLOC)
734  c = (utf8_int8_t *)malloc(bytes + 1);
735 #else
736  c = utf8_null;
737 #endif
738  }
739 
740  if (utf8_null == c) {
741  /* out of memory so we bail */
742  return utf8_null;
743  }
744 
745  bytes = 0;
746 
747  /* copy src byte-by-byte into our new utf8 string */
748  while ('\0' != src[bytes] && bytes < n) {
749  c[bytes] = src[bytes];
750  bytes++;
751  }
752 
753  /* append null terminating byte */
754  c[bytes] = '\0';
755  return c;
756 }
757 
758 utf8_constexpr14_impl utf8_int8_t *utf8rchr(const utf8_int8_t *src, int chr) {
759 
760  utf8_int8_t *match = utf8_null;
761  utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
762 
763  if (0 == chr) {
764  /* being asked to return position of null terminating byte, so
765  * just run s to the end, and return! */
766  while ('\0' != *src) {
767  src++;
768  }
769  return (utf8_int8_t *)src;
770  } else if (0 == ((int)0xffffff80 & chr)) {
771  /* 1-byte/7-bit ascii
772  * (0b0xxxxxxx) */
773  c[0] = (utf8_int8_t)chr;
774  } else if (0 == ((int)0xfffff800 & chr)) {
775  /* 2-byte/11-bit utf8 code point
776  * (0b110xxxxx 0b10xxxxxx) */
777  c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
778  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
779  } else if (0 == ((int)0xffff0000 & chr)) {
780  /* 3-byte/16-bit utf8 code point
781  * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
782  c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
783  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
784  c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
785  } else { /* if (0 == ((int)0xffe00000 & chr)) { */
786  /* 4-byte/21-bit utf8 code point
787  * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
788  c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
789  c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
790  c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
791  c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
792  }
793 
794  /* we've created a 2 utf8 codepoint string in c that is
795  * the utf8 character asked for by chr, and a null
796  * terminating byte */
797 
798  while ('\0' != *src) {
799  size_t offset = 0;
800 
801  while ((src[offset] == c[offset]) && ('\0' != src[offset])) {
802  offset++;
803  }
804 
805  if ('\0' == c[offset]) {
806  /* we found a matching utf8 code point */
807  match = (utf8_int8_t *)src;
808  src += offset;
809 
810  if ('\0' == *src) {
811  break;
812  }
813  } else {
814  src += offset;
815 
816  /* need to march s along to next utf8 codepoint start
817  * (the next byte that doesn't match 0b10xxxxxx) */
818  if ('\0' != *src) {
819  do {
820  src++;
821  } while (0x80 == (0xc0 & *src));
822  }
823  }
824  }
825 
826  /* return the last match we found (or 0 if no match was found) */
827  return match;
828 }
829 
830 utf8_constexpr14_impl utf8_int8_t *utf8pbrk(const utf8_int8_t *str,
831  const utf8_int8_t *accept) {
832  while ('\0' != *str) {
833  const utf8_int8_t *a = accept;
834  size_t offset = 0;
835 
836  while ('\0' != *a) {
837  /* checking that if *a is the start of a utf8 codepoint
838  * (it is not 0b10xxxxxx) and we have successfully matched
839  * a previous character (0 < offset) - we found a match */
840  if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
841  return (utf8_int8_t *)str;
842  } else {
843  if (*a == str[offset]) {
844  /* part of a utf8 codepoint matched, so move our checking
845  * onwards to the next byte */
846  offset++;
847  a++;
848  } else {
849  /* r could be in the middle of an unmatching utf8 code point,
850  * so we need to march it on to the next character beginning, */
851 
852  do {
853  a++;
854  } while (0x80 == (0xc0 & *a));
855 
856  /* reset offset too as we found a mismatch */
857  offset = 0;
858  }
859  }
860  }
861 
862  /* we found a match on the last utf8 codepoint */
863  if (0 < offset) {
864  return (utf8_int8_t *)str;
865  }
866 
867  /* the current utf8 codepoint in src did not match accept, but src
868  * could have been partway through a utf8 codepoint, so we need to
869  * march it onto the next utf8 codepoint starting byte */
870  do {
871  str++;
872  } while ((0x80 == (0xc0 & *str)));
873  }
874 
875  return utf8_null;
876 }
877 
878 utf8_constexpr14_impl size_t utf8size(const utf8_int8_t *str) {
879  return utf8size_lazy(str) + 1;
880 }
881 
882 utf8_constexpr14_impl size_t utf8size_lazy(const utf8_int8_t *str) {
883  return utf8nsize_lazy(str, SIZE_MAX);
884 }
885 
886 utf8_constexpr14_impl size_t utf8nsize_lazy(const utf8_int8_t *str, size_t n) {
887  size_t size = 0;
888  while (size < n && '\0' != str[size]) {
889  size++;
890  }
891  return size;
892 }
893 
894 utf8_constexpr14_impl size_t utf8spn(const utf8_int8_t *src,
895  const utf8_int8_t *accept) {
896  size_t chars = 0;
897 
898  while ('\0' != *src) {
899  const utf8_int8_t *a = accept;
900  size_t offset = 0;
901 
902  while ('\0' != *a) {
903  /* checking that if *r is the start of a utf8 codepoint
904  * (it is not 0b10xxxxxx) and we have successfully matched
905  * a previous character (0 < offset) - we found a match */
906  if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
907  /* found a match, so increment the number of utf8 codepoints
908  * that have matched and stop checking whether any other utf8
909  * codepoints in a match */
910  chars++;
911  src += offset;
912  offset = 0;
913  break;
914  } else {
915  if (*a == src[offset]) {
916  offset++;
917  a++;
918  } else {
919  /* a could be in the middle of an unmatching utf8 codepoint,
920  * so we need to march it on to the next character beginning, */
921  do {
922  a++;
923  } while (0x80 == (0xc0 & *a));
924 
925  /* reset offset too as we found a mismatch */
926  offset = 0;
927  }
928  }
929  }
930 
931  /* found a match at the end of *a, so didn't get a chance to test it */
932  if (0 < offset) {
933  chars++;
934  src += offset;
935  continue;
936  }
937 
938  /* if a got to its terminating null byte, then we didn't find a match.
939  * Return the current number of matched utf8 codepoints */
940  if ('\0' == *a) {
941  return chars;
942  }
943  }
944 
945  return chars;
946 }
947 
948 utf8_constexpr14_impl utf8_int8_t *utf8str(const utf8_int8_t *haystack,
949  const utf8_int8_t *needle) {
950  utf8_int32_t throwaway_codepoint = 0;
951 
952  /* if needle has no utf8 codepoints before the null terminating
953  * byte then return haystack */
954  if ('\0' == *needle) {
955  return (utf8_int8_t *)haystack;
956  }
957 
958  while ('\0' != *haystack) {
959  const utf8_int8_t *maybeMatch = haystack;
960  const utf8_int8_t *n = needle;
961 
962  while (*haystack == *n && (*haystack != '\0' && *n != '\0')) {
963  n++;
964  haystack++;
965  }
966 
967  if ('\0' == *n) {
968  /* we found the whole utf8 string for needle in haystack at
969  * maybeMatch, so return it */
970  return (utf8_int8_t *)maybeMatch;
971  } else {
972  /* h could be in the middle of an unmatching utf8 codepoint,
973  * so we need to march it on to the next character beginning
974  * starting from the current character */
975  haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
976  }
977  }
978 
979  /* no match */
980  return utf8_null;
981 }
982 
983 utf8_constexpr14_impl utf8_int8_t *utf8casestr(const utf8_int8_t *haystack,
984  const utf8_int8_t *needle) {
985  /* if needle has no utf8 codepoints before the null terminating
986  * byte then return haystack */
987  if ('\0' == *needle) {
988  return (utf8_int8_t *)haystack;
989  }
990 
991  for (;;) {
992  const utf8_int8_t *maybeMatch = haystack;
993  const utf8_int8_t *n = needle;
994  utf8_int32_t h_cp = 0, n_cp = 0;
995 
996  /* Get the next code point and track it */
997  const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
998  n = utf8codepoint(n, &n_cp);
999 
1000  while ((0 != h_cp) && (0 != n_cp)) {
1001  h_cp = utf8lwrcodepoint(h_cp);
1002  n_cp = utf8lwrcodepoint(n_cp);
1003 
1004  /* if we find a mismatch, bail out! */
1005  if (h_cp != n_cp) {
1006  break;
1007  }
1008 
1009  haystack = utf8codepoint(haystack, &h_cp);
1010  n = utf8codepoint(n, &n_cp);
1011  }
1012 
1013  if (0 == n_cp) {
1014  /* we found the whole utf8 string for needle in haystack at
1015  * maybeMatch, so return it */
1016  return (utf8_int8_t *)maybeMatch;
1017  }
1018 
1019  if (0 == h_cp) {
1020  /* no match */
1021  return utf8_null;
1022  }
1023 
1024  /* Roll back to the next code point in the haystack to test */
1025  haystack = nextH;
1026  }
1027 }
1028 
1029 utf8_constexpr14_impl utf8_int8_t *utf8valid(const utf8_int8_t *str) {
1030  return utf8nvalid(str, SIZE_MAX);
1031 }
1032 
1033 utf8_constexpr14_impl utf8_int8_t *utf8nvalid(const utf8_int8_t *str,
1034  size_t n) {
1035  const utf8_int8_t *t = str;
1036  size_t consumed = 0;
1037 
1038  while ((void)(consumed = (size_t)(str - t)), consumed < n && '\0' != *str) {
1039  const size_t remaining = n - consumed;
1040 
1041  if (0xf0 == (0xf8 & *str)) {
1042  /* ensure that there's 4 bytes or more remaining */
1043  if (remaining < 4) {
1044  return (utf8_int8_t *)str;
1045  }
1046 
1047  /* ensure each of the 3 following bytes in this 4-byte
1048  * utf8 codepoint began with 0b10xxxxxx */
1049  if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
1050  (0x80 != (0xc0 & str[3]))) {
1051  return (utf8_int8_t *)str;
1052  }
1053 
1054  /* ensure that our utf8 codepoint ended after 4 bytes */
1055  if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
1056  return (utf8_int8_t *)str;
1057  }
1058 
1059  /* ensure that the top 5 bits of this 4-byte utf8
1060  * codepoint were not 0, as then we could have used
1061  * one of the smaller encodings */
1062  if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
1063  return (utf8_int8_t *)str;
1064  }
1065 
1066  /* 4-byte utf8 code point (began with 0b11110xxx) */
1067  str += 4;
1068  } else if (0xe0 == (0xf0 & *str)) {
1069  /* ensure that there's 3 bytes or more remaining */
1070  if (remaining < 3) {
1071  return (utf8_int8_t *)str;
1072  }
1073 
1074  /* ensure each of the 2 following bytes in this 3-byte
1075  * utf8 codepoint began with 0b10xxxxxx */
1076  if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
1077  return (utf8_int8_t *)str;
1078  }
1079 
1080  /* ensure that our utf8 codepoint ended after 3 bytes */
1081  if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
1082  return (utf8_int8_t *)str;
1083  }
1084 
1085  /* ensure that the top 5 bits of this 3-byte utf8
1086  * codepoint were not 0, as then we could have used
1087  * one of the smaller encodings */
1088  if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
1089  return (utf8_int8_t *)str;
1090  }
1091 
1092  /* 3-byte utf8 code point (began with 0b1110xxxx) */
1093  str += 3;
1094  } else if (0xc0 == (0xe0 & *str)) {
1095  /* ensure that there's 2 bytes or more remaining */
1096  if (remaining < 2) {
1097  return (utf8_int8_t *)str;
1098  }
1099 
1100  /* ensure the 1 following byte in this 2-byte
1101  * utf8 codepoint began with 0b10xxxxxx */
1102  if (0x80 != (0xc0 & str[1])) {
1103  return (utf8_int8_t *)str;
1104  }
1105 
1106  /* ensure that our utf8 codepoint ended after 2 bytes */
1107  if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
1108  return (utf8_int8_t *)str;
1109  }
1110 
1111  /* ensure that the top 4 bits of this 2-byte utf8
1112  * codepoint were not 0, as then we could have used
1113  * one of the smaller encodings */
1114  if (0 == (0x1e & str[0])) {
1115  return (utf8_int8_t *)str;
1116  }
1117 
1118  /* 2-byte utf8 code point (began with 0b110xxxxx) */
1119  str += 2;
1120  } else if (0x00 == (0x80 & *str)) {
1121  /* 1-byte ascii (began with 0b0xxxxxxx) */
1122  str += 1;
1123  } else {
1124  /* we have an invalid 0b1xxxxxxx utf8 code point entry */
1125  return (utf8_int8_t *)str;
1126  }
1127  }
1128 
1129  return utf8_null;
1130 }
1131 
1132 int utf8makevalid(utf8_int8_t *str, const utf8_int32_t replacement) {
1133  utf8_int8_t *read = str;
1134  utf8_int8_t *write = read;
1135  const utf8_int8_t r = (utf8_int8_t)replacement;
1136  utf8_int32_t codepoint = 0;
1137 
1138  if (replacement > 0x7f) {
1139  return -1;
1140  }
1141 
1142  while ('\0' != *read) {
1143  if (0xf0 == (0xf8 & *read)) {
1144  /* ensure each of the 3 following bytes in this 4-byte
1145  * utf8 codepoint began with 0b10xxxxxx */
1146  if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
1147  (0x80 != (0xc0 & read[3]))) {
1148  *write++ = r;
1149  read++;
1150  continue;
1151  }
1152 
1153  /* 4-byte utf8 code point (began with 0b11110xxx) */
1154  read = utf8codepoint(read, &codepoint);
1155  write = utf8catcodepoint(write, codepoint, 4);
1156  } else if (0xe0 == (0xf0 & *read)) {
1157  /* ensure each of the 2 following bytes in this 3-byte
1158  * utf8 codepoint began with 0b10xxxxxx */
1159  if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
1160  *write++ = r;
1161  read++;
1162  continue;
1163  }
1164 
1165  /* 3-byte utf8 code point (began with 0b1110xxxx) */
1166  read = utf8codepoint(read, &codepoint);
1167  write = utf8catcodepoint(write, codepoint, 3);
1168  } else if (0xc0 == (0xe0 & *read)) {
1169  /* ensure the 1 following byte in this 2-byte
1170  * utf8 codepoint began with 0b10xxxxxx */
1171  if (0x80 != (0xc0 & read[1])) {
1172  *write++ = r;
1173  read++;
1174  continue;
1175  }
1176 
1177  /* 2-byte utf8 code point (began with 0b110xxxxx) */
1178  read = utf8codepoint(read, &codepoint);
1179  write = utf8catcodepoint(write, codepoint, 2);
1180  } else if (0x00 == (0x80 & *read)) {
1181  /* 1-byte ascii (began with 0b0xxxxxxx) */
1182  read = utf8codepoint(read, &codepoint);
1183  write = utf8catcodepoint(write, codepoint, 1);
1184  } else {
1185  /* if we got here then we've got a dangling continuation (0b10xxxxxx) */
1186  *write++ = r;
1187  read++;
1188  continue;
1189  }
1190  }
1191 
1192  *write = '\0';
1193 
1194  return 0;
1195 }
1196 #endif
1197 
1198 utf8_constexpr14_impl utf8_int8_t *
1199 utf8codepoint(const utf8_int8_t *utf8_restrict str,
1200  utf8_int32_t *utf8_restrict out_codepoint) {
1201  if (0xf0 == (0xf8 & str[0])) {
1202  /* 4 byte utf8 codepoint */
1203  *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
1204  ((0x3f & str[2]) << 6) | (0x3f & str[3]);
1205  str += 4;
1206  } else if (0xe0 == (0xf0 & str[0])) {
1207  /* 3 byte utf8 codepoint */
1208  *out_codepoint =
1209  ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
1210  str += 3;
1211  } else if (0xc0 == (0xe0 & str[0])) {
1212  /* 2 byte utf8 codepoint */
1213  *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
1214  str += 2;
1215  } else {
1216  /* 1 byte utf8 codepoint otherwise */
1217  *out_codepoint = str[0];
1218  str += 1;
1219  }
1220 
1221  return const_cast<utf8_int8_t *>(str);
1222 }
1223 
1224 utf8_constexpr14_impl size_t utf8codepointcalcsize(const utf8_int8_t *str) {
1225  if (0xf0 == (0xf8 & str[0])) {
1226  /* 4 byte utf8 codepoint */
1227  return 4;
1228  } else if (0xe0 == (0xf0 & str[0])) {
1229  /* 3 byte utf8 codepoint */
1230  return 3;
1231  } else if (0xc0 == (0xe0 & str[0])) {
1232  /* 2 byte utf8 codepoint */
1233  return 2;
1234  }
1235 
1236  /* 1 byte utf8 codepoint otherwise */
1237  return 1;
1238 }
1239 
1240 #if 0
1241 utf8_constexpr14_impl size_t utf8codepointsize(utf8_int32_t chr) {
1242  if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1243  return 1;
1244  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1245  return 2;
1246  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1247  return 3;
1248  } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1249  return 4;
1250  }
1251 }
1252 
1253 utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n) {
1254  if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1255  /* 1-byte/7-bit ascii
1256  * (0b0xxxxxxx) */
1257  if (n < 1) {
1258  return utf8_null;
1259  }
1260  str[0] = (utf8_int8_t)chr;
1261  str += 1;
1262  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1263  /* 2-byte/11-bit utf8 code point
1264  * (0b110xxxxx 0b10xxxxxx) */
1265  if (n < 2) {
1266  return utf8_null;
1267  }
1268  str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
1269  str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1270  str += 2;
1271  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1272  /* 3-byte/16-bit utf8 code point
1273  * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
1274  if (n < 3) {
1275  return utf8_null;
1276  }
1277  str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
1278  str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1279  str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1280  str += 3;
1281  } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1282  /* 4-byte/21-bit utf8 code point
1283  * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
1284  if (n < 4) {
1285  return utf8_null;
1286  }
1287  str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
1288  str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
1289  str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1290  str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1291  str += 4;
1292  }
1293 
1294  return str;
1295 }
1296 
1297 utf8_constexpr14_impl int utf8islower(utf8_int32_t chr) {
1298  return chr != utf8uprcodepoint(chr);
1299 }
1300 
1301 utf8_constexpr14_impl int utf8isupper(utf8_int32_t chr) {
1302  return chr != utf8lwrcodepoint(chr);
1303 }
1304 
1305 void utf8lwr(utf8_int8_t *utf8_restrict str) {
1306  utf8_int32_t cp = 0;
1307  utf8_int8_t *pn = utf8codepoint(str, &cp);
1308 
1309  while (cp != 0) {
1310  const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1311  const size_t size = utf8codepointsize(lwr_cp);
1312 
1313  if (lwr_cp != cp) {
1314  utf8catcodepoint(str, lwr_cp, size);
1315  }
1316 
1317  str = pn;
1318  pn = utf8codepoint(str, &cp);
1319  }
1320 }
1321 
1322 void utf8upr(utf8_int8_t *utf8_restrict str) {
1323  utf8_int32_t cp = 0;
1324  utf8_int8_t *pn = utf8codepoint(str, &cp);
1325 
1326  while (cp != 0) {
1327  const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1328  const size_t size = utf8codepointsize(lwr_cp);
1329 
1330  if (lwr_cp != cp) {
1331  utf8catcodepoint(str, lwr_cp, size);
1332  }
1333 
1334  str = pn;
1335  pn = utf8codepoint(str, &cp);
1336  }
1337 }
1338 #endif
1339 
1340 utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1341  if (((0x0041 <= cp) && (0x005a >= cp)) ||
1342  ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1343  ((0x00d8 <= cp) && (0x00de >= cp)) ||
1344  ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1345  ((0x03a3 <= cp) && (0x03ab >= cp)) ||
1346  ((0x0410 <= cp) && (0x042f >= cp))) {
1347  cp += 32;
1348  } else if ((0x0400 <= cp) && (0x040f >= cp)) {
1349  cp += 80;
1350  } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1351  ((0x0132 <= cp) && (0x0137 >= cp)) ||
1352  ((0x014a <= cp) && (0x0177 >= cp)) ||
1353  ((0x0182 <= cp) && (0x0185 >= cp)) ||
1354  ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1355  ((0x01de <= cp) && (0x01ef >= cp)) ||
1356  ((0x01f8 <= cp) && (0x021f >= cp)) ||
1357  ((0x0222 <= cp) && (0x0233 >= cp)) ||
1358  ((0x0246 <= cp) && (0x024f >= cp)) ||
1359  ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1360  ((0x0460 <= cp) && (0x0481 >= cp)) ||
1361  ((0x048a <= cp) && (0x04ff >= cp))) {
1362  cp |= 0x1;
1363  } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1364  ((0x0179 <= cp) && (0x017e >= cp)) ||
1365  ((0x01af <= cp) && (0x01b0 >= cp)) ||
1366  ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1367  ((0x01cd <= cp) && (0x01dc >= cp))) {
1368  cp += 1;
1369  cp &= ~0x1;
1370  } else {
1371  switch (cp) {
1372  default:
1373  break;
1374  case 0x0178:
1375  cp = 0x00ff;
1376  break;
1377  case 0x0243:
1378  cp = 0x0180;
1379  break;
1380  case 0x018e:
1381  cp = 0x01dd;
1382  break;
1383  case 0x023d:
1384  cp = 0x019a;
1385  break;
1386  case 0x0220:
1387  cp = 0x019e;
1388  break;
1389  case 0x01b7:
1390  cp = 0x0292;
1391  break;
1392  case 0x01c4:
1393  cp = 0x01c6;
1394  break;
1395  case 0x01c7:
1396  cp = 0x01c9;
1397  break;
1398  case 0x01ca:
1399  cp = 0x01cc;
1400  break;
1401  case 0x01f1:
1402  cp = 0x01f3;
1403  break;
1404  case 0x01f7:
1405  cp = 0x01bf;
1406  break;
1407  case 0x0187:
1408  cp = 0x0188;
1409  break;
1410  case 0x018b:
1411  cp = 0x018c;
1412  break;
1413  case 0x0191:
1414  cp = 0x0192;
1415  break;
1416  case 0x0198:
1417  cp = 0x0199;
1418  break;
1419  case 0x01a7:
1420  cp = 0x01a8;
1421  break;
1422  case 0x01ac:
1423  cp = 0x01ad;
1424  break;
1425  case 0x01b8:
1426  cp = 0x01b9;
1427  break;
1428  case 0x01bc:
1429  cp = 0x01bd;
1430  break;
1431  case 0x01f4:
1432  cp = 0x01f5;
1433  break;
1434  case 0x023b:
1435  cp = 0x023c;
1436  break;
1437  case 0x0241:
1438  cp = 0x0242;
1439  break;
1440  case 0x03fd:
1441  cp = 0x037b;
1442  break;
1443  case 0x03fe:
1444  cp = 0x037c;
1445  break;
1446  case 0x03ff:
1447  cp = 0x037d;
1448  break;
1449  case 0x037f:
1450  cp = 0x03f3;
1451  break;
1452  case 0x0386:
1453  cp = 0x03ac;
1454  break;
1455  case 0x0388:
1456  cp = 0x03ad;
1457  break;
1458  case 0x0389:
1459  cp = 0x03ae;
1460  break;
1461  case 0x038a:
1462  cp = 0x03af;
1463  break;
1464  case 0x038c:
1465  cp = 0x03cc;
1466  break;
1467  case 0x038e:
1468  cp = 0x03cd;
1469  break;
1470  case 0x038f:
1471  cp = 0x03ce;
1472  break;
1473  case 0x0370:
1474  cp = 0x0371;
1475  break;
1476  case 0x0372:
1477  cp = 0x0373;
1478  break;
1479  case 0x0376:
1480  cp = 0x0377;
1481  break;
1482  case 0x03f4:
1483  cp = 0x03b8;
1484  break;
1485  case 0x03cf:
1486  cp = 0x03d7;
1487  break;
1488  case 0x03f9:
1489  cp = 0x03f2;
1490  break;
1491  case 0x03f7:
1492  cp = 0x03f8;
1493  break;
1494  case 0x03fa:
1495  cp = 0x03fb;
1496  break;
1497  }
1498  }
1499 
1500  return cp;
1501 }
1502 
1503 utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1504  if (((0x0061 <= cp) && (0x007a >= cp)) ||
1505  ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1506  ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1507  ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1508  ((0x03c3 <= cp) && (0x03cb >= cp)) ||
1509  ((0x0430 <= cp) && (0x044f >= cp))) {
1510  cp -= 32;
1511  } else if ((0x0450 <= cp) && (0x045f >= cp)) {
1512  cp -= 80;
1513  } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1514  ((0x0132 <= cp) && (0x0137 >= cp)) ||
1515  ((0x014a <= cp) && (0x0177 >= cp)) ||
1516  ((0x0182 <= cp) && (0x0185 >= cp)) ||
1517  ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1518  ((0x01de <= cp) && (0x01ef >= cp)) ||
1519  ((0x01f8 <= cp) && (0x021f >= cp)) ||
1520  ((0x0222 <= cp) && (0x0233 >= cp)) ||
1521  ((0x0246 <= cp) && (0x024f >= cp)) ||
1522  ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1523  ((0x0460 <= cp) && (0x0481 >= cp)) ||
1524  ((0x048a <= cp) && (0x04ff >= cp))) {
1525  cp &= ~0x1;
1526  } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1527  ((0x0179 <= cp) && (0x017e >= cp)) ||
1528  ((0x01af <= cp) && (0x01b0 >= cp)) ||
1529  ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1530  ((0x01cd <= cp) && (0x01dc >= cp))) {
1531  cp -= 1;
1532  cp |= 0x1;
1533  } else {
1534  switch (cp) {
1535  default:
1536  break;
1537  case 0x00ff:
1538  cp = 0x0178;
1539  break;
1540  case 0x0180:
1541  cp = 0x0243;
1542  break;
1543  case 0x01dd:
1544  cp = 0x018e;
1545  break;
1546  case 0x019a:
1547  cp = 0x023d;
1548  break;
1549  case 0x019e:
1550  cp = 0x0220;
1551  break;
1552  case 0x0292:
1553  cp = 0x01b7;
1554  break;
1555  case 0x01c6:
1556  cp = 0x01c4;
1557  break;
1558  case 0x01c9:
1559  cp = 0x01c7;
1560  break;
1561  case 0x01cc:
1562  cp = 0x01ca;
1563  break;
1564  case 0x01f3:
1565  cp = 0x01f1;
1566  break;
1567  case 0x01bf:
1568  cp = 0x01f7;
1569  break;
1570  case 0x0188:
1571  cp = 0x0187;
1572  break;
1573  case 0x018c:
1574  cp = 0x018b;
1575  break;
1576  case 0x0192:
1577  cp = 0x0191;
1578  break;
1579  case 0x0199:
1580  cp = 0x0198;
1581  break;
1582  case 0x01a8:
1583  cp = 0x01a7;
1584  break;
1585  case 0x01ad:
1586  cp = 0x01ac;
1587  break;
1588  case 0x01b9:
1589  cp = 0x01b8;
1590  break;
1591  case 0x01bd:
1592  cp = 0x01bc;
1593  break;
1594  case 0x01f5:
1595  cp = 0x01f4;
1596  break;
1597  case 0x023c:
1598  cp = 0x023b;
1599  break;
1600  case 0x0242:
1601  cp = 0x0241;
1602  break;
1603  case 0x037b:
1604  cp = 0x03fd;
1605  break;
1606  case 0x037c:
1607  cp = 0x03fe;
1608  break;
1609  case 0x037d:
1610  cp = 0x03ff;
1611  break;
1612  case 0x03f3:
1613  cp = 0x037f;
1614  break;
1615  case 0x03ac:
1616  cp = 0x0386;
1617  break;
1618  case 0x03ad:
1619  cp = 0x0388;
1620  break;
1621  case 0x03ae:
1622  cp = 0x0389;
1623  break;
1624  case 0x03af:
1625  cp = 0x038a;
1626  break;
1627  case 0x03cc:
1628  cp = 0x038c;
1629  break;
1630  case 0x03cd:
1631  cp = 0x038e;
1632  break;
1633  case 0x03ce:
1634  cp = 0x038f;
1635  break;
1636  case 0x0371:
1637  cp = 0x0370;
1638  break;
1639  case 0x0373:
1640  cp = 0x0372;
1641  break;
1642  case 0x0377:
1643  cp = 0x0376;
1644  break;
1645  case 0x03d1:
1646  cp = 0x0398;
1647  break;
1648  case 0x03d7:
1649  cp = 0x03cf;
1650  break;
1651  case 0x03f2:
1652  cp = 0x03f9;
1653  break;
1654  case 0x03f8:
1655  cp = 0x03f7;
1656  break;
1657  case 0x03fb:
1658  cp = 0x03fa;
1659  break;
1660  }
1661  }
1662 
1663  return cp;
1664 }
1665 
1666 utf8_constexpr14_impl utf8_int8_t *
1667 utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
1668  utf8_int32_t *utf8_restrict out_codepoint) {
1669  const utf8_int8_t *s = static_cast<const utf8_int8_t *>(str);
1670 
1671  if (0xf0 == (0xf8 & s[0])) {
1672  /* 4 byte utf8 codepoint */
1673  *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
1674  ((0x3f & s[2]) << 6) | (0x3f & s[3]);
1675  } else if (0xe0 == (0xf0 & s[0])) {
1676  /* 3 byte utf8 codepoint */
1677  *out_codepoint =
1678  ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
1679  } else if (0xc0 == (0xe0 & s[0])) {
1680  /* 2 byte utf8 codepoint */
1681  *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
1682  } else {
1683  /* 1 byte utf8 codepoint otherwise */
1684  *out_codepoint = s[0];
1685  }
1686 
1687  do {
1688  s--;
1689  } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
1690 
1691  return const_cast<utf8_int8_t *>(s);
1692 }
1693 
1694 #undef utf8_restrict
1695 #undef utf8_constexpr14
1696 #undef utf8_null
1697 
1698 } // namespace
1699 
1700 #if defined(__clang__)
1701 #pragma clang diagnostic pop
1702 #endif
1703 
1704 #endif /* SHEREDOM_UTF8_H_INCLUDED */