32 #ifndef SHEREDOM_UTF8_H_INCLUDED
33 #define SHEREDOM_UTF8_H_INCLUDED
39 #pragma warning(disable : 4255)
43 #pragma warning(disable : 4668)
46 #pragma warning(disable : 4820)
56 #if defined(_MSC_VER) && (_MSC_VER < 1920)
57 typedef __int32 utf8_int32_t;
60 typedef int32_t utf8_int32_t;
63 #if defined(__clang__)
64 #pragma clang diagnostic push
65 #pragma clang diagnostic ignored "-Wold-style-cast"
66 #pragma clang diagnostic ignored "-Wcast-qual"
68 #if __has_warning("-Wunsafe-buffer-usage")
69 #pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
78 #define utf8_restrict __restrict
79 #define utf8_weak __inline
80 #elif defined(__clang__) || defined(__GNUC__)
81 #define utf8_nonnull __attribute__((nonnull))
82 #define utf8_pure __attribute__((pure))
83 #define utf8_restrict __restrict__
84 #define utf8_weak __attribute__((weak))
93 #define utf8_null NULL
98 #if (defined(__cplusplus) && __cplusplus >= 201402L)
99 #define utf8_constexpr14 constexpr
100 #define utf8_constexpr14_impl constexpr
103 #define utf8_constexpr14 utf8_weak
104 #define utf8_constexpr14_impl
107 #if defined(__cplusplus) && __cplusplus >= 202002L
108 using utf8_int8_t = char8_t;
110 typedef char utf8_int8_t;
116 utf8_constexpr14 utf8_nonnull utf8_pure
int
117 utf8casecmp(
const utf8_int8_t *src1,
const utf8_int8_t *src2);
120 utf8_nonnull utf8_weak utf8_int8_t *
121 utf8cat(utf8_int8_t *utf8_restrict dst,
const utf8_int8_t *utf8_restrict src);
124 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
125 utf8chr(
const utf8_int8_t *src, utf8_int32_t chr);
129 utf8_constexpr14 utf8_nonnull utf8_pure
int utf8cmp(
const utf8_int8_t *src1,
130 const utf8_int8_t *src2);
133 utf8_nonnull utf8_weak utf8_int8_t *
134 utf8cpy(utf8_int8_t *utf8_restrict dst,
const utf8_int8_t *utf8_restrict src);
138 utf8_constexpr14 utf8_nonnull utf8_pure
size_t
139 utf8cspn(
const utf8_int8_t *src,
const utf8_int8_t *reject);
143 utf8_weak utf8_int8_t *utf8dup(
const utf8_int8_t *src);
147 utf8_constexpr14 utf8_nonnull utf8_pure
size_t utf8len(
const utf8_int8_t *str);
150 utf8_constexpr14 utf8_nonnull utf8_pure
size_t utf8nlen(
const utf8_int8_t *str,
156 utf8_constexpr14 utf8_nonnull utf8_pure
int
157 utf8ncasecmp(
const utf8_int8_t *src1,
const utf8_int8_t *src2,
size_t n);
162 utf8_nonnull utf8_weak utf8_int8_t *
163 utf8ncat(utf8_int8_t *utf8_restrict dst,
const utf8_int8_t *utf8_restrict src,
169 utf8_constexpr14 utf8_nonnull utf8_pure
int
170 utf8ncmp(
const utf8_int8_t *src1,
const utf8_int8_t *src2,
size_t n);
178 utf8_nonnull utf8_weak utf8_int8_t *
179 utf8ncpy(utf8_int8_t *utf8_restrict dst,
const utf8_int8_t *utf8_restrict src,
186 utf8_weak utf8_int8_t *utf8ndup(
const utf8_int8_t *src,
size_t n);
190 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
191 utf8pbrk(
const utf8_int8_t *str,
const utf8_int8_t *accept);
194 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
195 utf8rchr(
const utf8_int8_t *src,
int chr);
199 utf8_constexpr14 utf8_nonnull utf8_pure
size_t utf8size(
const utf8_int8_t *str);
202 utf8_constexpr14 utf8_nonnull utf8_pure
size_t
203 utf8size_lazy(
const utf8_int8_t *str);
207 utf8_constexpr14 utf8_nonnull utf8_pure
size_t
208 utf8nsize_lazy(
const utf8_int8_t *str,
size_t n);
212 utf8_constexpr14 utf8_nonnull utf8_pure
size_t
213 utf8spn(
const utf8_int8_t *src,
const utf8_int8_t *accept);
216 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
217 utf8str(
const utf8_int8_t *haystack,
const utf8_int8_t *needle);
221 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
222 utf8casestr(
const utf8_int8_t *haystack,
const utf8_int8_t *needle);
226 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
227 utf8valid(
const utf8_int8_t *str);
230 utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
231 utf8nvalid(
const utf8_int8_t *str,
size_t n);
235 utf8_nonnull utf8_weak
int utf8makevalid(utf8_int8_t *str,
236 const utf8_int32_t replacement);
240 utf8_constexpr14 utf8_nonnull utf8_int8_t *
241 utf8codepoint(
const utf8_int8_t *utf8_restrict str,
242 utf8_int32_t *utf8_restrict out_codepoint);
245 utf8_constexpr14 utf8_nonnull
size_t
246 utf8codepointcalcsize(
const utf8_int8_t *str);
250 utf8_constexpr14
size_t utf8codepointsize(utf8_int32_t chr);
256 utf8_nonnull utf8_weak utf8_int8_t *
257 utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr,
size_t n);
260 utf8_constexpr14
int utf8islower(utf8_int32_t chr);
263 utf8_constexpr14
int utf8isupper(utf8_int32_t chr);
266 utf8_nonnull utf8_weak
void utf8lwr(utf8_int8_t *utf8_restrict str);
269 utf8_nonnull utf8_weak
void utf8upr(utf8_int8_t *utf8_restrict str);
273 utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
276 utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
281 utf8_constexpr14 utf8_nonnull utf8_int8_t *
282 utf8rcodepoint(
const utf8_int8_t *utf8_restrict str,
283 utf8_int32_t *utf8_restrict out_codepoint);
288 utf8_weak utf8_int8_t *utf8dup_ex(
const utf8_int8_t *src,
289 utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
291 utf8_int8_t *user_data);
297 utf8_weak utf8_int8_t *utf8ndup_ex(
const utf8_int8_t *src,
size_t n,
298 utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
300 utf8_int8_t *user_data);
308 utf8_constexpr14_impl
int utf8casecmp(
const utf8_int8_t *src1,
309 const utf8_int8_t *src2) {
310 utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
311 src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
314 src1 = utf8codepoint(src1, &src1_orig_cp);
315 src2 = utf8codepoint(src2, &src2_orig_cp);
318 src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
319 src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
322 src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
323 src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
326 if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
328 }
else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
334 return src1_lwr_cp - src2_lwr_cp;
338 utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
339 const utf8_int8_t *utf8_restrict src) {
340 utf8_int8_t *d = dst;
347 while (
'\0' != *src) {
357 utf8_constexpr14_impl utf8_int8_t *utf8chr(
const utf8_int8_t *src,
359 utf8_int8_t c[5] = {
'\0',
'\0',
'\0',
'\0',
'\0'};
364 while (
'\0' != *src) {
367 return (utf8_int8_t *)src;
368 }
else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
371 c[0] = (utf8_int8_t)chr;
372 }
else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
375 c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
376 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
377 }
else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
380 c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
381 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
382 c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
386 c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
387 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
388 c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
389 c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
395 return utf8str(src, c);
398 utf8_constexpr14_impl
int utf8cmp(
const utf8_int8_t *src1,
399 const utf8_int8_t *src2) {
400 while ((
'\0' != *src1) || (
'\0' != *src2)) {
403 }
else if (*src1 > *src2) {
415 utf8_constexpr14_impl
int utf8coll(
const utf8_int8_t *src1,
416 const utf8_int8_t *src2);
418 utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
419 const utf8_int8_t *utf8_restrict src) {
420 utf8_int8_t *d = dst;
424 while (
'\0' != *src) {
434 utf8_constexpr14_impl
size_t utf8cspn(
const utf8_int8_t *src,
435 const utf8_int8_t *reject) {
438 while (
'\0' != *src) {
439 const utf8_int8_t *r = reject;
446 if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
449 if (*r == src[offset]) {
460 }
while (0x80 == (0xc0 & *r));
478 }
while ((0x80 == (0xc0 & *src)));
485 utf8_int8_t *utf8dup(
const utf8_int8_t *src) {
486 return utf8dup_ex(src, utf8_null, utf8_null);
489 utf8_int8_t *utf8dup_ex(
const utf8_int8_t *src,
490 utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
size_t),
491 utf8_int8_t *user_data) {
492 utf8_int8_t *n = utf8_null;
496 size_t bytes = utf8size(src);
498 if (alloc_func_ptr) {
499 n = alloc_func_ptr(user_data, bytes);
501 #if !defined(UTF8_NO_STD_MALLOC)
502 n = (utf8_int8_t *)malloc(bytes);
508 if (utf8_null == n) {
515 while (
'\0' != src[bytes]) {
516 n[bytes] = src[bytes];
526 utf8_constexpr14_impl utf8_int8_t *utf8fry(
const utf8_int8_t *str);
528 utf8_constexpr14_impl
size_t utf8len(
const utf8_int8_t *str) {
529 return utf8nlen(str, SIZE_MAX);
532 utf8_constexpr14_impl
size_t utf8nlen(
const utf8_int8_t *str,
size_t n) {
533 const utf8_int8_t *t = str;
536 while ((
size_t)(str - t) < n &&
'\0' != *str) {
537 if (0xf0 == (0xf8 & *str)) {
540 }
else if (0xe0 == (0xf0 & *str)) {
543 }
else if (0xc0 == (0xe0 & *str)) {
556 if ((
size_t)(str - t) > n) {
562 utf8_constexpr14_impl
int utf8ncasecmp(
const utf8_int8_t *src1,
563 const utf8_int8_t *src2,
size_t n) {
564 utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
565 src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
568 const utf8_int8_t *
const s1 = src1;
569 const utf8_int8_t *
const s2 = src2;
577 if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
578 const utf8_int32_t c1 = (0xe0 & *s1);
579 const utf8_int32_t c2 = (0xe0 & *s2);
588 if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
589 const utf8_int32_t c1 = (0xf0 & *s1);
590 const utf8_int32_t c2 = (0xf0 & *s2);
599 if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
600 const utf8_int32_t c1 = (0xf8 & *s1);
601 const utf8_int32_t c2 = (0xf8 & *s2);
610 src1 = utf8codepoint(src1, &src1_orig_cp);
611 src2 = utf8codepoint(src2, &src2_orig_cp);
612 n -= utf8codepointsize(src1_orig_cp);
614 src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
615 src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
617 src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
618 src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
621 if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
623 }
else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
629 return src1_lwr_cp - src2_lwr_cp;
636 utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
637 const utf8_int8_t *utf8_restrict src,
size_t n) {
638 utf8_int8_t *d = dst;
647 while ((
'\0' != *src) && (0 != n--)) {
657 utf8_constexpr14_impl
int utf8ncmp(
const utf8_int8_t *src1,
658 const utf8_int8_t *src2,
size_t n) {
659 while ((0 != n--) && ((
'\0' != *src1) || (
'\0' != *src2))) {
662 }
else if (*src1 > *src2) {
674 utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
675 const utf8_int8_t *utf8_restrict src,
size_t n) {
676 utf8_int8_t *d = dst;
677 size_t index = 0, check_index = 0;
685 for (index = 0; index < n; index++) {
686 d[index] = src[index];
687 if (
'\0' == src[index]) {
692 for (check_index = index - 1;
693 check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
697 if (check_index < index &&
698 ((index - check_index) < utf8codepointcalcsize(&d[check_index]) ||
699 (index - check_index) == n)) {
704 for (; index < n; index++) {
711 utf8_int8_t *utf8ndup(
const utf8_int8_t *src,
size_t n) {
712 return utf8ndup_ex(src, n, utf8_null, utf8_null);
715 utf8_int8_t *utf8ndup_ex(
const utf8_int8_t *src,
size_t n,
716 utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
size_t),
717 utf8_int8_t *user_data) {
718 utf8_int8_t *c = utf8_null;
722 while (
'\0' != src[bytes] && bytes < n) {
730 if (alloc_func_ptr) {
731 c = alloc_func_ptr(user_data, bytes + 1);
733 #if !defined(UTF8_NO_STD_MALLOC)
734 c = (utf8_int8_t *)malloc(bytes + 1);
740 if (utf8_null == c) {
748 while (
'\0' != src[bytes] && bytes < n) {
749 c[bytes] = src[bytes];
758 utf8_constexpr14_impl utf8_int8_t *utf8rchr(
const utf8_int8_t *src,
int chr) {
760 utf8_int8_t *match = utf8_null;
761 utf8_int8_t c[5] = {
'\0',
'\0',
'\0',
'\0',
'\0'};
766 while (
'\0' != *src) {
769 return (utf8_int8_t *)src;
770 }
else if (0 == ((
int)0xffffff80 & chr)) {
773 c[0] = (utf8_int8_t)chr;
774 }
else if (0 == ((
int)0xfffff800 & chr)) {
777 c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
778 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
779 }
else if (0 == ((
int)0xffff0000 & chr)) {
782 c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
783 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
784 c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
788 c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
789 c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
790 c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
791 c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
798 while (
'\0' != *src) {
801 while ((src[offset] == c[offset]) && (
'\0' != src[offset])) {
805 if (
'\0' == c[offset]) {
807 match = (utf8_int8_t *)src;
821 }
while (0x80 == (0xc0 & *src));
830 utf8_constexpr14_impl utf8_int8_t *utf8pbrk(
const utf8_int8_t *str,
831 const utf8_int8_t *accept) {
832 while (
'\0' != *str) {
833 const utf8_int8_t *a = accept;
840 if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
841 return (utf8_int8_t *)str;
843 if (*a == str[offset]) {
854 }
while (0x80 == (0xc0 & *a));
864 return (utf8_int8_t *)str;
872 }
while ((0x80 == (0xc0 & *str)));
878 utf8_constexpr14_impl
size_t utf8size(
const utf8_int8_t *str) {
879 return utf8size_lazy(str) + 1;
882 utf8_constexpr14_impl
size_t utf8size_lazy(
const utf8_int8_t *str) {
883 return utf8nsize_lazy(str, SIZE_MAX);
886 utf8_constexpr14_impl
size_t utf8nsize_lazy(
const utf8_int8_t *str,
size_t n) {
888 while (size < n &&
'\0' != str[size]) {
894 utf8_constexpr14_impl
size_t utf8spn(
const utf8_int8_t *src,
895 const utf8_int8_t *accept) {
898 while (
'\0' != *src) {
899 const utf8_int8_t *a = accept;
906 if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
915 if (*a == src[offset]) {
923 }
while (0x80 == (0xc0 & *a));
948 utf8_constexpr14_impl utf8_int8_t *utf8str(
const utf8_int8_t *haystack,
949 const utf8_int8_t *needle) {
950 utf8_int32_t throwaway_codepoint = 0;
954 if (
'\0' == *needle) {
955 return (utf8_int8_t *)haystack;
958 while (
'\0' != *haystack) {
959 const utf8_int8_t *maybeMatch = haystack;
960 const utf8_int8_t *n = needle;
962 while (*haystack == *n && (*haystack !=
'\0' && *n !=
'\0')) {
970 return (utf8_int8_t *)maybeMatch;
975 haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
983 utf8_constexpr14_impl utf8_int8_t *utf8casestr(
const utf8_int8_t *haystack,
984 const utf8_int8_t *needle) {
987 if (
'\0' == *needle) {
988 return (utf8_int8_t *)haystack;
992 const utf8_int8_t *maybeMatch = haystack;
993 const utf8_int8_t *n = needle;
994 utf8_int32_t h_cp = 0, n_cp = 0;
997 const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
998 n = utf8codepoint(n, &n_cp);
1000 while ((0 != h_cp) && (0 != n_cp)) {
1001 h_cp = utf8lwrcodepoint(h_cp);
1002 n_cp = utf8lwrcodepoint(n_cp);
1009 haystack = utf8codepoint(haystack, &h_cp);
1010 n = utf8codepoint(n, &n_cp);
1016 return (utf8_int8_t *)maybeMatch;
1029 utf8_constexpr14_impl utf8_int8_t *utf8valid(
const utf8_int8_t *str) {
1030 return utf8nvalid(str, SIZE_MAX);
1033 utf8_constexpr14_impl utf8_int8_t *utf8nvalid(
const utf8_int8_t *str,
1035 const utf8_int8_t *t = str;
1036 size_t consumed = 0;
1038 while ((
void)(consumed = (
size_t)(str - t)), consumed < n &&
'\0' != *str) {
1039 const size_t remaining = n - consumed;
1041 if (0xf0 == (0xf8 & *str)) {
1043 if (remaining < 4) {
1044 return (utf8_int8_t *)str;
1049 if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
1050 (0x80 != (0xc0 & str[3]))) {
1051 return (utf8_int8_t *)str;
1055 if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
1056 return (utf8_int8_t *)str;
1062 if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
1063 return (utf8_int8_t *)str;
1068 }
else if (0xe0 == (0xf0 & *str)) {
1070 if (remaining < 3) {
1071 return (utf8_int8_t *)str;
1076 if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
1077 return (utf8_int8_t *)str;
1081 if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
1082 return (utf8_int8_t *)str;
1088 if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
1089 return (utf8_int8_t *)str;
1094 }
else if (0xc0 == (0xe0 & *str)) {
1096 if (remaining < 2) {
1097 return (utf8_int8_t *)str;
1102 if (0x80 != (0xc0 & str[1])) {
1103 return (utf8_int8_t *)str;
1107 if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
1108 return (utf8_int8_t *)str;
1114 if (0 == (0x1e & str[0])) {
1115 return (utf8_int8_t *)str;
1120 }
else if (0x00 == (0x80 & *str)) {
1125 return (utf8_int8_t *)str;
1132 int utf8makevalid(utf8_int8_t *str,
const utf8_int32_t replacement) {
1133 utf8_int8_t *read = str;
1134 utf8_int8_t *write = read;
1135 const utf8_int8_t r = (utf8_int8_t)replacement;
1136 utf8_int32_t codepoint = 0;
1138 if (replacement > 0x7f) {
1142 while (
'\0' != *read) {
1143 if (0xf0 == (0xf8 & *read)) {
1146 if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
1147 (0x80 != (0xc0 & read[3]))) {
1154 read = utf8codepoint(read, &codepoint);
1155 write = utf8catcodepoint(write, codepoint, 4);
1156 }
else if (0xe0 == (0xf0 & *read)) {
1159 if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
1166 read = utf8codepoint(read, &codepoint);
1167 write = utf8catcodepoint(write, codepoint, 3);
1168 }
else if (0xc0 == (0xe0 & *read)) {
1171 if (0x80 != (0xc0 & read[1])) {
1178 read = utf8codepoint(read, &codepoint);
1179 write = utf8catcodepoint(write, codepoint, 2);
1180 }
else if (0x00 == (0x80 & *read)) {
1182 read = utf8codepoint(read, &codepoint);
1183 write = utf8catcodepoint(write, codepoint, 1);
1198 utf8_constexpr14_impl utf8_int8_t *
1199 utf8codepoint(
const utf8_int8_t *utf8_restrict str,
1200 utf8_int32_t *utf8_restrict out_codepoint) {
1201 if (0xf0 == (0xf8 & str[0])) {
1203 *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
1204 ((0x3f & str[2]) << 6) | (0x3f & str[3]);
1206 }
else if (0xe0 == (0xf0 & str[0])) {
1209 ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
1211 }
else if (0xc0 == (0xe0 & str[0])) {
1213 *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
1217 *out_codepoint = str[0];
1221 return const_cast<utf8_int8_t *
>(str);
1224 utf8_constexpr14_impl
size_t utf8codepointcalcsize(
const utf8_int8_t *str) {
1225 if (0xf0 == (0xf8 & str[0])) {
1228 }
else if (0xe0 == (0xf0 & str[0])) {
1231 }
else if (0xc0 == (0xe0 & str[0])) {
1241 utf8_constexpr14_impl
size_t utf8codepointsize(utf8_int32_t chr) {
1242 if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1244 }
else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1246 }
else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1253 utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr,
size_t n) {
1254 if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1260 str[0] = (utf8_int8_t)chr;
1262 }
else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1268 str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
1269 str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1271 }
else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1277 str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
1278 str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1279 str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1287 str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
1288 str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
1289 str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1290 str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1297 utf8_constexpr14_impl
int utf8islower(utf8_int32_t chr) {
1298 return chr != utf8uprcodepoint(chr);
1301 utf8_constexpr14_impl
int utf8isupper(utf8_int32_t chr) {
1302 return chr != utf8lwrcodepoint(chr);
1305 void utf8lwr(utf8_int8_t *utf8_restrict str) {
1306 utf8_int32_t cp = 0;
1307 utf8_int8_t *pn = utf8codepoint(str, &cp);
1310 const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1311 const size_t size = utf8codepointsize(lwr_cp);
1314 utf8catcodepoint(str, lwr_cp, size);
1318 pn = utf8codepoint(str, &cp);
1322 void utf8upr(utf8_int8_t *utf8_restrict str) {
1323 utf8_int32_t cp = 0;
1324 utf8_int8_t *pn = utf8codepoint(str, &cp);
1327 const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1328 const size_t size = utf8codepointsize(lwr_cp);
1331 utf8catcodepoint(str, lwr_cp, size);
1335 pn = utf8codepoint(str, &cp);
1340 utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1341 if (((0x0041 <= cp) && (0x005a >= cp)) ||
1342 ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1343 ((0x00d8 <= cp) && (0x00de >= cp)) ||
1344 ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1345 ((0x03a3 <= cp) && (0x03ab >= cp)) ||
1346 ((0x0410 <= cp) && (0x042f >= cp))) {
1348 }
else if ((0x0400 <= cp) && (0x040f >= cp)) {
1350 }
else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1351 ((0x0132 <= cp) && (0x0137 >= cp)) ||
1352 ((0x014a <= cp) && (0x0177 >= cp)) ||
1353 ((0x0182 <= cp) && (0x0185 >= cp)) ||
1354 ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1355 ((0x01de <= cp) && (0x01ef >= cp)) ||
1356 ((0x01f8 <= cp) && (0x021f >= cp)) ||
1357 ((0x0222 <= cp) && (0x0233 >= cp)) ||
1358 ((0x0246 <= cp) && (0x024f >= cp)) ||
1359 ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1360 ((0x0460 <= cp) && (0x0481 >= cp)) ||
1361 ((0x048a <= cp) && (0x04ff >= cp))) {
1363 }
else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1364 ((0x0179 <= cp) && (0x017e >= cp)) ||
1365 ((0x01af <= cp) && (0x01b0 >= cp)) ||
1366 ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1367 ((0x01cd <= cp) && (0x01dc >= cp))) {
1503 utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1504 if (((0x0061 <= cp) && (0x007a >= cp)) ||
1505 ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1506 ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1507 ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1508 ((0x03c3 <= cp) && (0x03cb >= cp)) ||
1509 ((0x0430 <= cp) && (0x044f >= cp))) {
1511 }
else if ((0x0450 <= cp) && (0x045f >= cp)) {
1513 }
else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1514 ((0x0132 <= cp) && (0x0137 >= cp)) ||
1515 ((0x014a <= cp) && (0x0177 >= cp)) ||
1516 ((0x0182 <= cp) && (0x0185 >= cp)) ||
1517 ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1518 ((0x01de <= cp) && (0x01ef >= cp)) ||
1519 ((0x01f8 <= cp) && (0x021f >= cp)) ||
1520 ((0x0222 <= cp) && (0x0233 >= cp)) ||
1521 ((0x0246 <= cp) && (0x024f >= cp)) ||
1522 ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1523 ((0x0460 <= cp) && (0x0481 >= cp)) ||
1524 ((0x048a <= cp) && (0x04ff >= cp))) {
1526 }
else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1527 ((0x0179 <= cp) && (0x017e >= cp)) ||
1528 ((0x01af <= cp) && (0x01b0 >= cp)) ||
1529 ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1530 ((0x01cd <= cp) && (0x01dc >= cp))) {
1666 utf8_constexpr14_impl utf8_int8_t *
1667 utf8rcodepoint(
const utf8_int8_t *utf8_restrict str,
1668 utf8_int32_t *utf8_restrict out_codepoint) {
1669 const utf8_int8_t *s =
static_cast<const utf8_int8_t *
>(str);
1671 if (0xf0 == (0xf8 & s[0])) {
1673 *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
1674 ((0x3f & s[2]) << 6) | (0x3f & s[3]);
1675 }
else if (0xe0 == (0xf0 & s[0])) {
1678 ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
1679 }
else if (0xc0 == (0xe0 & s[0])) {
1681 *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
1684 *out_codepoint = s[0];
1689 }
while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
1691 return const_cast<utf8_int8_t *
>(s);
1694 #undef utf8_restrict
1695 #undef utf8_constexpr14
1700 #if defined(__clang__)
1701 #pragma clang diagnostic pop