xs_unicode.h (10837B)
1 /* copyright (c) 2022 - 2025 grunfink et al. / MIT license */ 2 3 #ifndef _XS_UNICODE_H 4 5 #define _XS_UNICODE_H 6 7 int xs_utf8_enc(char buf[4], unsigned int cpoint); 8 int xs_is_utf8_cont_byte(char c); 9 unsigned int xs_utf8_dec(const char **str); 10 int xs_unicode_width(unsigned int cpoint); 11 int xs_is_surrogate(unsigned int cpoint); 12 int xs_is_diacritic(unsigned int cpoint); 13 unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); 14 unsigned int xs_surrogate_enc(unsigned int cpoint); 15 unsigned int (*_xs_unicode_upper_search(unsigned int cpoint))[2]; 16 unsigned int (*_xs_unicode_lower_search(unsigned int cpoint))[2]; 17 #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint)) 18 #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint)) 19 unsigned int xs_unicode_to_upper(unsigned int cpoint); 20 unsigned int xs_unicode_to_lower(unsigned int cpoint); 21 int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac); 22 int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); 23 int xs_unicode_is_alpha(unsigned int cpoint); 24 int xs_unicode_is_right_to_left(unsigned int cpoint); 25 26 #ifdef _XS_H 27 xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset); 28 xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); 29 xs_str *xs_utf8_to_upper(const char *str); 30 xs_str *xs_utf8_to_lower(const char *str); 31 xs_str *xs_utf8_to_nfd(const char *str); 32 xs_str *xs_utf8_to_nfc(const char *str); 33 #endif 34 35 #ifdef XS_IMPLEMENTATION 36 37 #include <ctype.h> 38 #include <search.h> 39 40 #ifndef xs_countof 41 #define xs_countof(a) (sizeof((a)) / sizeof((*a))) 42 #endif 43 44 int xs_utf8_enc(char buf[4], unsigned int cpoint) 45 /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */ 46 { 47 char *p = buf; 48 49 if (cpoint < 0x80) /* 1 byte char */ 50 *p++ = cpoint & 0xff; 51 else { 52 if (cpoint < 0x800) /* 2 byte char */ 53 *p++ = 0xc0 | (cpoint >> 6); 54 else { 55 if (cpoint < 0x10000) /* 3 byte char */ 56 *p++ = 0xe0 | (cpoint >> 12); 57 else { /* 4 byte char */ 58 *p++ = 0xf0 | (cpoint >> 18); 59 *p++ = 0x80 | ((cpoint >> 12) & 0x3f); 60 } 61 62 *p++ = 0x80 | ((cpoint >> 6) & 0x3f); 63 } 64 65 *p++ = 0x80 | (cpoint & 0x3f); 66 } 67 68 return p - buf; 69 } 70 71 72 int xs_is_utf8_cont_byte(char c) 73 /* returns true if c is an utf8 continuation byte */ 74 { 75 return ((c & 0xc0) == 0x80); 76 } 77 78 79 unsigned int xs_utf8_dec(const char **str) 80 /* decodes an utf-8 char inside str and updates the pointer */ 81 { 82 const char *p = *str; 83 unsigned int cpoint = 0; 84 unsigned char c = *p++; 85 int cb = 0; 86 87 if ((c & 0x80) == 0) { /* 1 byte char */ 88 cpoint = c; 89 } 90 else 91 if ((c & 0xe0) == 0xc0) { /* 2 byte char */ 92 cpoint = (c & 0x1f) << 6; 93 cb = 1; 94 } 95 else 96 if ((c & 0xf0) == 0xe0) { /* 3 byte char */ 97 cpoint = (c & 0x0f) << 12; 98 cb = 2; 99 } 100 else 101 if ((c & 0xf8) == 0xf0) { /* 4 byte char */ 102 cpoint = (c & 0x07) << 18; 103 cb = 3; 104 } 105 106 /* process the continuation bytes */ 107 while (cb > 0 && *p && xs_is_utf8_cont_byte(*p)) 108 cpoint |= (*p++ & 0x3f) << (--cb * 6); 109 110 /* incomplete or broken? */ 111 if (cb) 112 cpoint = 0xfffd; 113 114 *str = p; 115 return cpoint; 116 } 117 118 119 /** Unicode character width: intentionally dead simple **/ 120 121 static unsigned int xs_unicode_width_table[] = { 122 0x300, 0x36f, 0, /* diacritics */ 123 0x1100, 0x11ff, 2, /* Hangul */ 124 0x2e80, 0xa4cf, 2, /* CJK */ 125 0xac00, 0xd7a3, 2, /* more Hangul */ 126 0xe000, 0xf8ff, 0, /* private use */ 127 0xf900, 0xfaff, 2, /* CJK compatibility */ 128 0xff00, 0xff60, 2, /* full width things */ 129 0xffdf, 0xffe6, 2, /* full width things */ 130 0x1f200, 0x1ffff, 2, /* emojis */ 131 0x20000, 0x2fffd, 2 /* more CJK */ 132 }; 133 134 int xs_unicode_width(unsigned int cpoint) 135 /* returns the width in columns of a Unicode codepoint (somewhat simplified) */ 136 { 137 int b = 0; 138 int t = xs_countof(xs_unicode_width_table) / 3 - 1; 139 140 while (t >= b) { 141 int n = (b + t) / 2; 142 unsigned int *p = &xs_unicode_width_table[n * 3]; 143 144 if (cpoint < p[0]) 145 t = n - 1; 146 else 147 if (cpoint > p[1]) 148 b = n + 1; 149 else 150 return p[2]; 151 } 152 153 return 1; 154 } 155 156 157 int xs_is_diacritic(unsigned int cpoint) 158 { 159 return cpoint >= 0x300 && cpoint <= 0x36f; 160 } 161 162 163 /** surrogate pairs **/ 164 165 int xs_is_surrogate(unsigned int cpoint) 166 /* checks if cpoint is the first element of a Unicode surrogate pair */ 167 { 168 return cpoint >= 0xd800 && cpoint <= 0xdfff; 169 } 170 171 172 unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2) 173 /* "decodes" a surrogate pair into a codepoint */ 174 { 175 return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff); 176 } 177 178 179 unsigned int xs_surrogate_enc(unsigned int cpoint) 180 /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */ 181 { 182 unsigned int p1 = 0xd7c0 + (cpoint >> 10); 183 unsigned int p2 = 0xdc00 + (cpoint & 0x3ff); 184 185 return (p1 << 16) | p2; 186 } 187 188 189 #ifdef _XS_H 190 191 xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset) 192 /* encodes an Unicode codepoint to utf-8 into str */ 193 { 194 char tmp[4]; 195 196 int c = xs_utf8_enc(tmp, cpoint); 197 198 str = xs_insert_m(str, *offset, tmp, c); 199 200 *offset += c; 201 202 return str; 203 } 204 205 206 xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) 207 /* encodes an Unicode codepoint to utf-8 into str */ 208 { 209 int offset = strlen(str); 210 211 return xs_utf8_insert(str, cpoint, &offset); 212 } 213 214 #endif /* _XS_H */ 215 216 217 #ifdef _XS_UNICODE_TBL_H 218 219 /* include xs_unicode_tbl.h before this one to use these functions */ 220 221 int _upper_match(const void *a, const void *b) 222 { 223 const unsigned int (*ai)[2] = (void *)a; 224 const unsigned int (*bi)[2] = (void *)b; 225 226 return (**bi < **ai) - (**ai < **bi); 227 } 228 229 int _lower_match(const void *a, const void *b) 230 { 231 const unsigned int (*ai)[2] = (void *)a; 232 const unsigned int (*bi)[2] = (void *)b; 233 234 return (bi[0][1] < ai[0][1]) - (ai[0][1] < bi[0][1]); 235 } 236 237 unsigned int (*_xs_unicode_upper_search(unsigned int cpoint))[2] 238 /* searches for an uppercase codepoint in the case fold table */ 239 { 240 return bsearch((unsigned int[]){ cpoint, 0}, xs_unicode_case_fold_table, 241 xs_countof(xs_unicode_case_fold_table), 242 sizeof(*xs_unicode_case_fold_table), 243 _upper_match); 244 } 245 246 247 unsigned int (*_xs_unicode_lower_search(unsigned int cpoint))[2] 248 /* searches for a lowercase codepoint in the case fold table */ 249 { 250 251 return lfind((unsigned int[]){ 0, cpoint }, xs_unicode_case_fold_table, 252 &(size_t){ xs_countof(xs_unicode_case_fold_table) }, 253 sizeof(*xs_unicode_case_fold_table), _lower_match); 254 } 255 256 257 unsigned int xs_unicode_to_lower(unsigned int cpoint) 258 /* returns the cpoint to lowercase */ 259 { 260 if (cpoint < 0x80) 261 return tolower(cpoint); 262 263 unsigned int (*p)[2] = _xs_unicode_upper_search(cpoint); 264 265 return p == NULL ? cpoint : p[0][1]; 266 } 267 268 269 unsigned int xs_unicode_to_upper(unsigned int cpoint) 270 /* returns the cpoint to uppercase */ 271 { 272 if (cpoint < 0x80) 273 return toupper(cpoint); 274 275 unsigned int (*p)[2] = _xs_unicode_lower_search(cpoint); 276 277 return p == NULL ? cpoint : **p; 278 } 279 280 281 int _nfd_match(const void *a, const void *b) 282 { 283 const unsigned int (*ai)[3] = (void *)a; 284 const unsigned int (*bi)[3] = (void *)b; 285 286 return (**bi < **ai) - (**ai < **bi); 287 } 288 289 int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) 290 /* applies unicode Normalization Form D */ 291 { 292 unsigned int (*p)[3] = bsearch((unsigned int[]){ cpoint, 0, 0 }, xs_unicode_nfd_table, xs_countof(xs_unicode_nfd_table), sizeof(*xs_unicode_nfd_table), _nfd_match); 293 if (p == NULL) 294 return 0; 295 296 *base = p[0][1]; 297 *diac = p[0][2]; 298 299 return 1; 300 } 301 302 int _nfc_match(const void *a, const void *b) 303 { 304 const unsigned int (*ai)[3] = (void *)a; 305 const unsigned int (*bi)[3] = (void *)b; 306 307 return memcmp(*ai + 1, *bi + 1, 2 * sizeof(**ai)); 308 } 309 310 311 int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) 312 /* applies unicode Normalization Form C */ 313 { 314 unsigned int (*p)[3] = lfind((unsigned int[]){ 0, base, diac }, xs_unicode_nfd_table, 315 &(size_t){ xs_countof(xs_unicode_nfd_table) }, 316 sizeof(*xs_unicode_nfd_table), _nfc_match); 317 318 if (p == NULL) 319 return 0; 320 321 *cpoint = p[0][0]; 322 323 return 1; 324 } 325 326 int _uint_match(const void *a, const void *b) 327 { 328 const unsigned int *ai = a; 329 const unsigned int *bi = b; 330 331 return (*bi < *ai) - (*ai < *bi); 332 } 333 334 335 int xs_unicode_is_alpha(unsigned int cpoint) 336 /* checks if a codepoint is an alpha (i.e. a letter) */ 337 { 338 return !!bsearch(&cpoint, xs_unicode_alpha_table, xs_countof(xs_unicode_alpha_table), sizeof(*xs_unicode_alpha_table), _uint_match); 339 } 340 341 342 int xs_unicode_is_right_to_left(unsigned int cpoint) 343 /* checks if a codepoint is a right-to-left letter */ 344 { 345 return !!bsearch(&cpoint, xs_unicode_right_to_left_table, xs_countof(xs_unicode_right_to_left_table), sizeof(*xs_unicode_right_to_left_table), _uint_match); 346 } 347 348 349 #ifdef _XS_H 350 351 xs_str *xs_utf8_to_upper(const char *str) 352 { 353 xs_str *s = xs_str_new(NULL); 354 unsigned int cpoint; 355 int offset = 0; 356 357 while ((cpoint = xs_utf8_dec(&str))) { 358 cpoint = xs_unicode_to_upper(cpoint); 359 s = xs_utf8_insert(s, cpoint, &offset); 360 } 361 362 return s; 363 } 364 365 366 xs_str *xs_utf8_to_lower(const char *str) 367 { 368 xs_str *s = xs_str_new(NULL); 369 unsigned int cpoint; 370 int offset = 0; 371 372 while ((cpoint = xs_utf8_dec(&str))) { 373 cpoint = xs_unicode_to_lower(cpoint); 374 s = xs_utf8_insert(s, cpoint, &offset); 375 } 376 377 return s; 378 } 379 380 381 xs_str *xs_utf8_to_nfd(const char *str) 382 { 383 xs_str *s = xs_str_new(NULL); 384 unsigned int cpoint; 385 int offset = 0; 386 387 while ((cpoint = xs_utf8_dec(&str))) { 388 unsigned int base; 389 390 if (xs_unicode_nfd(cpoint, &base, &cpoint)) 391 s = xs_utf8_insert(s, base, &offset); 392 s = xs_utf8_insert(s, cpoint, &offset); 393 } 394 395 return s; 396 } 397 398 399 xs_str *xs_utf8_to_nfc(const char *str) 400 { 401 xs_str *s = xs_str_new(NULL); 402 unsigned int cpoint; 403 unsigned int base = 0; 404 int offset = 0; 405 406 while ((cpoint = xs_utf8_dec(&str))) { 407 if (xs_is_diacritic(cpoint)) { 408 if (xs_unicode_nfc(base, cpoint, &base)) 409 continue; 410 } 411 412 if (base) 413 s = xs_utf8_insert(s, base, &offset); 414 415 base = cpoint; 416 } 417 418 if (base) 419 s = xs_utf8_insert(s, base, &offset); 420 421 return s; 422 } 423 424 #endif /* _XS_H */ 425 426 #endif /* _XS_UNICODE_TBL_H */ 427 428 #endif /* XS_IMPLEMENTATION */ 429 430 #endif /* _XS_UNICODE_H */