snac2

Fork of https://codeberg.org/grunfink/snac2
git clone https://git.inz.fi/snac2
Log | Files | Refs | README | LICENSE

xs_unicode.h (10837B)


      1 /* copyright (c) 2022 - 2025 grunfink et al. / MIT license */
      2 
      3 #ifndef _XS_UNICODE_H
      4 
      5 #define _XS_UNICODE_H
      6 
      7  int xs_utf8_enc(char buf[4], unsigned int cpoint);
      8  int xs_is_utf8_cont_byte(char c);
      9  unsigned int xs_utf8_dec(const char **str);
     10  int xs_unicode_width(unsigned int cpoint);
     11  int xs_is_surrogate(unsigned int cpoint);
     12  int xs_is_diacritic(unsigned int cpoint);
     13  unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
     14  unsigned int xs_surrogate_enc(unsigned int cpoint);
     15  unsigned int (*_xs_unicode_upper_search(unsigned int cpoint))[2];
     16  unsigned int (*_xs_unicode_lower_search(unsigned int cpoint))[2];
     17  #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
     18  #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
     19  unsigned int xs_unicode_to_upper(unsigned int cpoint);
     20  unsigned int xs_unicode_to_lower(unsigned int cpoint);
     21  int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
     22  int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
     23  int xs_unicode_is_alpha(unsigned int cpoint);
     24  int xs_unicode_is_right_to_left(unsigned int cpoint);
     25 
     26 #ifdef _XS_H
     27  xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
     28  xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
     29  xs_str *xs_utf8_to_upper(const char *str);
     30  xs_str *xs_utf8_to_lower(const char *str);
     31  xs_str *xs_utf8_to_nfd(const char *str);
     32  xs_str *xs_utf8_to_nfc(const char *str);
     33 #endif
     34 
     35 #ifdef XS_IMPLEMENTATION
     36 
     37 #include <ctype.h>
     38 #include <search.h>
     39 
     40 #ifndef xs_countof
     41 #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
     42 #endif
     43 
     44 int xs_utf8_enc(char buf[4], unsigned int cpoint)
     45 /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
     46 {
     47     char *p = buf;
     48 
     49     if (cpoint < 0x80) /* 1 byte char */
     50         *p++ = cpoint & 0xff;
     51     else {
     52         if (cpoint < 0x800) /* 2 byte char */
     53             *p++ = 0xc0 | (cpoint >> 6);
     54         else {
     55             if (cpoint < 0x10000) /* 3 byte char */
     56                 *p++ = 0xe0 | (cpoint >> 12);
     57             else { /* 4 byte char */
     58                 *p++ = 0xf0 | (cpoint >> 18);
     59                 *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
     60             }
     61 
     62             *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
     63         }
     64 
     65         *p++ = 0x80 | (cpoint & 0x3f);
     66     }
     67 
     68     return p - buf;
     69 }
     70 
     71 
     72 int xs_is_utf8_cont_byte(char c)
     73 /* returns true if c is an utf8 continuation byte */
     74 {
     75     return ((c & 0xc0) == 0x80);
     76 }
     77 
     78 
     79 unsigned int xs_utf8_dec(const char **str)
     80 /* decodes an utf-8 char inside str and updates the pointer */
     81 {
     82     const char *p = *str;
     83     unsigned int cpoint = 0;
     84     unsigned char c = *p++;
     85     int cb = 0;
     86 
     87     if ((c & 0x80) == 0) { /* 1 byte char */
     88         cpoint = c;
     89     }
     90     else
     91     if ((c & 0xe0) == 0xc0) { /* 2 byte char */
     92         cpoint = (c & 0x1f) << 6;
     93         cb = 1;
     94     }
     95     else
     96     if ((c & 0xf0) == 0xe0) { /* 3 byte char */
     97         cpoint = (c & 0x0f) << 12;
     98         cb = 2;
     99     }
    100     else
    101     if ((c & 0xf8) == 0xf0) { /* 4 byte char */
    102         cpoint = (c & 0x07) << 18;
    103         cb = 3;
    104     }
    105 
    106     /* process the continuation bytes */
    107     while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
    108         cpoint |= (*p++ & 0x3f) << (--cb * 6);
    109 
    110     /* incomplete or broken? */
    111     if (cb)
    112         cpoint = 0xfffd;
    113 
    114     *str = p;
    115     return cpoint;
    116 }
    117 
    118 
    119 /** Unicode character width: intentionally dead simple **/
    120 
    121 static unsigned int xs_unicode_width_table[] = {
    122     0x300,      0x36f,      0,      /* diacritics */
    123     0x1100,     0x11ff,     2,      /* Hangul */
    124     0x2e80,     0xa4cf,     2,      /* CJK */
    125     0xac00,     0xd7a3,     2,      /* more Hangul */
    126     0xe000,     0xf8ff,     0,      /* private use */
    127     0xf900,     0xfaff,     2,      /* CJK compatibility */
    128     0xff00,     0xff60,     2,      /* full width things */
    129     0xffdf,     0xffe6,     2,      /* full width things */
    130     0x1f200,    0x1ffff,    2,      /* emojis */
    131     0x20000,    0x2fffd,    2       /* more CJK */
    132 };
    133 
    134 int xs_unicode_width(unsigned int cpoint)
    135 /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
    136 {
    137     int b = 0;
    138     int t = xs_countof(xs_unicode_width_table) / 3 - 1;
    139 
    140     while (t >= b) {
    141         int n = (b + t) / 2;
    142         unsigned int *p = &xs_unicode_width_table[n * 3];
    143 
    144         if (cpoint < p[0])
    145             t = n - 1;
    146         else
    147         if (cpoint > p[1])
    148             b = n + 1;
    149         else
    150             return p[2];
    151     }
    152 
    153     return 1;
    154 }
    155 
    156 
    157 int xs_is_diacritic(unsigned int cpoint)
    158 {
    159     return cpoint >= 0x300 && cpoint <= 0x36f;
    160 }
    161 
    162 
    163 /** surrogate pairs **/
    164 
    165 int xs_is_surrogate(unsigned int cpoint)
    166 /* checks if cpoint is the first element of a Unicode surrogate pair */
    167 {
    168     return cpoint >= 0xd800 && cpoint <= 0xdfff;
    169 }
    170 
    171 
    172 unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
    173 /* "decodes" a surrogate pair into a codepoint */
    174 {
    175     return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
    176 }
    177 
    178 
    179 unsigned int xs_surrogate_enc(unsigned int cpoint)
    180 /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
    181 {
    182     unsigned int p1 = 0xd7c0 + (cpoint >> 10);
    183     unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
    184 
    185     return (p1 << 16) | p2;
    186 }
    187 
    188 
    189 #ifdef _XS_H
    190 
    191 xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
    192 /* encodes an Unicode codepoint to utf-8 into str */
    193 {
    194     char tmp[4];
    195 
    196     int c = xs_utf8_enc(tmp, cpoint);
    197 
    198     str = xs_insert_m(str, *offset, tmp, c);
    199 
    200     *offset += c;
    201 
    202     return str;
    203 }
    204 
    205 
    206 xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
    207 /* encodes an Unicode codepoint to utf-8 into str */
    208 {
    209     int offset = strlen(str);
    210 
    211     return xs_utf8_insert(str, cpoint, &offset);
    212 }
    213 
    214 #endif /* _XS_H */
    215 
    216 
    217 #ifdef _XS_UNICODE_TBL_H
    218 
    219 /* include xs_unicode_tbl.h before this one to use these functions */
    220 
    221 int _upper_match(const void *a, const void *b)
    222 {
    223     const unsigned int (*ai)[2] = (void *)a;
    224     const unsigned int (*bi)[2] = (void *)b;
    225 
    226     return (**bi < **ai) - (**ai < **bi);
    227 }
    228 
    229 int _lower_match(const void *a, const void *b)
    230 {
    231     const unsigned int (*ai)[2] = (void *)a;
    232     const unsigned int (*bi)[2] = (void *)b;
    233 
    234     return (bi[0][1] < ai[0][1]) - (ai[0][1] < bi[0][1]);
    235 }
    236 
    237 unsigned int (*_xs_unicode_upper_search(unsigned int cpoint))[2]
    238 /* searches for an uppercase codepoint in the case fold table */
    239 {
    240     return bsearch((unsigned int[]){ cpoint, 0}, xs_unicode_case_fold_table,
    241                    xs_countof(xs_unicode_case_fold_table),
    242                    sizeof(*xs_unicode_case_fold_table),
    243 		   _upper_match);
    244 }
    245 
    246 
    247 unsigned int (*_xs_unicode_lower_search(unsigned int cpoint))[2]
    248 /* searches for a lowercase codepoint in the case fold table */
    249 {
    250 
    251     return lfind((unsigned int[]){ 0, cpoint }, xs_unicode_case_fold_table,
    252                             &(size_t){ xs_countof(xs_unicode_case_fold_table) },
    253                             sizeof(*xs_unicode_case_fold_table), _lower_match);
    254 }
    255 
    256 
    257 unsigned int xs_unicode_to_lower(unsigned int cpoint)
    258 /* returns the cpoint to lowercase */
    259 {
    260     if (cpoint < 0x80)
    261         return tolower(cpoint);
    262 
    263     unsigned int (*p)[2] = _xs_unicode_upper_search(cpoint);
    264 
    265     return p == NULL ? cpoint : p[0][1];
    266 }
    267 
    268 
    269 unsigned int xs_unicode_to_upper(unsigned int cpoint)
    270 /* returns the cpoint to uppercase */
    271 {
    272     if (cpoint < 0x80)
    273         return toupper(cpoint);
    274 
    275     unsigned int (*p)[2] = _xs_unicode_lower_search(cpoint);
    276 
    277     return p == NULL ? cpoint : **p;
    278 }
    279 
    280 
    281 int _nfd_match(const void *a, const void *b)
    282 {
    283     const unsigned int (*ai)[3] = (void *)a;
    284     const unsigned int (*bi)[3] = (void *)b;
    285 
    286     return (**bi < **ai) - (**ai < **bi);
    287 }
    288 
    289 int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
    290 /* applies unicode Normalization Form D */
    291 {
    292     unsigned int (*p)[3] = bsearch((unsigned int[]){ cpoint, 0, 0 }, xs_unicode_nfd_table, xs_countof(xs_unicode_nfd_table), sizeof(*xs_unicode_nfd_table), _nfd_match);
    293     if (p == NULL)
    294         return 0;
    295 
    296     *base = p[0][1];
    297     *diac = p[0][2];
    298 
    299     return 1;
    300 }
    301 
    302 int _nfc_match(const void *a, const void *b)
    303 {
    304     const unsigned int (*ai)[3] = (void *)a;
    305     const unsigned int (*bi)[3] = (void *)b;
    306 
    307     return memcmp(*ai + 1, *bi + 1, 2 * sizeof(**ai));
    308 }
    309 
    310 
    311 int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
    312 /* applies unicode Normalization Form C */
    313 {
    314     unsigned int (*p)[3] = lfind((unsigned int[]){ 0, base, diac }, xs_unicode_nfd_table,
    315                                  &(size_t){ xs_countof(xs_unicode_nfd_table) },
    316                                  sizeof(*xs_unicode_nfd_table), _nfc_match);
    317 
    318     if (p == NULL)
    319         return 0;
    320 
    321     *cpoint = p[0][0];
    322 
    323     return 1;
    324 }
    325 
    326 int _uint_match(const void *a, const void *b)
    327 {
    328     const unsigned int *ai = a;
    329     const unsigned int *bi = b;
    330 
    331     return (*bi < *ai) - (*ai < *bi);
    332 }
    333 
    334 
    335 int xs_unicode_is_alpha(unsigned int cpoint)
    336 /* checks if a codepoint is an alpha (i.e. a letter) */
    337 {
    338     return !!bsearch(&cpoint, xs_unicode_alpha_table, xs_countof(xs_unicode_alpha_table), sizeof(*xs_unicode_alpha_table), _uint_match);
    339 }
    340 
    341 
    342 int xs_unicode_is_right_to_left(unsigned int cpoint)
    343 /* checks if a codepoint is a right-to-left letter */
    344 {
    345     return !!bsearch(&cpoint, xs_unicode_right_to_left_table, xs_countof(xs_unicode_right_to_left_table), sizeof(*xs_unicode_right_to_left_table), _uint_match);
    346 }
    347 
    348 
    349 #ifdef _XS_H
    350 
    351 xs_str *xs_utf8_to_upper(const char *str)
    352 {
    353     xs_str *s = xs_str_new(NULL);
    354     unsigned int cpoint;
    355     int offset = 0;
    356 
    357     while ((cpoint = xs_utf8_dec(&str))) {
    358         cpoint = xs_unicode_to_upper(cpoint);
    359         s = xs_utf8_insert(s, cpoint, &offset);
    360     }
    361 
    362     return s;
    363 }
    364 
    365 
    366 xs_str *xs_utf8_to_lower(const char *str)
    367 {
    368     xs_str *s = xs_str_new(NULL);
    369     unsigned int cpoint;
    370     int offset = 0;
    371 
    372     while ((cpoint = xs_utf8_dec(&str))) {
    373         cpoint = xs_unicode_to_lower(cpoint);
    374         s = xs_utf8_insert(s, cpoint, &offset);
    375     }
    376 
    377     return s;
    378 }
    379 
    380 
    381 xs_str *xs_utf8_to_nfd(const char *str)
    382 {
    383     xs_str *s = xs_str_new(NULL);
    384     unsigned int cpoint;
    385     int offset = 0;
    386 
    387     while ((cpoint = xs_utf8_dec(&str))) {
    388         unsigned int base;
    389 
    390         if (xs_unicode_nfd(cpoint, &base, &cpoint))
    391             s = xs_utf8_insert(s, base, &offset);
    392 	s = xs_utf8_insert(s, cpoint, &offset);
    393     }
    394 
    395     return s;
    396 }
    397 
    398 
    399 xs_str *xs_utf8_to_nfc(const char *str)
    400 {
    401     xs_str *s = xs_str_new(NULL);
    402     unsigned int cpoint;
    403     unsigned int base = 0;
    404     int offset = 0;
    405 
    406     while ((cpoint = xs_utf8_dec(&str))) {
    407         if (xs_is_diacritic(cpoint)) {
    408             if (xs_unicode_nfc(base, cpoint, &base))
    409                 continue;
    410         }
    411 
    412         if (base)
    413             s = xs_utf8_insert(s, base, &offset);
    414 
    415         base = cpoint;
    416     }
    417 
    418     if (base)
    419         s = xs_utf8_insert(s, base, &offset);
    420 
    421     return s;
    422 }
    423 
    424 #endif /* _XS_H */
    425 
    426 #endif /* _XS_UNICODE_TBL_H */
    427 
    428 #endif /* XS_IMPLEMENTATION */
    429 
    430 #endif /* _XS_UNICODE_H */