snac2

Fork of https://codeberg.org/grunfink/snac2
git clone https://git.inz.fi/snac2
Log | Files | Refs | README | LICENSE

format.c (14715B)


      1 /* snac - A simple, minimalistic ActivityPub instance */
      2 /* copyright (c) 2022 - 2025 grunfink et al. / MIT license */
      3 
      4 #include "xs.h"
      5 #include "xs_regex.h"
      6 #include "xs_mime.h"
      7 #include "xs_html.h"
      8 #include "xs_json.h"
      9 #include "xs_time.h"
     10 #include "xs_match.h"
     11 #include "xs_unicode.h"
     12 
     13 #include "snac.h"
     14 
     15 /* emoticons, people laughing and such */
     16 const char *smileys[] = {
     17     ":-)",        "🙂",
     18     ":-D",        "😀",
     19     "X-D",        "😆",
     20     ";-)",        "😉",
     21     "B-)",        "😎",
     22     ">:-(",       "😡",
     23     ":-(",        "😞",
     24     ":-*",        "😘",
     25     ":-/",        "😕",
     26     "8-o",        "😲",
     27     "%-)",        "🤪",
     28     ":_(",        "😢",
     29     ":-|",        "😐",
     30     "<3",         "&#10084;&#65039;",
     31     ":facepalm:", "&#129318;",
     32     ":shrug:",    "&#129335;",
     33     ":shrug2:",   "&#175;\\_(&#12484;)_/&#175;",
     34     ":eyeroll:",  "&#128580;",
     35     ":beer:",     "&#127866;",
     36     ":beers:",    "&#127867;",
     37     ":munch:",    "&#128561;",
     38     ":thumb:",    "&#128077;",
     39     NULL,         NULL
     40 };
     41 
     42 
     43 xs_dict *emojis(void)
     44 /* returns a dict with the emojis */
     45 {
     46     xs *fn = xs_fmt("%s/emojis.json", srv_basedir);
     47     FILE *f;
     48 
     49     if (mtime(fn) == 0) {
     50         /* file does not exist; create it with the defaults */
     51         xs *d = xs_dict_new();
     52         const char **emo = smileys;
     53 
     54         while (*emo) {
     55             d = xs_dict_append(d, emo[0], emo[1]);
     56             emo += 2;
     57         }
     58 
     59         if ((f = fopen(fn, "w")) != NULL) {
     60             xs_json_dump(d, 4, f);
     61             fclose(f);
     62         }
     63         else
     64             srv_log(xs_fmt("Error creating '%s'", fn));
     65     }
     66 
     67     xs_dict *d = NULL;
     68 
     69     if ((f = fopen(fn, "r")) != NULL) {
     70         d = xs_json_load(f);
     71         fclose(f);
     72 
     73         if (d == NULL)
     74             srv_log(xs_fmt("JSON parse error in '%s'", fn));
     75     }
     76     else
     77         srv_log(xs_fmt("Error opening '%s'", fn));
     78 
     79     return d;
     80 }
     81 
     82 /* Non-whitespace without trailing comma, period or closing paren */
     83 #define NOSPACE "([^[:space:],.)]+|[,.)]+[^[:space:],.)])+"
     84 
     85 static xs_str *format_line(const char *line, xs_list **attach)
     86 /* formats a line */
     87 {
     88     xs_str_bld b = { 0 };
     89     char *p;
     90     const char *v;
     91 
     92     /* split by markup */
     93     xs *sm = xs_regex_split(line,
     94         "("
     95             "`[^`]+`"                           "|"
     96             "~~[^~]+~~"                         "|"
     97             "\\*\\*?\\*?[^\\*]+\\*?\\*?\\*"     "|"
     98             "__[^_]+__"                         "|" //anzu
     99             "!\\[[^]]+\\]\\([^\\)]+\\)\\)?"     "|"
    100             "\\[[^]]+\\]\\([^\\)]+\\)\\)?"      "|"
    101             "[a-z]+:/" "/" NOSPACE              "|"
    102             "(mailto|xmpp):[^@[:space:]]+@" NOSPACE
    103         ")");
    104     int n = 0;
    105 
    106     p = sm;
    107     while (xs_list_iter(&p, &v)) {
    108         if ((n & 0x1)) {
    109             /* markup */
    110             if (xs_startswith(v, "`")) {
    111                 xs *s1 = xs_strip_chars_i(xs_dup(v), "`");
    112                 xs *e1 = encode_html(s1);
    113                 xs_str_bld_cat_fmt(&b, "<code>%s</code>", e1);
    114             }
    115             else
    116             if (xs_startswith(v, "***")) {
    117                 xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
    118                 xs_str_bld_cat_fmt(&b, "<b><i>%s</i></b>", s1);
    119             }
    120             else
    121             if (xs_startswith(v, "**")) {
    122                 xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
    123                 xs_str_bld_cat_fmt(&b, "<b>%s</b>", s1);
    124             }
    125             else
    126             if (xs_startswith(v, "*")) {
    127                 xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
    128                 xs_str_bld_cat_fmt(&b, "<i>%s</i>", s1);
    129             }
    130             //anzu - begin
    131             else
    132             if (xs_startswith(v, "__")) {
    133                 xs *s1 = xs_strip_chars_i(xs_dup(v), "_");
    134                 xs_str_bld_cat_fmt(&b, "<u>%s</u>", s1);
    135             }
    136             //anzu - end
    137             else
    138             if (xs_startswith(v, "~~")) {
    139                 xs *s1 = xs_strip_chars_i(xs_dup(v), "~");
    140                 xs *e1 = encode_html(s1);
    141                 xs_str_bld_cat_fmt(&b, "<s>%s</s>", e1);
    142             }
    143             else
    144             if (*v == '[') {
    145                 /* markdown-like links [label](url) */
    146                 xs *w = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
    147                 xs *l = xs_split_n(w, "](", 1);
    148 
    149                 if (xs_list_len(l) == 2) {
    150                     xs *name = xs_dup(xs_list_get(l, 0));
    151                     xs *url  = xs_dup(xs_list_get(l, 1));
    152 
    153                     name = xs_crop_i(name, 1, 0);
    154                     url  = xs_crop_i(url, 0, -1);
    155 
    156                     xs_str_bld_cat_fmt(&b, "<a href=\"%s\">%s</a>",
    157                             url, name);
    158                 }
    159                 else
    160                     xs_str_bld_cat(&b, v);
    161             }
    162             else
    163             if (*v == '!') {
    164                 /* markdown-like images ![alt text](url to image) */
    165                 xs *w = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
    166                 xs *l = xs_split_n(w, "](", 1);
    167 
    168                 if (xs_list_len(l) == 2) {
    169                     xs *alt_text = xs_dup(xs_list_get(l, 0));
    170                     xs *img_url  = xs_dup(xs_list_get(l, 1));
    171 
    172                     alt_text = xs_crop_i(alt_text, 2, 0);
    173                     img_url  = xs_crop_i(img_url, 0, -1);
    174 
    175                     const char *mime = xs_mime_by_ext(img_url);
    176 
    177                     if (attach != NULL && xs_startswith(mime, "image/")) {
    178                         const xs_dict *ad;
    179                         int add = 1;
    180 
    181                         xs_list_foreach(*attach, ad) {
    182                             if (strcmp(xs_dict_get_def(ad, "url", ""), img_url) == 0) {
    183                                 add = 0;
    184                                 break;
    185                             }
    186                         }
    187 
    188                         if (add) {
    189                             xs *d = xs_dict_new();
    190 
    191                             d = xs_dict_append(d, "mediaType", mime);
    192                             d = xs_dict_append(d, "url",       img_url);
    193                             d = xs_dict_append(d, "name",      alt_text);
    194                             d = xs_dict_append(d, "type",      "Image");
    195 
    196                             *attach = xs_list_append(*attach, d);
    197                         }
    198                     }
    199                     else {
    200                         xs_str_bld_cat_fmt(&b, "<a href=\"%s\">%s</a>", img_url, alt_text);
    201                     }
    202                 }
    203                 else
    204                     xs_str_bld_cat(&b, v);
    205             }
    206             else
    207             if (xs_str_in(v, ":/" "/") != -1) {
    208                 /* direct URLs in the post body */
    209                 xs *u  = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
    210 
    211                 xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
    212 
    213                 const char *mime = xs_mime_by_ext(v2);
    214 
    215                 if (attach != NULL && xs_startswith(mime, "image/")) {
    216                     /* if it's a link to an image, insert it as an attachment */
    217                     const xs_dict *ad;
    218                     int add = 1;
    219 
    220                     xs_list_foreach(*attach, ad) {
    221                         if (strcmp(xs_dict_get_def(ad, "url", ""), v2) == 0) {
    222                             add = 0;
    223                             break;
    224                         }
    225                     }
    226 
    227                     if (add) {
    228                         xs *d = xs_dict_new();
    229 
    230                         d = xs_dict_append(d, "mediaType", mime);
    231                         d = xs_dict_append(d, "url",       v2);
    232                         d = xs_dict_append(d, "name",      "");
    233                         d = xs_dict_append(d, "type",      "Image");
    234 
    235                         *attach = xs_list_append(*attach, d);
    236                     }
    237                 }
    238                 else {
    239                     xs_str_bld_cat_fmt(&b, "<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
    240                 }
    241             }
    242             else
    243             if (xs_match(v, "mailto*|xmpp*")) {
    244                 xs *u  = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
    245 
    246                 xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
    247 
    248                 xs_str_bld_cat_fmt(&b, "<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
    249             }
    250             else
    251                 xs_str_bld_cat(&b, v);
    252         }
    253         else
    254             /* surrounded text, copy directly */
    255             xs_str_bld_cat(&b, v);
    256 
    257         n++;
    258     }
    259 
    260     return b.data;
    261 }
    262 
    263 
    264 xs_str *not_really_markdown(const char *content, xs_list **attach, xs_list **tag)
    265 /* formats a content using some Markdown rules */
    266 {
    267     xs_str_bld b = { 0 };
    268     int in_pre = 0;
    269     int in_blq = 0;
    270     xs *list;
    271     char *p;
    272     const char *v;
    273 
    274     /* work by lines */
    275     list = xs_split(content, "\n");
    276 
    277     p = list;
    278     while (xs_list_iter(&p, &v)) {
    279         xs *ss = NULL;
    280 
    281         if (strcmp(v, "```") == 0) {
    282             if (!in_pre)
    283                 xs_str_bld_cat(&b, "<pre>");
    284             else
    285                 xs_str_bld_cat(&b, "</pre>");
    286 
    287             in_pre = !in_pre;
    288             continue;
    289         }
    290 
    291         if (in_pre) {
    292             // Encode all HTML characters when we're in pre element until we are out.
    293             ss = encode_html(v);
    294 
    295             xs_str_bld_cat(&b, ss);
    296             xs_str_bld_cat(&b, "<br>");
    297             continue;
    298         }
    299 
    300         else
    301             ss = xs_strip_i(format_line(v, attach));
    302 
    303         if (xs_startswith(ss, "---")) {
    304             /* delete the --- */
    305             ss = xs_strip_i(xs_crop_i(ss, 3, 0));
    306             xs_str_bld_cat(&b, "<hr>");
    307 
    308             xs_str_bld_cat(&b, ss);
    309 
    310             continue;
    311         }
    312 
    313         //anzu - begin
    314         // h1 reserved for snac?
    315         if (xs_startswith(ss, "# ")) {
    316             ss = xs_strip_i(xs_crop_i(ss, 2, 0));
    317             xs_str_bld_cat(&b, "<h2>");
    318             xs_str_bld_cat(&b, ss);
    319             xs_str_bld_cat(&b, "</h2>");
    320             continue;
    321         }
    322         if (xs_startswith(ss, "## ")) {
    323             ss = xs_strip_i(xs_crop_i(ss, 3, 0));
    324             xs_str_bld_cat(&b, "<h2>");
    325             xs_str_bld_cat(&b, ss);
    326             xs_str_bld_cat(&b, "</h2>");
    327             continue;
    328         }
    329         if (xs_startswith(ss, "### ")) {
    330             ss = xs_strip_i(xs_crop_i(ss, 4, 0));
    331             xs_str_bld_cat(&b, "<h3>");
    332             xs_str_bld_cat(&b, ss);
    333             xs_str_bld_cat(&b, "</h3>");
    334             continue;
    335         }
    336         //anzu - end
    337 
    338         if (xs_startswith(ss, ">")) {
    339             /* delete the > and subsequent spaces */
    340             ss = xs_strip_i(xs_crop_i(ss, 1, 0));
    341 
    342             if (!in_blq) {
    343                 xs_str_bld_cat(&b, "<blockquote>");
    344                 in_blq = 1;
    345             }
    346 
    347             xs_str_bld_cat(&b, ss);
    348             xs_str_bld_cat(&b, "<br>");
    349 
    350             continue;
    351         }
    352 
    353         if (in_blq) {
    354             xs_str_bld_cat(&b, "</blockquote>");
    355             in_blq = 0;
    356         }
    357 
    358         xs_str_bld_cat(&b, ss);
    359         xs_str_bld_cat(&b, "<br>");
    360     }
    361 
    362     if (in_blq)
    363         xs_str_bld_cat(&b, "</blockquote>");
    364     if (in_pre)
    365         xs_str_bld_cat(&b, "</pre>");
    366 
    367     xs_str *s = b.data;
    368     /* some beauty fixes */
    369     s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
    370     s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
    371     s = xs_replace_i(s, "</pre><br>", "</pre>");
    372     s = xs_replace_i(s, "</h2><br>", "</h2>"); //anzu ???
    373     s = xs_replace_i(s, "</h3><br>", "</h3>"); //anzu ???
    374 
    375     {
    376         /* traditional emoticons */
    377         xs *d = emojis();
    378         int c = 0;
    379         const char *k, *v;
    380 
    381         while (xs_dict_next(d, &k, &v, &c)) {
    382             const char *t = xs_mime_by_ext(v);
    383 
    384             /* is it an URL to an image? */
    385             if (xs_startswith(v, "https:/" "/") &&
    386                 (xs_startswith(t, "image/") || strcmp(t, "application/octet-stream") == 0)) {
    387                 if (tag && xs_str_in(s, k) != -1) {
    388                     /* add the emoji to the tag list */
    389                     xs *e = xs_dict_new();
    390                     xs *i = xs_dict_new();
    391                     xs *u = xs_str_utctime(0, ISO_DATE_SPEC);
    392 
    393                     e = xs_dict_append(e, "id", v);
    394                     e = xs_dict_append(e, "type", "Emoji");
    395                     e = xs_dict_append(e, "name", k);
    396                     e = xs_dict_append(e, "updated", u);
    397 
    398                     i = xs_dict_append(i, "type", "Image");
    399                     i = xs_dict_append(i, "mediaType", t);
    400                     i = xs_dict_append(i, "url", v);
    401                     e = xs_dict_append(e, "icon", i);
    402 
    403                     *tag = xs_list_append(*tag, e);
    404                 }
    405             }
    406             else
    407                 s = xs_replace_i(s, k, v);
    408         }
    409     }
    410 
    411     return s;
    412 }
    413 
    414 
    415 const char *valid_tags[] = {
    416     "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
    417     "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", "bdi",
    418     "h2","h3", //anzu
    419     NULL
    420 };
    421 
    422 xs_str *sanitize(const char *content)
    423 /* cleans dangerous HTML output */
    424 {
    425     xs_str_bld b = { 0 };
    426     xs *sl;
    427     int n = 0;
    428     char *p;
    429     const char *v;
    430 
    431     sl = xs_regex_split(content, "</?[^>]+>");
    432 
    433     p = sl;
    434 
    435     n = 0;
    436     while (xs_list_iter(&p, &v)) {
    437         if (n & 0x1) {
    438             xs *s1  = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
    439             xs *l1  = xs_split_n(s1, " ", 1);
    440             xs *tag = xs_utf8_to_lower(xs_list_get(l1, 0));
    441             xs *s2  = NULL;
    442             int i;
    443 
    444             /* check if it's one of the valid tags */
    445             for (i = 0; valid_tags[i]; i++) {
    446                 if (strcmp(tag, valid_tags[i]) == 0)
    447                     break;
    448             }
    449 
    450             if (valid_tags[i]) {
    451                 /* accepted tag: rebuild it with only the accepted elements */
    452                 xs *el = xs_regex_select(v, "(src|href|rel|class|target)=(\"[^\"]*\"|'[^']*')");
    453                 xs *s3 = xs_join(el, " ");
    454 
    455                 xs_str_bld_cat_fmt(&b, "<%s%s%s%s>",
    456                     v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
    457             } else {
    458                 /* treat end of divs as paragraph breaks */
    459                 if (strcmp(v, "</div>"))
    460                     xs_str_bld_cat(&b, "<p>");
    461             }
    462         }
    463         else {
    464             /* non-tag */
    465             xs_str_bld_cat(&b, v);
    466         }
    467 
    468         n++;
    469     }
    470 
    471     return b.data;
    472 }
    473 
    474 
    475 xs_str *encode_html(const char *str)
    476 /* escapes html characters */
    477 {
    478     xs_str *encoded = xs_html_encode((char *)str);
    479 
    480     /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
    481     encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
    482 
    483     return encoded;
    484 }