format.c (14715B)
1 /* snac - A simple, minimalistic ActivityPub instance */ 2 /* copyright (c) 2022 - 2025 grunfink et al. / MIT license */ 3 4 #include "xs.h" 5 #include "xs_regex.h" 6 #include "xs_mime.h" 7 #include "xs_html.h" 8 #include "xs_json.h" 9 #include "xs_time.h" 10 #include "xs_match.h" 11 #include "xs_unicode.h" 12 13 #include "snac.h" 14 15 /* emoticons, people laughing and such */ 16 const char *smileys[] = { 17 ":-)", "🙂", 18 ":-D", "😀", 19 "X-D", "😆", 20 ";-)", "😉", 21 "B-)", "😎", 22 ">:-(", "😡", 23 ":-(", "😞", 24 ":-*", "😘", 25 ":-/", "😕", 26 "8-o", "😲", 27 "%-)", "🤪", 28 ":_(", "😢", 29 ":-|", "😐", 30 "<3", "❤️", 31 ":facepalm:", "🤦", 32 ":shrug:", "🤷", 33 ":shrug2:", "¯\\_(ツ)_/¯", 34 ":eyeroll:", "🙄", 35 ":beer:", "🍺", 36 ":beers:", "🍻", 37 ":munch:", "😱", 38 ":thumb:", "👍", 39 NULL, NULL 40 }; 41 42 43 xs_dict *emojis(void) 44 /* returns a dict with the emojis */ 45 { 46 xs *fn = xs_fmt("%s/emojis.json", srv_basedir); 47 FILE *f; 48 49 if (mtime(fn) == 0) { 50 /* file does not exist; create it with the defaults */ 51 xs *d = xs_dict_new(); 52 const char **emo = smileys; 53 54 while (*emo) { 55 d = xs_dict_append(d, emo[0], emo[1]); 56 emo += 2; 57 } 58 59 if ((f = fopen(fn, "w")) != NULL) { 60 xs_json_dump(d, 4, f); 61 fclose(f); 62 } 63 else 64 srv_log(xs_fmt("Error creating '%s'", fn)); 65 } 66 67 xs_dict *d = NULL; 68 69 if ((f = fopen(fn, "r")) != NULL) { 70 d = xs_json_load(f); 71 fclose(f); 72 73 if (d == NULL) 74 srv_log(xs_fmt("JSON parse error in '%s'", fn)); 75 } 76 else 77 srv_log(xs_fmt("Error opening '%s'", fn)); 78 79 return d; 80 } 81 82 /* Non-whitespace without trailing comma, period or closing paren */ 83 #define NOSPACE "([^[:space:],.)]+|[,.)]+[^[:space:],.)])+" 84 85 static xs_str *format_line(const char *line, xs_list **attach) 86 /* formats a line */ 87 { 88 xs_str_bld b = { 0 }; 89 char *p; 90 const char *v; 91 92 /* split by markup */ 93 xs *sm = xs_regex_split(line, 94 "(" 95 "`[^`]+`" "|" 96 "~~[^~]+~~" "|" 97 "\\*\\*?\\*?[^\\*]+\\*?\\*?\\*" "|" 98 "__[^_]+__" "|" //anzu 99 "!\\[[^]]+\\]\\([^\\)]+\\)\\)?" "|" 100 "\\[[^]]+\\]\\([^\\)]+\\)\\)?" "|" 101 "[a-z]+:/" "/" NOSPACE "|" 102 "(mailto|xmpp):[^@[:space:]]+@" NOSPACE 103 ")"); 104 int n = 0; 105 106 p = sm; 107 while (xs_list_iter(&p, &v)) { 108 if ((n & 0x1)) { 109 /* markup */ 110 if (xs_startswith(v, "`")) { 111 xs *s1 = xs_strip_chars_i(xs_dup(v), "`"); 112 xs *e1 = encode_html(s1); 113 xs_str_bld_cat_fmt(&b, "<code>%s</code>", e1); 114 } 115 else 116 if (xs_startswith(v, "***")) { 117 xs *s1 = xs_strip_chars_i(xs_dup(v), "*"); 118 xs_str_bld_cat_fmt(&b, "<b><i>%s</i></b>", s1); 119 } 120 else 121 if (xs_startswith(v, "**")) { 122 xs *s1 = xs_strip_chars_i(xs_dup(v), "*"); 123 xs_str_bld_cat_fmt(&b, "<b>%s</b>", s1); 124 } 125 else 126 if (xs_startswith(v, "*")) { 127 xs *s1 = xs_strip_chars_i(xs_dup(v), "*"); 128 xs_str_bld_cat_fmt(&b, "<i>%s</i>", s1); 129 } 130 //anzu - begin 131 else 132 if (xs_startswith(v, "__")) { 133 xs *s1 = xs_strip_chars_i(xs_dup(v), "_"); 134 xs_str_bld_cat_fmt(&b, "<u>%s</u>", s1); 135 } 136 //anzu - end 137 else 138 if (xs_startswith(v, "~~")) { 139 xs *s1 = xs_strip_chars_i(xs_dup(v), "~"); 140 xs *e1 = encode_html(s1); 141 xs_str_bld_cat_fmt(&b, "<s>%s</s>", e1); 142 } 143 else 144 if (*v == '[') { 145 /* markdown-like links [label](url) */ 146 xs *w = xs_replace_i(xs_replace(v, "#", "#"), "@", "@"); 147 xs *l = xs_split_n(w, "](", 1); 148 149 if (xs_list_len(l) == 2) { 150 xs *name = xs_dup(xs_list_get(l, 0)); 151 xs *url = xs_dup(xs_list_get(l, 1)); 152 153 name = xs_crop_i(name, 1, 0); 154 url = xs_crop_i(url, 0, -1); 155 156 xs_str_bld_cat_fmt(&b, "<a href=\"%s\">%s</a>", 157 url, name); 158 } 159 else 160 xs_str_bld_cat(&b, v); 161 } 162 else 163 if (*v == '!') { 164 /* markdown-like images  */ 165 xs *w = xs_replace_i(xs_replace(v, "#", "#"), "@", "@"); 166 xs *l = xs_split_n(w, "](", 1); 167 168 if (xs_list_len(l) == 2) { 169 xs *alt_text = xs_dup(xs_list_get(l, 0)); 170 xs *img_url = xs_dup(xs_list_get(l, 1)); 171 172 alt_text = xs_crop_i(alt_text, 2, 0); 173 img_url = xs_crop_i(img_url, 0, -1); 174 175 const char *mime = xs_mime_by_ext(img_url); 176 177 if (attach != NULL && xs_startswith(mime, "image/")) { 178 const xs_dict *ad; 179 int add = 1; 180 181 xs_list_foreach(*attach, ad) { 182 if (strcmp(xs_dict_get_def(ad, "url", ""), img_url) == 0) { 183 add = 0; 184 break; 185 } 186 } 187 188 if (add) { 189 xs *d = xs_dict_new(); 190 191 d = xs_dict_append(d, "mediaType", mime); 192 d = xs_dict_append(d, "url", img_url); 193 d = xs_dict_append(d, "name", alt_text); 194 d = xs_dict_append(d, "type", "Image"); 195 196 *attach = xs_list_append(*attach, d); 197 } 198 } 199 else { 200 xs_str_bld_cat_fmt(&b, "<a href=\"%s\">%s</a>", img_url, alt_text); 201 } 202 } 203 else 204 xs_str_bld_cat(&b, v); 205 } 206 else 207 if (xs_str_in(v, ":/" "/") != -1) { 208 /* direct URLs in the post body */ 209 xs *u = xs_replace_i(xs_replace(v, "#", "#"), "@", "@"); 210 211 xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)"); 212 213 const char *mime = xs_mime_by_ext(v2); 214 215 if (attach != NULL && xs_startswith(mime, "image/")) { 216 /* if it's a link to an image, insert it as an attachment */ 217 const xs_dict *ad; 218 int add = 1; 219 220 xs_list_foreach(*attach, ad) { 221 if (strcmp(xs_dict_get_def(ad, "url", ""), v2) == 0) { 222 add = 0; 223 break; 224 } 225 } 226 227 if (add) { 228 xs *d = xs_dict_new(); 229 230 d = xs_dict_append(d, "mediaType", mime); 231 d = xs_dict_append(d, "url", v2); 232 d = xs_dict_append(d, "name", ""); 233 d = xs_dict_append(d, "type", "Image"); 234 235 *attach = xs_list_append(*attach, d); 236 } 237 } 238 else { 239 xs_str_bld_cat_fmt(&b, "<a href=\"%s\" target=\"_blank\">%s</a>", v2, u); 240 } 241 } 242 else 243 if (xs_match(v, "mailto*|xmpp*")) { 244 xs *u = xs_replace_i(xs_replace(v, "#", "#"), "@", "@"); 245 246 xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)"); 247 248 xs_str_bld_cat_fmt(&b, "<a href=\"%s\" target=\"_blank\">%s</a>", v2, u); 249 } 250 else 251 xs_str_bld_cat(&b, v); 252 } 253 else 254 /* surrounded text, copy directly */ 255 xs_str_bld_cat(&b, v); 256 257 n++; 258 } 259 260 return b.data; 261 } 262 263 264 xs_str *not_really_markdown(const char *content, xs_list **attach, xs_list **tag) 265 /* formats a content using some Markdown rules */ 266 { 267 xs_str_bld b = { 0 }; 268 int in_pre = 0; 269 int in_blq = 0; 270 xs *list; 271 char *p; 272 const char *v; 273 274 /* work by lines */ 275 list = xs_split(content, "\n"); 276 277 p = list; 278 while (xs_list_iter(&p, &v)) { 279 xs *ss = NULL; 280 281 if (strcmp(v, "```") == 0) { 282 if (!in_pre) 283 xs_str_bld_cat(&b, "<pre>"); 284 else 285 xs_str_bld_cat(&b, "</pre>"); 286 287 in_pre = !in_pre; 288 continue; 289 } 290 291 if (in_pre) { 292 // Encode all HTML characters when we're in pre element until we are out. 293 ss = encode_html(v); 294 295 xs_str_bld_cat(&b, ss); 296 xs_str_bld_cat(&b, "<br>"); 297 continue; 298 } 299 300 else 301 ss = xs_strip_i(format_line(v, attach)); 302 303 if (xs_startswith(ss, "---")) { 304 /* delete the --- */ 305 ss = xs_strip_i(xs_crop_i(ss, 3, 0)); 306 xs_str_bld_cat(&b, "<hr>"); 307 308 xs_str_bld_cat(&b, ss); 309 310 continue; 311 } 312 313 //anzu - begin 314 // h1 reserved for snac? 315 if (xs_startswith(ss, "# ")) { 316 ss = xs_strip_i(xs_crop_i(ss, 2, 0)); 317 xs_str_bld_cat(&b, "<h2>"); 318 xs_str_bld_cat(&b, ss); 319 xs_str_bld_cat(&b, "</h2>"); 320 continue; 321 } 322 if (xs_startswith(ss, "## ")) { 323 ss = xs_strip_i(xs_crop_i(ss, 3, 0)); 324 xs_str_bld_cat(&b, "<h2>"); 325 xs_str_bld_cat(&b, ss); 326 xs_str_bld_cat(&b, "</h2>"); 327 continue; 328 } 329 if (xs_startswith(ss, "### ")) { 330 ss = xs_strip_i(xs_crop_i(ss, 4, 0)); 331 xs_str_bld_cat(&b, "<h3>"); 332 xs_str_bld_cat(&b, ss); 333 xs_str_bld_cat(&b, "</h3>"); 334 continue; 335 } 336 //anzu - end 337 338 if (xs_startswith(ss, ">")) { 339 /* delete the > and subsequent spaces */ 340 ss = xs_strip_i(xs_crop_i(ss, 1, 0)); 341 342 if (!in_blq) { 343 xs_str_bld_cat(&b, "<blockquote>"); 344 in_blq = 1; 345 } 346 347 xs_str_bld_cat(&b, ss); 348 xs_str_bld_cat(&b, "<br>"); 349 350 continue; 351 } 352 353 if (in_blq) { 354 xs_str_bld_cat(&b, "</blockquote>"); 355 in_blq = 0; 356 } 357 358 xs_str_bld_cat(&b, ss); 359 xs_str_bld_cat(&b, "<br>"); 360 } 361 362 if (in_blq) 363 xs_str_bld_cat(&b, "</blockquote>"); 364 if (in_pre) 365 xs_str_bld_cat(&b, "</pre>"); 366 367 xs_str *s = b.data; 368 /* some beauty fixes */ 369 s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>"); 370 s = xs_replace_i(s, "</blockquote><br>", "</blockquote>"); 371 s = xs_replace_i(s, "</pre><br>", "</pre>"); 372 s = xs_replace_i(s, "</h2><br>", "</h2>"); //anzu ??? 373 s = xs_replace_i(s, "</h3><br>", "</h3>"); //anzu ??? 374 375 { 376 /* traditional emoticons */ 377 xs *d = emojis(); 378 int c = 0; 379 const char *k, *v; 380 381 while (xs_dict_next(d, &k, &v, &c)) { 382 const char *t = xs_mime_by_ext(v); 383 384 /* is it an URL to an image? */ 385 if (xs_startswith(v, "https:/" "/") && 386 (xs_startswith(t, "image/") || strcmp(t, "application/octet-stream") == 0)) { 387 if (tag && xs_str_in(s, k) != -1) { 388 /* add the emoji to the tag list */ 389 xs *e = xs_dict_new(); 390 xs *i = xs_dict_new(); 391 xs *u = xs_str_utctime(0, ISO_DATE_SPEC); 392 393 e = xs_dict_append(e, "id", v); 394 e = xs_dict_append(e, "type", "Emoji"); 395 e = xs_dict_append(e, "name", k); 396 e = xs_dict_append(e, "updated", u); 397 398 i = xs_dict_append(i, "type", "Image"); 399 i = xs_dict_append(i, "mediaType", t); 400 i = xs_dict_append(i, "url", v); 401 e = xs_dict_append(e, "icon", i); 402 403 *tag = xs_list_append(*tag, e); 404 } 405 } 406 else 407 s = xs_replace_i(s, k, v); 408 } 409 } 410 411 return s; 412 } 413 414 415 const char *valid_tags[] = { 416 "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small", 417 "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", "bdi", 418 "h2","h3", //anzu 419 NULL 420 }; 421 422 xs_str *sanitize(const char *content) 423 /* cleans dangerous HTML output */ 424 { 425 xs_str_bld b = { 0 }; 426 xs *sl; 427 int n = 0; 428 char *p; 429 const char *v; 430 431 sl = xs_regex_split(content, "</?[^>]+>"); 432 433 p = sl; 434 435 n = 0; 436 while (xs_list_iter(&p, &v)) { 437 if (n & 0x1) { 438 xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1)); 439 xs *l1 = xs_split_n(s1, " ", 1); 440 xs *tag = xs_utf8_to_lower(xs_list_get(l1, 0)); 441 xs *s2 = NULL; 442 int i; 443 444 /* check if it's one of the valid tags */ 445 for (i = 0; valid_tags[i]; i++) { 446 if (strcmp(tag, valid_tags[i]) == 0) 447 break; 448 } 449 450 if (valid_tags[i]) { 451 /* accepted tag: rebuild it with only the accepted elements */ 452 xs *el = xs_regex_select(v, "(src|href|rel|class|target)=(\"[^\"]*\"|'[^']*')"); 453 xs *s3 = xs_join(el, " "); 454 455 xs_str_bld_cat_fmt(&b, "<%s%s%s%s>", 456 v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3); 457 } else { 458 /* treat end of divs as paragraph breaks */ 459 if (strcmp(v, "</div>")) 460 xs_str_bld_cat(&b, "<p>"); 461 } 462 } 463 else { 464 /* non-tag */ 465 xs_str_bld_cat(&b, v); 466 } 467 468 n++; 469 } 470 471 return b.data; 472 } 473 474 475 xs_str *encode_html(const char *str) 476 /* escapes html characters */ 477 { 478 xs_str *encoded = xs_html_encode((char *)str); 479 480 /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */ 481 encoded = xs_replace_i(encoded, "<br>", "<br>"); 482 483 return encoded; 484 }