From: Stefan Sperling Subject: allow UTF-8 in gotsys.conf site owner and repository description To: gameoftrees@openbsd.org Date: Fri, 10 Apr 2026 12:36:54 +0200 The gotsys.conf parser and related string validation routines currently reject non-ASCII bytes. This limitation was put in place during early development of gotsysd, with an intention to lift this limitation eventually. I would like to allow UTF-8 in quoted strings, such that repository owner names will not have to be mangled and repository descriptions can be written in arbitrary languages. gotwebd.conf already accepts non-ASCII bytes without doing any validation, which is fine since this configuration file is considered trusted input provided by the root user. The diff below combines a sequence of commits which is as follows, starting from the current tip of main: 2026-03-21 main typo fix 2026-04-10 80e8ce5 move utf8d.h to lib/ 2026-04-10 bbfbab7 rename decode() to utf8d_decode() for namespacing reasons 2026-04-10 0846d39 validate UTF-8 in quoted strings in gotsys.conf 2026-04-10 18af3a0 allow UTF-8 in gotsys string values 2026-04-10 52e22e7 add a test case for UTF-8 repository owner names in gotsys.conf 2026-04-10 gotsys-utf8 document that site owner and repository description may use UTF-8 ok? M gotd/libexec/got-notify-http/got-notify-http.c | 2+ 2- D gotd/libexec/got-notify-http/utf8d.h | 0+ 55- M gotsys/gotsys.conf.5 | 16+ 0- M gotsys/parse.y | 7+ 0- M lib/gotsys_conf.c | 13+ 3- A lib/utf8d.h | 55+ 0- M regress/gotsysd/test_gotwebd.sh | 234+ 0- 7 files changed, 327 insertions(+), 60 deletions(-) commit - dd1f33d3b4d0d6303a96755e5a73ea539136e64c commit + 1667f4f7b4577ba6b72b96b20e2b2ba6694a91dc blob - d9499c2daaac15867a5911f267ea5a8b669ca96a blob + ef4d323107bcbd311475045114b4c58e010c5a8b --- gotd/libexec/got-notify-http/got-notify-http.c +++ gotd/libexec/got-notify-http/got-notify-http.c @@ -109,9 +109,9 @@ escape(FILE *fp, const uint8_t *s) uint32_t codepoint, state; const uint8_t *start = s; - state = 0; + state = UTF8_ACCEPT; for (; *s; ++s) { - switch (decode(&state, &codepoint, *s)) { + switch (utf8_decode(&state, &codepoint, *s)) { case UTF8_ACCEPT: switch (codepoint) { case '"': blob - 480e9c55ac4a6aa58414ec65e0053dd620b5b01d (mode 644) blob + /dev/null --- gotd/libexec/got-notify-http/utf8d.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2008-2009 Bjoern Hoehrmann - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 1 - -static const uint8_t utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - -static uint32_t inline -decode(uint32_t* state, uint32_t* codep, uint32_t byte) { - uint32_t type = utf8d[byte]; - - *codep = (*state != UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state*16 + type]; - return *state; -} blob - b769c8da569ed6be844d2e2ef103e02b1178eedc blob + 25fce29e9304bd5a74cb81666364f2613ffe5048 --- gotsys/gotsys.conf.5 +++ gotsys/gotsys.conf.5 @@ -280,6 +280,16 @@ Additionally, the name may not contain the two-charact .Pp The available repository configuration directives are as follows: .Bl -tag -width Ds +.It Ic description Ar string +Sets the repository description shown on the +.Xr gotwebd 8 +repository listing page. +The +.Ar string +parameter can only contain one line of text in ASCII encoding, +or in UTF-8 encoding if +.Ar string +is wrapped in quotes. .It Ic head Ar branch Point the repository's symbolic .Pa HEAD @@ -705,6 +715,12 @@ repository is hidden. Set the displayed site owner. If not set then no site owner will be displayed by .Xr gotwebd 8 . +The +.Ar name +parameter can only contain one line of text in ASCII encoding, +or in UTF-8 encoding if +.Ar name +is wrapped in quotes. .It Ic repositories url path Ar url-path Sets the URL path under which Git repositories will be displayed by .Xr gotwebd 8 . blob - 7d29858771ec205754bead3726797efbb55eb0ab blob + 693f694844dc8414c699da48f55a275e458fa780 --- gotsys/parse.y +++ gotsys/parse.y @@ -54,6 +54,7 @@ #include "media.h" #include "gotwebd.h" #include "gotsys.h" +#include "utf8d.h" #ifndef nitems #define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) @@ -1392,6 +1393,7 @@ yylex(void) unsigned char *p; int quotec, next, c; int token; + uint32_t cp, state = UTF8_ACCEPT; p = buf; c = lgetc(0); @@ -1438,6 +1440,11 @@ yylex(void) yyerror("string too long"); return (findeol()); } + if (utf8_decode(&state, &cp, + (unsigned char)c) == UTF8_REJECT) { + yyerror("invalid UTF-8 string"); + return (findeol()); + } *p++ = c; } yylval.v.string = strdup(buf); blob - 55d9518f08f86547a7c0bdec2b855400321a4676 blob + c2cdaefb29324dbdd1917da2f86c01d603014117 --- lib/gotsys_conf.c +++ lib/gotsys_conf.c @@ -37,6 +37,7 @@ #include "media.h" #include "gotwebd.h" #include "gotsys.h" +#include "utf8d.h" #ifndef nitems #define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) @@ -1335,16 +1336,24 @@ gotsys_conf_validate_mediatype(const char *s) const struct got_error * gotsys_conf_validate_string(const char *s) { + uint32_t cp, state = UTF8_ACCEPT; int i; for (i = 0; s[i] != '\0'; ++i) { char x = s[i]; + if (utf8_decode(&state, &cp, (unsigned char)x) == UTF8_REJECT) { + return got_error_msg(GOT_ERR_PARSE_CONFIG, + "invalid UTF-8 string"); + } + /* * Similar to gotwebd/parse.y allowed_in_string() while - * allowing for spaces and tabs in quoted strings. + * allowing for UTF-8, spaces, and tabs in quoted strings. */ - if (isalnum((unsigned char)x) || x == ' ' || x == '\t' || + if (isalnum((unsigned char)x) || + ((unsigned char)x & 0x80) == 0x80 || + x == ' ' || x == '\t' || (ispunct((unsigned char)x) && x != '(' && x != ')' && x != '{' && x != '}' && x != '!' && x != '=' && x != '#' && @@ -1352,7 +1361,8 @@ gotsys_conf_validate_string(const char *s) continue; return got_error_fmt(GOT_ERR_PARSE_CONFIG, - "character '%c' (0x%.2x) is not allowed in %s", x, x, s); + "character '%c' (0x%x) is not allowed in %s", x, + (unsigned char)x, s); } return NULL; blob - /dev/null blob + 76457222ed8d2d5366ea06b99224f1d89137ae68 (mode 644) --- /dev/null +++ lib/utf8d.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static uint32_t inline +utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} blob - d9b41a3b9a330c9e5ec48b32471c5d0ab8e86361 blob + 74d98394522484f6d2749c8a686246dff5e3f61d --- regress/gotsysd/test_gotwebd.sh +++ regress/gotsysd/test_gotwebd.sh @@ -1104,8 +1104,242 @@ EOF test_done "$testroot" "$ret" } +test_utf8_site_owner() { + local testroot=`test_init utf8_site_owner 1` + + GOTSYS_ECDSA_HOST_FP=$(ssh -i ${GOTSYSD_SSH_KEY} \ + ${GOTSYSD_TEST_USER}@${VMIP} \ + ssh-keygen -lf /etc/ssh/ssh_host_ecdsa_key.pub | \ + cut -d' ' -f2) + GOTSYS_ED25519_HOST_FP=$(ssh -i ${GOTSYSD_SSH_KEY} \ + ${GOTSYSD_TEST_USER}@${VMIP} \ + ssh-keygen -lf /etc/ssh/ssh_host_ed25519_key.pub | \ + cut -d' ' -f2) + GOTSYS_RSA_HOST_FP=$(ssh -i ${GOTSYSD_SSH_KEY} \ + ${GOTSYSD_TEST_USER}@${VMIP} \ + ssh-keygen -lf /etc/ssh/ssh_host_rsa_key.pub | \ + cut -d' ' -f2) + + got checkout -q $testroot/${GOTSYS_REPO} $testroot/wt >/dev/null + ret=$? + if [ $ret -ne 0 ]; then + echo "got checkout failed unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + crypted_vm_pw=`echo ${GOTSYSD_VM_PASSWORD} | encrypt | tr -d '\n'` + crypted_pw=`echo ${GOTSYSD_DEV_PASSWORD} | encrypt | tr -d '\n'` + sshkey=`cat ${GOTSYSD_SSH_PUBKEY}` + cat > ${testroot}/wt/gotsys.conf <