~sircmpwn/hare-dev

hare: use simpler and faster utf8 decoder v1 PROPOSED

Bor Grošelj Simić: 3
 use simpler and faster utf8 decoder
 use improved forward decode in backward decode
 rewrite encoding::utf8::valid using the new decoder table

 6 files changed, 188 insertions(+), 90 deletions(-)
#933550 alpine.yml success
#933551 freebsd.yml success
hare/patches: SUCCESS in 1m42s

[use simpler and faster utf8 decoder][0] from [Bor Grošelj Simić][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/38680
[1]: mailto:bgs@turminal.net

✓ #933551 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/933551
✓ #933550 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/933550
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/hare-dev/patches/38680/mbox | git am -3
Learn more about email & git

[PATCH hare 1/3] use simpler and faster utf8 decoder Export this patch

Fixes: https://todo.sr.ht/~sircmpwn/hare/798
Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>
---
 encoding/utf8/decode.ha      |  48 ++++++-------
 encoding/utf8/decodetable.ha | 130 +++++++++++++++++++++++++++++++++++
 scripts/gen-stdlib           |   1 +
 stdlib.mk                    |   2 +
 4 files changed, 155 insertions(+), 26 deletions(-)
 create mode 100644 encoding/utf8/decodetable.ha

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
index 513a3655..2d8a31f7 100644
--- a/encoding/utf8/decode.ha
+++ b/encoding/utf8/decode.ha
@@ -27,6 +27,11 @@ export type invalid = !void;

const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];

const masks: [2][8]u8 = [
	[0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f],
        [0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07],
];

fn decode_leader(c: u8) ((size, u8) | invalid) = {
	for (let i = 0z; i < len(sizes); i += 1) {
		if (c & sizes[i].mask == sizes[i].result) {
@@ -40,38 +45,26 @@ fn decode_leader(c: u8) ((size, u8) | invalid) = {
// Returns the next rune from a decoder. void is returned when there are no
// remaining codepoints.
export fn next(d: *decoder) (rune | void | more | invalid) = {
	assert(d.offs <= len(d.src));
	if (d.offs == len(d.src)) {
		return;
	};

	const (n, leader) = decode_leader(d.src[d.offs])?;
	if (d.offs + n > len(d.src)) {
		return more;
	};

	let r = leader: u32;
	if (n > 1) {
		for (let i = 1z; i < n; i += 1) {
			let byte = d.src[d.offs + i];
			if ((byte & 0xC0) != 0x80) {
				// Bad continuation byte
				return invalid;
			};
			r <<= 6;
			r |= byte & 0x3F;
		};
		if (r >= 0xD800 && r <= 0xDFFF) {
			// UTF-16 surrogates
			return invalid;
		};
		if (runesz(r: rune) != n) {
			// Overlong encoding
			return invalid;
	// from https://github.com/skeeto/scratch/blob/master/parsers/utf8_decode.c
	// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
	// and https://nullprogram.com/blog/2020/12/31/ for an explanation of
	// the algorithm.
	let next = 0, state = 0;
	let r = 0u32;
	for (d.offs < len(d.src); d.offs += 1) {
		next = table[state][d.src[d.offs]];
		r = r << 6 | d.src[d.offs] & masks[(state - 1): uint >> 31][next & 7];
		if (next <= 0) {
			d.offs += 1;
			return if (next == 0) r: rune else invalid;
		};
		state = next;
	};
	d.offs += n;
	return r: rune;
	return more;
};

// Returns the previous rune from a decoder. void is returned when there are no
@@ -180,6 +173,9 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
	decoder = decode(extracont);
	decoder.offs = 3;
	assert(prev(&decoder) is invalid);

	const regression: []u8 = [0xf5, 0x94, 0x80, 0x80];
	assert(!valid(regression));
};

// Returns true if a given string or byte slice contains only valid UTF-8
diff --git a/encoding/utf8/decodetable.ha b/encoding/utf8/decodetable.ha
new file mode 100644
index 00000000..bf4c2ba1
--- /dev/null
+++ b/encoding/utf8/decodetable.ha
@@ -0,0 +1,130 @@
let table: [8][256]i8 = [
	[+0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +3, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +4, +2, +2,
         +5, +6, +6, +6, +7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0, +0,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2, +2,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
];
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
index 66f0bafa..00dac1a7 100755
--- a/scripts/gen-stdlib
+++ b/scripts/gen-stdlib
@@ -587,6 +587,7 @@ encoding_pem() {
encoding_utf8() {
	gen_srcs encoding::utf8 \
		decode.ha \
		decodetable.ha \
		encode.ha \
		rune.ha
	gen_ssa encoding::utf8 types
diff --git a/stdlib.mk b/stdlib.mk
index 2a4490d9..175b2088 100644
--- a/stdlib.mk
+++ b/stdlib.mk
@@ -1149,6 +1149,7 @@ $(HARECACHE)/encoding/pem/encoding_pem-any.ssa: $(stdlib_encoding_pem_any_srcs)
# encoding::utf8 (+any)
stdlib_encoding_utf8_any_srcs = \
	$(STDLIB)/encoding/utf8/decode.ha \
	$(STDLIB)/encoding/utf8/decodetable.ha \
	$(STDLIB)/encoding/utf8/encode.ha \
	$(STDLIB)/encoding/utf8/rune.ha

@@ -3383,6 +3384,7 @@ $(TESTCACHE)/encoding/pem/encoding_pem-any.ssa: $(testlib_encoding_pem_any_srcs)
# encoding::utf8 (+any)
testlib_encoding_utf8_any_srcs = \
	$(STDLIB)/encoding/utf8/decode.ha \
	$(STDLIB)/encoding/utf8/decodetable.ha \
	$(STDLIB)/encoding/utf8/encode.ha \
	$(STDLIB)/encoding/utf8/rune.ha

-- 
2.36.4

[PATCH hare 2/3] use improved forward decode in backward decode Export this patch

Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>
---
 encoding/utf8/decode.ha | 58 ++++++++---------------------------------
 1 file changed, 11 insertions(+), 47 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
index 2d8a31f7..4cee4ac5 100644
--- a/encoding/utf8/decode.ha
+++ b/encoding/utf8/decode.ha
@@ -25,23 +25,11 @@ export type more = void;
// Returned when an invalid UTF-8 sequence was found.
export type invalid = !void;

const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];

const masks: [2][8]u8 = [
	[0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f],
        [0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07],
];

fn decode_leader(c: u8) ((size, u8) | invalid) = {
	for (let i = 0z; i < len(sizes); i += 1) {
		if (c & sizes[i].mask == sizes[i].result) {
			return (sizes[i].octets, c & leader_masks[i]);
		};
	};
	// Bad leading byte
	return invalid;
};

// Returns the next rune from a decoder. void is returned when there are no
// remaining codepoints.
export fn next(d: *decoder) (rune | void | more | invalid) = {
@@ -73,45 +61,21 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
	if (d.offs == 0) {
		return;
	};

	let r = 0u32;
	let n = 0z;
	for (let i = 0z; i < d.offs; i += 1) {
		let byte = d.src[d.offs - i - 1];
		if ((byte & 0xC0) == 0x80) {
			if (i == 3) {
				// Too many continuation bytes in a row
				return invalid;
			};
			byte &= 0x3F;
			r |= byte << (i * 6): u32;
		} else {
			const nl = decode_leader(byte)?;
			n = nl.0;
			if (i + 1 != n) {
				// Trailing continuation bytes
				return invalid;
			};
			r |= nl.1 << (i * 6): u32;
			break;
	let n = d.offs;
	d.offs -= 1;
	for (d.offs < len(d.src); d.offs -= 1) {
		if (table[0][d.src[d.offs]] != -1) {
			let t = d.offs;
			defer d.offs = t;
			let r = next(d);
			return if (n != d.offs || r is more) invalid else r;
		};
	};

	if (n == 0) {
		return more;
	} else if (n > 1) {
		if (r >= 0xD800 && r <= 0xDFFF) {
			// UTF-16 surrogates
			return invalid;
		};
		if (runesz(r: rune) != n) {
			// Overlong encoding
		if (n - d.offs == 4) {
			// Too many continuation bytes in a row
			return invalid;
		};
	};

	d.offs -= n;
	return r: rune;
	return more;
};

@test fn decode() void = {
-- 
2.36.4

[PATCH hare 3/3] rewrite encoding::utf8::valid using the new decoder table Export this patch

Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>
---
 encoding/utf8/decode.ha | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
index 4cee4ac5..12ee01e9 100644
--- a/encoding/utf8/decode.ha
+++ b/encoding/utf8/decode.ha
@@ -5,17 +5,22 @@

fn toutf8(in: str) []u8 = *(&in: *[]u8);

fn fromtagged(in: (str | []u8)) []u8 = match (in) {
case let s: str =>
	return toutf8(s);
case let b: []u8 =>
	return b;
};

export type decoder = struct {
	offs: size,
	src: []u8,
};

// Initializes a new UTF-8 decoder.
export fn decode(src: (str | []u8)) decoder = match (src) {
case let s: str =>
	yield decoder { src = toutf8(s), ...  };
case let b: []u8 =>
	yield decoder { src = b, ...  };
export fn decode(src: (str | []u8)) decoder = decoder {
	src = fromtagged(src),
	offs = 0,
};

// Returned when more data is needed, i.e. when an incomplete UTF-8 sequence is
@@ -83,6 +88,7 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
		0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
		0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
	];
	assert(valid(input));
	const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0'];
	let decoder = decode(input);
	for (let i = 0z; i < len(expected); i += 1) {
@@ -110,33 +116,39 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
	assert(next(&decoder) is invalid);
	decoder.offs = 2;
	assert(prev(&decoder) is more);
	assert(!valid(inv));

	const incomplete: [_]u8 = [0xE3, 0x81];
	decoder = decode(incomplete);
	assert(next(&decoder) is more);
	decoder.offs = 2;
	assert(prev(&decoder) is invalid);
	assert(!valid(incomplete));

	const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
	decoder = decode(surrogate);
	assert(next(&decoder) is invalid);
	decoder.offs = 3;
	assert(prev(&decoder) is invalid);
	assert(!valid(surrogate));

	const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
	decoder = decode(overlong);
	assert(next(&decoder) is invalid);
	decoder.offs = 4;
	assert(prev(&decoder) is invalid);
	assert(!valid(overlong));

	const badcont: [_]u8 = [0xC2, 0xFF];
	decoder = decode(badcont);
	assert(next(&decoder) is invalid);
	assert(!valid(badcont));

	const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
	decoder = decode(extracont);
	decoder.offs = 3;
	assert(prev(&decoder) is invalid);
	assert(!valid(extracont));

	const regression: []u8 = [0xf5, 0x94, 0x80, 0x80];
	assert(!valid(regression));
@@ -146,17 +158,10 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
// sequences. Note that Hare strings (str) are always valid UTF-8 - if this
// returns false for a str type, something funny is going on.
export fn valid(src: (str | []u8)) bool = {
	let decoder = decode(src);
	for (true) {
		match (next(&decoder)) {
		case void =>
			return true;
		case invalid =>
			return false;
		case more =>
			return false;
		case rune => void;
		};
	let src = fromtagged(src);
	let state = 0;
	for (let i = 0z; i < len(src) && state >= 0; i += 1) {
		state = table[state][src[i]];
	};
	abort();
	return state == 0;
};
-- 
2.36.4
hare/patches: SUCCESS in 1m42s

[use simpler and faster utf8 decoder][0] from [Bor Grošelj Simić][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/38680
[1]: mailto:bgs@turminal.net

✓ #933551 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/933551
✓ #933550 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/933550