Signed-off-by: Kirill Primak <vyivel@eclair.cafe>
---
encoding/utf8/decode.ha | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
index 513a3655..94b52450 100644
--- a/encoding/utf8/decode.ha+++ b/encoding/utf8/decode.ha
@@ -28,13 +28,13 @@ export type invalid = !void;
const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];
fn decode_leader(c: u8) ((size, u8) | invalid) = {
- for (let i = 0z; i < len(sizes); i += 1) {- if (c & sizes[i].mask == sizes[i].result) {- return (sizes[i].octets, c & leader_masks[i]);- };+ match (utf8sz(c)) {+ case let sz: size =>+ return (sz, c & leader_masks[sz - 1]);+ case void =>+ // Bad leading byte+ return invalid; };
- // Bad leading byte- return invalid;};
// Returns the next rune from a decoder. void is returned when there are no
--
2.39.1
[PATCH hare 2/2] encoding/utf8: fix utf8sz() for 0b11110101-0b11110111 range
Fixes: https://todo.sr.ht/~sircmpwn/hare/798
Signed-off-by: Kirill Primak <vyivel@eclair.cafe>
---
Alternatively, this can be fixed by adding another rsize.
encoding/utf8/rune.ha | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/encoding/utf8/rune.ha b/encoding/utf8/rune.ha
index ca5337c6..61fd586d 100644
--- a/encoding/utf8/rune.ha+++ b/encoding/utf8/rune.ha
@@ -26,6 +26,11 @@ export fn runesz(r: rune) size = {
// Returns the expected length of a UTF-8 codepoint in bytes given its first
// byte, or void if the given byte doesn't begin a valid UTF-8 sequence.
export fn utf8sz(c: u8) (size | void) = {
+ if (c > 0xF4) {+ // While 21 bits allow to represent numbers up to 0x1FFFFF,+ // the highest possible codepoint is U+10FFFF+ return void;+ }; for (let i = 0z; i < len(sizes); i += 1) {
if (c & sizes[i].mask == sizes[i].result) {
return sizes[i].octets;
--
2.39.1