~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
2 2

[PATCH hare 1/2] strings: add runelen

Details
Message ID
<20240718160816.33784-1-apreiml@strohwolke.at>
DKIM signature
pass
Download raw message
Patch: +16 -0
Signed-off-by: Armin Preiml <apreiml@strohwolke.at>
---
 strings/runes.ha | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/strings/runes.ha b/strings/runes.ha
index 58da3b02..b225f37f 100644
--- a/strings/runes.ha
+++ b/strings/runes.ha
@@ -51,3 +51,19 @@ export fn fromrunes(runes: []rune) str = {
		};
	};
};

// Counts the number of runes of 's'.
export fn runelen(s: str) size = {
	let it = iter(s);
	let l = 0z;

	for (let r => next(&it)) {
		l += 1;
	};

	return l;
};

@test fn runelen() void = {
	assert(runelen("Čmrlj") == 5);
};
-- 
2.45.2

[PATCH hare 2/2] strings: add multibyte tokenizer as fallback

Details
Message ID
<20240718160816.33784-2-apreiml@strohwolke.at>
In-Reply-To
<20240718160816.33784-1-apreiml@strohwolke.at> (view parent)
DKIM signature
pass
Download raw message
Patch: +180 -19
The implementation is simple but slow. May be improved by working
directly with the string's bytes.

Signed-off-by: Armin Preiml <apreiml@strohwolke.at>
---
 strings/tokenize.ha | 199 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 180 insertions(+), 19 deletions(-)

diff --git a/strings/tokenize.ha b/strings/tokenize.ha
index 03b76f77..df88c67c 100644
--- a/strings/tokenize.ha
+++ b/strings/tokenize.ha
@@ -4,7 +4,39 @@
use bytes;
use types;

export type tokenizer = bytes::tokenizer;

export type tokenizer_vtable = struct {
	next: *fn (t: *tokenizer) (str | done),
	peek: *fn (t: *tokenizer) (str | done),
	remaining: *fn (t: *tokenizer) str,
};

export type tokenizer = struct {
	vtable: *tokenizer_vtable,
	union {
		bt: bytes::tokenizer,
		mb: mbtokenizer,
	},
};

export type mbtokenizer = struct {
	in: str,
	delim: str,
	reverse: bool,
	rlen: size,
};

export const bytetokenizervt = tokenizer_vtable {
	next = &byte_next_token,
	peek = &byte_peek_token,
	remaining = &byte_remaining_token,
};

export const mbtokenizervt = tokenizer_vtable {
	next = &mb_next_token,
	peek = &mb_peek_token,
	remaining = &mb_remaining_token,
};

// Tokenizes a string, returning an iterator that yields substrings separated by
// one or more delimiters, such that the string will be split along any of the
@@ -31,11 +63,27 @@ export type tokenizer = bytes::tokenizer;
// 	assert(next_token(&tok) is done);
export fn tokenize(s: str, delim: str) tokenizer = {
	const in = toutf8(s);
	const delim = toutf8(delim);
	for (let ch .. delim) {
		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
	const delimb = toutf8(delim);
	for (let ch .. delimb) {
		if (ch & 0x80 == 0) {
			continue;
		};

		// multibyte fallback
		return tokenizer {
			vtable = &mbtokenizervt,
			mb = mbtokenizer {
				in = s,
				delim = delim,
				reverse = false,
				rlen = runelen(s),
			},
		};
	};
	return tokenizer {
		vtable = &bytetokenizervt,
		bt = bytes::tokenize(in, delimb...),
	};
	return bytes::tokenize(in, delim...);
};

// Like [[tokenize]], but tokenizes the string in reverse, such that the first
@@ -43,16 +91,41 @@ export fn tokenize(s: str, delim: str) tokenizer = {
// first token.
export fn rtokenize(s: str, delim: str) tokenizer = {
	const in = toutf8(s);
	const delim = toutf8(delim);
	for (let ch .. delim) {
		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
	const delimb = toutf8(delim);
	for (let ch .. delimb) {
		if (ch & 0x80 == 0) {
			continue;
		};

		// multibyte fallback
		return tokenizer {
			vtable = &mbtokenizervt,
			mb = mbtokenizer {
				in = s,
				delim = delim,
				reverse = true,
				rlen = runelen(s),
			},
		};
	};
	return tokenizer {
		vtable = &bytetokenizervt,
		bt = bytes::rtokenize(in, delimb...),
	};
	return bytes::rtokenize(in, delim...);
};

// Returns the next token from a [[tokenizer]] and advances the cursor.
export fn next_token(s: *tokenizer) (str | done) = {
	let s = s: *bytes::tokenizer;
export fn next_token(s: *tokenizer) (str | done) = s.vtable.next(s);

// Returns the next token from a [[tokenizer]] without advancing the cursor.
export fn peek_token(s: *tokenizer) (str | done) = s.vtable.peek(s);

// Returns the remainder of the input string from a [[tokenizer]] ahead of the
// token cursor.
export fn remaining_tokens(s: *tokenizer) str = s.vtable.remaining(s);

fn byte_next_token(s: *tokenizer) (str | done) = {
	let s = &s.bt;
	match (bytes::next_token(s)) {
	case let b: []u8 =>
		return fromutf8_unsafe(b);
@@ -60,9 +133,8 @@ export fn next_token(s: *tokenizer) (str | done) = {
	};
};

// Returns the next token from a [[tokenizer]] without advancing the cursor.
export fn peek_token(s: *tokenizer) (str | done) = {
	let s = s: *bytes::tokenizer;
fn byte_peek_token(s: *tokenizer) (str | done) = {
	let s = &s.bt;
	return match (bytes::peek_token(s)) {
	case let b: []u8 =>
		yield fromutf8_unsafe(b);
@@ -71,13 +143,73 @@ export fn peek_token(s: *tokenizer) (str | done) = {
	};
};

// Returns the remainder of the input string from a [[tokenizer]] ahead of the
// token cursor.
export fn remaining_tokens(s: *tokenizer) str = {
	let s = s: *bytes::tokenizer;
	return fromutf8_unsafe(bytes::remaining_tokens(s));
fn byte_remaining_token(s: *tokenizer) str =
	fromutf8_unsafe(bytes::remaining_tokens(&s.bt));

fn mb_next_token(s: *tokenizer) (str | done) = {
	let idx = match (mb_next_delim(s)) {
	case let idx: size =>
		yield idx;
	case void =>
		s.mb.delim = "";
		return s.mb.in;
	case done =>
		return done;
	};

	if (s.mb.reverse) {
		let t = sub(s.mb.in, s.mb.rlen - idx);
		s.mb.rlen -= idx + 1;
		s.mb.in = sub(s.mb.in, 0, s.mb.rlen);
		return t;
	};

	let t = sub(s.mb.in, 0, idx);
	s.mb.rlen -= idx + 1;
	s.mb.in = sub(s.mb.in, idx + 1);
	return t;
};

fn mb_peek_token(s: *tokenizer) (str | done) = {
	match (mb_next_delim(s)) {
	case let i: size =>
		if (s.mb.reverse) {
			return sub(s.mb.in, s.mb.rlen - i);
		} else {
			return sub(s.mb.in, 0, i);
		};
	case void =>
		return s.mb.in;
	case done =>
		return done;
	};
};

fn mb_next_delim(s: *tokenizer) (size | done | void) = {
	if (s.mb.delim == "") {
		return done;
	};

	let iterfunc = if (s.mb.reverse) &riter else &iter;

	let i = 0z;
	let it = iterfunc(s.mb.in);
	for (let r => next(&it)) {
		let dit = iter(s.mb.delim);
		for (let d => next(&dit)) {
			if (r == d) {
				return i;
			};
		};

		i += 1;
	};

	return void;
};

fn mb_remaining_token(s: *tokenizer) str = s.mb.in;

fn tokenize_test(
	testcase: str,
	in: str,
@@ -150,6 +282,26 @@ fn tokenize_test(
			"",
		]);

	tokenize_test("multibyte simple case",
		"multibyte\u00b6simple\u00b6!", "\u00b6",
		[
			"multibyte",
			"simple",
			"!",
		]);

	tokenize_test("multibyte complex",
		"\u00b6multibyte\u00b6\u00b6\u00b6a\u00b6!\u00b6", "\u00b6",
		[
			"",
			"multibyte",
			"",
			"",
			"a",
			"!",
			"",
		]);

	const tok = tokenize_test("remaining_tokens",
		"Hello world! My name is Harriet.", " ",
		[
@@ -157,6 +309,15 @@ fn tokenize_test(
			"world!",
		], 2);
	assert(remaining_tokens(&tok) == "My name is Harriet.");

	const tok = tokenize_test("remaining_tokens",
		"Hello\u006bworld!\u006bMy\u006bname\u006bis\u006bHarriet.",
		"\u006b",
		[
			"Hello",
			"world!",
		], 2);
	assert(remaining_tokens(&tok) == "My\u006bname\u006bis\u006bHarriet.");
};

// Splits a string into tokens delimited by 'delim', starting at the beginning
-- 
2.45.2

[hare/patches] build success

builds.sr.ht <builds@sr.ht>
Details
Message ID
<D2SSP0L0KGBK.1IKMSJU6SJ9H4@fra02>
In-Reply-To
<20240718160816.33784-2-apreiml@strohwolke.at> (view parent)
DKIM signature
missing
Download raw message
hare/patches: SUCCESS in 1m11s

[strings: add runelen][0] from [Armin Preiml][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/53964
[1]: apreiml@strohwolke.at

✓ #1279207 SUCCESS hare/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1279207
✓ #1279205 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1279205
✓ #1279206 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1279206
✓ #1279208 SUCCESS hare/patches/openbsd.yml https://builds.sr.ht/~sircmpwn/job/1279208
Reply to thread Export thread (mbox)