The implementation is simple but slow. May be improved by working
directly with the string's bytes.
Signed-off-by: Armin Preiml <apreiml@strohwolke.at>
---
strings/tokenize.ha | 199 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 180 insertions(+), 19 deletions(-)
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
index 03b76f77..df88c67c 100644
--- a/strings/tokenize.ha
+++ b/strings/tokenize.ha
@@ -4,7 +4,39 @@
use bytes;
use types;
-export type tokenizer = bytes::tokenizer;
+
+export type tokenizer_vtable = struct {
+ next: *fn (t: *tokenizer) (str | done),
+ peek: *fn (t: *tokenizer) (str | done),
+ remaining: *fn (t: *tokenizer) str,
+};
+
+export type tokenizer = struct {
+ vtable: *tokenizer_vtable,
+ union {
+ bt: bytes::tokenizer,
+ mb: mbtokenizer,
+ },
+};
+
+export type mbtokenizer = struct {
+ in: str,
+ delim: str,
+ reverse: bool,
+ rlen: size,
+};
+
+export const bytetokenizervt = tokenizer_vtable {
+ next = &byte_next_token,
+ peek = &byte_peek_token,
+ remaining = &byte_remaining_token,
+};
+
+export const mbtokenizervt = tokenizer_vtable {
+ next = &mb_next_token,
+ peek = &mb_peek_token,
+ remaining = &mb_remaining_token,
+};
// Tokenizes a string, returning an iterator that yields substrings separated by
// one or more delimiters, such that the string will be split along any of the
@@ -31,11 +63,27 @@ export type tokenizer = bytes::tokenizer;
// assert(next_token(&tok) is done);
export fn tokenize(s: str, delim: str) tokenizer = {
const in = toutf8(s);
- const delim = toutf8(delim);
- for (let ch .. delim) {
- assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+ const delimb = toutf8(delim);
+ for (let ch .. delimb) {
+ if (ch & 0x80 == 0) {
+ continue;
+ };
+
+ // multibyte fallback
+ return tokenizer {
+ vtable = &mbtokenizervt,
+ mb = mbtokenizer {
+ in = s,
+ delim = delim,
+ reverse = false,
+ rlen = runelen(s),
+ },
+ };
+ };
+ return tokenizer {
+ vtable = &bytetokenizervt,
+ bt = bytes::tokenize(in, delimb...),
};
- return bytes::tokenize(in, delim...);
};
// Like [[tokenize]], but tokenizes the string in reverse, such that the first
@@ -43,16 +91,41 @@ export fn tokenize(s: str, delim: str) tokenizer = {
// first token.
export fn rtokenize(s: str, delim: str) tokenizer = {
const in = toutf8(s);
- const delim = toutf8(delim);
- for (let ch .. delim) {
- assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+ const delimb = toutf8(delim);
+ for (let ch .. delimb) {
+ if (ch & 0x80 == 0) {
+ continue;
+ };
+
+ // multibyte fallback
+ return tokenizer {
+ vtable = &mbtokenizervt,
+ mb = mbtokenizer {
+ in = s,
+ delim = delim,
+ reverse = true,
+ rlen = runelen(s),
+ },
+ };
+ };
+ return tokenizer {
+ vtable = &bytetokenizervt,
+ bt = bytes::rtokenize(in, delimb...),
};
- return bytes::rtokenize(in, delim...);
};
// Returns the next token from a [[tokenizer]] and advances the cursor.
-export fn next_token(s: *tokenizer) (str | done) = {
- let s = s: *bytes::tokenizer;
+export fn next_token(s: *tokenizer) (str | done) = s.vtable.next(s);
+
+// Returns the next token from a [[tokenizer]] without advancing the cursor.
+export fn peek_token(s: *tokenizer) (str | done) = s.vtable.peek(s);
+
+// Returns the remainder of the input string from a [[tokenizer]] ahead of the
+// token cursor.
+export fn remaining_tokens(s: *tokenizer) str = s.vtable.remaining(s);
+
+fn byte_next_token(s: *tokenizer) (str | done) = {
+ let s = &s.bt;
match (bytes::next_token(s)) {
case let b: []u8 =>
return fromutf8_unsafe(b);
@@ -60,9 +133,8 @@ export fn next_token(s: *tokenizer) (str | done) = {
};
};
-// Returns the next token from a [[tokenizer]] without advancing the cursor.
-export fn peek_token(s: *tokenizer) (str | done) = {
- let s = s: *bytes::tokenizer;
+fn byte_peek_token(s: *tokenizer) (str | done) = {
+ let s = &s.bt;
return match (bytes::peek_token(s)) {
case let b: []u8 =>
yield fromutf8_unsafe(b);
@@ -71,13 +143,73 @@ export fn peek_token(s: *tokenizer) (str | done) = {
};
};
-// Returns the remainder of the input string from a [[tokenizer]] ahead of the
-// token cursor.
-export fn remaining_tokens(s: *tokenizer) str = {
- let s = s: *bytes::tokenizer;
- return fromutf8_unsafe(bytes::remaining_tokens(s));
+fn byte_remaining_token(s: *tokenizer) str =
+ fromutf8_unsafe(bytes::remaining_tokens(&s.bt));
+
+fn mb_next_token(s: *tokenizer) (str | done) = {
+ let idx = match (mb_next_delim(s)) {
+ case let idx: size =>
+ yield idx;
+ case void =>
+ s.mb.delim = "";
+ return s.mb.in;
+ case done =>
+ return done;
+ };
+
+ if (s.mb.reverse) {
+ let t = sub(s.mb.in, s.mb.rlen - idx);
+ s.mb.rlen -= idx + 1;
+ s.mb.in = sub(s.mb.in, 0, s.mb.rlen);
+ return t;
+ };
+
+ let t = sub(s.mb.in, 0, idx);
+ s.mb.rlen -= idx + 1;
+ s.mb.in = sub(s.mb.in, idx + 1);
+ return t;
+};
+
+fn mb_peek_token(s: *tokenizer) (str | done) = {
+ match (mb_next_delim(s)) {
+ case let i: size =>
+ if (s.mb.reverse) {
+ return sub(s.mb.in, s.mb.rlen - i);
+ } else {
+ return sub(s.mb.in, 0, i);
+ };
+ case void =>
+ return s.mb.in;
+ case done =>
+ return done;
+ };
+};
+
+fn mb_next_delim(s: *tokenizer) (size | done | void) = {
+ if (s.mb.delim == "") {
+ return done;
+ };
+
+ let iterfunc = if (s.mb.reverse) &riter else &iter;
+
+ let i = 0z;
+ let it = iterfunc(s.mb.in);
+ for (let r => next(&it)) {
+ let dit = iter(s.mb.delim);
+ for (let d => next(&dit)) {
+ if (r == d) {
+ return i;
+ };
+ };
+
+ i += 1;
+ };
+
+ return void;
};
+fn mb_remaining_token(s: *tokenizer) str = s.mb.in;
+
fn tokenize_test(
testcase: str,
in: str,
@@ -150,6 +282,26 @@ fn tokenize_test(
"",
]);
+ tokenize_test("multibyte simple case",
+ "multibyte\u00b6simple\u00b6!", "\u00b6",
+ [
+ "multibyte",
+ "simple",
+ "!",
+ ]);
+
+ tokenize_test("multibyte complex",
+ "\u00b6multibyte\u00b6\u00b6\u00b6a\u00b6!\u00b6", "\u00b6",
+ [
+ "",
+ "multibyte",
+ "",
+ "",
+ "a",
+ "!",
+ "",
+ ]);
+
const tok = tokenize_test("remaining_tokens",
"Hello world! My name is Harriet.", " ",
[
@@ -157,6 +309,15 @@ fn tokenize_test(
"world!",
], 2);
assert(remaining_tokens(&tok) == "My name is Harriet.");
+
+ const tok = tokenize_test("remaining_tokens",
+ "Hello\u006bworld!\u006bMy\u006bname\u006bis\u006bHarriet.",
+ "\u006b",
+ [
+ "Hello",
+ "world!",
+ ], 2);
+ assert(remaining_tokens(&tok) == "My\u006bname\u006bis\u006bHarriet.");
};
// Splits a string into tokens delimited by 'delim', starting at the beginning
--
2.45.2