Changed range expressions to cast runes to u32 instead of u8. This
means the range expressions now support multibyte codepoints.
Also added tests for different alphabets. It includes
Cyrillic/Polish/Thai alphabets. In addition, edited the testing code to
use rune length instead of byte length.
Signed-off-by: Andrey Kolchin <kaathewise@protonmail.com>
---
regex/+test.ha | 30 +++++++++++++++++++++++++++++-
regex/regex.ha | 16 +++++-----------
2 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
index b1292752..40a44b3b 100644
--- a/regex/+test.ha
+++ b/regex/+test.ha
@@ -1,6 +1,7 @@
// License: MPL-2.0
// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
use fmt;
+use strings;
type matchres = enum { MATCH, NOMATCH, ERROR };
@@ -287,6 +288,30 @@ fn run_findall_case(
(`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1),
(`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1),
(`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1),
+ // range expressions
+ (`[a-z]+`, "onlylatinletters", matchres::MATCH, 0, -1),
+ (`[x-z]+`, "xyz", matchres::MATCH, 0, -1),
+ (`[x-z]+`, "wxyz", matchres::MATCH, 1, 4),
+ (`[a-e]+`, "-abcdefg", matchres::MATCH, 1, 6),
+ (`[a-z]`, "-1234567890@#$%^&*(!)-+=", matchres::NOMATCH, 0, -1),
+ (`[0-9]+`, "9246", matchres::MATCH, 0, -1),
+ // # Cyrillic
+ (`[а-я]+`, "кирилица", matchres::MATCH, 0, -1),
+ (`[а-д]`, "е", matchres::NOMATCH, 0, -1),
+ (`[я-ф]`, "-", matchres::ERROR, 0, -1),
+ (`[А-Я]+`, "АБВГд", matchres::MATCH, 0, 4),
+ // because Macedonian uses cyrrilics, the broad range does
+ // not include special symbols
+ (`[а-ш]+`, "ѓљњќ", matchres::NOMATCH, 0, -1),
+ // # Polish Alphabet
+ (`[a-ż]+`, "polskialfabet", matchres::MATCH, 0, -1),
+ (`[a-ż]+`, "źśółęćą", matchres::MATCH, 0, -1),
+ // because Polish alphabet uses Latin with special characters,
+ // other characters can be accepted
+ (`[a-ż]+`, "englishspeak", matchres::MATCH, 0, -1),
+ (`[a-ż]+`, "{|}~", matchres::MATCH, 0, -1),
+ // # Thai Alphabet
+ (`[ก-ฮ]+`, "ศอผจข", matchres::MATCH, 0, -1),
// [:alpha:] etc. plus extra characters
(`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1),
(`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1),
@@ -537,7 +562,10 @@ fn run_findall_case(
const should_match = cases[i].2;
const start = cases[i].3;
const end = if (cases[i].4 == -1) {
- yield len(string): int;
+ // workaround to get the length in codepoints
+ let runes = strings::runes(string);
+ defer free(runes);
+ yield len(runes): int;
} else {
yield cases[i].4;
};
diff --git a/regex/regex.ha b/regex/regex.ha
index f30fd18a..be628f1f 100644
--- a/regex/regex.ha
+++ b/regex/regex.ha
@@ -61,7 +61,7 @@ export type charclass = enum {
export type charset = [](charset_lit_item | charset_range_item |
charset_class_item),
charset_lit_item = rune,
- charset_range_item = (u8, u8),
+ charset_range_item = (u32, u32),
charset_class_item = *fn(c: rune) bool;
const charclass_map: [](str, *fn(c: rune) bool) = [
@@ -173,13 +173,8 @@ fn handle_bracket(
return `No character class after '[:'`: error;
};
} else if (is_range) {
- const start_enc = utf8::encoderune(r);
- assert(len(start_enc) == 1, "Character ranges do not currently support characters larger than one byte");
- const start_b = start_enc[0];
-
- const end_enc = utf8::encoderune(range_end as rune);
- assert(len(end_enc) == 1, "Character ranges do not currently support characters larger than one byte");
- const end_b = end_enc[0];
+ const start_b = r: u32;
+ const end_b = range_end as rune: u32;
if (end_b < start_b) {
return `Decending bracket expression range '[z-a]'`: error;
@@ -600,9 +595,8 @@ fn run_thread(
break;
};
case let range: charset_range_item =>
- const r_enc = utf8::encoderune(r);
- assert(len(r_enc) == 1, "Character ranges do not currently support characters larger than one byte");
- const r_b = r_enc[0];
+ const r_b = r: u32;
+
if (r_b >= range.0 && r_b <= range.1) {
// Succeeded if positive match
// Failed if negative match
--
2.38.3