~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
2 2

[PATCH hare] regex: Convert ranges to use u32 representation of runes

Details
Message ID
<UnEmSR7gsan1V4w8w2MoN99cT06c7vS4Kj4RJ-I-aRCZAsFk4-JNmYwOiW2iDqRfgNEk8n_f9gqaf12ZqLdOzGpRcOY9TzoST7MlM1M0c2k=@protonmail.com>
DKIM signature
pass
Download raw message
Patch: +34 -12
Changed range expressions to cast runes to u32 instead of u8.  This
means the range expressions now support multibyte codepoints.

Also added tests for different alphabets.  It includes
Cyrillic/Polish/Thai alphabets.  In addition, edited the testing code to
use rune length instead of byte length.

Signed-off-by: Andrey Kolchin <kaathewise@protonmail.com>
---
 regex/+test.ha | 30 +++++++++++++++++++++++++++++-
 regex/regex.ha | 16 +++++-----------
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha
index b1292752..40a44b3b 100644
--- a/regex/+test.ha
+++ b/regex/+test.ha
@@ -1,6 +1,7 @@
// License: MPL-2.0
// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
use fmt;
use strings;

type matchres = enum { MATCH, NOMATCH, ERROR };

@@ -287,6 +288,30 @@ fn run_findall_case(
		(`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1),
		(`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1),
		(`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1),
		// range expressions
		(`[a-z]+`, "onlylatinletters", matchres::MATCH, 0, -1),
		(`[x-z]+`, "xyz", matchres::MATCH, 0, -1),
		(`[x-z]+`, "wxyz", matchres::MATCH, 1, 4),
		(`[a-e]+`, "-abcdefg", matchres::MATCH, 1, 6),
		(`[a-z]`, "-1234567890@#$%^&*(!)-+=", matchres::NOMATCH, 0, -1),
		(`[0-9]+`, "9246", matchres::MATCH, 0, -1),
		// # Cyrillic
		(`[а-я]+`, "кирилица", matchres::MATCH, 0, -1),
		(`[а-д]`, "е", matchres::NOMATCH, 0, -1),
		(`[я-ф]`, "-", matchres::ERROR, 0, -1),
		(`[А-Я]+`, "АБВГд", matchres::MATCH, 0, 4),
		// because Macedonian uses cyrrilics, the broad range does
		// not include special symbols
		(`[а-ш]+`, "ѓљњќ", matchres::NOMATCH, 0, -1),
		// # Polish Alphabet
		(`[a-ż]+`, "polskialfabet", matchres::MATCH, 0, -1),
		(`[a-ż]+`, "źśółęćą", matchres::MATCH, 0, -1),
		// because Polish alphabet uses Latin with special characters,
		// other characters can be accepted
		(`[a-ż]+`, "englishspeak", matchres::MATCH, 0, -1),
		(`[a-ż]+`, "{|}~", matchres::MATCH, 0, -1),
		// # Thai Alphabet
		(`[ก-ฮ]+`, "ศอผจข", matchres::MATCH, 0, -1),
		// [:alpha:] etc. plus extra characters
		(`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1),
		(`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1),
@@ -537,7 +562,10 @@ fn run_findall_case(
		const should_match = cases[i].2;
		const start = cases[i].3;
		const end = if (cases[i].4 == -1) {
			yield len(string): int;
			// workaround to get the length in codepoints
			let runes = strings::runes(string);
			defer free(runes);
			yield len(runes): int;
		} else {
			yield cases[i].4;
		};
diff --git a/regex/regex.ha b/regex/regex.ha
index f30fd18a..be628f1f 100644
--- a/regex/regex.ha
+++ b/regex/regex.ha
@@ -61,7 +61,7 @@ export type charclass = enum {
export type charset = [](charset_lit_item | charset_range_item |
	charset_class_item),
	charset_lit_item = rune,
	charset_range_item = (u8, u8),
	charset_range_item = (u32, u32),
	charset_class_item = *fn(c: rune) bool;

const charclass_map: [](str, *fn(c: rune) bool) = [
@@ -173,13 +173,8 @@ fn handle_bracket(
			return `No character class after '[:'`: error;
		};
	} else if (is_range) {
		const start_enc = utf8::encoderune(r);
		assert(len(start_enc) == 1, "Character ranges do not currently support characters larger than one byte");
		const start_b = start_enc[0];

		const end_enc = utf8::encoderune(range_end as rune);
		assert(len(end_enc) == 1, "Character ranges do not currently support characters larger than one byte");
		const end_b = end_enc[0];
		const start_b = r: u32;
		const end_b = range_end as rune: u32;

		if (end_b < start_b) {
			return `Decending bracket expression range '[z-a]'`: error;
@@ -600,9 +595,8 @@ fn run_thread(
				break;
			};
		case let range: charset_range_item =>
			const r_enc = utf8::encoderune(r);
			assert(len(r_enc) == 1, "Character ranges do not currently support characters larger than one byte");
			const r_b = r_enc[0];
			const r_b = r: u32;

			if (r_b >= range.0 && r_b <= range.1) {
				// Succeeded if positive match
				// Failed if negative match
-- 
2.38.3

[hare/patches] build success

builds.sr.ht <builds@sr.ht>
Details
Message ID
<CQ5JZMZ1XDTE.3G5O8Z041TF8K@cirno2>
In-Reply-To
<UnEmSR7gsan1V4w8w2MoN99cT06c7vS4Kj4RJ-I-aRCZAsFk4-JNmYwOiW2iDqRfgNEk8n_f9gqaf12ZqLdOzGpRcOY9TzoST7MlM1M0c2k=@protonmail.com> (view parent)
DKIM signature
missing
Download raw message
hare/patches: SUCCESS in 1m39s

[regex: Convert ranges to use u32 representation of runes][0] from [KAAtheWise][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/38550
[1]: KAAtheWise@protonmail.com

✓ #931074 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/931074
✓ #931073 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/931073
Details
Message ID
<CQ73RWL91GYE.2LYWVLMEU59K0@taiga>
In-Reply-To
<UnEmSR7gsan1V4w8w2MoN99cT06c7vS4Kj4RJ-I-aRCZAsFk4-JNmYwOiW2iDqRfgNEk8n_f9gqaf12ZqLdOzGpRcOY9TzoST7MlM1M0c2k=@protonmail.com> (view parent)
DKIM signature
pass
Download raw message
Thanks!

To git@git.sr.ht:~sircmpwn/hare
   266ed58e..e74074f9  master -> master
Reply to thread Export thread (mbox)