~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
4 3

[PATCH hare v2 1/3] regex: make find and findall return [][]capture

Details
Message ID
<20220623232306.31040-1-sebastian@sebsite.pw>
DKIM signature
pass
Download raw message
Patch: +69 -81
Previously, find and findall would return void if no matches were found.
This commit changes this behavior, so an empty slice is returned
instead.

Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
Since v1: dropped free_{captures,matches} -> {captures,matches}_free
patch, since it's still unclear how these functions should be named.
Either the style guide needs to be updated and these functions changed,
or other stdlib functions should be changed.

 regex/+test.ha | 106 +++++++++++++++++++++++--------------------------
 regex/README   |  30 ++++++--------
 regex/regex.ha |  14 ++++---
 3 files changed, 69 insertions(+), 81 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha
index 48549d70..181cf0ec 100644
--- a/regex/+test.ha
+++ b/regex/+test.ha
@@ -36,31 +36,30 @@ fn run_find_case(
	};
	defer finish(&re);

	match (find(&re, string)) {
	case void =>
	const captures = find(&re, string);
	defer free_captures(captures);
	if (len(captures) == 0) {
		if (expected == matchres::MATCH) {
			fmt::errorfln("Expected expression /{}/ to match string \"{}\", but it did not",
				expr, string)!;
			abort();
		};
		return;
	} else if (expected == matchres::NOMATCH) {
		fmt::errorfln("Expected expression /{}/ to not match string \"{}\", but it did",
			expr, string)!;
		abort();
	};

	case let captures: []capture =>
		defer free_captures(captures);
		if (expected == matchres::NOMATCH) {
			fmt::errorfln("Expected expression /{}/ to not match string \"{}\", but it did",
				expr, string)!;
			abort();
		};
		if (start: size != captures[0].start) {
			fmt::errorfln("Expected start of main capture to be {} but it was {}",
				start, captures[0].start)!;
			abort();
		};
		if (end: size != captures[0].end) {
			fmt::errorfln("Expected end of main capture to be {} but it was {}",
				end, captures[0].end)!;
			abort();
		};
	if (start: size != captures[0].start) {
		fmt::errorfln("Expected start of main capture to be {} but it was {}",
			start, captures[0].start)!;
		abort();
	};
	if (end: size != captures[0].end) {
		fmt::errorfln("Expected end of main capture to be {} but it was {}",
			end, captures[0].end)!;
		abort();
	};
};

@@ -68,15 +67,14 @@ fn run_submatch_case(
	expr: str,
	string: str,
	expected: matchres,
	count: size,
	targets: []str
) void = {
	const re = compile(expr)!;
	defer finish(&re);

	const captures = find(&re, string) as []capture;
	const captures = find(&re, string);
	defer free_captures(captures);
	assert(len(captures) == count, "Invalid number of captures");
	assert(len(captures) == len(targets), "Invalid number of captures");
	for (let i = 0z; i < len(targets); i += 1) {
		assert(targets[i] == captures[i].content, "Invalid capture");
	};
@@ -86,7 +84,6 @@ fn run_findall_case(
	expr: str,
	string: str,
	expected: matchres,
	count: size,
	targets: []str
) void = {
	const re = match (compile(expr)) {
@@ -108,33 +105,30 @@ fn run_findall_case(
		abort();
	};

	match (findall(&re, string)) {
	case void =>
		if (expected == matchres::MATCH) {
			fmt::errorfln("Expected expression /{}/ to match string \"{}\", but it did not",
				expr, string)!;
			abort();
		};
	const matches = findall(&re, string);
	if (len(matches) == 0 && expected == matchres::MATCH) {
		fmt::errorfln("Expected expression /{}/ to match string \"{}\", but it did not",
			expr, string)!;
		abort();
	};
	defer free_matches(matches);

	case let matches: [][]capture =>
		defer free_matches(matches);
		if (expected == matchres::NOMATCH) {
			fmt::errorfln("Expected expression /{}/ to not match string \"{}\", but it did",
				expr, string)!;
			abort();
		};
		if (count != len(matches)) {
			fmt::errorfln("Expected to find {} matches but found {}",
				count, len(matches))!;
	if (expected == matchres::NOMATCH) {
		fmt::errorfln("Expected expression /{}/ to not match string \"{}\", but it did",
			expr, string)!;
		abort();
	};
	if (len(targets) != len(matches)) {
		fmt::errorfln("Expected expression /{}/ to find {} matches but found {}",
			expr, len(targets), len(matches))!;
		abort();
	};
	for (let i = 0z; i < len(matches); i += 1) {
		if (matches[i][0].content != targets[i]) {
			fmt::errorfln("Expected submatch of expression /{}/ to be {} but it was {}",
				expr, targets[i], matches[i][0].content)!;
			abort();
		};
		for (let i = 0z; i < len(matches); i += 1) {
			if (matches[i][0].content != targets[i]) {
				fmt::errorfln("Expected submatch to be {} but it was {}",
					targets[i], matches[i][0].content)!;
				abort();
			};
		};
	};
};

@@ -553,32 +547,30 @@ fn run_findall_case(

	const submatch_cases = [
		// literals
		(`aaa ([^ ]*) (...)`, "aaa bbb ccc", matchres::MATCH, 3z,
			["aaa bbb ccc", "bbb", "ccc"]),
		(`aaa ([^ ]*) (...)`, "aaa bbb ccc", matchres::MATCH,
			["aaa bbb ccc", "bbb", "ccc"]: []str),
	];

	for (let i = 0z; i < len(submatch_cases); i += 1) {
		const expr = submatch_cases[i].0;
		const string = submatch_cases[i].1;
		const should_match = submatch_cases[i].2;
		const count = submatch_cases[i].3;
		const targets = submatch_cases[i].4;
		run_submatch_case(expr, string, should_match, count, targets);
		const targets = submatch_cases[i].3;
		run_submatch_case(expr, string, should_match, targets);
	};
};

@test fn findall() void = {
	const cases = [
		(`ab.`, "hello abc and abあ test abq thanks", matchres::MATCH, 3z,
			["abc", "abあ", "abq"]),
		(`ab.`, "hello abc and abあ test abq thanks", matchres::MATCH,
			["abc", "abあ", "abq"]: []str),
	];

	for (let i = 0z; i < len(cases); i += 1) {
		const expr = cases[i].0;
		const string = cases[i].1;
		const should_match = cases[i].2;
		const count = cases[i].3;
		const targets = cases[i].4;
		run_findall_case(expr, string, should_match, count, targets);
		const targets = cases[i].3;
		run_findall_case(expr, string, should_match, targets);
	};
};
diff --git a/regex/README b/regex/README
index ea0a3513..a32197b7 100644
--- a/regex/README
+++ b/regex/README
@@ -23,10 +23,8 @@ the longest match among the leftmost matches.
	const does_match = regex::test(&re, "Hello Hare, hello Hare.");
	fmt::printfln("matched? {}", does_match)!;

	const first_match = regex::find(&re, "Hello Hare, hello Hare.");
	match (first_match) {
	case void => void;
	case let captures: []regex::capture =>
	const captures = regex::find(&re, "Hello Hare, hello Hare.");
	if (len(captures) != 0) {
		defer regex::free_captures(captures);
		// captures[0]: The full matching string.
		// captures[1...]: A capture for every capture group.
@@ -35,20 +33,16 @@ the longest match among the leftmost matches.
			captures[0].end)!;
	};

	const all_matches = regex::findall(&re, "Hello Hare, hello Hare.");
	match (all_matches) {
	case void => void;
	case let matches: [][]regex::capture =>
		defer regex::free_matches(matches);
		// matches[0]: All captures for the first match.
		// matches[0][0]: The full matching string for the first match.
		// matches[0][1...]: A capture for every capture group in the
		//     first match.
		for (let i = 0z; i < len(matches); i += 1) {
			fmt::printfln("{} ({}, {})", matches[i][0].content,
				matches[i][0].start,
				matches[i][0].end)!;
		};
	const matches = regex::findall(&re, "Hello Hare, hello Hare.");
	defer regex::free_matches(matches);
	// matches[0]: All captures for the first match.
	// matches[0][0]: The full matching string for the first match.
	// matches[0][1...]: A capture for every capture group in the
	//     first match.
	for (let i = 0z; i < len(matches); i += 1) {
		fmt::printfln("{} ({}, {})", matches[i][0].content,
			matches[i][0].start,
			matches[i][0].end)!;
	};

[0]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04
diff --git a/regex/regex.ha b/regex/regex.ha
index eb29e1ad..71308136 100644
--- a/regex/regex.ha
+++ b/regex/regex.ha
@@ -773,16 +773,21 @@ export fn test(re: *regex, string: str) bool = {

// Attempts to match a regular expression against a string and returns the
// longest leftmost match, or void if there is no match.
export fn find(re: *regex, string: str) (void | []capture) = {
export fn find(re: *regex, string: str) []capture = {
	let str_idx = -1;
	let str_iter = strings::iter(string);
	let str_bytesize = 0z;
	return search(re, string, &str_iter, &str_idx, &str_bytesize, true);
	match (search(re, string, &str_iter, &str_idx, &str_bytesize, true)) {
	case let m: []capture =>
		return m;
	case void =>
		return [];
	};
};

// Attempts to match a regular expression against a string and returns all
// non-overlapping matches, or void if there are no matches.
export fn findall(re: *regex, string: str) (void | [][]capture) = {
export fn findall(re: *regex, string: str) [][]capture = {
	let res: [][]capture = alloc([]);
	let str_idx = -1;
	let str_iter = strings::iter(string);
@@ -804,9 +809,6 @@ export fn findall(re: *regex, string: str) (void | [][]capture) = {
		case void => break;
		};
	};
	if (len(res) == 0) {
		return void;
	};
	return res;
};

-- 
2.36.1

[PATCH hare v2 3/3] regex: add replace and rawreplace

Details
Message ID
<20220623232306.31040-3-sebastian@sebsite.pw>
In-Reply-To
<20220623232306.31040-1-sebastian@sebsite.pw> (view parent)
DKIM signature
pass
Download raw message
Patch: +235 -0
Implements: https://todo.sr.ht/~sircmpwn/hare/710
Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 regex/+test.ha | 112 ++++++++++++++++++++++++++++++++++++++++++++
 regex/regex.ha | 123 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 235 insertions(+)

diff --git a/regex/+test.ha b/regex/+test.ha
index 02e838e7..4e7e5882 100644
--- a/regex/+test.ha
+++ b/regex/+test.ha
@@ -132,6 +132,71 @@ fn run_findall_case(
	};
};

fn run_replace_case(
	expr: str,
	string: str,
	target: str,
	expected: (str | void),
) void = {
	const re = match (compile(expr)) {
	case let re: regex => yield re;
	case let e: error =>
		fmt::errorln(e)!;
		fmt::errorfln("Expected expression /{}/ to compile, but it errored",
			expr)!;
		abort();
	};
	defer finish(&re);

	match (replace(&re, string, target)) {
	case let e: error =>
		if (expected is str) {
			fmt::errorln(e)!;
			fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\"",
				expr, string, target, expected as str)!;
			abort();
		};
	case let s: str =>
		defer free(s);
		if (expected is void) {
			fmt::errorln("Expected replace to fail, but it did not")!;
			fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" return=\"{}\"",
				expr, string, target, s)!;
			abort();
		};
		if (expected as str != s) {
			fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"",
				expr, string, target, expected as str, s)!;
			abort();
		};
	};
};

fn run_rawreplace_case(
	expr: str,
	string: str,
	target: str,
	expected: str,
) void = {
	const re = match (compile(expr)) {
	case let re: regex => yield re;
	case let e: error =>
		fmt::errorln(e)!;
		fmt::errorfln("Expected expression /{}/ to compile, but it errored",
			expr)!;
		abort();
	};
	defer finish(&re);

	const s = rawreplace(&re, string, target);
	defer free(s);
	if (expected != s) {
		fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"",
			expr, string, target, expected, s)!;
		abort();
	};
};

@test fn find() void = {
	const cases = [
		// literals
@@ -578,3 +643,50 @@ fn run_findall_case(
		run_findall_case(expr, string, should_match, targets);
	};
};

@test fn replace() void = {
	const cases: [_](str, str, str, (str | void)) = [
		(`ab.`, "hello abc and abあ test abq thanks", `xyz`,
			"hello xyz and xyz test xyz thanks"),
		(`([Hh])ello`, "Hello world and hello Hare.", `\1owdy`,
			"Howdy world and howdy Hare."),
		(`fo{2,}`, "fo foo fooofoof oofoo", `&bar`,
			"fo foobar fooobarfoobarf oofoobar"),
		(`simple`, "Hare is a simple language.", `& \& robust`,
			"Hare is a simple & robust language."),
		(`(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)`, "12345678910", `\10`,
			"10"),
		(`...?`, "abcdefgh", `\7&\8`,
			"abcdefgh"),
		(`...?`, "abcdefgh", `\7&\`, void),
	];

	for (let i = 0z; i < len(cases); i += 1) {
		const expr = cases[i].0;
		const string = cases[i].1;
		const target = cases[i].2;
		const expected = cases[i].3;
		run_replace_case(expr, string, target, expected);
	};
};

@test fn rawreplace() void = {
	const cases = [
		(`ab.`, "hello abc and abあ test abq thanks", "xyz",
			"hello xyz and xyz test xyz thanks"),
		(`([Hh])ello`, "Hello world and hello Hare.", `\howdy\`,
			`\howdy\ world and \howdy\ Hare.`),
		(`fo{2,}`, "fo foo fooofoof oofoo", "&bar",
			"fo &bar &bar&barf oo&bar"),
		(`\\\\`, `\\\\\\\\`, `\&&\1`,
			`\&&\1\&&\1\&&\1\&&\1`),
	];

	for (let i = 0z; i < len(cases); i += 1) {
		const expr = cases[i].0;
		const string = cases[i].1;
		const target = cases[i].2;
		const expected = cases[i].3;
		run_rawreplace_case(expr, string, target, expected);
	};
};
diff --git a/regex/regex.ha b/regex/regex.ha
index 4eb07b81..c77ea626 100644
--- a/regex/regex.ha
+++ b/regex/regex.ha
@@ -807,6 +807,129 @@ export fn findall(re: *regex, string: str) [][]capture = {
	return res;
};

// Replaces all non-overlapping matches of a regular expression against a string
// with 'targetstr'.
//
// An ampersand ('&') within 'targetstr' is substituted with the substring that
// was matched. A backslash followed by a single decimal number is replaced by
// the capture at that index (starting at 1), or an empty string if no such
// capture exists. For example, `\1` is replaced with the first capture, `\2`
// with the second, etc. A literal ampersand or backslash must have a backslash
// before it, i.e. `\&` and `\\`.
//
// An error is only returned if 'targetstr' isn't formatted correctly.
export fn replace(re: *regex, string: str, targetstr: str) (str | error) = {
	const matches = findall(re, string);
	if (len(matches) == 0) {
		return strings::dup(string);
	};
	defer free_matches(matches);

	const target = parse_replace_target(targetstr)?;
	defer free(target);

	const bytes = strings::toutf8(string);
	let buf = alloc(bytes[..matches[0][0].start_bytesize]...);

	for (let i = 0z; i < len(matches); i += 1) {
		for (let j = 0z; j < len(target); j += 1) {
			match (target[j]) {
			case let b: []u8 =>
				append(buf, b...);
			case let z: size =>
				if (z >= len(matches[i])) yield;
				const b = strings::toutf8(matches[i][z].content);
				append(buf, b...);
			};
		};
		const start = matches[i][0].end_bytesize;
		const end = if (i == len(matches) - 1) len(bytes)
			else matches[i + 1][0].start_bytesize;
		append(buf, bytes[start..end]...);
	};

	return strings::fromutf8(buf);
};

fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = {
	const bytes = strings::toutf8(targetstr);
	let target: []([]u8 | size) = alloc([], 1);
	let iter = strings::iter(targetstr);
	let start = 0z, end = 0z;
	for (true) match (strings::next(&iter)) {
	case void =>
		if (start != end) {
			append(target, bytes[start..]);
		};
		break;
	case let r: rune =>
		switch (r) {
		case '&' =>
			if (start != end) {
				append(target, bytes[start..end]);
			};
			append(target, 0z);
			end += 1;
			start = end;
		case '\\' =>
			if (start != end) {
				append(target, bytes[start..end]);
			};

			const r = match (strings::next(&iter)) {
			case void =>
				return "Trailing backslash": error;
			case let r: rune =>
				yield r;
			};

			switch (r) {
			case '&', '\\' =>
				append(target, [r: u32: u8]);
			case =>
				if (!ascii::isdigit(r) || r == '0') {
					return "Backslash must be followed by positive decimal number, an ampersand, or a backslash": error;
				};
				append(target, r: u32: size - 0x30);
			};

			end += 2;
			start = end;
		case =>
			end += utf8::runesz(r);
		};
	};

	return target;
};

// Replaces all non-overlapping matches of a regular expression against a string
// with 'targetstr'. 'targetstr' is isn't interpreted in any special way;
// backslashes and ampersands are treated literally.
export fn rawreplace(re: *regex, string: str, targetstr: str) str = {
	const matches = findall(re, string);
	if (len(matches) == 0) {
		return strings::dup(string);
	};
	defer free_matches(matches);

	const target = strings::toutf8(targetstr);
	const bytes = strings::toutf8(string);
	let buf: []u8 = [];

	append(buf, bytes[..matches[0][0].start_bytesize]...);
	for (let i = 1z; i < len(matches); i += 1) {
		append(buf, target...);
		const start = matches[i - 1][0].end_bytesize;
		const end = matches[i][0].start_bytesize;
		append(buf, bytes[start..end]...);
	};
	append(buf, target...);
	append(buf, bytes[matches[len(matches) - 1][0].end_bytesize..]...);

	return strings::fromutf8(buf);
};

// Frees a slice of captures.
export fn free_captures(s: []capture) void = {
	free(s);
-- 
2.36.1

[PATCH hare v2 2/3] regex: find consecutive matches in findall

Details
Message ID
<20220623232306.31040-2-sebastian@sebsite.pw>
In-Reply-To
<20220623232306.31040-1-sebastian@sebsite.pw> (view parent)
DKIM signature
pass
Download raw message
Patch: +43 -44
Previously, findall had a bug where consecutive matches wouldn't be
found. Minimal reproduction is given below:

	let re = regex::compile(`a`)!;
	defer regex::finish(&re);
	const matches = regex::findall(&re, "aa");
	defer regex::free_matches(matches);
	assert(len(matches) == 2);

The assertion failed prior to this commit, since the second "a" was
skipped over. If the string were instead "a a", this would've succeeded.

Although this commit is large, the fix is relatively simple. However,
this commit also does some significant refactoring in the process of
fixing the bug, For one, strings::iterator is now no longer used.
Instead, a bufio fixed stream is used, which simplifies the code here.
Furthermore, search() no longer takes in as many parameters. Most of
these parameters were pointers which were only used by findall(), and
the implementation required that str_idx was stored as an int and casted
to size when appropriate. This logic is now all handled outside of
search() and within findall() itself.

Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 regex/+test.ha     |  4 +++
 regex/regex.ha     | 75 ++++++++++++++++++++++------------------------
 scripts/gen-stdlib |  4 +--
 stdlib.mk          |  4 +--
 4 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha
index 181cf0ec..02e838e7 100644
--- a/regex/+test.ha
+++ b/regex/+test.ha
@@ -564,6 +564,10 @@ fn run_findall_case(
	const cases = [
		(`ab.`, "hello abc and abあ test abq thanks", matchres::MATCH,
			["abc", "abあ", "abq"]: []str),
		(`a`, "aa", matchres::MATCH,
			["a", "a"]: []str),
		(`fo{2,}`, "fo foo fooofoof oofoo", matchres::MATCH,
			["foo", "fooo", "foo", "foo"]: []str),
	];

	for (let i = 0z; i < len(cases); i += 1) {
diff --git a/regex/regex.ha b/regex/regex.ha
index 71308136..4eb07b81 100644
--- a/regex/regex.ha
+++ b/regex/regex.ha
@@ -1,8 +1,10 @@
// License: MPL-2.0
// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
use ascii;
use bufio;
use encoding::utf8;
use errors;
use io;
use strconv;
use strings;

@@ -491,8 +493,8 @@ fn run_thread(
	re: *regex,
	string: str,
	threads: *[]thread,
	r_or_end: (rune | void),
	str_idx: int,
	r_or_end: (rune | io::EOF),
	str_idx: size,
	str_bytesize: size
) (void | newmatch) = {
	const str_bytes = strings::toutf8(string);
@@ -511,14 +513,14 @@ fn run_thread(
			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
		case inst_skip =>
			const new_pc = threads[i].pc + 1;
			threads[i].start_idx = str_idx: size;
			threads[i].start_idx = str_idx;
			threads[i].start_bytesize = str_bytesize;
			add_thread(threads, i, new_pc);
			break;
		case let anchored: inst_match =>
			// Do not match if we need an end-anchored match, but we
			// have not exhausted our string
			if (anchored && !(r_or_end is void)) {
			if (anchored && !(r_or_end is io::EOF)) {
				threads[i].failed = true;
				return;
			};
@@ -527,7 +529,7 @@ fn run_thread(
			threads[i].root_capture = capture {
				start = threads[i].start_idx,
				start_bytesize = threads[i].start_bytesize,
				end = str_idx: size,
				end = str_idx,
				end_bytesize = str_bytesize,
				content = content,
			};
@@ -535,13 +537,13 @@ fn run_thread(
			return newmatch;
		case inst_groupstart =>
			assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported");
			threads[i].curr_capture.start = str_idx: size;
			threads[i].curr_capture.start = str_idx;
			threads[i].curr_capture.start_bytesize = str_bytesize;
			threads[i].curr_capture_inited = true;
			threads[i].pc += 1;
		case inst_groupend =>
			assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
			threads[i].curr_capture.end = str_idx: size;
			threads[i].curr_capture.end = str_idx;
			threads[i].curr_capture.end_bytesize = str_bytesize;
			threads[i].curr_capture.content =
				strings::fromutf8_unsafe(str_bytes[
@@ -572,7 +574,7 @@ fn run_thread(

	// From now on, we're only matching consuming instructions, and these
	// can't do anything without another rune.
	if (r_or_end is void) {
	if (r_or_end is io::EOF) {
		threads[i].failed = true;
		return;
	};
@@ -630,9 +632,7 @@ fn run_thread(
fn search(
	re: *regex,
	string: str,
	str_iter: *strings::iterator,
	str_idx: *int,
	str_bytesize: *size,
	handle: io::handle,
	need_captures: bool
) (void | []capture) = {
	let threads: []thread = alloc([
@@ -649,11 +649,13 @@ fn search(
		free(threads);
	};

	let str_idx = 0z;
	let first_match_idx: (void | size) = void;
	let str_bytesize = 0z;
	let last_bytesize = 0z;

	for (true) {
		*str_bytesize += last_bytesize;
		str_bytesize += last_bytesize;

		if (len(threads) == 0) {
			return void;
@@ -691,15 +693,14 @@ fn search(
			return res;
		};

		const r_or_end = strings::next(str_iter);
		*str_idx += 1;
		const r_or_end = bufio::scanrune(handle)!;
		if (r_or_end is rune) {
			last_bytesize = utf8::runesz(r_or_end as rune);
		};

		for (let i = 0z; i < len(threads); i += 1) {
			const res = run_thread(i, re, string, &threads,
				r_or_end, *str_idx, *str_bytesize);
				r_or_end, str_idx, str_bytesize);
			const matchlen = threads[i].root_capture.end
				- threads[i].root_capture.start;
			if (res is newmatch && matchlen > 0 && !need_captures) {
@@ -713,6 +714,7 @@ fn search(
				first_match_idx = threads[i].start_idx;
			};
		};
		str_idx += 1;

		// When we only want the leftmost match, delete all threads that
		// start after the earliest non-zero-length matched thread
@@ -761,23 +763,16 @@ fn search(

// Returns whether or not a regex matches a string.
export fn test(re: *regex, string: str) bool = {
	let str_idx = -1;
	let str_iter = strings::iter(string);
	let str_bytesize = 0z;
	match (search(re, string, &str_iter, &str_idx, &str_bytesize, false)) {
	case void => return false;
	case []capture => return true;
	};
	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
	return search(re, string, &strm, false) is []capture;
};


// Attempts to match a regular expression against a string and returns the
// longest leftmost match, or void if there is no match.
export fn find(re: *regex, string: str) []capture = {
	let str_idx = -1;
	let str_iter = strings::iter(string);
	let str_bytesize = 0z;
	match (search(re, string, &str_iter, &str_idx, &str_bytesize, true)) {
	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
	match (search(re, string, &strm, true)) {
	case let m: []capture =>
		return m;
	case void =>
@@ -789,23 +784,23 @@ export fn find(re: *regex, string: str) []capture = {
// non-overlapping matches, or void if there are no matches.
export fn findall(re: *regex, string: str) [][]capture = {
	let res: [][]capture = alloc([]);
	let str_idx = -1;
	let str_iter = strings::iter(string);
	let str_bytesize = 0z;
	let str_idx = 0z, str_bytesize = 0z;
	let substring = string;
	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
	const str_bytes = strings::toutf8(string);
	for (true) {
		const findres = search(re, string, &str_iter, &str_idx,
			&str_bytesize, true);
		match (findres) {
		match (search(re, substring, &strm, true)) {
		case let m: []capture =>
			append(res, m);
			assert(str_idx: size >= m[0].end);
			for (str_idx: size > m[0].end) {
				strings::prev(&str_iter);
				str_idx -= 1;
			};
			if (str_idx: size >= len(string)) {
				break;
			};
			m[0].start += str_idx;
			m[0].end += str_idx;
			m[0].start_bytesize += str_bytesize;
			m[0].end_bytesize += str_bytesize;
			str_idx = m[0].end;
			str_bytesize = m[0].end_bytesize;
			substring = strings::fromutf8(str_bytes[str_bytesize..]);
			io::seek(&strm, str_bytesize: io::off,
				io::whence::SET)!;
		case void => break;
		};
	};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
index 896ac743..41fa40c0 100755
--- a/scripts/gen-stdlib
+++ b/scripts/gen-stdlib
@@ -1139,10 +1139,10 @@ path() {
regex() {
	if [ $testing -eq 0 ]; then
		gen_srcs regex regex.ha
		gen_ssa regex encoding::utf8 errors strconv strings
		gen_ssa regex ascii bufio encoding::utf8 errors io strconv strings
	else
		gen_srcs regex regex.ha +test.ha
		gen_ssa regex encoding::utf8 errors strconv strings fmt io os
		gen_ssa regex encoding::utf8 errors strconv strings fmt io os bufio
	fi
}

diff --git a/stdlib.mk b/stdlib.mk
index 62c51840..192f62e8 100644
--- a/stdlib.mk
+++ b/stdlib.mk
@@ -1773,7 +1773,7 @@ $(HARECACHE)/path/path-any.ssa: $(stdlib_path_any_srcs) $(stdlib_rt) $(stdlib_st
stdlib_regex_any_srcs = \
	$(STDLIB)/regex/regex.ha

$(HARECACHE)/regex/regex-any.ssa: $(stdlib_regex_any_srcs) $(stdlib_rt) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
$(HARECACHE)/regex/regex-any.ssa: $(stdlib_regex_any_srcs) $(stdlib_rt) $(stdlib_ascii_$(PLATFORM)) $(stdlib_bufio_$(PLATFORM)) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
	@printf 'HAREC \t$@\n'
	@mkdir -p $(HARECACHE)/regex
	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nregex \
@@ -3938,7 +3938,7 @@ testlib_regex_any_srcs = \
	$(STDLIB)/regex/regex.ha \
	$(STDLIB)/regex/+test.ha

$(TESTCACHE)/regex/regex-any.ssa: $(testlib_regex_any_srcs) $(testlib_rt) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_os_$(PLATFORM))
$(TESTCACHE)/regex/regex-any.ssa: $(testlib_regex_any_srcs) $(testlib_rt) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_bufio_$(PLATFORM))
	@printf 'HAREC \t$@\n'
	@mkdir -p $(TESTCACHE)/regex
	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nregex \
-- 
2.36.1

[hare/patches] build failed

builds.sr.ht <builds@sr.ht>
Details
Message ID
<CKXWFEZMHX53.3MRYPX1PYOUHP@cirno>
In-Reply-To
<20220623232306.31040-2-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
hare/patches: FAILED in 46s

[regex: make find and findall return [][]capture][0] v2 from [Sebastian][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/33236
[1]: sebastian@sebsite.pw

✗ #786649 FAILED hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/786649
✗ #786650 FAILED hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/786650

Re: [PATCH hare v2 3/3] regex: add replace and rawreplace

Details
Message ID
<CL9DQOKAP6SI.1XKO65Q51F4W0@taiga>
In-Reply-To
<20220623232306.31040-3-sebastian@sebsite.pw> (view parent)
DKIM signature
pass
Download raw message
Can we use \0 instead of &?
Reply to thread Export thread (mbox)