~sircmpwn/hare-dev

hare: bufio: remove scanner inefficiency v2 PROPOSED

It looks like v1 of this patch (#39292) was never applied.

I have rebased it to master and extended it to the later added function
scan_unread.

These are the results for Jose's mini benchmark:

master branch:

time ./newlines-master l.txt
20.01 secs

time ./newlines-master l8k.txt
59.58 secs

With this patch (v2) applied:

time ./newlines-patch l.txt
230.16 millis

time ./newlines-patch l8k.txt
238.67 millis


Jose Lombera (1):
  bufio: remove scanner inefficiency

 bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
 1 file changed, 67 insertions(+), 80 deletions(-)

-- 
2.44.0
#1188412 alpine.yml success
#1188413 freebsd.yml success
#1188414 openbsd.yml success
hare/patches: SUCCESS in 1m5s

[bufio: remove scanner inefficiency][0] v2 from [Max Schillinger][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/50792
[1]: mailto:max@mxsr.de

✓ #1188413 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1188413
✓ #1188414 SUCCESS hare/patches/openbsd.yml https://builds.sr.ht/~sircmpwn/job/1188414
✓ #1188412 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1188412
Thank your for his hint. I think it's the C way trying not to call the 
same [str]len twice. :-)
Next
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/hare-dev/patches/50792/mbox | git am -3
Learn more about email & git

[PATCH hare v2 1/1] bufio: remove scanner inefficiency Export this patch

From: Jose Lombera <jose@lombera.dev>

The new scanner with an internal read-ahead buffer contains an
inefficiency in which the underlying buffer is shifted left every time a
token is consumed.

Fix this by delaying the shift until an actual read-ahead from the
source IO handle is made, and only if the shift is required.

Co-authored-by: Max Schillinger <max@mxsr.de>
Signed-off-by: Jose Lombera <jose@lombera.dev>
Signed-off-by: Max Schillinger <max@mxsr.de>
---
 bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
 1 file changed, 67 insertions(+), 80 deletions(-)

diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index 1f156050..30f96d07 100644
--- a/bufio/scanner.ha
@@ -19,10 +19,10 @@ export type scanner = struct {
	stream: io::stream,
	src: io::handle,
	buffer: []u8,
	// Number of bytes available in buffer
	pending: size,
	// Number of bytes returned to the user
	readout: size,
	// Index of start of pending bytes in buffer
	start: size,
	// Sub-slice with pending bytes in buffer
	pending: []u8,
	// User-confirmed maximum size of read buffer
	maxread: size,
};
@@ -44,8 +44,8 @@ export fn newscanner(
		src = src,
		buffer = alloc([0...], BUFSZ),
		maxread = maxread,
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

@@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
		src = src,
		buffer = buffer,
		maxread = len(buffer),
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

// Frees resources associated associated with a [[scanner]]. Does not close the
// underlying I/O handle.
// Frees resources associated with a [[scanner]]. Does not close the underlying
// I/O handle.
export fn finish(scan: *scanner) void = {
	free(scan.buffer);
};
@@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
	let scan = s: *scanner;

	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
		};
	};

	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
	buf[..n] = scan_consume(scan, n)[..];
	return n;
};
@@ -95,52 +92,50 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
	if (scan.pending >= len(scan.buffer)) {
		let readahead = scan.pending + BUFSZ;
		if (readahead > scan.maxread) {
			readahead = scan.maxread;
		};
		if (scan.pending >= readahead) {
			return errors::overflow;
	let start = scan.start;
	const pending = len(scan.pending);
	const buf_len = len(scan.buffer);

	if ((start + pending) >= buf_len) {
		if (start > 0) {
			// Shift buffer to the left to free space at the end
			scan.buffer[..buf_len - start] = scan.buffer[start..];
			scan.pending = scan.buffer[..pending];
			start = 0;
			scan.start = 0;
		} else {
			// Buffer is full, expand it
			let readahead = pending + BUFSZ;
			if (readahead > scan.maxread) {
				readahead = scan.maxread;
			};
			if (pending >= readahead) {
				return errors::overflow;
			};
			append(scan.buffer, [0...], readahead);
		};
		append(scan.buffer, [0...], readahead);
	};

	const prev = scan.pending;
	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
	match (io::read(scan.src, scan.buffer[start + pending..])?) {
	case let z: size =>
		scan.pending += z;
		return prev;
		scan.pending = scan.buffer[start..start + pending + z];
		return pending;
	case io::EOF =>
		return io::EOF;
	};
};

// Shifts the buffer towards the start, discarding bytes which were read out.
fn scan_shift(scan: *scanner) void = {
	const n = scan.readout;
	if (n == 0) {
		return;
	};
	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
	scan.readout = 0;
	scan.pending -= n;
};

// Consumes N bytes from the buffer, updating scan.readout. User must call
// [[scan_shift]] before calling scan_consume again.
// Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
	assert(len(scan.buffer) >= n && scan.readout == 0);
	scan.readout = n;
	return scan.buffer[..n];
	assert(len(scan.pending) >= n);
	scan.start += n;
	defer scan.pending = scan.pending[n..];
	return scan.pending[..n];
};

// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -159,26 +154,24 @@ export fn scan_bytes(
	scan: *scanner,
	delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
	scan_shift(scan);

	let i = 0z, nread = 0z;
	let i = 0z;
	for (true) {
		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
		match (bytes::index(scan.pending[i..], delim)) {
		case let ix: size =>
			i = ix;
			i += ix;
			break;
		case void => void;
		};

		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
			return scan_consume(scan, scan.pending);
			return scan_consume(scan, len(scan.pending));
		case let prevpending: size =>
			// No need to re-index the earlier part of the buffer
			nread = prevpending;
			i = prevpending;
		};
	};

@@ -188,36 +181,27 @@ export fn scan_bytes(
	case let u: []u8 =>
		yield len(u);
	};
	const nuser = nread + i, nconsume = nuser + ndelim;
	return scan_consume(scan, nconsume)[..nuser];
	const nconsume = i + ndelim;
	return scan_consume(scan, nconsume)[..i];
};

// Reads one rune from a [[scanner]].
export fn scan_rune(
	scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) < 4) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
		case size => void;
		};
	};
	const sz = utf8::utf8sz(scan.buffer[0])?;

	for (scan.pending < sz) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return utf8::invalid;
		case size => void;
		};
	const sz = utf8::utf8sz(scan.pending[0])?;
	if (len(scan.pending) < sz) {
		return utf8::invalid;
	};

	const buf = scan_consume(scan, sz);
	const dec = utf8::decode(buf[..sz]);
	match (utf8::next(&dec)?) {
@@ -259,25 +243,28 @@ export fn scan_line(
// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
	scan_shift(scan);
	return scan.buffer[..scan.pending];
	return scan.pending[..];
};

fn scan_unread(scan: *scanner, buf: []u8) void = {
	if (len(buf) == 0) {
		return;
	};
	if (len(buf) <= scan.readout) {
		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
		scan.readout -= len(buf);
	if (len(buf) <= scan.start) {
		const pending_end = scan.start + len(scan.pending);
		scan.buffer[scan.start - len(buf)..scan.start] = buf;
		scan.start -= len(buf);
		scan.pending = scan.buffer[scan.start..pending_end];
	} else {
		const n = len(buf) - scan.readout;
		assert(n < scan.maxread - scan.pending,
		const n = len(buf) - scan.start;
		assert(n < scan.maxread - len(scan.pending),
			"Attempted to unread more data than buffer has available");
		// Shift buffer to the right to free space at the beginning
		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
		scan.pending += n;
		scan.buffer[..len(buf)] = buf;
		scan.readout = 0;
		const pending_end = len(scan.pending) + len(buf);
		scan.pending = scan.buffer[..pending_end];
		scan.start = 0;
	};
};

-- 
2.44.0
hare/patches: SUCCESS in 1m5s

[bufio: remove scanner inefficiency][0] v2 from [Max Schillinger][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/50792
[1]: mailto:max@mxsr.de

✓ #1188413 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1188413
✓ #1188414 SUCCESS hare/patches/openbsd.yml https://builds.sr.ht/~sircmpwn/job/1188414
✓ #1188412 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1188412

Re: [PATCH hare v2 1/1] bufio: remove scanner inefficiency Export this patch

On Sat, Apr 06, 2024 at 08:12:35AM +0200, Max Schillinger wrote:
> From: Jose Lombera <jose@lombera.dev>
> 
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
> 
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
> 
> Co-authored-by: Max Schillinger <max@mxsr.de>
> Signed-off-by: Jose Lombera <jose@lombera.dev>
> Signed-off-by: Max Schillinger <max@mxsr.de>

thanks for this patch, it's a huge performance improvement, which
is really nice. it took a couple of hours for me to dig through
this patch/logic and figure it out; most of this is fine, but there
are a couple of nits and one pretty serious bug.

can you please send a v3 with this addresseed?

you can find a proposed patch from me below.

> ---
>  bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
>  1 file changed, 67 insertions(+), 80 deletions(-)
> 
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 1f156050..30f96d07 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -95,52 +92,50 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  // is updated accordingly. Returns the number of bytes which had been available
>  // prior to the call.
>  fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> -	if (scan.pending >= len(scan.buffer)) {
> -		let readahead = scan.pending + BUFSZ;
> -		if (readahead > scan.maxread) {
> -			readahead = scan.maxread;
> -		};
> -		if (scan.pending >= readahead) {
> -			return errors::overflow;
> +	let start = scan.start;
> +	const pending = len(scan.pending);
> +	const buf_len = len(scan.buffer);

i would remove "buf_len"; it's confusing because there is (1) a
variable "buf" in this scope and (2) obscures code, len(scan.buffer)
is much more readable in the two places it's used anyways.

> +
> +	if ((start + pending) >= buf_len) {

the "bigger than" part of this check does not make sense, it's
logically not possible. this should use "==". also, the extra
parentheses are not needed.

> +		if (start > 0) {
> +			// Shift buffer to the left to free space at the end
> +			scan.buffer[..buf_len - start] = scan.buffer[start..];
> +			scan.pending = scan.buffer[..pending];
> +			start = 0;
> +			scan.start = 0;
> +		} else {
> +			// Buffer is full, expand it
> +			let readahead = pending + BUFSZ;
> +			if (readahead > scan.maxread) {
> +				readahead = scan.maxread;
> +			};
> +			if (pending >= readahead) {
> +				return errors::overflow;
> +			};
> +			append(scan.buffer, [0...], readahead);
>  		};
> -		append(scan.buffer, [0...], readahead);
>  	};
>  
> -	const prev = scan.pending;
> -	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> +	match (io::read(scan.src, scan.buffer[start + pending..])?) {
>  	case let z: size =>
> -		scan.pending += z;
> -		return prev;
> +		scan.pending = scan.buffer[start..start + pending + z];
> +		return pending;
>  	case io::EOF =>
>  		return io::EOF;
>  	};
>  };
>  
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> -	const n = scan.readout;
> -	if (n == 0) {
> -		return;
> -	};
> -	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> -	scan.readout = 0;
> -	scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
>  fn scan_consume(scan: *scanner, n: size) []u8 = {
> -	assert(len(scan.buffer) >= n && scan.readout == 0);
> -	scan.readout = n;
> -	return scan.buffer[..n];
> +	assert(len(scan.pending) >= n);
> +	scan.start += n;
> +	defer scan.pending = scan.pending[n..];
> +	return scan.pending[..n];
>  };
>  
>  // Reads one byte from a [[scanner]].
>  export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) == 0) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
>  			return io::EOF;
> @@ -259,25 +243,28 @@ export fn scan_line(
>  // Returns the internal scanner buffer, which contains all bytes read ahead by
>  // the scanner up to this point.
>  export fn scan_buffer(scan: *scanner) []u8 = {
> -	scan_shift(scan);
> -	return scan.buffer[..scan.pending];
> +	return scan.pending[..];
>  };
>  
>  fn scan_unread(scan: *scanner, buf: []u8) void = {
>  	if (len(buf) == 0) {
>  		return;
>  	};
> -	if (len(buf) <= scan.readout) {
> -		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> -		scan.readout -= len(buf);
> +	if (len(buf) <= scan.start) {
> +		const pending_end = scan.start + len(scan.pending);
> +		scan.buffer[scan.start - len(buf)..scan.start] = buf;
> +		scan.start -= len(buf);
> +		scan.pending = scan.buffer[scan.start..pending_end];
>  	} else {
> -		const n = len(buf) - scan.readout;
> -		assert(n < scan.maxread - scan.pending,
> +		const n = len(buf) - scan.start;
> +		assert(n < scan.maxread - len(scan.pending),
>  			"Attempted to unread more data than buffer has available");
> +		// Shift buffer to the right to free space at the beginning
>  		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> -		scan.pending += n;
>  		scan.buffer[..len(buf)] = buf;
> -		scan.readout = 0;
> +		const pending_end = len(scan.pending) + len(buf);
> +		scan.pending = scan.buffer[..pending_end];
> +		scan.start = 0;

the logic in this else branch is completely broken; here's an example:
suppose we have a buf where len(buf) is 9. now, scan.start is 5,
scan.maxread is 15 and len(scan.pending) is 10. the first thing
that we are doing is:

		const n = len(buf) - scan.start;

now n is 4. let's see what the assert is doing:

		assert(n < scan.maxread - len(scan.pending),
			"Attempted to unread more data than buffer has available");

the calculation is: `4 < 15 - 10` or `4 < 5`.

NOW: this passes the assert even though len(buf) is 9 and we only
have 5 spare bytes that we can unread; this will cause an overwrite of
the existing data further down below in the code!

suppose len(buf) is 5, scan.start is 0, scan.maxread is 15 and
len(scan.pending) is 10. this should theoretically pass the assert,
right? no, because:

		const n = len(buf) - scan.start;

here n is 5.

		assert(n < scan.maxread - len(scan.pending),
			"Attempted to unread more data than buffer has available");

the calculation is: `5 < 15 - 10`, or `5 < 5`

NOW: this will cause the assertion to fail, even though the inputs to the
function are logically correct and should not cause it to fail.

apart from these issues, another issue here is that this code assumes
that len(scan.buffer) == scan.maxread, which is wrong and will also cause
an out-of-bounds write below in some cases (in almost all of them with
the default value of scan.maxread).

>  	};
>  };
>  
> -- 
> 2.44.0
> 

diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index ff63784b..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -94,12 +94,11 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
	let start = scan.start;
	const pending = len(scan.pending);
	const buf_len = len(scan.buffer);

	if ((start + pending) >= buf_len) {
	if (start + pending == len(scan.buffer)) {
		if (start > 0) {
			// Shift buffer to the left to free space at the end
			scan.buffer[..buf_len - start] = scan.buffer[start..];
			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
			scan.pending = scan.buffer[..pending];
			start = 0;
			scan.start = 0;
@@ -256,14 +255,13 @@ fn scan_unread(scan: *scanner, buf: []u8) void = {
		scan.start -= len(buf);
		scan.pending = scan.buffer[scan.start..pending_end];
	} else {
		const n = len(buf) - scan.start;
		assert(n < scan.maxread - len(scan.pending),
		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
			"Attempted to unread more data than buffer has available");
		// Shift buffer to the right to free space at the beginning
		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
			scan.buffer[scan.start..scan.start + len(scan.pending)];
		scan.buffer[..len(buf)] = buf;
		const pending_end = len(scan.pending) + len(buf);
		scan.pending = scan.buffer[..pending_end];
		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
		scan.start = 0;
	};
};
Thank your for his hint. I think it's the C way trying not to call the 
same [str]len twice. :-)