~sircmpwn/hare-dev

11 5

[PATCH hare v3] bufio: remove scanner inefficiency

Details
Message ID
<20240524192543.12627-1-max@mxsr.de>
DKIM signature
pass
Download raw message
[PATCH hare v3] bufio: remove scanner inefficiency

v3 includes Lorenz' fix as provided as comment on v2:

https://lists.sr.ht/~sircmpwn/hare-dev/patches/50792
Details
Message ID
<20240524192543.12627-2-max@mxsr.de>
In-Reply-To
<20240524192543.12627-1-max@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
Patch: +66 -81
From: Jose Lombera <jose@lombera.dev>

The new scanner with an internal read-ahead buffer contains an
inefficiency in which the underlying buffer is shifted left every time a
token is consumed.

Fix this by delaying the shift until an actual read-ahead from the
source IO handle is made, and only if the shift is required.

Co-authored-by: Max Schillinger <max@mxsr.de>
Co-authored-by: Lorenz (xha) <me@xha.li>
Signed-off-by: Jose Lombera <jose@lombera.dev>
Signed-off-by: Max Schillinger <max@mxsr.de>
---
 bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
 1 file changed, 66 insertions(+), 81 deletions(-)

diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index 4d6690c8..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -19,10 +19,10 @@ export type scanner = struct {
	stream: io::stream,
	src: io::handle,
	buffer: []u8,
	// Number of bytes available in buffer
	pending: size,
	// Number of bytes returned to the user
	readout: size,
	// Index of start of pending bytes in buffer
	start: size,
	// Sub-slice with pending bytes in buffer
	pending: []u8,
	// User-confirmed maximum size of read buffer
	maxread: size,
};
@@ -44,8 +44,8 @@ export fn newscanner(
		src = src,
		buffer = alloc([0...], BUFSZ),
		maxread = maxread,
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

@@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
		src = src,
		buffer = buffer,
		maxread = len(buffer),
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

// Frees resources associated associated with a [[scanner]]. Does not close the
// underlying I/O handle.
// Frees resources associated with a [[scanner]]. Does not close the underlying
// I/O handle.
export fn finish(scan: *scanner) void = {
	free(scan.buffer);
};
@@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
	let scan = s: *scanner;

	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
		};
	};

	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
	buf[..n] = scan_consume(scan, n)[..];
	return n;
};
@@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
	if (scan.pending >= len(scan.buffer)) {
		let readahead = scan.pending + BUFSZ;
		if (readahead > scan.maxread) {
			readahead = scan.maxread;
		};
		if (scan.pending >= readahead) {
			return errors::overflow;
	let start = scan.start;
	const pending = len(scan.pending);

	if (start + pending == len(scan.buffer)) {
		if (start > 0) {
			// Shift buffer to the left to free space at the end
			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
			scan.pending = scan.buffer[..pending];
			start = 0;
			scan.start = 0;
		} else {
			// Buffer is full, expand it
			let readahead = pending + BUFSZ;
			if (readahead > scan.maxread) {
				readahead = scan.maxread;
			};
			if (pending >= readahead) {
				return errors::overflow;
			};
			append(scan.buffer, [0...], readahead);
		};
		append(scan.buffer, [0...], readahead);
	};

	const prev = scan.pending;
	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
	match (io::read(scan.src, scan.buffer[start + pending..])?) {
	case let z: size =>
		scan.pending += z;
		return prev;
		scan.pending = scan.buffer[start..start + pending + z];
		return pending;
	case io::EOF =>
		return io::EOF;
	};
};

// Shifts the buffer towards the start, discarding bytes which were read out.
fn scan_shift(scan: *scanner) void = {
	const n = scan.readout;
	if (n == 0) {
		return;
	};
	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
	scan.readout = 0;
	scan.pending -= n;
};

// Consumes N bytes from the buffer, updating scan.readout. User must call
// [[scan_shift]] before calling scan_consume again.
// Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
	assert(len(scan.buffer) >= n && scan.readout == 0);
	scan.readout = n;
	return scan.buffer[..n];
	assert(len(scan.pending) >= n);
	scan.start += n;
	defer scan.pending = scan.pending[n..];
	return scan.pending[..n];
};

// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -159,26 +153,24 @@ export fn scan_bytes(
	scan: *scanner,
	delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
	scan_shift(scan);

	let i = 0z, nread = 0z;
	let i = 0z;
	for (true) {
		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
		match (bytes::index(scan.pending[i..], delim)) {
		case let ix: size =>
			i = ix;
			i += ix;
			break;
		case void => void;
		};

		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
			return scan_consume(scan, scan.pending);
			return scan_consume(scan, len(scan.pending));
		case let prevpending: size =>
			// No need to re-index the earlier part of the buffer
			nread = prevpending;
			i = prevpending;
		};
	};

@@ -188,36 +180,27 @@ export fn scan_bytes(
	case let u: []u8 =>
		yield len(u);
	};
	const nuser = nread + i, nconsume = nuser + ndelim;
	return scan_consume(scan, nconsume)[..nuser];
	const nconsume = i + ndelim;
	return scan_consume(scan, nconsume)[..i];
};

// Reads one rune from a [[scanner]].
export fn scan_rune(
	scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) < 4) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
		case size => void;
		};
	};
	const sz = utf8::utf8sz(scan.buffer[0])?;

	for (scan.pending < sz) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return utf8::invalid;
		case size => void;
		};
	const sz = utf8::utf8sz(scan.pending[0])?;
	if (len(scan.pending) < sz) {
		return utf8::invalid;
	};

	const buf = scan_consume(scan, sz);
	const dec = utf8::decode(buf[..sz]);
	match (utf8::next(&dec)?) {
@@ -259,25 +242,27 @@ export fn scan_line(
// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
	scan_shift(scan);
	return scan.buffer[..scan.pending];
	return scan.pending[..];
};

fn scan_unread(scan: *scanner, buf: []u8) void = {
	if (len(buf) == 0) {
		return;
	};
	if (len(buf) <= scan.readout) {
		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
		scan.readout -= len(buf);
	if (len(buf) <= scan.start) {
		const pending_end = scan.start + len(scan.pending);
		scan.buffer[scan.start - len(buf)..scan.start] = buf;
		scan.start -= len(buf);
		scan.pending = scan.buffer[scan.start..pending_end];
	} else {
		const n = len(buf) - scan.readout;
		assert(n < scan.maxread - scan.pending,
		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
			"Attempted to unread more data than buffer has available");
		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
		scan.pending += n;
		// Shift buffer to the right to free space at the beginning
		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
			scan.buffer[scan.start..scan.start + len(scan.pending)];
		scan.buffer[..len(buf)] = buf;
		scan.readout = 0;
		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
		scan.start = 0;
	};
};

-- 
2.45.1

[hare/patches] build success

builds.sr.ht <builds@sr.ht>
Details
Message ID
<D1I4F6HX6X2U.W6C9FZ23L2E9@fra01>
In-Reply-To
<20240524192543.12627-2-max@mxsr.de> (view parent)
DKIM signature
missing
Download raw message
hare/patches: SUCCESS in 1m45s

[bufio: remove scanner inefficiency][0] v3 from [Max Schillinger][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/52917
[1]: max@mxsr.de

✓ #1230246 SUCCESS hare/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1230246
✓ #1230245 SUCCESS hare/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1230245
✓ #1230247 SUCCESS hare/patches/openbsd.yml https://builds.sr.ht/~sircmpwn/job/1230247
Lorenz (xha) <me@xha.li>
Details
Message ID
<ZlFnm9PGiQqGI-7L@xha.li>
In-Reply-To
<20240524192543.12627-2-max@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
On Fri, May 24, 2024 at 09:21:41PM +0200, Max Schillinger wrote:
> From: Jose Lombera <jose@lombera.dev>
> 
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
> 
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
> 
> Co-authored-by: Max Schillinger <max@mxsr.de>
> Co-authored-by: Lorenz (xha) <me@xha.li>
> Signed-off-by: Jose Lombera <jose@lombera.dev>
> Signed-off-by: Max Schillinger <max@mxsr.de>

this version is OK. thank you!

> ---
>  bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
>  1 file changed, 66 insertions(+), 81 deletions(-)
> 
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 4d6690c8..8b8b6b68 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -19,10 +19,10 @@ export type scanner = struct {
>  	stream: io::stream,
>  	src: io::handle,
>  	buffer: []u8,
> -	// Number of bytes available in buffer
> -	pending: size,
> -	// Number of bytes returned to the user
> -	readout: size,
> +	// Index of start of pending bytes in buffer
> +	start: size,
> +	// Sub-slice with pending bytes in buffer
> +	pending: []u8,
>  	// User-confirmed maximum size of read buffer
>  	maxread: size,
>  };
> @@ -44,8 +44,8 @@ export fn newscanner(
>  		src = src,
>  		buffer = alloc([0...], BUFSZ),
>  		maxread = maxread,
> -		pending = 0,
> -		readout = 0,
> +		start = 0,
> +		pending = [],
>  	};
>  };
>  
> @@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
>  		src = src,
>  		buffer = buffer,
>  		maxread = len(buffer),
> -		pending = 0,
> -		readout = 0,
> +		start = 0,
> +		pending = [],
>  	};
>  };
>  
> -// Frees resources associated associated with a [[scanner]]. Does not close the
> -// underlying I/O handle.
> +// Frees resources associated with a [[scanner]]. Does not close the underlying
> +// I/O handle.
>  export fn finish(scan: *scanner) void = {
>  	free(scan.buffer);
>  };
> @@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
>  fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  	let scan = s: *scanner;
>  
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) == 0) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
>  			return io::EOF;
> @@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  		};
>  	};
>  
> -	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
> +	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
>  	buf[..n] = scan_consume(scan, n)[..];
>  	return n;
>  };
> @@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  // is updated accordingly. Returns the number of bytes which had been available
>  // prior to the call.
>  fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> -	if (scan.pending >= len(scan.buffer)) {
> -		let readahead = scan.pending + BUFSZ;
> -		if (readahead > scan.maxread) {
> -			readahead = scan.maxread;
> -		};
> -		if (scan.pending >= readahead) {
> -			return errors::overflow;
> +	let start = scan.start;
> +	const pending = len(scan.pending);
> +
> +	if (start + pending == len(scan.buffer)) {
> +		if (start > 0) {
> +			// Shift buffer to the left to free space at the end
> +			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
> +			scan.pending = scan.buffer[..pending];
> +			start = 0;
> +			scan.start = 0;
> +		} else {
> +			// Buffer is full, expand it
> +			let readahead = pending + BUFSZ;
> +			if (readahead > scan.maxread) {
> +				readahead = scan.maxread;
> +			};
> +			if (pending >= readahead) {
> +				return errors::overflow;
> +			};
> +			append(scan.buffer, [0...], readahead);
>  		};
> -		append(scan.buffer, [0...], readahead);
>  	};
>  
> -	const prev = scan.pending;
> -	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> +	match (io::read(scan.src, scan.buffer[start + pending..])?) {
>  	case let z: size =>
> -		scan.pending += z;
> -		return prev;
> +		scan.pending = scan.buffer[start..start + pending + z];
> +		return pending;
>  	case io::EOF =>
>  		return io::EOF;
>  	};
>  };
>  
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> -	const n = scan.readout;
> -	if (n == 0) {
> -		return;
> -	};
> -	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> -	scan.readout = 0;
> -	scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
>  fn scan_consume(scan: *scanner, n: size) []u8 = {
> -	assert(len(scan.buffer) >= n && scan.readout == 0);
> -	scan.readout = n;
> -	return scan.buffer[..n];
> +	assert(len(scan.pending) >= n);
> +	scan.start += n;
> +	defer scan.pending = scan.pending[n..];
> +	return scan.pending[..n];
>  };
>  
>  // Reads one byte from a [[scanner]].
>  export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) == 0) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
>  			return io::EOF;
> @@ -159,26 +153,24 @@ export fn scan_bytes(
>  	scan: *scanner,
>  	delim: (u8 | []u8),
>  ) ([]u8 | io::EOF | io::error) = {
> -	scan_shift(scan);
> -
> -	let i = 0z, nread = 0z;
> +	let i = 0z;
>  	for (true) {
> -		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
> +		match (bytes::index(scan.pending[i..], delim)) {
>  		case let ix: size =>
> -			i = ix;
> +			i += ix;
>  			break;
>  		case void => void;
>  		};
>  
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
> -			if (scan.pending == 0) {
> +			if (len(scan.pending) == 0) {
>  				return io::EOF;
>  			};
> -			return scan_consume(scan, scan.pending);
> +			return scan_consume(scan, len(scan.pending));
>  		case let prevpending: size =>
>  			// No need to re-index the earlier part of the buffer
> -			nread = prevpending;
> +			i = prevpending;
>  		};
>  	};
>  
> @@ -188,36 +180,27 @@ export fn scan_bytes(
>  	case let u: []u8 =>
>  		yield len(u);
>  	};
> -	const nuser = nread + i, nconsume = nuser + ndelim;
> -	return scan_consume(scan, nconsume)[..nuser];
> +	const nconsume = i + ndelim;
> +	return scan_consume(scan, nconsume)[..i];
>  };
>  
>  // Reads one rune from a [[scanner]].
>  export fn scan_rune(
>  	scan: *scanner,
>  ) (rune | io::EOF | io::error | utf8::invalid) = {
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) < 4) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
> -			if (scan.pending == 0) {
> +			if (len(scan.pending) == 0) {
>  				return io::EOF;
>  			};
>  		case size => void;
>  		};
>  	};
> -	const sz = utf8::utf8sz(scan.buffer[0])?;
> -
> -	for (scan.pending < sz) {
> -		match (scan_readahead(scan)?) {
> -		case io::EOF =>
> -			return utf8::invalid;
> -		case size => void;
> -		};
> +	const sz = utf8::utf8sz(scan.pending[0])?;
> +	if (len(scan.pending) < sz) {
> +		return utf8::invalid;
>  	};
> -
>  	const buf = scan_consume(scan, sz);
>  	const dec = utf8::decode(buf[..sz]);
>  	match (utf8::next(&dec)?) {
> @@ -259,25 +242,27 @@ export fn scan_line(
>  // Returns the internal scanner buffer, which contains all bytes read ahead by
>  // the scanner up to this point.
>  export fn scan_buffer(scan: *scanner) []u8 = {
> -	scan_shift(scan);
> -	return scan.buffer[..scan.pending];
> +	return scan.pending[..];
>  };
>  
>  fn scan_unread(scan: *scanner, buf: []u8) void = {
>  	if (len(buf) == 0) {
>  		return;
>  	};
> -	if (len(buf) <= scan.readout) {
> -		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> -		scan.readout -= len(buf);
> +	if (len(buf) <= scan.start) {
> +		const pending_end = scan.start + len(scan.pending);
> +		scan.buffer[scan.start - len(buf)..scan.start] = buf;
> +		scan.start -= len(buf);
> +		scan.pending = scan.buffer[scan.start..pending_end];
>  	} else {
> -		const n = len(buf) - scan.readout;
> -		assert(n < scan.maxread - scan.pending,
> +		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
>  			"Attempted to unread more data than buffer has available");
> -		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> -		scan.pending += n;
> +		// Shift buffer to the right to free space at the beginning
> +		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
> +			scan.buffer[scan.start..scan.start + len(scan.pending)];
>  		scan.buffer[..len(buf)] = buf;
> -		scan.readout = 0;
> +		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
> +		scan.start = 0;
>  	};
>  };
>  
> -- 
> 2.45.1
> 
Details
Message ID
<D1IJXSBNDGOC.3GMH0ASBDMMQ3@mxsr.de>
In-Reply-To
<ZlFnm9PGiQqGI-7L@xha.li> (view parent)
DKIM signature
pass
Download raw message
On Sat May 25, 2024 at 6:22 AM CEST, Lorenz (xha) wrote:
> this version is OK. thank you!

Thank you for your fix!


Apart from that: I still don't understand how sourcehut works. My e-mail 
with v3 of this patch shows up on 
<https://lists.sr.ht/~sircmpwn/hare-dev> between `[PATCH hare v5] 
NetBSD: port Hare to NetBSD/amd64` and `[PATCH hare v6] NetBSD: port 
Hare to NetBSD/amd64`. But it's missing on the patches page:
<https://lists.sr.ht/~sircmpwn/hare-dev/patches>

Is this a bug in sourcehut? Or is something missing my e-mail header?
Lorenz (xha) <me@xha.li>
Details
Message ID
<ZlH77S7Fk3oXKRvd@xha.li>
In-Reply-To
<D1IJXSBNDGOC.3GMH0ASBDMMQ3@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
On Sat, May 25, 2024 at 09:37:16AM +0200, Max Schillinger wrote:
> Apart from that: I still don't understand how sourcehut works. My e-mail 
> with v3 of this patch shows up on 
> <https://lists.sr.ht/~sircmpwn/hare-dev> between `[PATCH hare v5] 
> NetBSD: port Hare to NetBSD/amd64` and `[PATCH hare v6] NetBSD: port 
> Hare to NetBSD/amd64`. But it's missing on the patches page:
> <https://lists.sr.ht/~sircmpwn/hare-dev/patches>
> 
> Is this a bug in sourcehut? Or is something missing my e-mail header?

i'd guess that this is a bug in sourcehut. prehaps using a coverletter
with only a signle commit confused it.
Details
Message ID
<D1IYTD04IAUN.17KI70GBEEVH@lombera.dev>
In-Reply-To
<20240524192543.12627-2-max@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
Hi Max,

Thanks for taking this over.

I'm no longer following hare-dev, nor have time to review this patch nor
the changes you made on top of my original patch.  As such, I don't feel
comfortable appearing as the author of this version (which I presume is
what will finally be merged upstream).  Please add yourself as author
instead and remove my "Signed-off-by:".  Maybe just mention me as
"Co-authored-by:" and reference my original patch[1] with "Link:" or
something.

Thanks,
Lombera

On 2024-05-24 21:21 UTC, Max Schillinger wrote:

> From: Jose Lombera <jose@lombera.dev>
>
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
>
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
>
> Co-authored-by: Max Schillinger <max@mxsr.de>
> Co-authored-by: Lorenz (xha) <me@xha.li>
> Signed-off-by: Jose Lombera <jose@lombera.dev>
> Signed-off-by: Max Schillinger <max@mxsr.de>
> ---
>  bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
>  1 file changed, 66 insertions(+), 81 deletions(-)
>
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 4d6690c8..8b8b6b68 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -19,10 +19,10 @@ export type scanner = struct {
>  	stream: io::stream,
>  	src: io::handle,
>  	buffer: []u8,
> -	// Number of bytes available in buffer
> -	pending: size,
> -	// Number of bytes returned to the user
> -	readout: size,
> +	// Index of start of pending bytes in buffer
> +	start: size,
> +	// Sub-slice with pending bytes in buffer
> +	pending: []u8,
>  	// User-confirmed maximum size of read buffer
>  	maxread: size,
>  };
> @@ -44,8 +44,8 @@ export fn newscanner(
>  		src = src,
>  		buffer = alloc([0...], BUFSZ),
>  		maxread = maxread,
> -		pending = 0,
> -		readout = 0,
> +		start = 0,
> +		pending = [],
>  	};
>  };
>  
> @@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
>  		src = src,
>  		buffer = buffer,
>  		maxread = len(buffer),
> -		pending = 0,
> -		readout = 0,
> +		start = 0,
> +		pending = [],
>  	};
>  };
>  
> -// Frees resources associated associated with a [[scanner]]. Does not close the
> -// underlying I/O handle.
> +// Frees resources associated with a [[scanner]]. Does not close the underlying
> +// I/O handle.
>  export fn finish(scan: *scanner) void = {
>  	free(scan.buffer);
>  };
> @@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
>  fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  	let scan = s: *scanner;
>  
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) == 0) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
>  			return io::EOF;
> @@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  		};
>  	};
>  
> -	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
> +	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
>  	buf[..n] = scan_consume(scan, n)[..];
>  	return n;
>  };
> @@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
>  // is updated accordingly. Returns the number of bytes which had been available
>  // prior to the call.
>  fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> -	if (scan.pending >= len(scan.buffer)) {
> -		let readahead = scan.pending + BUFSZ;
> -		if (readahead > scan.maxread) {
> -			readahead = scan.maxread;
> -		};
> -		if (scan.pending >= readahead) {
> -			return errors::overflow;
> +	let start = scan.start;
> +	const pending = len(scan.pending);
> +
> +	if (start + pending == len(scan.buffer)) {
> +		if (start > 0) {
> +			// Shift buffer to the left to free space at the end
> +			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
> +			scan.pending = scan.buffer[..pending];
> +			start = 0;
> +			scan.start = 0;
> +		} else {
> +			// Buffer is full, expand it
> +			let readahead = pending + BUFSZ;
> +			if (readahead > scan.maxread) {
> +				readahead = scan.maxread;
> +			};
> +			if (pending >= readahead) {
> +				return errors::overflow;
> +			};
> +			append(scan.buffer, [0...], readahead);
>  		};
> -		append(scan.buffer, [0...], readahead);
>  	};
>  
> -	const prev = scan.pending;
> -	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> +	match (io::read(scan.src, scan.buffer[start + pending..])?) {
>  	case let z: size =>
> -		scan.pending += z;
> -		return prev;
> +		scan.pending = scan.buffer[start..start + pending + z];
> +		return pending;
>  	case io::EOF =>
>  		return io::EOF;
>  	};
>  };
>  
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> -	const n = scan.readout;
> -	if (n == 0) {
> -		return;
> -	};
> -	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> -	scan.readout = 0;
> -	scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
>  fn scan_consume(scan: *scanner, n: size) []u8 = {
> -	assert(len(scan.buffer) >= n && scan.readout == 0);
> -	scan.readout = n;
> -	return scan.buffer[..n];
> +	assert(len(scan.pending) >= n);
> +	scan.start += n;
> +	defer scan.pending = scan.pending[n..];
> +	return scan.pending[..n];
>  };
>  
>  // Reads one byte from a [[scanner]].
>  export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) == 0) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
>  			return io::EOF;
> @@ -159,26 +153,24 @@ export fn scan_bytes(
>  	scan: *scanner,
>  	delim: (u8 | []u8),
>  ) ([]u8 | io::EOF | io::error) = {
> -	scan_shift(scan);
> -
> -	let i = 0z, nread = 0z;
> +	let i = 0z;
>  	for (true) {
> -		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
> +		match (bytes::index(scan.pending[i..], delim)) {
>  		case let ix: size =>
> -			i = ix;
> +			i += ix;
>  			break;
>  		case void => void;
>  		};
>  
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
> -			if (scan.pending == 0) {
> +			if (len(scan.pending) == 0) {
>  				return io::EOF;
>  			};
> -			return scan_consume(scan, scan.pending);
> +			return scan_consume(scan, len(scan.pending));
>  		case let prevpending: size =>
>  			// No need to re-index the earlier part of the buffer
> -			nread = prevpending;
> +			i = prevpending;
>  		};
>  	};
>  
> @@ -188,36 +180,27 @@ export fn scan_bytes(
>  	case let u: []u8 =>
>  		yield len(u);
>  	};
> -	const nuser = nread + i, nconsume = nuser + ndelim;
> -	return scan_consume(scan, nconsume)[..nuser];
> +	const nconsume = i + ndelim;
> +	return scan_consume(scan, nconsume)[..i];
>  };
>  
>  // Reads one rune from a [[scanner]].
>  export fn scan_rune(
>  	scan: *scanner,
>  ) (rune | io::EOF | io::error | utf8::invalid) = {
> -	// Consume previous read, if any
> -	scan_shift(scan);
> -
> -	if (scan.pending == 0) {
> +	if (len(scan.pending) < 4) {
>  		match (scan_readahead(scan)?) {
>  		case io::EOF =>
> -			if (scan.pending == 0) {
> +			if (len(scan.pending) == 0) {
>  				return io::EOF;
>  			};
>  		case size => void;
>  		};
>  	};
> -	const sz = utf8::utf8sz(scan.buffer[0])?;
> -
> -	for (scan.pending < sz) {
> -		match (scan_readahead(scan)?) {
> -		case io::EOF =>
> -			return utf8::invalid;
> -		case size => void;
> -		};
> +	const sz = utf8::utf8sz(scan.pending[0])?;
> +	if (len(scan.pending) < sz) {
> +		return utf8::invalid;
>  	};
> -
>  	const buf = scan_consume(scan, sz);
>  	const dec = utf8::decode(buf[..sz]);
>  	match (utf8::next(&dec)?) {
> @@ -259,25 +242,27 @@ export fn scan_line(
>  // Returns the internal scanner buffer, which contains all bytes read ahead by
>  // the scanner up to this point.
>  export fn scan_buffer(scan: *scanner) []u8 = {
> -	scan_shift(scan);
> -	return scan.buffer[..scan.pending];
> +	return scan.pending[..];
>  };
>  
>  fn scan_unread(scan: *scanner, buf: []u8) void = {
>  	if (len(buf) == 0) {
>  		return;
>  	};
> -	if (len(buf) <= scan.readout) {
> -		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> -		scan.readout -= len(buf);
> +	if (len(buf) <= scan.start) {
> +		const pending_end = scan.start + len(scan.pending);
> +		scan.buffer[scan.start - len(buf)..scan.start] = buf;
> +		scan.start -= len(buf);
> +		scan.pending = scan.buffer[scan.start..pending_end];
>  	} else {
> -		const n = len(buf) - scan.readout;
> -		assert(n < scan.maxread - scan.pending,
> +		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
>  			"Attempted to unread more data than buffer has available");
> -		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> -		scan.pending += n;
> +		// Shift buffer to the right to free space at the beginning
> +		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
> +			scan.buffer[scan.start..scan.start + len(scan.pending)];
>  		scan.buffer[..len(buf)] = buf;
> -		scan.readout = 0;
> +		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
> +		scan.start = 0;
>  	};
>  };
>  
> -- 
> 2.45.1
Lorenz (xha) <me@xha.li>
Details
Message ID
<ZlLFOH1_77l7b38X@xha.li>
In-Reply-To
<D1IYTD04IAUN.17KI70GBEEVH@lombera.dev> (view parent)
DKIM signature
pass
Download raw message
On Sat, May 25, 2024 at 07:16:46PM +0000, Jose Lombera wrote:
> Hi Max,
> 
> Thanks for taking this over.
> 
> I'm no longer following hare-dev, nor have time to review this patch nor
> the changes you made on top of my original patch.  As such, I don't feel
> comfortable appearing as the author of this version (which I presume is
> what will finally be merged upstream).  Please add yourself as author
> instead and remove my "Signed-off-by:".  Maybe just mention me as
> "Co-authored-by:" and reference my original patch[1] with "Link:" or
> something.

i can't obviously talk you into your doing, but you are making it
much more complicated without any good reason imo. your sign-off
is needed because *you are the author of that patch*, it's all about
copyright and not about responsibility or something.

can the v3 please be kept the way it is? it's a very good improvement.

btw, just saying, *legally*, with singing off the original patch,
you've distributed it under the MPL 2.0, which means it's fine for
Max to use your patch.
Details
Message ID
<D1JQSCYKCR03.16L52VGAHK7CK@lombera.dev>
In-Reply-To
<ZlLFOH1_77l7b38X@xha.li> (view parent)
DKIM signature
pass
Download raw message
On 2024-05-26 07:14 UTC, Lorenz (xha) wrote:

> On Sat, May 25, 2024 at 07:16:46PM +0000, Jose Lombera wrote:
> > Hi Max,
> > 
> > Thanks for taking this over.
> > 
> > I'm no longer following hare-dev, nor have time to review this patch nor
> > the changes you made on top of my original patch.  As such, I don't feel
> > comfortable appearing as the author of this version (which I presume is
> > what will finally be merged upstream).  Please add yourself as author
> > instead and remove my "Signed-off-by:".  Maybe just mention me as
> > "Co-authored-by:" and reference my original patch[1] with "Link:" or
> > something.
>
> i can't obviously talk you into your doing, but you are making it
> much more complicated without any good reason imo. your sign-off
> is needed because *you are the author of that patch*, it's all about
> copyright and not about responsibility or something.
>
> can the v3 please be kept the way it is? it's a very good improvement.
>
> btw, just saying, *legally*, with singing off the original patch,
> you've distributed it under the MPL 2.0, which means it's fine for
> Max to use your patch.

I don't mean anything legal by this request.  I'm giving away the
ownership of the patch.  You are free to use it however you want, change
it however you want and distribute it under whatever license you want.
I just don't want to appear as the author of this modified version
because it's not the version I wrote nor have time to review it and say
I'm ok with whatever changes were required on top of my patch.  For the
same reason, I don't want my sign-off to appear in something I didn't
review.  The only recognition I ask for, and that's optional too, is
that my original patch[1] is mentioned/referenced.


[1] https://lists.sr.ht/~sircmpwn/hare-dev/patches/39292
Details
Message ID
<D1JSVTQDM50R.XRJF95ZLWAO5@mxsr.de>
In-Reply-To
<D1JQSCYKCR03.16L52VGAHK7CK@lombera.dev> (view parent)
DKIM signature
pass
Download raw message
On Sun May 26, 2024 at 7:11 PM CEST, Jose Lombera wrote:
> I just don't want to appear as the author of this modified version 
> because it's not the version I wrote nor have time to review it and 
> say I'm ok with whatever changes were required on top of my patch.

No problem. I can put myself as author of this patch. (I never wanted 
you to take over responsibility for my changes. I just didn't want to 
take credits for your work.)

> For the same reason, I don't want my sign-off to appear in something I 
> didn't review.

This still sounds like signing off means to you giving an approval. I 
also understand it like Lorenz wrote: It only means giving away the 
copyright.

@Lorenz: Is it a problem if somebody is listed as co-author but not 
signing off? Btw, for me it's also okay if you modify the commit message 
in a way that it fits the project's needs.

[PATCH hare v4] bufio: remove scanner inefficiency

Lorenz (xha) <me@xha.li>
Details
Message ID
<20240527041904.92739-1-me@xha.li>
In-Reply-To
<D1JSVTQDM50R.XRJF95ZLWAO5@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
Patch: +66 -81
From: Max Schillinger <max@mxsr.de>

The new scanner with an internal read-ahead buffer contains an
inefficiency in which the underlying buffer is shifted left every time a
token is consumed.

Fix this by delaying the shift until an actual read-ahead from the
source IO handle is made, and only if the shift is required.

Co-authored-by: Jose Lombera <jose@lombera.dev>
Co-authored-by: Lorenz (xha) <me@xha.li>
Signed-off-by: Max Schillinger <max@mxsr.de>
Signed-off-by: Lorenz (xha) <me@xha.li>
---
sure, here you go. i still don't quite understand the situation
given the signoff is just required for agreeing to the "Developer
Certificate of Origin" as described in README.md, but well, we
can also do it this way.

 bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
 1 file changed, 66 insertions(+), 81 deletions(-)

diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index 4d6690c8..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -19,10 +19,10 @@ export type scanner = struct {
	stream: io::stream,
	src: io::handle,
	buffer: []u8,
	// Number of bytes available in buffer
	pending: size,
	// Number of bytes returned to the user
	readout: size,
	// Index of start of pending bytes in buffer
	start: size,
	// Sub-slice with pending bytes in buffer
	pending: []u8,
	// User-confirmed maximum size of read buffer
	maxread: size,
};
@@ -44,8 +44,8 @@ export fn newscanner(
		src = src,
		buffer = alloc([0...], BUFSZ),
		maxread = maxread,
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

@@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
		src = src,
		buffer = buffer,
		maxread = len(buffer),
		pending = 0,
		readout = 0,
		start = 0,
		pending = [],
	};
};

// Frees resources associated associated with a [[scanner]]. Does not close the
// underlying I/O handle.
// Frees resources associated with a [[scanner]]. Does not close the underlying
// I/O handle.
export fn finish(scan: *scanner) void = {
	free(scan.buffer);
};
@@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
	let scan = s: *scanner;

	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
		};
	};

	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
	buf[..n] = scan_consume(scan, n)[..];
	return n;
};
@@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
	if (scan.pending >= len(scan.buffer)) {
		let readahead = scan.pending + BUFSZ;
		if (readahead > scan.maxread) {
			readahead = scan.maxread;
		};
		if (scan.pending >= readahead) {
			return errors::overflow;
	let start = scan.start;
	const pending = len(scan.pending);

	if (start + pending == len(scan.buffer)) {
		if (start > 0) {
			// Shift buffer to the left to free space at the end
			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
			scan.pending = scan.buffer[..pending];
			start = 0;
			scan.start = 0;
		} else {
			// Buffer is full, expand it
			let readahead = pending + BUFSZ;
			if (readahead > scan.maxread) {
				readahead = scan.maxread;
			};
			if (pending >= readahead) {
				return errors::overflow;
			};
			append(scan.buffer, [0...], readahead);
		};
		append(scan.buffer, [0...], readahead);
	};

	const prev = scan.pending;
	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
	match (io::read(scan.src, scan.buffer[start + pending..])?) {
	case let z: size =>
		scan.pending += z;
		return prev;
		scan.pending = scan.buffer[start..start + pending + z];
		return pending;
	case io::EOF =>
		return io::EOF;
	};
};

// Shifts the buffer towards the start, discarding bytes which were read out.
fn scan_shift(scan: *scanner) void = {
	const n = scan.readout;
	if (n == 0) {
		return;
	};
	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
	scan.readout = 0;
	scan.pending -= n;
};

// Consumes N bytes from the buffer, updating scan.readout. User must call
// [[scan_shift]] before calling scan_consume again.
// Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
	assert(len(scan.buffer) >= n && scan.readout == 0);
	scan.readout = n;
	return scan.buffer[..n];
	assert(len(scan.pending) >= n);
	scan.start += n;
	defer scan.pending = scan.pending[n..];
	return scan.pending[..n];
};

// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
@@ -159,26 +153,24 @@ export fn scan_bytes(
	scan: *scanner,
	delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
	scan_shift(scan);

	let i = 0z, nread = 0z;
	let i = 0z;
	for (true) {
		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
		match (bytes::index(scan.pending[i..], delim)) {
		case let ix: size =>
			i = ix;
			i += ix;
			break;
		case void => void;
		};

		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
			return scan_consume(scan, scan.pending);
			return scan_consume(scan, len(scan.pending));
		case let prevpending: size =>
			// No need to re-index the earlier part of the buffer
			nread = prevpending;
			i = prevpending;
		};
	};

@@ -188,36 +180,27 @@ export fn scan_bytes(
	case let u: []u8 =>
		yield len(u);
	};
	const nuser = nread + i, nconsume = nuser + ndelim;
	return scan_consume(scan, nconsume)[..nuser];
	const nconsume = i + ndelim;
	return scan_consume(scan, nconsume)[..i];
};

// Reads one rune from a [[scanner]].
export fn scan_rune(
	scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
	// Consume previous read, if any
	scan_shift(scan);

	if (scan.pending == 0) {
	if (len(scan.pending) < 4) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.pending == 0) {
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
		case size => void;
		};
	};
	const sz = utf8::utf8sz(scan.buffer[0])?;

	for (scan.pending < sz) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return utf8::invalid;
		case size => void;
		};
	const sz = utf8::utf8sz(scan.pending[0])?;
	if (len(scan.pending) < sz) {
		return utf8::invalid;
	};

	const buf = scan_consume(scan, sz);
	const dec = utf8::decode(buf[..sz]);
	match (utf8::next(&dec)?) {
@@ -259,25 +242,27 @@ export fn scan_line(
// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
	scan_shift(scan);
	return scan.buffer[..scan.pending];
	return scan.pending[..];
};

fn scan_unread(scan: *scanner, buf: []u8) void = {
	if (len(buf) == 0) {
		return;
	};
	if (len(buf) <= scan.readout) {
		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
		scan.readout -= len(buf);
	if (len(buf) <= scan.start) {
		const pending_end = scan.start + len(scan.pending);
		scan.buffer[scan.start - len(buf)..scan.start] = buf;
		scan.start -= len(buf);
		scan.pending = scan.buffer[scan.start..pending_end];
	} else {
		const n = len(buf) - scan.readout;
		assert(n < scan.maxread - scan.pending,
		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
			"Attempted to unread more data than buffer has available");
		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
		scan.pending += n;
		// Shift buffer to the right to free space at the beginning
		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
			scan.buffer[scan.start..scan.start + len(scan.pending)];
		scan.buffer[..len(buf)] = buf;
		scan.readout = 0;
		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
		scan.start = 0;
	};
};

-- 
2.45.1

Re: [PATCH hare v4] bufio: remove scanner inefficiency

Details
Message ID
<D1KCI5PZ56HQ.1B226QZBGHEMH@cmpwn.com>
In-Reply-To
<20240527041904.92739-1-me@xha.li> (view parent)
DKIM signature
pass
Download raw message
This thread is pretty weird but as far as I can tell everything is above
board with this patch.

Thanks!

To git@git.sr.ht:~sircmpwn/hare
   3ac88260..5a9d8e7d  master -> master
Reply to thread Export thread (mbox)