11
5
[PATCH hare v3] bufio: remove scanner inefficiency
From: Jose Lombera <jose@lombera.dev>
The new scanner with an internal read-ahead buffer contains an
inefficiency in which the underlying buffer is shifted left every time a
token is consumed.
Fix this by delaying the shift until an actual read-ahead from the
source IO handle is made, and only if the shift is required.
Co-authored-by: Max Schillinger <max@mxsr.de>
Co-authored-by: Lorenz (xha) <me@xha.li>
Signed-off-by: Jose Lombera <jose@lombera.dev>
Signed-off-by: Max Schillinger <max@mxsr.de>
---
bufio/scanner.ha | 147 +++++++++++++++++++++ --------------------------
1 file changed, 66 insertions(+), 81 deletions(-)
diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index 4d6690c8..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -19,10 +19,10 @@ export type scanner = struct {
stream: io::stream,
src: io::handle,
buffer: []u8,
- // Number of bytes available in buffer
- pending: size,
- // Number of bytes returned to the user
- readout: size,
+ // Index of start of pending bytes in buffer
+ start: size,
+ // Sub-slice with pending bytes in buffer
+ pending: []u8,
// User-confirmed maximum size of read buffer
maxread: size,
};
@@ -44,8 +44,8 @@ export fn newscanner(
src = src,
buffer = alloc([0...], BUFSZ),
maxread = maxread,
- pending = 0,
- readout = 0,
+ start = 0,
+ pending = [],
};
};
@@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
src = src,
buffer = buffer,
maxread = len(buffer),
- pending = 0,
- readout = 0,
+ start = 0,
+ pending = [],
};
};
- // Frees resources associated associated with a [[scanner]]. Does not close the
- // underlying I/O handle.
+ // Frees resources associated with a [[scanner]]. Does not close the underlying
+ // I/O handle.
export fn finish(scan: *scanner) void = {
free(scan.buffer);
};
@@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
let scan = s: *scanner;
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
match (scan_readahead(scan)?) {
case io::EOF =>
return io::EOF;
@@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
};
};
- const n = if (len(buf) > scan.pending) scan.pending else len(buf);
+ const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
buf[..n] = scan_consume(scan, n)[..];
return n;
};
@@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
- if (scan.pending >= len(scan.buffer)) {
- let readahead = scan.pending + BUFSZ;
- if (readahead > scan.maxread) {
- readahead = scan.maxread;
- };
- if (scan.pending >= readahead) {
- return errors::overflow;
+ let start = scan.start;
+ const pending = len(scan.pending);
+
+ if (start + pending == len(scan.buffer)) {
+ if (start > 0) {
+ // Shift buffer to the left to free space at the end
+ scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
+ scan.pending = scan.buffer[..pending];
+ start = 0;
+ scan.start = 0;
+ } else {
+ // Buffer is full, expand it
+ let readahead = pending + BUFSZ;
+ if (readahead > scan.maxread) {
+ readahead = scan.maxread;
+ };
+ if (pending >= readahead) {
+ return errors::overflow;
+ };
+ append(scan.buffer, [0...], readahead);
};
- append(scan.buffer, [0...], readahead);
};
- const prev = scan.pending;
- match (io::read(scan.src, scan.buffer[scan.pending..])?) {
+ match (io::read(scan.src, scan.buffer[start + pending..])?) {
case let z: size =>
- scan.pending += z;
- return prev;
+ scan.pending = scan.buffer[start..start + pending + z];
+ return pending;
case io::EOF =>
return io::EOF;
};
};
- // Shifts the buffer towards the start, discarding bytes which were read out.
- fn scan_shift(scan: *scanner) void = {
- const n = scan.readout;
- if (n == 0) {
- return;
- };
- scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
- scan.readout = 0;
- scan.pending -= n;
- };
-
- // Consumes N bytes from the buffer, updating scan.readout. User must call
- // [[scan_shift]] before calling scan_consume again.
+ // Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
- assert(len(scan.buffer) >= n && scan.readout == 0);
- scan.readout = n;
- return scan.buffer[..n];
+ assert(len(scan.pending) >= n);
+ scan.start += n;
+ defer scan.pending = scan.pending[n..];
+ return scan.pending[..n];
};
// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
match (scan_readahead(scan)?) {
case io::EOF =>
return io::EOF;
@@ -159,26 +153,24 @@ export fn scan_bytes(
scan: *scanner,
delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
- scan_shift(scan);
-
- let i = 0z, nread = 0z;
+ let i = 0z;
for (true) {
- match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
+ match (bytes::index(scan.pending[i..], delim)) {
case let ix: size =>
- i = ix;
+ i += ix;
break;
case void => void;
};
match (scan_readahead(scan)?) {
case io::EOF =>
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
return io::EOF;
};
- return scan_consume(scan, scan.pending);
+ return scan_consume(scan, len(scan.pending));
case let prevpending: size =>
// No need to re-index the earlier part of the buffer
- nread = prevpending;
+ i = prevpending;
};
};
@@ -188,36 +180,27 @@ export fn scan_bytes(
case let u: []u8 =>
yield len(u);
};
- const nuser = nread + i, nconsume = nuser + ndelim;
- return scan_consume(scan, nconsume)[..nuser];
+ const nconsume = i + ndelim;
+ return scan_consume(scan, nconsume)[..i];
};
// Reads one rune from a [[scanner]].
export fn scan_rune(
scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) < 4) {
match (scan_readahead(scan)?) {
case io::EOF =>
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
return io::EOF;
};
case size => void;
};
};
- const sz = utf8::utf8sz(scan.buffer[0])?;
-
- for (scan.pending < sz) {
- match (scan_readahead(scan)?) {
- case io::EOF =>
- return utf8::invalid;
- case size => void;
- };
+ const sz = utf8::utf8sz(scan.pending[0])?;
+ if (len(scan.pending) < sz) {
+ return utf8::invalid;
};
-
const buf = scan_consume(scan, sz);
const dec = utf8::decode(buf[..sz]);
match (utf8::next(&dec)?) {
@@ -259,25 +242,27 @@ export fn scan_line(
// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
- scan_shift(scan);
- return scan.buffer[..scan.pending];
+ return scan.pending[..];
};
fn scan_unread(scan: *scanner, buf: []u8) void = {
if (len(buf) == 0) {
return;
};
- if (len(buf) <= scan.readout) {
- scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
- scan.readout -= len(buf);
+ if (len(buf) <= scan.start) {
+ const pending_end = scan.start + len(scan.pending);
+ scan.buffer[scan.start - len(buf)..scan.start] = buf;
+ scan.start -= len(buf);
+ scan.pending = scan.buffer[scan.start..pending_end];
} else {
- const n = len(buf) - scan.readout;
- assert(n < scan.maxread - scan.pending,
+ assert(len(buf) <= len(scan.buffer) - len(scan.pending),
"Attempted to unread more data than buffer has available");
- scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
- scan.pending += n;
+ // Shift buffer to the right to free space at the beginning
+ scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
+ scan.buffer[scan.start..scan.start + len(scan.pending)];
scan.buffer[..len(buf)] = buf;
- scan.readout = 0;
+ scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
+ scan.start = 0;
};
};
--
2.45.1
[hare/patches] build success
On Fri, May 24, 2024 at 09:21:41PM +0200, Max Schillinger wrote:
> From: Jose Lombera <jose@lombera.dev >
>
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
>
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
>
> Co-authored-by: Max Schillinger <max@mxsr.de >
> Co-authored-by: Lorenz (xha) <me@xha.li >
> Signed-off-by: Jose Lombera <jose@lombera.dev >
> Signed-off-by: Max Schillinger <max@mxsr.de >
this version is OK. thank you!
> ---
> bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
> 1 file changed, 66 insertions(+), 81 deletions(-)
>
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 4d6690c8..8b8b6b68 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -19,10 +19,10 @@ export type scanner = struct {
> stream: io::stream,
> src: io::handle,
> buffer: []u8,
> - // Number of bytes available in buffer
> - pending: size,
> - // Number of bytes returned to the user
> - readout: size,
> + // Index of start of pending bytes in buffer
> + start: size,
> + // Sub-slice with pending bytes in buffer
> + pending: []u8,
> // User-confirmed maximum size of read buffer
> maxread: size,
> };
> @@ -44,8 +44,8 @@ export fn newscanner(
> src = src,
> buffer = alloc([0...], BUFSZ),
> maxread = maxread,
> - pending = 0,
> - readout = 0,
> + start = 0,
> + pending = [],
> };
> };
>
> @@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
> src = src,
> buffer = buffer,
> maxread = len(buffer),
> - pending = 0,
> - readout = 0,
> + start = 0,
> + pending = [],
> };
> };
>
> -// Frees resources associated associated with a [[scanner]]. Does not close the
> -// underlying I/O handle.
> +// Frees resources associated with a [[scanner]]. Does not close the underlying
> +// I/O handle.
> export fn finish(scan: *scanner) void = {
> free(scan.buffer);
> };
> @@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
> fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> let scan = s: *scanner;
>
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> return io::EOF;
> @@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> };
> };
>
> - const n = if (len(buf) > scan.pending) scan.pending else len(buf);
> + const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
> buf[..n] = scan_consume(scan, n)[..];
> return n;
> };
> @@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> // is updated accordingly. Returns the number of bytes which had been available
> // prior to the call.
> fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> - if (scan.pending >= len(scan.buffer)) {
> - let readahead = scan.pending + BUFSZ;
> - if (readahead > scan.maxread) {
> - readahead = scan.maxread;
> - };
> - if (scan.pending >= readahead) {
> - return errors::overflow;
> + let start = scan.start;
> + const pending = len(scan.pending);
> +
> + if (start + pending == len(scan.buffer)) {
> + if (start > 0) {
> + // Shift buffer to the left to free space at the end
> + scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
> + scan.pending = scan.buffer[..pending];
> + start = 0;
> + scan.start = 0;
> + } else {
> + // Buffer is full, expand it
> + let readahead = pending + BUFSZ;
> + if (readahead > scan.maxread) {
> + readahead = scan.maxread;
> + };
> + if (pending >= readahead) {
> + return errors::overflow;
> + };
> + append(scan.buffer, [0...], readahead);
> };
> - append(scan.buffer, [0...], readahead);
> };
>
> - const prev = scan.pending;
> - match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> + match (io::read(scan.src, scan.buffer[start + pending..])?) {
> case let z: size =>
> - scan.pending += z;
> - return prev;
> + scan.pending = scan.buffer[start..start + pending + z];
> + return pending;
> case io::EOF =>
> return io::EOF;
> };
> };
>
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> - const n = scan.readout;
> - if (n == 0) {
> - return;
> - };
> - scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> - scan.readout = 0;
> - scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
> fn scan_consume(scan: *scanner, n: size) []u8 = {
> - assert(len(scan.buffer) >= n && scan.readout == 0);
> - scan.readout = n;
> - return scan.buffer[..n];
> + assert(len(scan.pending) >= n);
> + scan.start += n;
> + defer scan.pending = scan.pending[n..];
> + return scan.pending[..n];
> };
>
> // Reads one byte from a [[scanner]].
> export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> return io::EOF;
> @@ -159,26 +153,24 @@ export fn scan_bytes(
> scan: *scanner,
> delim: (u8 | []u8),
> ) ([]u8 | io::EOF | io::error) = {
> - scan_shift(scan);
> -
> - let i = 0z, nread = 0z;
> + let i = 0z;
> for (true) {
> - match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
> + match (bytes::index(scan.pending[i..], delim)) {
> case let ix: size =>
> - i = ix;
> + i += ix;
> break;
> case void => void;
> };
>
> match (scan_readahead(scan)?) {
> case io::EOF =>
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> return io::EOF;
> };
> - return scan_consume(scan, scan.pending);
> + return scan_consume(scan, len(scan.pending));
> case let prevpending: size =>
> // No need to re-index the earlier part of the buffer
> - nread = prevpending;
> + i = prevpending;
> };
> };
>
> @@ -188,36 +180,27 @@ export fn scan_bytes(
> case let u: []u8 =>
> yield len(u);
> };
> - const nuser = nread + i, nconsume = nuser + ndelim;
> - return scan_consume(scan, nconsume)[..nuser];
> + const nconsume = i + ndelim;
> + return scan_consume(scan, nconsume)[..i];
> };
>
> // Reads one rune from a [[scanner]].
> export fn scan_rune(
> scan: *scanner,
> ) (rune | io::EOF | io::error | utf8::invalid) = {
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) < 4) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> return io::EOF;
> };
> case size => void;
> };
> };
> - const sz = utf8::utf8sz(scan.buffer[0])?;
> -
> - for (scan.pending < sz) {
> - match (scan_readahead(scan)?) {
> - case io::EOF =>
> - return utf8::invalid;
> - case size => void;
> - };
> + const sz = utf8::utf8sz(scan.pending[0])?;
> + if (len(scan.pending) < sz) {
> + return utf8::invalid;
> };
> -
> const buf = scan_consume(scan, sz);
> const dec = utf8::decode(buf[..sz]);
> match (utf8::next(&dec)?) {
> @@ -259,25 +242,27 @@ export fn scan_line(
> // Returns the internal scanner buffer, which contains all bytes read ahead by
> // the scanner up to this point.
> export fn scan_buffer(scan: *scanner) []u8 = {
> - scan_shift(scan);
> - return scan.buffer[..scan.pending];
> + return scan.pending[..];
> };
>
> fn scan_unread(scan: *scanner, buf: []u8) void = {
> if (len(buf) == 0) {
> return;
> };
> - if (len(buf) <= scan.readout) {
> - scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> - scan.readout -= len(buf);
> + if (len(buf) <= scan.start) {
> + const pending_end = scan.start + len(scan.pending);
> + scan.buffer[scan.start - len(buf)..scan.start] = buf;
> + scan.start -= len(buf);
> + scan.pending = scan.buffer[scan.start..pending_end];
> } else {
> - const n = len(buf) - scan.readout;
> - assert(n < scan.maxread - scan.pending,
> + assert(len(buf) <= len(scan.buffer) - len(scan.pending),
> "Attempted to unread more data than buffer has available");
> - scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> - scan.pending += n;
> + // Shift buffer to the right to free space at the beginning
> + scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
> + scan.buffer[scan.start..scan.start + len(scan.pending)];
> scan.buffer[..len(buf)] = buf;
> - scan.readout = 0;
> + scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
> + scan.start = 0;
> };
> };
>
> --
> 2.45.1
>
On Sat May 25, 2024 at 6:22 AM CEST, Lorenz (xha) wrote:
> this version is OK. thank you!
Thank you for your fix!
Apart from that: I still don't understand how sourcehut works. My e-mail
with v3 of this patch shows up on
<https://lists.sr.ht/~sircmpwn/hare-dev > between `[PATCH hare v5]
NetBSD: port Hare to NetBSD/amd64` and `[PATCH hare v6] NetBSD: port
Hare to NetBSD/amd64`. But it's missing on the patches page:
<https://lists.sr.ht/~sircmpwn/hare-dev/patches >
Is this a bug in sourcehut? Or is something missing my e-mail header?
On Sat, May 25, 2024 at 09:37:16AM +0200, Max Schillinger wrote:
> Apart from that: I still don't understand how sourcehut works. My e-mail
> with v3 of this patch shows up on
> <https://lists.sr.ht/~sircmpwn/hare-dev > between `[PATCH hare v5]
> NetBSD: port Hare to NetBSD/amd64` and `[PATCH hare v6] NetBSD: port
> Hare to NetBSD/amd64`. But it's missing on the patches page:
> <https://lists.sr.ht/~sircmpwn/hare-dev/patches >
>
> Is this a bug in sourcehut? Or is something missing my e-mail header?
i'd guess that this is a bug in sourcehut. prehaps using a coverletter
with only a signle commit confused it.
Hi Max,
Thanks for taking this over.
I'm no longer following hare-dev, nor have time to review this patch nor
the changes you made on top of my original patch. As such, I don't feel
comfortable appearing as the author of this version (which I presume is
what will finally be merged upstream). Please add yourself as author
instead and remove my "Signed-off-by:". Maybe just mention me as
"Co-authored-by:" and reference my original patch[1] with "Link:" or
something.
Thanks,
Lombera
On 2024-05-24 21:21 UTC, Max Schillinger wrote:
> From: Jose Lombera <jose@lombera.dev >
>
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
>
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
>
> Co-authored-by: Max Schillinger <max@mxsr.de >
> Co-authored-by: Lorenz (xha) <me@xha.li >
> Signed-off-by: Jose Lombera <jose@lombera.dev >
> Signed-off-by: Max Schillinger <max@mxsr.de >
> ---
> bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
> 1 file changed, 66 insertions(+), 81 deletions(-)
>
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 4d6690c8..8b8b6b68 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -19,10 +19,10 @@ export type scanner = struct {
> stream: io::stream,
> src: io::handle,
> buffer: []u8,
> - // Number of bytes available in buffer
> - pending: size,
> - // Number of bytes returned to the user
> - readout: size,
> + // Index of start of pending bytes in buffer
> + start: size,
> + // Sub-slice with pending bytes in buffer
> + pending: []u8,
> // User-confirmed maximum size of read buffer
> maxread: size,
> };
> @@ -44,8 +44,8 @@ export fn newscanner(
> src = src,
> buffer = alloc([0...], BUFSZ),
> maxread = maxread,
> - pending = 0,
> - readout = 0,
> + start = 0,
> + pending = [],
> };
> };
>
> @@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
> src = src,
> buffer = buffer,
> maxread = len(buffer),
> - pending = 0,
> - readout = 0,
> + start = 0,
> + pending = [],
> };
> };
>
> -// Frees resources associated associated with a [[scanner]]. Does not close the
> -// underlying I/O handle.
> +// Frees resources associated with a [[scanner]]. Does not close the underlying
> +// I/O handle.
> export fn finish(scan: *scanner) void = {
> free(scan.buffer);
> };
> @@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
> fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> let scan = s: *scanner;
>
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> return io::EOF;
> @@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> };
> };
>
> - const n = if (len(buf) > scan.pending) scan.pending else len(buf);
> + const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
> buf[..n] = scan_consume(scan, n)[..];
> return n;
> };
> @@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> // is updated accordingly. Returns the number of bytes which had been available
> // prior to the call.
> fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> - if (scan.pending >= len(scan.buffer)) {
> - let readahead = scan.pending + BUFSZ;
> - if (readahead > scan.maxread) {
> - readahead = scan.maxread;
> - };
> - if (scan.pending >= readahead) {
> - return errors::overflow;
> + let start = scan.start;
> + const pending = len(scan.pending);
> +
> + if (start + pending == len(scan.buffer)) {
> + if (start > 0) {
> + // Shift buffer to the left to free space at the end
> + scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
> + scan.pending = scan.buffer[..pending];
> + start = 0;
> + scan.start = 0;
> + } else {
> + // Buffer is full, expand it
> + let readahead = pending + BUFSZ;
> + if (readahead > scan.maxread) {
> + readahead = scan.maxread;
> + };
> + if (pending >= readahead) {
> + return errors::overflow;
> + };
> + append(scan.buffer, [0...], readahead);
> };
> - append(scan.buffer, [0...], readahead);
> };
>
> - const prev = scan.pending;
> - match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> + match (io::read(scan.src, scan.buffer[start + pending..])?) {
> case let z: size =>
> - scan.pending += z;
> - return prev;
> + scan.pending = scan.buffer[start..start + pending + z];
> + return pending;
> case io::EOF =>
> return io::EOF;
> };
> };
>
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> - const n = scan.readout;
> - if (n == 0) {
> - return;
> - };
> - scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> - scan.readout = 0;
> - scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
> fn scan_consume(scan: *scanner, n: size) []u8 = {
> - assert(len(scan.buffer) >= n && scan.readout == 0);
> - scan.readout = n;
> - return scan.buffer[..n];
> + assert(len(scan.pending) >= n);
> + scan.start += n;
> + defer scan.pending = scan.pending[n..];
> + return scan.pending[..n];
> };
>
> // Reads one byte from a [[scanner]].
> export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> return io::EOF;
> @@ -159,26 +153,24 @@ export fn scan_bytes(
> scan: *scanner,
> delim: (u8 | []u8),
> ) ([]u8 | io::EOF | io::error) = {
> - scan_shift(scan);
> -
> - let i = 0z, nread = 0z;
> + let i = 0z;
> for (true) {
> - match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
> + match (bytes::index(scan.pending[i..], delim)) {
> case let ix: size =>
> - i = ix;
> + i += ix;
> break;
> case void => void;
> };
>
> match (scan_readahead(scan)?) {
> case io::EOF =>
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> return io::EOF;
> };
> - return scan_consume(scan, scan.pending);
> + return scan_consume(scan, len(scan.pending));
> case let prevpending: size =>
> // No need to re-index the earlier part of the buffer
> - nread = prevpending;
> + i = prevpending;
> };
> };
>
> @@ -188,36 +180,27 @@ export fn scan_bytes(
> case let u: []u8 =>
> yield len(u);
> };
> - const nuser = nread + i, nconsume = nuser + ndelim;
> - return scan_consume(scan, nconsume)[..nuser];
> + const nconsume = i + ndelim;
> + return scan_consume(scan, nconsume)[..i];
> };
>
> // Reads one rune from a [[scanner]].
> export fn scan_rune(
> scan: *scanner,
> ) (rune | io::EOF | io::error | utf8::invalid) = {
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) < 4) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> return io::EOF;
> };
> case size => void;
> };
> };
> - const sz = utf8::utf8sz(scan.buffer[0])?;
> -
> - for (scan.pending < sz) {
> - match (scan_readahead(scan)?) {
> - case io::EOF =>
> - return utf8::invalid;
> - case size => void;
> - };
> + const sz = utf8::utf8sz(scan.pending[0])?;
> + if (len(scan.pending) < sz) {
> + return utf8::invalid;
> };
> -
> const buf = scan_consume(scan, sz);
> const dec = utf8::decode(buf[..sz]);
> match (utf8::next(&dec)?) {
> @@ -259,25 +242,27 @@ export fn scan_line(
> // Returns the internal scanner buffer, which contains all bytes read ahead by
> // the scanner up to this point.
> export fn scan_buffer(scan: *scanner) []u8 = {
> - scan_shift(scan);
> - return scan.buffer[..scan.pending];
> + return scan.pending[..];
> };
>
> fn scan_unread(scan: *scanner, buf: []u8) void = {
> if (len(buf) == 0) {
> return;
> };
> - if (len(buf) <= scan.readout) {
> - scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> - scan.readout -= len(buf);
> + if (len(buf) <= scan.start) {
> + const pending_end = scan.start + len(scan.pending);
> + scan.buffer[scan.start - len(buf)..scan.start] = buf;
> + scan.start -= len(buf);
> + scan.pending = scan.buffer[scan.start..pending_end];
> } else {
> - const n = len(buf) - scan.readout;
> - assert(n < scan.maxread - scan.pending,
> + assert(len(buf) <= len(scan.buffer) - len(scan.pending),
> "Attempted to unread more data than buffer has available");
> - scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> - scan.pending += n;
> + // Shift buffer to the right to free space at the beginning
> + scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
> + scan.buffer[scan.start..scan.start + len(scan.pending)];
> scan.buffer[..len(buf)] = buf;
> - scan.readout = 0;
> + scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
> + scan.start = 0;
> };
> };
>
> --
> 2.45.1
On Sat, May 25, 2024 at 07:16:46PM +0000, Jose Lombera wrote:
> Hi Max,
>
> Thanks for taking this over.
>
> I'm no longer following hare-dev, nor have time to review this patch nor
> the changes you made on top of my original patch. As such, I don't feel
> comfortable appearing as the author of this version (which I presume is
> what will finally be merged upstream). Please add yourself as author
> instead and remove my "Signed-off-by:". Maybe just mention me as
> "Co-authored-by:" and reference my original patch[1] with "Link:" or
> something.
i can't obviously talk you into your doing, but you are making it
much more complicated without any good reason imo. your sign-off
is needed because *you are the author of that patch*, it's all about
copyright and not about responsibility or something.
can the v3 please be kept the way it is? it's a very good improvement.
btw, just saying, *legally*, with singing off the original patch,
you've distributed it under the MPL 2.0, which means it's fine for
Max to use your patch.
On 2024-05-26 07:14 UTC, Lorenz (xha) wrote:
> On Sat, May 25, 2024 at 07:16:46PM +0000, Jose Lombera wrote:
> > Hi Max,
> >
> > Thanks for taking this over.
> >
> > I'm no longer following hare-dev, nor have time to review this patch nor
> > the changes you made on top of my original patch. As such, I don't feel
> > comfortable appearing as the author of this version (which I presume is
> > what will finally be merged upstream). Please add yourself as author
> > instead and remove my "Signed-off-by:". Maybe just mention me as
> > "Co-authored-by:" and reference my original patch[1] with "Link:" or
> > something.
>
> i can't obviously talk you into your doing, but you are making it
> much more complicated without any good reason imo. your sign-off
> is needed because *you are the author of that patch*, it's all about
> copyright and not about responsibility or something.
>
> can the v3 please be kept the way it is? it's a very good improvement.
>
> btw, just saying, *legally*, with singing off the original patch,
> you've distributed it under the MPL 2.0, which means it's fine for
> Max to use your patch.
I don't mean anything legal by this request. I'm giving away the
ownership of the patch. You are free to use it however you want, change
it however you want and distribute it under whatever license you want.
I just don't want to appear as the author of this modified version
because it's not the version I wrote nor have time to review it and say
I'm ok with whatever changes were required on top of my patch. For the
same reason, I don't want my sign-off to appear in something I didn't
review. The only recognition I ask for, and that's optional too, is
that my original patch[1] is mentioned/referenced.
[1] https://lists.sr.ht/~sircmpwn/hare-dev/patches/39292
On Sun May 26, 2024 at 7:11 PM CEST, Jose Lombera wrote:
> I just don't want to appear as the author of this modified version
> because it's not the version I wrote nor have time to review it and
> say I'm ok with whatever changes were required on top of my patch.
No problem. I can put myself as author of this patch. (I never wanted
you to take over responsibility for my changes. I just didn't want to
take credits for your work.)
> For the same reason, I don't want my sign-off to appear in something I
> didn't review.
This still sounds like signing off means to you giving an approval. I
also understand it like Lorenz wrote: It only means giving away the
copyright.
@Lorenz: Is it a problem if somebody is listed as co-author but not
signing off? Btw, for me it's also okay if you modify the commit message
in a way that it fits the project's needs.
[PATCH hare v4] bufio: remove scanner inefficiency
From: Max Schillinger <max@mxsr.de>
The new scanner with an internal read-ahead buffer contains an
inefficiency in which the underlying buffer is shifted left every time a
token is consumed.
Fix this by delaying the shift until an actual read-ahead from the
source IO handle is made, and only if the shift is required.
Co-authored-by: Jose Lombera <jose@lombera.dev>
Co-authored-by: Lorenz (xha) <me@xha.li>
Signed-off-by: Max Schillinger <max@mxsr.de>
Signed-off-by: Lorenz (xha) <me@xha.li>
---
sure, here you go. i still don't quite understand the situation
given the signoff is just required for agreeing to the "Developer
Certificate of Origin" as described in README.md, but well, we
can also do it this way.
bufio/scanner.ha | 147 +++++++++++++++++++++ --------------------------
1 file changed, 66 insertions(+), 81 deletions(-)
diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index 4d6690c8..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -19,10 +19,10 @@ export type scanner = struct {
stream: io::stream,
src: io::handle,
buffer: []u8,
- // Number of bytes available in buffer
- pending: size,
- // Number of bytes returned to the user
- readout: size,
+ // Index of start of pending bytes in buffer
+ start: size,
+ // Sub-slice with pending bytes in buffer
+ pending: []u8,
// User-confirmed maximum size of read buffer
maxread: size,
};
@@ -44,8 +44,8 @@ export fn newscanner(
src = src,
buffer = alloc([0...], BUFSZ),
maxread = maxread,
- pending = 0,
- readout = 0,
+ start = 0,
+ pending = [],
};
};
@@ -59,13 +59,13 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
src = src,
buffer = buffer,
maxread = len(buffer),
- pending = 0,
- readout = 0,
+ start = 0,
+ pending = [],
};
};
- // Frees resources associated associated with a [[scanner]]. Does not close the
- // underlying I/O handle.
+ // Frees resources associated with a [[scanner]]. Does not close the underlying
+ // I/O handle.
export fn finish(scan: *scanner) void = {
free(scan.buffer);
};
@@ -73,10 +73,7 @@ export fn finish(scan: *scanner) void = {
fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
let scan = s: *scanner;
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
match (scan_readahead(scan)?) {
case io::EOF =>
return io::EOF;
@@ -84,7 +81,7 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
};
};
- const n = if (len(buf) > scan.pending) scan.pending else len(buf);
+ const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
buf[..n] = scan_consume(scan, n)[..];
return n;
};
@@ -95,52 +92,49 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
- if (scan.pending >= len(scan.buffer)) {
- let readahead = scan.pending + BUFSZ;
- if (readahead > scan.maxread) {
- readahead = scan.maxread;
- };
- if (scan.pending >= readahead) {
- return errors::overflow;
+ let start = scan.start;
+ const pending = len(scan.pending);
+
+ if (start + pending == len(scan.buffer)) {
+ if (start > 0) {
+ // Shift buffer to the left to free space at the end
+ scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
+ scan.pending = scan.buffer[..pending];
+ start = 0;
+ scan.start = 0;
+ } else {
+ // Buffer is full, expand it
+ let readahead = pending + BUFSZ;
+ if (readahead > scan.maxread) {
+ readahead = scan.maxread;
+ };
+ if (pending >= readahead) {
+ return errors::overflow;
+ };
+ append(scan.buffer, [0...], readahead);
};
- append(scan.buffer, [0...], readahead);
};
- const prev = scan.pending;
- match (io::read(scan.src, scan.buffer[scan.pending..])?) {
+ match (io::read(scan.src, scan.buffer[start + pending..])?) {
case let z: size =>
- scan.pending += z;
- return prev;
+ scan.pending = scan.buffer[start..start + pending + z];
+ return pending;
case io::EOF =>
return io::EOF;
};
};
- // Shifts the buffer towards the start, discarding bytes which were read out.
- fn scan_shift(scan: *scanner) void = {
- const n = scan.readout;
- if (n == 0) {
- return;
- };
- scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
- scan.readout = 0;
- scan.pending -= n;
- };
-
- // Consumes N bytes from the buffer, updating scan.readout. User must call
- // [[scan_shift]] before calling scan_consume again.
+ // Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
- assert(len(scan.buffer) >= n && scan.readout == 0);
- scan.readout = n;
- return scan.buffer[..n];
+ assert(len(scan.pending) >= n);
+ scan.start += n;
+ defer scan.pending = scan.pending[n..];
+ return scan.pending[..n];
};
// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
match (scan_readahead(scan)?) {
case io::EOF =>
return io::EOF;
@@ -159,26 +153,24 @@ export fn scan_bytes(
scan: *scanner,
delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
- scan_shift(scan);
-
- let i = 0z, nread = 0z;
+ let i = 0z;
for (true) {
- match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
+ match (bytes::index(scan.pending[i..], delim)) {
case let ix: size =>
- i = ix;
+ i += ix;
break;
case void => void;
};
match (scan_readahead(scan)?) {
case io::EOF =>
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
return io::EOF;
};
- return scan_consume(scan, scan.pending);
+ return scan_consume(scan, len(scan.pending));
case let prevpending: size =>
// No need to re-index the earlier part of the buffer
- nread = prevpending;
+ i = prevpending;
};
};
@@ -188,36 +180,27 @@ export fn scan_bytes(
case let u: []u8 =>
yield len(u);
};
- const nuser = nread + i, nconsume = nuser + ndelim;
- return scan_consume(scan, nconsume)[..nuser];
+ const nconsume = i + ndelim;
+ return scan_consume(scan, nconsume)[..i];
};
// Reads one rune from a [[scanner]].
export fn scan_rune(
scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
- // Consume previous read, if any
- scan_shift(scan);
-
- if (scan.pending == 0) {
+ if (len(scan.pending) < 4) {
match (scan_readahead(scan)?) {
case io::EOF =>
- if (scan.pending == 0) {
+ if (len(scan.pending) == 0) {
return io::EOF;
};
case size => void;
};
};
- const sz = utf8::utf8sz(scan.buffer[0])?;
-
- for (scan.pending < sz) {
- match (scan_readahead(scan)?) {
- case io::EOF =>
- return utf8::invalid;
- case size => void;
- };
+ const sz = utf8::utf8sz(scan.pending[0])?;
+ if (len(scan.pending) < sz) {
+ return utf8::invalid;
};
-
const buf = scan_consume(scan, sz);
const dec = utf8::decode(buf[..sz]);
match (utf8::next(&dec)?) {
@@ -259,25 +242,27 @@ export fn scan_line(
// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
- scan_shift(scan);
- return scan.buffer[..scan.pending];
+ return scan.pending[..];
};
fn scan_unread(scan: *scanner, buf: []u8) void = {
if (len(buf) == 0) {
return;
};
- if (len(buf) <= scan.readout) {
- scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
- scan.readout -= len(buf);
+ if (len(buf) <= scan.start) {
+ const pending_end = scan.start + len(scan.pending);
+ scan.buffer[scan.start - len(buf)..scan.start] = buf;
+ scan.start -= len(buf);
+ scan.pending = scan.buffer[scan.start..pending_end];
} else {
- const n = len(buf) - scan.readout;
- assert(n < scan.maxread - scan.pending,
+ assert(len(buf) <= len(scan.buffer) - len(scan.pending),
"Attempted to unread more data than buffer has available");
- scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
- scan.pending += n;
+ // Shift buffer to the right to free space at the beginning
+ scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
+ scan.buffer[scan.start..scan.start + len(scan.pending)];
scan.buffer[..len(buf)] = buf;
- scan.readout = 0;
+ scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
+ scan.start = 0;
};
};
--
2.45.1
Re: [PATCH hare v4] bufio: remove scanner inefficiency
This thread is pretty weird but as far as I can tell everything is above
board with this patch.
Thanks!
To git@git.sr.ht:~sircmpwn/hare
3ac88260..5a9d8e7d master -> master