It looks like v1 of this patch (#39292) was never applied.
I have rebased it to master and extended it to the later added function
scan_unread.
These are the results for Jose's mini benchmark:
master branch:
time ./newlines-master l.txt
20.01 secs
time ./newlines-master l8k.txt
59.58 secs
With this patch (v2) applied:
time ./newlines-patch l.txt
230.16 millis
time ./newlines-patch l8k.txt
238.67 millis
Jose Lombera (1):
bufio: remove scanner inefficiency
bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
1 file changed, 67 insertions(+), 80 deletions(-)
--
2.44.0
[PATCH hare v2 1/1] bufio: remove scanner inefficiency
On Sat, Apr 06, 2024 at 08:12:35AM +0200, Max Schillinger wrote:
> From: Jose Lombera <jose@lombera.dev>
>
> The new scanner with an internal read-ahead buffer contains an
> inefficiency in which the underlying buffer is shifted left every time a
> token is consumed.
>
> Fix this by delaying the shift until an actual read-ahead from the
> source IO handle is made, and only if the shift is required.
>
> Co-authored-by: Max Schillinger <max@mxsr.de>
> Signed-off-by: Jose Lombera <jose@lombera.dev>
> Signed-off-by: Max Schillinger <max@mxsr.de>
thanks for this patch, it's a huge performance improvement, which
is really nice. it took a couple of hours for me to dig through
this patch/logic and figure it out; most of this is fine, but there
are a couple of nits and one pretty serious bug.
can you please send a v3 with this addresseed?
you can find a proposed patch from me below.
> ---
> bufio/scanner.ha | 147 +++++++++++++++++++++--------------------------
> 1 file changed, 67 insertions(+), 80 deletions(-)
>
> diff --git a/bufio/scanner.ha b/bufio/scanner.ha
> index 1f156050..30f96d07 100644
> --- a/bufio/scanner.ha
> +++ b/bufio/scanner.ha
> @@ -95,52 +92,50 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
> // is updated accordingly. Returns the number of bytes which had been available
> // prior to the call.
> fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
> - if (scan.pending >= len(scan.buffer)) {
> - let readahead = scan.pending + BUFSZ;
> - if (readahead > scan.maxread) {
> - readahead = scan.maxread;
> - };
> - if (scan.pending >= readahead) {
> - return errors::overflow;
> + let start = scan.start;
> + const pending = len(scan.pending);
> + const buf_len = len(scan.buffer);
i would remove "buf_len"; it's confusing because there is (1) a
variable "buf" in this scope and (2) obscures code, len(scan.buffer)
is much more readable in the two places it's used anyways.
> +
> + if ((start + pending) >= buf_len) {
the "bigger than" part of this check does not make sense, it's
logically not possible. this should use "==". also, the extra
parentheses are not needed.
> + if (start > 0) {
> + // Shift buffer to the left to free space at the end
> + scan.buffer[..buf_len - start] = scan.buffer[start..];
> + scan.pending = scan.buffer[..pending];
> + start = 0;
> + scan.start = 0;
> + } else {
> + // Buffer is full, expand it
> + let readahead = pending + BUFSZ;
> + if (readahead > scan.maxread) {
> + readahead = scan.maxread;
> + };
> + if (pending >= readahead) {
> + return errors::overflow;
> + };
> + append(scan.buffer, [0...], readahead);
> };
> - append(scan.buffer, [0...], readahead);
> };
>
> - const prev = scan.pending;
> - match (io::read(scan.src, scan.buffer[scan.pending..])?) {
> + match (io::read(scan.src, scan.buffer[start + pending..])?) {
> case let z: size =>
> - scan.pending += z;
> - return prev;
> + scan.pending = scan.buffer[start..start + pending + z];
> + return pending;
> case io::EOF =>
> return io::EOF;
> };
> };
>
> -// Shifts the buffer towards the start, discarding bytes which were read out.
> -fn scan_shift(scan: *scanner) void = {
> - const n = scan.readout;
> - if (n == 0) {
> - return;
> - };
> - scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
> - scan.readout = 0;
> - scan.pending -= n;
> -};
> -
> -// Consumes N bytes from the buffer, updating scan.readout. User must call
> -// [[scan_shift]] before calling scan_consume again.
> +// Consumes N bytes from the buffer.
> fn scan_consume(scan: *scanner, n: size) []u8 = {
> - assert(len(scan.buffer) >= n && scan.readout == 0);
> - scan.readout = n;
> - return scan.buffer[..n];
> + assert(len(scan.pending) >= n);
> + scan.start += n;
> + defer scan.pending = scan.pending[n..];
> + return scan.pending[..n];
> };
>
> // Reads one byte from a [[scanner]].
> export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
> - // Consume previous read, if any
> - scan_shift(scan);
> -
> - if (scan.pending == 0) {
> + if (len(scan.pending) == 0) {
> match (scan_readahead(scan)?) {
> case io::EOF =>
> return io::EOF;
> @@ -259,25 +243,28 @@ export fn scan_line(
> // Returns the internal scanner buffer, which contains all bytes read ahead by
> // the scanner up to this point.
> export fn scan_buffer(scan: *scanner) []u8 = {
> - scan_shift(scan);
> - return scan.buffer[..scan.pending];
> + return scan.pending[..];
> };
>
> fn scan_unread(scan: *scanner, buf: []u8) void = {
> if (len(buf) == 0) {
> return;
> };
> - if (len(buf) <= scan.readout) {
> - scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
> - scan.readout -= len(buf);
> + if (len(buf) <= scan.start) {
> + const pending_end = scan.start + len(scan.pending);
> + scan.buffer[scan.start - len(buf)..scan.start] = buf;
> + scan.start -= len(buf);
> + scan.pending = scan.buffer[scan.start..pending_end];
> } else {
> - const n = len(buf) - scan.readout;
> - assert(n < scan.maxread - scan.pending,
> + const n = len(buf) - scan.start;
> + assert(n < scan.maxread - len(scan.pending),
> "Attempted to unread more data than buffer has available");
> + // Shift buffer to the right to free space at the beginning
> scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
> - scan.pending += n;
> scan.buffer[..len(buf)] = buf;
> - scan.readout = 0;
> + const pending_end = len(scan.pending) + len(buf);
> + scan.pending = scan.buffer[..pending_end];
> + scan.start = 0;
the logic in this else branch is completely broken; here's an example:
suppose we have a buf where len(buf) is 9. now, scan.start is 5,
scan.maxread is 15 and len(scan.pending) is 10. the first thing
that we are doing is:
const n = len(buf) - scan.start;
now n is 4. let's see what the assert is doing:
assert(n < scan.maxread - len(scan.pending),
"Attempted to unread more data than buffer has available");
the calculation is: `4 < 15 - 10` or `4 < 5`.
NOW: this passes the assert even though len(buf) is 9 and we only
have 5 spare bytes that we can unread; this will cause an overwrite of
the existing data further down below in the code!
suppose len(buf) is 5, scan.start is 0, scan.maxread is 15 and
len(scan.pending) is 10. this should theoretically pass the assert,
right? no, because:
const n = len(buf) - scan.start;
here n is 5.
assert(n < scan.maxread - len(scan.pending),
"Attempted to unread more data than buffer has available");
the calculation is: `5 < 15 - 10`, or `5 < 5`
NOW: this will cause the assertion to fail, even though the inputs to the
function are logically correct and should not cause it to fail.
apart from these issues, another issue here is that this code assumes
that len(scan.buffer) == scan.maxread, which is wrong and will also cause
an out-of-bounds write below in some cases (in almost all of them with
the default value of scan.maxread).
> };
> };
>
> --
> 2.44.0
>
diff --git a/bufio/scanner.ha b/bufio/scanner.ha
index ff63784b..8b8b6b68 100644
--- a/bufio/scanner.ha
@@ -94,12 +94,11 @@ fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
let start = scan.start;
const pending = len(scan.pending);
- const buf_len = len(scan.buffer);- if ((start + pending) >= buf_len) {+ if (start + pending == len(scan.buffer)) { if (start > 0) {
// Shift buffer to the left to free space at the end
- scan.buffer[..buf_len - start] = scan.buffer[start..];+ scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..]; scan.pending = scan.buffer[..pending];
start = 0;
scan.start = 0;
@@ -256,14 +255,13 @@ fn scan_unread(scan: *scanner, buf: []u8) void = {
scan.start -= len(buf);
scan.pending = scan.buffer[scan.start..pending_end];
} else {
- const n = len(buf) - scan.start;- assert(n < scan.maxread - len(scan.pending),+ assert(len(buf) <= len(scan.buffer) - len(scan.pending), "Attempted to unread more data than buffer has available");
// Shift buffer to the right to free space at the beginning
- scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];+ scan.buffer[len(buf)..len(buf) + len(scan.pending)] =+ scan.buffer[scan.start..scan.start + len(scan.pending)]; scan.buffer[..len(buf)] = buf;
- const pending_end = len(scan.pending) + len(buf);- scan.pending = scan.buffer[..pending_end];+ scan.pending = scan.buffer[..len(scan.pending) + len(buf)]; scan.start = 0;
};
};
Re: [PATCH hare v2 1/1] bufio: remove scanner inefficiency
On Thu May 23, 2024 at 7:59 PM CEST, Lorenz (xha) wrote:
> i would remove "buf_len"; it's confusing because there is (1) a> variable "buf" in this scope and (2) obscures code, len(scan.buffer)> is much more readable in the two places it's used anyways.>> > +> > + if ((start + pending) >= buf_len) {
Thank your for his hint. I think it's the C way trying not to call the
same [str]len twice. :-)
> the logic in this else branch is completely broken; here's an example:
Yesterday I forgot to say sorry for this bug and thank you for your fix!
Re: [PATCH hare v2 1/1] bufio: remove scanner inefficiency
On Sat, May 25, 2024 at 09:26:53AM +0200, Max Schillinger wrote:
> > the logic in this else branch is completely broken; here's an example:> > Yesterday I forgot to say sorry for this bug and thank you for your fix!
no problem, everyone makes mistakes and that is also what code review
is for! :)