~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
3 3

[PATCH harec] lex: allow digit separators

Details
Message ID
<20231113000544.29011-1-jturtl@pm.me>
DKIM signature
missing
Download raw message
Patch: +111 -9
Long number literals can be broken up by `_`. For example: `1_000_000`.
Added some tests too. This isn't covered by the spec yet.
---
 src/lex.c              | 42 ++++++++++++++++++-----
 tests/37-separators.ha | 75 ++++++++++++++++++++++++++++++++++++++++++
 tests/configure        |  3 +-
 3 files changed, 111 insertions(+), 9 deletions(-)
 create mode 100644 tests/37-separators.ha

diff --git a/src/lex.c b/src/lex.c
index 86ca9ac..fbdd6b1 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -236,6 +236,18 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
	return c;
}

static uint32_t
next_literal(struct lexer *lexer)
{
	uint32_t c = next(lexer, NULL, false);
	if (c != '_' && c != C_EOF) {
		char buf[UTF8_MAX_SIZE];
		size_t sz = utf8_encode(&buf[0], c);
		append_buffer(lexer, buf, sz);
	}
	return c;
}

static bool
isharespace(uint32_t c)
{
@@ -342,10 +354,10 @@ lex_literal(struct lexer *lexer, struct token *out)
	};

	static const char chrs[][24] = {
		[BIN] = "01",
		[OCT] = "01234567",
		[DEC] = "0123456789",
		[HEX] = "0123456789abcdefABCDEF",
		[BIN] = "_01",
		[OCT] = "_01234567",
		[DEC] = "_0123456789",
		[HEX] = "_0123456789abcdefABCDEF",
	};

	static const char matching_states[0x80][6] = {
@@ -365,8 +377,8 @@ lex_literal(struct lexer *lexer, struct token *out)
	uint32_t c = next(lexer, &out->loc, true), last = 0;
	assert(c != C_EOF && c <= 0x7F && isdigit(c));
	if (c == '0') {
		c = next(lexer, NULL, true);
		if (c <= 0x7F && isdigit(c)) {
		c = next_literal(lexer);
		if ((c <= 0x7F && isdigit(c)) || c == '_') {
			error(out->loc, "Leading zero in base 10 literal");
		} else if (c == 'b') {
			state = BIN | 1 << DIG;
@@ -381,10 +393,19 @@ lex_literal(struct lexer *lexer, struct token *out)
	}
	if (state != DEC) {
		last = c;
		c = next(lexer, NULL, true);
		c = next_literal(lexer);
		if (c == '_') {
			error(out->loc, "Separator immediately following integer base prefix");
		}
	}
	size_t exp = 0, suff = 0;
	do {
		if (last == '_' && c == '_') {
			error(out->loc, "Repeated digit separator");
		}
		if (c == '_' && state & (1 << SUFF)) {
			error(out->loc, "Separator in number suffix");
		}
		if (strchr(chrs[state & MASK], c)) {
			state &= ~(1 << DIG);
			last = c;
@@ -417,6 +438,9 @@ lex_literal(struct lexer *lexer, struct token *out)
		case 'i':
		case 'u':
		case 'z':
			if (last == '_') {
				error(out->loc, "Separator before suffix");
			}
			state |= DEC | 1 << SUFF;
			suff = lexer->buflen - 1;
			break;
@@ -428,13 +452,15 @@ lex_literal(struct lexer *lexer, struct token *out)
		}
		last = c;
		state |= 1 << DIG;
	} while ((c = next(lexer, NULL, true)) != C_EOF);
	} while ((c = next_literal(lexer)) != C_EOF);
	last = 0;
end:
	if (last && !strchr("iuz", last) && !strchr(chrs[state & MASK], last)) {
		state = oldstate;
		push(lexer, c, true);
		push(lexer, last, true);
	} else if (last == '_') {
		error(out->loc, "Trailing digit separator");
	} else if (c != C_EOF) {
want_int:
		push(lexer, c, true);
diff --git a/tests/37-separators.ha b/tests/37-separators.ha
new file mode 100644
index 0000000..117a2ed
--- /dev/null
+++ b/tests/37-separators.ha
@@ -0,0 +1,75 @@
use rt::{compile, exited, EXIT_SUCCESS};

fn integers() void = {
	assert(1_000 == 1000);
	assert(1000 == 1_000);
	assert(1_000_000 == 1000000);
	assert(2_4_6_8 == 2468);
	assert(0xAC_AB == 0xACAB);
	assert(1_1_1_1_42069 == 111142069);
	assert(0x616d6f6e677573u64 == 0x61_6d_6f_6e_67_75_73);
	assert(0b1111_1010_0000 == 0b111110100000);
};

fn floats() void = {
	assert(20f32 == 2_0f32);
	assert(1_000_000f32 == 1000000f32);
	assert(1.000_001 == 1.000001);
};

fn bad() void = {
	// Repeated digit separator
	assert(compile("
		const problem = 1____0;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Leading zero in base 10 literal
	assert(compile("
		const problem = 0_1;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Separator immediately following integer base prefix
	assert(compile("
		const problem = 0x_000F_;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Trailing digit separator
	assert(compile("
		const problem = 1_;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Trailing digit separator
	assert(compile("
		static assert(1_==1);
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Separator in number suffix
	assert(compile("
		const problem = 42069u6_4;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Separator before suffix
	assert(compile("
		const problem = 42069_u64;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);

	// Separator before suffix
	assert(compile("
		const problem = 1000.000_f64;
		export fn main() void = void;
	") as exited != EXIT_SUCCESS);
};

export fn main() void = {
	integers();
	floats();
	bad();
};

diff --git a/tests/configure b/tests/configure
index 3133fb2..4f8c47e 100644
--- a/tests/configure
+++ b/tests/configure
@@ -51,7 +51,8 @@ tests() {
		33-yield \
		34-declarations \
		35-floats \
		36-defines
		36-defines \
		37-separators
	do
		cat <<EOF
tests/$t: libhart.a testmod.a tests/$t.ha
--
2.42.0

[harec/patches] build success

builds.sr.ht <builds@sr.ht>
Details
Message ID
<CWX8WPYY5G0D.3B2RQ1F0FOBSU@cirno2>
In-Reply-To
<20231113000544.29011-1-jturtl@pm.me> (view parent)
DKIM signature
missing
Download raw message
harec/patches: SUCCESS in 1m0s

[lex: allow digit separators][0] from [jturtle][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/46648
[1]: jturtl@pm.me

✓ #1093162 SUCCESS harec/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1093162
✓ #1093164 SUCCESS harec/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1093164
✓ #1093163 SUCCESS harec/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1093163
Details
Message ID
<CWXBAV2U98CN.3KDS3XKY9SKFX@attila>
In-Reply-To
<20231113000544.29011-1-jturtl@pm.me> (view parent)
DKIM signature
missing
Download raw message
Instead of adding _ to valid digit list, the state machine in lex_literal
should be modified to handle all the cases. Unfortunately that state machine is
not documented anywhere, but generally when modifying it, there are three
things to do:

* a new entry in the `flags` enum that denotes we just consumed a '_' should be
  added
* a new entry in the `matching_states` for '_' that describes in which states
  '_' is an acceptable character should be added
* the transition logic should be updated to do the right transitions when '_'
  is encountered

The tests belong to 00-constants, where the rest of literal parsing tests are.
The existing tests are quite thorough and I'd like to keep it that way with new
functionality, so it's great that you wrote some. I can see some more things to
be considered however - what happens with underscores in exponent, or before
the base specifier, or around the decimal point?
Details
Message ID
<2e3906a7-a16b-4891-9e64-ac38ffa3f6ee@pm.me>
In-Reply-To
<CWXBAV2U98CN.3KDS3XKY9SKFX@attila> (view parent)
DKIM signature
missing
Download raw message
Thank you, i'll try again the suggested way and move the tests.

I didn't consider the exponents, and this patch allows separators before
and immediately after the exponent marker in base-10 and base-16, which
is probably not desired.

Underscores before the base specifier, like `0_xFF` would be caught as
"Leading zero in base 10 literal", which isn't the ideal error message
but fixing that seems like a headache.

I should have waited for more discussion around the specifics before
submitting a patch :P
Reply to thread Export thread (mbox)