~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
2 2

[PATCH harec v2] lex: allow digit separators

Details
Message ID
<20231122063628.123449-1-jturtl@pm.me>
DKIM signature
missing
Download raw message
Patch: +67 -7
Allows the separator byte '_' to be placed between digits.

Signed-off-by: jturtle <jturtl@pm.me>
---
Implements suggested changes from v1, plus more tests to handle all the edge
cases i could think of. Separators may be placed between two valid
digits, except in type suffixes (like '1u3_2').

 src/lex.c             | 38 ++++++++++++++++++++++++++++++++------
 tests/00-constants.ha | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/src/lex.c b/src/lex.c
index 86ca9ac..ee5611d 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -236,6 +236,19 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
	return c;
}

static uint32_t
next_literal(struct lexer *lexer)
{
	uint32_t c = next(lexer, NULL, false);
	// don't include separators when parsing numbers
	if (c == C_EOF || c == '_')
		return c;
	char buf[UTF8_MAX_SIZE];
	size_t sz = utf8_encode(&buf[0], c);
	append_buffer(lexer, buf, sz);
	return c;
}

static bool
isharespace(uint32_t c)
{
@@ -338,7 +351,7 @@ lex_literal(struct lexer *lexer, struct token *out)
	};
	static_assert((BIN | OCT | HEX | DEC) == DEC, "DEC bits must be a superset of all other bases");
	enum flags {
		FLT = 3, EXP, SUFF, DIG,
		FLT = 3, EXP, SUFF, DIG, SEP,
	};

	static const char chrs[][24] = {
@@ -348,7 +361,7 @@ lex_literal(struct lexer *lexer, struct token *out)
		[HEX] = "0123456789abcdefABCDEF",
	};

	static const char matching_states[0x80][6] = {
	static const char matching_states[0x80][10] = {
		['.'] = {DEC, HEX, 0},
		['e'] = {DEC, DEC | 1<<FLT, 0},
		['E'] = {DEC, DEC | 1<<FLT, 0},
@@ -360,13 +373,15 @@ lex_literal(struct lexer *lexer, struct token *out)
		['u'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0},
		['z'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0},
		['f'] = {DEC, DEC | 1<<FLT, DEC | 1<<EXP, DEC | 1<<FLT | 1<<EXP, 0},
		['_'] = {BIN, OCT, HEX, DEC, DEC | 1<<FLT, HEX | 1<<FLT,
			DEC | 1<<EXP, HEX | 1<<EXP, DEC | 1<<FLT | 1<<EXP, 0},
	};
	int state = DEC, base = 10, oldstate = DEC;
	uint32_t c = next(lexer, &out->loc, true), last = 0;
	assert(c != C_EOF && c <= 0x7F && isdigit(c));
	if (c == '0') {
		c = next(lexer, NULL, true);
		if (c <= 0x7F && isdigit(c)) {
		if ((c <= 0x7F && isdigit(c)) || c == '_') {
			error(out->loc, "Leading zero in base 10 literal");
		} else if (c == 'b') {
			state = BIN | 1 << DIG;
@@ -381,14 +396,16 @@ lex_literal(struct lexer *lexer, struct token *out)
	}
	if (state != DEC) {
		last = c;
		c = next(lexer, NULL, true);
		c = next_literal(lexer);
	}
	size_t exp = 0, suff = 0;
	do {
		if (strchr(chrs[state & MASK], c)) {
			state &= ~(1 << DIG);
			state &= ~(1 << DIG | 1 << SEP);
			last = c;
			continue;
		} else if (state & 1 << SEP) {
			error(out->loc, "Expected digit after separator");
		} else if (c > 0x7f || !strchr(matching_states[c], state)) {
			goto end;
		}
@@ -420,15 +437,24 @@ lex_literal(struct lexer *lexer, struct token *out)
			state |= DEC | 1 << SUFF;
			suff = lexer->buflen - 1;
			break;
		case '_':
			if (!strchr(chrs[state & MASK], last)) {
				error(out->loc, "Expected digit before separator");
			}
			state |= 1 << SEP;
			break;
		default:
			goto end;
		}
		if (c != '_') {
			state &= ~(1 << SEP);
		}
		if (state & 1 << FLT && lexer->require_int) {
			error(out->loc, "Expected integer literal");
		}
		last = c;
		state |= 1 << DIG;
	} while ((c = next(lexer, NULL, true)) != C_EOF);
	} while ((c = next_literal(lexer)) != C_EOF);
	last = 0;
end:
	if (last && !strchr("iuz", last) && !strchr(chrs[state & MASK], last)) {
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index 1712155..cfe7af2 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -260,7 +260,8 @@ fn numeric() void = {
		0.0e01, 0.0e+01, 0.0e+00, 0.0e-00, 0e-0, 0e-00, 0e-1, 0e-01,
		0x0p0, 0x0p1, 0x0p-1, 0x0p+1,
		0x0.0p0, 0x0.00p0, 0x0.0p1, 0x0.00p1, 0x0.0p+0, 0x0.0p+1, 0x0.0p-0, 0x0.0p00,
		0x0.0p01, 0x0.0p+01, 0x0.0p+00, 0x0.0p-00, 0x0p-0, 0x0p-00, 0x0p-1, 0x0p-01];
		0x0.0p01, 0x0.0p+01, 0x0.0p+00, 0x0.0p-00, 0x0p-0, 0x0p-00, 0x0p-1, 0x0p-01,
		0.00_00];
	for (let j = 0z; j < len(f); j+= 1) {
		assert(f[j] == 0.0);
	};
@@ -295,6 +296,25 @@ fn numeric() void = {
	assert(0x0P0 == 0.0);
	assert(0E0 == 0);

	// separators
	assert(1_000 == 1000);
	assert(1_000_000 == 1000000);
	assert(1_0 == 10);
	assert(0xAB_CD == 0xABCD);
	assert(0b1_0_0_1 == 0b1001);
	assert(0o542_11 == 0o54211);
	assert(1_6e2 == 16e2);
	assert(2e1_6 == 2e16);
	assert(0x2p1_0 == 0x2p10);
	assert(1_000u32 == 1000u32);
	assert(0x1B_AD_C0_DEu32 == 0x1BADC0DE);
	assert(1_000.0f32 == 1000f32);
	assert(0.00_01 == 0.0001);
	assert(1_00.00_1 == 100.001);
	assert(1_6.0e2 == 16.0e2);
	assert(1_6e-2 == 16e-2);
	assert(2e-1_0 == 2e-10);

	// double tuple subscript special case
	let tup = (('a', 'b'), 'c');
	assert(tup.0.0 == 'a');
@@ -330,6 +350,10 @@ fn numeric() void = {
	assert(tup.0.0x0 == 'a');
	assert(tup.0x0.0x0 == 'a');

	// tuple with separator
	let tup = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k');
	assert(tup.1_0 == 'k');

	// zero with large exponent
	assert(0e10000000 == 0);
	assert(0e010000000 == 0);
@@ -397,6 +421,7 @@ fn numeric() void = {
		"05p3", "00000010p3", "00.0p3", "01.0p3",
		"05p+3", "00000010p+3", "00.0p+3", "01.0p+3",
		"05p-3", "00000010p-3", "00.0p-3", "01.0p-3",
		"0_10",

		// invalid sequences of special characters
		"1.",
@@ -479,6 +504,11 @@ fn numeric() void = {
		"1.p-1", "1p-.1",
		"1.1p-", "1p-1.",
		"1p-1.1",

		"1_", "100_", "1_000_",
		"1__0", "1__000_0", "1_000__0", "1___0",
		"2e_8", "2_e8", "2e8_", "3e1__1", "2e+_5", "2e_+5",
		"0x_FFFF", "0b_1010", "0b1111_0000_", "0o6__6",
	];
	let extra: [_]str = [
		"let t = 4e-0i;", "let t = 4e-1i;",
@@ -493,6 +523,10 @@ fn numeric() void = {

		// exponent overflow
		"let t: u64 = 1e1000;",

		"let t = 100u3_2;",
		"let t = 100u32_;",
		"let t = 100u_32;"
	];
	let suffix = [";", "i;", "i8;", "f32;"];
	let buf: [256]u8 = [0...];
--
2.42.1

[harec/patches] build success

builds.sr.ht <builds@sr.ht>
Details
Message ID
<CX54TZXTSPRY.22TU914NRWFX6@cirno2>
In-Reply-To
<20231122063628.123449-1-jturtl@pm.me> (view parent)
DKIM signature
missing
Download raw message
harec/patches: SUCCESS in 1m17s

[lex: allow digit separators][0] v2 from [jturtle][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/46898
[1]: jturtl@pm.me

✓ #1098599 SUCCESS harec/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1098599
✓ #1098597 SUCCESS harec/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1098597
✓ #1098598 SUCCESS harec/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1098598

Re: [harec/patches] build success

Details
Message ID
<d1bab212-b1e7-44a7-8b63-9151ae6656bf@fentker.eu>
In-Reply-To
<CX54TZXTSPRY.22TU914NRWFX6@cirno2> (view parent)
DKIM signature
missing
Download raw message
nice,

 >(c <= 0x7F && isdigit(c)) || c == '_'
should be `c <= 0x7F && (isdigit(c) || c == '_')`

 >if (!strchr(chrs[state & MASK], last))
this is redundant because `_` doesn't match after non-digits (DIG)

 >state &= ~(1 << SEP);
this is redundant as well, it's done in the `if (strchr(chrs[state & 
MASK], c))` clause

 >next_literal
this is probably a bad idea because of code duplication? you can just 
use `consume(lexer, 1);` to discard the underscore in the appropriate 
switch clause

 >HEX | 1<<EXP
exponents are always decimal, so this one can be removed

if this makes it, hare::lex and the spec need to be updated as well
Reply to thread Export thread (mbox)