Allows the separator byte '_' to be placed between digits.
Signed-off-by: jturtle <jturtl@pm.me>
---
Implements suggested changes from v1, plus more tests to handle all the edge
cases i could think of. Separators may be placed between two valid
digits, except in type suffixes (like '1u3_2').
src/lex.c | 38 ++++++++++++++++++++++++++++++++------
tests/00-constants.ha | 36 +++++++++++++++++++++++++++++++++++-
2 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/src/lex.c b/src/lex.c
index 86ca9ac..ee5611d 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -236,6 +236,19 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
return c;
}
+static uint32_t
+next_literal(struct lexer *lexer)
+{
+ uint32_t c = next(lexer, NULL, false);
+ // don't include separators when parsing numbers
+ if (c == C_EOF || c == '_')
+ return c;
+ char buf[UTF8_MAX_SIZE];
+ size_t sz = utf8_encode(&buf[0], c);
+ append_buffer(lexer, buf, sz);
+ return c;
+}
+
static bool
isharespace(uint32_t c)
{
@@ -338,7 +351,7 @@ lex_literal(struct lexer *lexer, struct token *out)
};
static_assert((BIN | OCT | HEX | DEC) == DEC, "DEC bits must be a superset of all other bases");
enum flags {
- FLT = 3, EXP, SUFF, DIG,
+ FLT = 3, EXP, SUFF, DIG, SEP,
};
static const char chrs[][24] = {
@@ -348,7 +361,7 @@ lex_literal(struct lexer *lexer, struct token *out)
[HEX] = "0123456789abcdefABCDEF",
};
- static const char matching_states[0x80][6] = {
+ static const char matching_states[0x80][10] = {
['.'] = {DEC, HEX, 0},
['e'] = {DEC, DEC | 1<<FLT, 0},
['E'] = {DEC, DEC | 1<<FLT, 0},
@@ -360,13 +373,15 @@ lex_literal(struct lexer *lexer, struct token *out)
['u'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0},
['z'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0},
['f'] = {DEC, DEC | 1<<FLT, DEC | 1<<EXP, DEC | 1<<FLT | 1<<EXP, 0},
+ ['_'] = {BIN, OCT, HEX, DEC, DEC | 1<<FLT, HEX | 1<<FLT,
+ DEC | 1<<EXP, HEX | 1<<EXP, DEC | 1<<FLT | 1<<EXP, 0},
};
int state = DEC, base = 10, oldstate = DEC;
uint32_t c = next(lexer, &out->loc, true), last = 0;
assert(c != C_EOF && c <= 0x7F && isdigit(c));
if (c == '0') {
c = next(lexer, NULL, true);
- if (c <= 0x7F && isdigit(c)) {
+ if ((c <= 0x7F && isdigit(c)) || c == '_') {
error(out->loc, "Leading zero in base 10 literal");
} else if (c == 'b') {
state = BIN | 1 << DIG;
@@ -381,14 +396,16 @@ lex_literal(struct lexer *lexer, struct token *out)
}
if (state != DEC) {
last = c;
- c = next(lexer, NULL, true);
+ c = next_literal(lexer);
}
size_t exp = 0, suff = 0;
do {
if (strchr(chrs[state & MASK], c)) {
- state &= ~(1 << DIG);
+ state &= ~(1 << DIG | 1 << SEP);
last = c;
continue;
+ } else if (state & 1 << SEP) {
+ error(out->loc, "Expected digit after separator");
} else if (c > 0x7f || !strchr(matching_states[c], state)) {
goto end;
}
@@ -420,15 +437,24 @@ lex_literal(struct lexer *lexer, struct token *out)
state |= DEC | 1 << SUFF;
suff = lexer->buflen - 1;
break;
+ case '_':
+ if (!strchr(chrs[state & MASK], last)) {
+ error(out->loc, "Expected digit before separator");
+ }
+ state |= 1 << SEP;
+ break;
default:
goto end;
}
+ if (c != '_') {
+ state &= ~(1 << SEP);
+ }
if (state & 1 << FLT && lexer->require_int) {
error(out->loc, "Expected integer literal");
}
last = c;
state |= 1 << DIG;
- } while ((c = next(lexer, NULL, true)) != C_EOF);
+ } while ((c = next_literal(lexer)) != C_EOF);
last = 0;
end:
if (last && !strchr("iuz", last) && !strchr(chrs[state & MASK], last)) {
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index 1712155..cfe7af2 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -260,7 +260,8 @@ fn numeric() void = {
0.0e01, 0.0e+01, 0.0e+00, 0.0e-00, 0e-0, 0e-00, 0e-1, 0e-01,
0x0p0, 0x0p1, 0x0p-1, 0x0p+1,
0x0.0p0, 0x0.00p0, 0x0.0p1, 0x0.00p1, 0x0.0p+0, 0x0.0p+1, 0x0.0p-0, 0x0.0p00,
- 0x0.0p01, 0x0.0p+01, 0x0.0p+00, 0x0.0p-00, 0x0p-0, 0x0p-00, 0x0p-1, 0x0p-01];
+ 0x0.0p01, 0x0.0p+01, 0x0.0p+00, 0x0.0p-00, 0x0p-0, 0x0p-00, 0x0p-1, 0x0p-01,
+ 0.00_00];
for (let j = 0z; j < len(f); j+= 1) {
assert(f[j] == 0.0);
};
@@ -295,6 +296,25 @@ fn numeric() void = {
assert(0x0P0 == 0.0);
assert(0E0 == 0);
+ // separators
+ assert(1_000 == 1000);
+ assert(1_000_000 == 1000000);
+ assert(1_0 == 10);
+ assert(0xAB_CD == 0xABCD);
+ assert(0b1_0_0_1 == 0b1001);
+ assert(0o542_11 == 0o54211);
+ assert(1_6e2 == 16e2);
+ assert(2e1_6 == 2e16);
+ assert(0x2p1_0 == 0x2p10);
+ assert(1_000u32 == 1000u32);
+ assert(0x1B_AD_C0_DEu32 == 0x1BADC0DE);
+ assert(1_000.0f32 == 1000f32);
+ assert(0.00_01 == 0.0001);
+ assert(1_00.00_1 == 100.001);
+ assert(1_6.0e2 == 16.0e2);
+ assert(1_6e-2 == 16e-2);
+ assert(2e-1_0 == 2e-10);
+
// double tuple subscript special case
let tup = (('a', 'b'), 'c');
assert(tup.0.0 == 'a');
@@ -330,6 +350,10 @@ fn numeric() void = {
assert(tup.0.0x0 == 'a');
assert(tup.0x0.0x0 == 'a');
+ // tuple with separator
+ let tup = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k');
+ assert(tup.1_0 == 'k');
+
// zero with large exponent
assert(0e10000000 == 0);
assert(0e010000000 == 0);
@@ -397,6 +421,7 @@ fn numeric() void = {
"05p3", "00000010p3", "00.0p3", "01.0p3",
"05p+3", "00000010p+3", "00.0p+3", "01.0p+3",
"05p-3", "00000010p-3", "00.0p-3", "01.0p-3",
+ "0_10",
// invalid sequences of special characters
"1.",
@@ -479,6 +504,11 @@ fn numeric() void = {
"1.p-1", "1p-.1",
"1.1p-", "1p-1.",
"1p-1.1",
+
+ "1_", "100_", "1_000_",
+ "1__0", "1__000_0", "1_000__0", "1___0",
+ "2e_8", "2_e8", "2e8_", "3e1__1", "2e+_5", "2e_+5",
+ "0x_FFFF", "0b_1010", "0b1111_0000_", "0o6__6",
];
let extra: [_]str = [
"let t = 4e-0i;", "let t = 4e-1i;",
@@ -493,6 +523,10 @@ fn numeric() void = {
// exponent overflow
"let t: u64 = 1e1000;",
+
+ "let t = 100u3_2;",
+ "let t = 100u32_;",
+ "let t = 100u_32;"
];
let suffix = [";", "i;", "i8;", "f32;"];
let buf: [256]u8 = [0...];
--
2.42.1