Long number literals can be broken up by `_`. For example: `1_000_000`.
Added some tests too. This isn't covered by the spec yet.
---
src/lex.c | 42 ++++++++++++++++++-----
tests/37-separators.ha | 75 ++++++++++++++++++++++++++++++++++++++++++
tests/configure | 3 +-
3 files changed, 111 insertions(+), 9 deletions(-)
create mode 100644 tests/37-separators.ha
diff --git a/src/lex.c b/src/lex.c
index 86ca9ac..fbdd6b1 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -236,6 +236,18 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
return c;
}
+static uint32_t
+next_literal(struct lexer *lexer)
+{
+ uint32_t c = next(lexer, NULL, false);
+ if (c != '_' && c != C_EOF) {
+ char buf[UTF8_MAX_SIZE];
+ size_t sz = utf8_encode(&buf[0], c);
+ append_buffer(lexer, buf, sz);
+ }
+ return c;
+}
+
static bool
isharespace(uint32_t c)
{
@@ -342,10 +354,10 @@ lex_literal(struct lexer *lexer, struct token *out)
};
static const char chrs[][24] = {
- [BIN] = "01",
- [OCT] = "01234567",
- [DEC] = "0123456789",
- [HEX] = "0123456789abcdefABCDEF",
+ [BIN] = "_01",
+ [OCT] = "_01234567",
+ [DEC] = "_0123456789",
+ [HEX] = "_0123456789abcdefABCDEF",
};
static const char matching_states[0x80][6] = {
@@ -365,8 +377,8 @@ lex_literal(struct lexer *lexer, struct token *out)
uint32_t c = next(lexer, &out->loc, true), last = 0;
assert(c != C_EOF && c <= 0x7F && isdigit(c));
if (c == '0') {
- c = next(lexer, NULL, true);
- if (c <= 0x7F && isdigit(c)) {
+ c = next_literal(lexer);
+ if ((c <= 0x7F && isdigit(c)) || c == '_') {
error(out->loc, "Leading zero in base 10 literal");
} else if (c == 'b') {
state = BIN | 1 << DIG;
@@ -381,10 +393,19 @@ lex_literal(struct lexer *lexer, struct token *out)
}
if (state != DEC) {
last = c;
- c = next(lexer, NULL, true);
+ c = next_literal(lexer);
+ if (c == '_') {
+ error(out->loc, "Separator immediately following integer base prefix");
+ }
}
size_t exp = 0, suff = 0;
do {
+ if (last == '_' && c == '_') {
+ error(out->loc, "Repeated digit separator");
+ }
+ if (c == '_' && state & (1 << SUFF)) {
+ error(out->loc, "Separator in number suffix");
+ }
if (strchr(chrs[state & MASK], c)) {
state &= ~(1 << DIG);
last = c;
@@ -417,6 +438,9 @@ lex_literal(struct lexer *lexer, struct token *out)
case 'i':
case 'u':
case 'z':
+ if (last == '_') {
+ error(out->loc, "Separator before suffix");
+ }
state |= DEC | 1 << SUFF;
suff = lexer->buflen - 1;
break;
@@ -428,13 +452,15 @@ lex_literal(struct lexer *lexer, struct token *out)
}
last = c;
state |= 1 << DIG;
- } while ((c = next(lexer, NULL, true)) != C_EOF);
+ } while ((c = next_literal(lexer)) != C_EOF);
last = 0;
end:
if (last && !strchr("iuz", last) && !strchr(chrs[state & MASK], last)) {
state = oldstate;
push(lexer, c, true);
push(lexer, last, true);
+ } else if (last == '_') {
+ error(out->loc, "Trailing digit separator");
} else if (c != C_EOF) {
want_int:
push(lexer, c, true);
diff --git a/tests/37-separators.ha b/tests/37-separators.ha
new file mode 100644
index 0000000..117a2ed
--- /dev/null
+++ b/tests/37-separators.ha
@@ -0,0 +1,75 @@
+use rt::{compile, exited, EXIT_SUCCESS};
+
+fn integers() void = {
+ assert(1_000 == 1000);
+ assert(1000 == 1_000);
+ assert(1_000_000 == 1000000);
+ assert(2_4_6_8 == 2468);
+ assert(0xAC_AB == 0xACAB);
+ assert(1_1_1_1_42069 == 111142069);
+ assert(0x616d6f6e677573u64 == 0x61_6d_6f_6e_67_75_73);
+ assert(0b1111_1010_0000 == 0b111110100000);
+};
+
+fn floats() void = {
+ assert(20f32 == 2_0f32);
+ assert(1_000_000f32 == 1000000f32);
+ assert(1.000_001 == 1.000001);
+};
+
+fn bad() void = {
+ // Repeated digit separator
+ assert(compile("
+ const problem = 1____0;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Leading zero in base 10 literal
+ assert(compile("
+ const problem = 0_1;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Separator immediately following integer base prefix
+ assert(compile("
+ const problem = 0x_000F_;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Trailing digit separator
+ assert(compile("
+ const problem = 1_;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Trailing digit separator
+ assert(compile("
+ static assert(1_==1);
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Separator in number suffix
+ assert(compile("
+ const problem = 42069u6_4;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Separator before suffix
+ assert(compile("
+ const problem = 42069_u64;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+
+ // Separator before suffix
+ assert(compile("
+ const problem = 1000.000_f64;
+ export fn main() void = void;
+ ") as exited != EXIT_SUCCESS);
+};
+
+export fn main() void = {
+ integers();
+ floats();
+ bad();
+};
+
diff --git a/tests/configure b/tests/configure
index 3133fb2..4f8c47e 100644
--- a/tests/configure
+++ b/tests/configure
@@ -51,7 +51,8 @@ tests() {
33-yield \
34-declarations \
35-floats \
- 36-defines
+ 36-defines \
+ 37-separators
do
cat <<EOF
tests/$t: libhart.a testmod.a tests/$t.ha
--
2.42.0
Instead of adding _ to valid digit list, the state machine in lex_literal
should be modified to handle all the cases. Unfortunately that state machine is
not documented anywhere, but generally when modifying it, there are three
things to do:
* a new entry in the `flags` enum that denotes we just consumed a '_' should be
added
* a new entry in the `matching_states` for '_' that describes in which states
'_' is an acceptable character should be added
* the transition logic should be updated to do the right transitions when '_'
is encountered
The tests belong to 00-constants, where the rest of literal parsing tests are.
The existing tests are quite thorough and I'd like to keep it that way with new
functionality, so it's great that you wrote some. I can see some more things to
be considered however - what happens with underscores in exponent, or before
the base specifier, or around the decimal point?
Thank you, i'll try again the suggested way and move the tests.
I didn't consider the exponents, and this patch allows separators before
and immediately after the exponent marker in base-10 and base-16, which
is probably not desired.
Underscores before the base specifier, like `0_xFF` would be caught as
"Leading zero in base 10 literal", which isn't the ideal error message
but fixing that seems like a headache.
I should have waited for more discussion around the specifics before
submitting a patch :P