~sircmpwn/hare-dev

harec: lex: improve location in invalid escape errors v1 APPLIED

Sebastian: 3
 lex: improve location in invalid escape errors
 Improve handling of invalid UTF-8 in rune/str literals
 lex: disallow sign in hex literal

 9 files changed, 145 insertions(+), 43 deletions(-)
#1057384 alpine.yml failed
#1057385 freebsd.yml success
#1057386 netbsd.yml success
harec/patches: FAILED in 1m39s

[lex: improve location in invalid escape errors][0] from [Sebastian][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/44669
[1]: mailto:sebastian@sebsite.pw

✗ #1057384 FAILED  harec/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1057384
✓ #1057386 SUCCESS harec/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1057386
✓ #1057385 SUCCESS harec/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1057385
'\u  aa' was also considered valid before this patch (strto* functions allow
whitespace before the number), so the commit message, comments and tests should
also reflect that.



          
          
          
        
      

      
      
      
      
      
      

      

      
      
      
      

      

      
      
      
      

      

      
      
      
      

      

      
      
      
      
      
      
      
      
      
      
    
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/hare-dev/patches/44669/mbox | git am -3
Learn more about email & git

[PATCH harec 1/3] lex: improve location in invalid escape errors Export this patch

Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 src/lex.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lex.c b/src/lex.c
index 1779e9f6..71bd416b 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -511,11 +511,13 @@ lex_rune(struct lexer *lexer)
{
	char buf[9];
	char *endptr;
	struct location loc;
	uint32_t c = next(lexer, NULL, false);
	assert(c != C_EOF);

	switch (c) {
	case '\\':
		loc = lexer->loc;
		c = next(lexer, NULL, false);
		switch (c) {
		case '0':
@@ -546,7 +548,7 @@ lex_rune(struct lexer *lexer)
			buf[2] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case 'u':
@@ -557,7 +559,7 @@ lex_rune(struct lexer *lexer)
			buf[4] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case 'U':
@@ -572,13 +574,13 @@ lex_rune(struct lexer *lexer)
			buf[8] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case C_EOF:
			error(lexer->loc, "Unexpected end of file");
		default:
			error(lexer->loc, "Invalid escape '\\%c'", c);
			error(loc, "Invalid escape '\\%c'", c);
		}
		assert(0);
	default:
-- 
2.40.1
thanks!

to git@git.sr.ht:~sircmpwn/harec
  6151bd6..a3687cb  master -> master

took 1/3, sounds like 2/3 and 3/3 need a v2

[PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals Export this patch

References: https://todo.sr.ht/~sircmpwn/hare/632
Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
Hold off on merging this until I send a spec update, I just realized right now I
forgot to do that. Specifically, I imagine the spec will need to be more
explicit about what kind of stuff is allowed in rune/string constants, and the
behavior of \x in rune/string constants will need to be changed. Actually I
should probably talk about that:

So, some context: \u is kinda weird. It behaves differently depending on where
it's used. In rune literals, it denotes the value the rune should hold, so
'\u1234' is represented as 0x1234. But in string literals, it's converted to
UTF-8. This is weird, but I feel like it's still correct.
\x is also weird. Prior to this commit, it behaved pretty much identically to
\u: in rune literals, it was used literally as the value; in string literals, it
was converted to UTF-8. Which is different from how \x behaves in other similar
languages: C, Go, Rust, and Zig all have \x denote a literal byte, whereas \u
denotes a Unicode codepoint. So I've changed \x to denote literal bytes in
strings, and, within runes, everything above \x7f is disallowed, since it's
invalid UTF-8.

 include/utf8.h        |  3 +-
 src/lex.c             | 88 ++++++++++++++++++++++++++++---------------
 src/parse.c           | 15 ++++++++
 tests/00-constants.ha | 18 ++++++++-
 tests/04-strings.ha   | 27 ++++++++++++-
 5 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/include/utf8.h b/include/utf8.h
index 2e7e4afd..727b3031 100644
--- a/include/utf8.h
+++ b/include/utf8.h
@@ -1,10 +1,11 @@
#ifndef HAREC_UTF8_H
#define HAREC_UTF8_H
#include <limits.h>
#include <stdio.h>

#define UTF8_MAX_SIZE 4

#define UTF8_INVALID 0x80
#define UTF8_INVALID UINT32_MAX

/**
 * Grabs the next UTF-8 codepoint and advances the string pointer
diff --git a/src/lex.c b/src/lex.c
index 71bd416b..336d9906 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -194,6 +194,18 @@ update_lineno(struct location *loc, uint32_t c)
	}
}

static void
append_buffer(struct lexer *lexer, const char *buf, size_t sz)
{
	if (lexer->buflen + sz >= lexer->bufsz) {
		lexer->bufsz *= 2;
		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
	}
	memcpy(lexer->buf + lexer->buflen, buf, sz);
	lexer->buflen += sz;
	lexer->buf[lexer->buflen] = '\0';
}

static uint32_t
next(struct lexer *lexer, struct location *loc, bool buffer)
{
@@ -220,13 +232,7 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
	}
	char buf[UTF8_MAX_SIZE];
	size_t sz = utf8_encode(&buf[0], c);
	if (lexer->buflen + sz >= lexer->bufsz) {
		lexer->bufsz *= 2;
		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
	}
	memcpy(lexer->buf + lexer->buflen, buf, sz);
	lexer->buflen += sz;
	lexer->buf[lexer->buflen] = '\0';
	append_buffer(lexer, buf, sz);
	return c;
}

@@ -506,8 +512,8 @@ want_int:
	consume(lexer, -1);
}

static uint32_t
lex_rune(struct lexer *lexer)
static size_t
lex_rune(struct lexer *lexer, char *out)
{
	char buf[9];
	char *endptr;
@@ -521,27 +527,38 @@ lex_rune(struct lexer *lexer)
		c = next(lexer, NULL, false);
		switch (c) {
		case '0':
			return '\0';
			out[0] = '\0';
			return 1;
		case 'a':
			return '\a';
			out[0] = '\a';
			return 1;
		case 'b':
			return '\b';
			out[0] = '\b';
			return 1;
		case 'f':
			return '\f';
			out[0] = '\f';
			return 1;
		case 'n':
			return '\n';
			out[0] = '\n';
			return 1;
		case 'r':
			return '\r';
			out[0] = '\r';
			return 1;
		case 't':
			return '\t';
			out[0] = '\t';
			return 1;
		case 'v':
			return '\v';
			out[0] = '\v';
			return 1;
		case '\\':
			return '\\';
			out[0] = '\\';
			return 1;
		case '\'':
			return '\'';
			out[0] = '\'';
			return 1;
		case '"':
			return '\"';
			out[0] = '\"';
			return 1;
		case 'x':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -550,7 +567,8 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			out[0] = c;
			return 1;
		case 'u':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -561,7 +579,7 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			return utf8_encode(out, c);
		case 'U':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -576,7 +594,7 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			return utf8_encode(out, c);
		case C_EOF:
			error(lexer->loc, "Unexpected end of file");
		default:
@@ -584,7 +602,7 @@ lex_rune(struct lexer *lexer)
		}
		assert(0);
	default:
		return c;
		return utf8_encode(out, c);
	}
	assert(0);
}
@@ -594,6 +612,7 @@ lex_string(struct lexer *lexer, struct token *out)
{
	uint32_t c = next(lexer, &out->loc, false);
	uint32_t delim;
	char buf[UTF8_MAX_SIZE + 1];

	switch (c) {
	case '"':
@@ -605,16 +624,18 @@ lex_string(struct lexer *lexer, struct token *out)
			}
			push(lexer, c, false);
			if (delim == '"') {
				push(lexer, lex_rune(lexer), false);
				size_t sz = lex_rune(lexer, buf);
				append_buffer(lexer, buf, sz);
			} else {
				next(lexer, NULL, true);
			}
			next(lexer, NULL, true);
		}
		char *buf = xcalloc(lexer->buflen + 1, 1);
		memcpy(buf, lexer->buf, lexer->buflen);
		char *s = xcalloc(lexer->buflen + 1, 1);
		memcpy(s, lexer->buf, lexer->buflen);
		out->token = T_LITERAL;
		out->storage = STORAGE_STRING;
		out->string.len = lexer->buflen;
		out->string.value = buf;
		out->string.value = s;
		consume(lexer, -1);
		return out->token;
	case '\'':
@@ -624,7 +645,14 @@ lex_string(struct lexer *lexer, struct token *out)
			error(out->loc, "Expected rune before trailing single quote");
		case '\\':
			push(lexer, c, false);
			out->rune = lex_rune(lexer);
			struct location loc = lexer->loc;
			size_t sz = lex_rune(lexer, buf);
			buf[sz] = '\0';
			const char *s = buf;
			out->rune = utf8_decode(&s);
			if (out->rune == UTF8_INVALID) {
				error(loc, "invalid UTF-8 in rune literal");
			}
			break;
		default:
			out->rune = c;
diff --git a/src/parse.c b/src/parse.c
index 4f91c8a5..5f44db85 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -12,6 +12,7 @@
#include "lex.h"
#include "parse.h"
#include "types.h"
#include "utf8.h"
#include "util.h"

static void
@@ -823,6 +824,7 @@ parse_constant(struct lexer *lexer)
		break;
	}

	struct location loc = tok.loc;
	switch (tok.storage) {
	case STORAGE_U8:
	case STORAGE_U16:
@@ -863,6 +865,19 @@ parse_constant(struct lexer *lexer)
			exp->constant.string.len += tok.string.len;
		}
		unlex(lexer, &tok);

		// check for invalid UTF-8 (possible when \x is used)
		const char *s = exp->constant.string.value;
		while (*s != '\0') {
			if (utf8_decode(&s) != UTF8_INVALID) {
				continue;
			}
			xfprintf(stderr,
				"%s:%d%d: invalid UTF-8 in string literal\n",
				sources[loc.file], loc.lineno, loc.colno);
			errline(loc);
			exit(EXIT_FAILURE);
		}
		break;
	case STORAGE_BOOL:
	case STORAGE_NULL:
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index cebd4114..e2fa9e28 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -508,7 +508,23 @@ fn basics() void = {
	let b1 = true, b2 = false;
	let p1: nullable *int = null;
	let r1 = ['x', '\x0A', '\u1234', '\0', '\a', '\b', '\f', '\n', '\r', '\t',
		'\v', '\\', '\'', '\"', '\U12345678'];
		'\v', '\\', '\'', '\"', '\U00123456', '\u0080', '\U00000080'];
	static assert('a' == '\x61');
	static assert('a' == '\u0061');
	static assert('a' == '\U00000061');
	static assert('a' == 0x61u32);
	static assert('à' == '\u00e0');
	static assert('à' == '\U000000e0');
	static assert('à' == 0xe0u32);
	assert(compile(`let r = 'abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\033';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\xc3';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x123';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u69';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U1234567';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\xah';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\uahij';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\Uahijklmn';`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
diff --git a/tests/04-strings.ha b/tests/04-strings.ha
index 6dfca7d5..a4b2bfd1 100644
--- a/tests/04-strings.ha
+++ b/tests/04-strings.ha
@@ -1,4 +1,4 @@
use rt::{compile, exited, EXIT_SUCCESS};
use rt::{compile, exited, EXIT_SUCCESS, toutf8};

fn measurements() void = {
	const x = "Hello!";
@@ -65,6 +65,19 @@ fn equality() void = {
	static assert("foo\0bar" != "foo\0foo");
};

fn escapes() void = {
	const s = "à";
	assert(s == "\xc3\xa0");
	assert(s == "\xc3" "" "\xa0");
	assert(s == "\u00e0");
	assert(s == "\U000000e0");
	const s = toutf8(s);
	assert(len(s) == 2 && s[0] == 0xc3 && s[1] == 0xa0);

	assert("\x345" == "45");
	assert("\033" == "\x0033");
};

fn raw() void = {
	assert(`hello \" world` == "hello \\\" world");
};
@@ -81,6 +94,17 @@ fn reject() void = {
			x += "fdsa";
		};
	`) as exited != EXIT_SUCCESS);

	assert(compile(`let s = "\xc3";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc3\x00";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc30";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc3" "\xc3";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xa";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u69";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U1234567";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xah";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\uahij";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\Uahijklmn";`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
@@ -88,6 +112,7 @@ export fn main() void = {
	storage();
	concat();
	equality();
	escapes();
	raw();
	reject();
};
-- 
2.40.1

[PATCH harec 3/3] lex: disallow sign in hex literal Export this patch

Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 src/lex.c             | 15 +++++++++------
 tests/00-constants.ha |  6 ++++++
 tests/04-strings.ha   |  6 ++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/lex.c b/src/lex.c
index 336d9906..93f5f509 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -563,8 +563,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
			buf[2] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			out[0] = c;
@@ -575,8 +576,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[2] = next(lexer, NULL, false);
			buf[3] = next(lexer, NULL, false);
			buf[4] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			return utf8_encode(out, c);
@@ -590,8 +592,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[6] = next(lexer, NULL, false);
			buf[7] = next(lexer, NULL, false);
			buf[8] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			return utf8_encode(out, c);
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index e2fa9e28..5e43937c 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -525,6 +525,12 @@ fn basics() void = {
	assert(compile(`let r = '\xah';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\uahij';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\Uahijklmn';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x-a';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x+a';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u-abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u+abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U-abcdefg';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U+abcdefg';`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
diff --git a/tests/04-strings.ha b/tests/04-strings.ha
index a4b2bfd1..51b86612 100644
--- a/tests/04-strings.ha
+++ b/tests/04-strings.ha
@@ -105,6 +105,12 @@ fn reject() void = {
	assert(compile(`let s = "\xah";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\uahij";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\Uahijklmn";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\x-a";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\x+a";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u-abc";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u+abc";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U-abcdefg";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U+abcdefg";`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
-- 
2.40.1
harec/patches: FAILED in 1m39s

[lex: improve location in invalid escape errors][0] from [Sebastian][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/44669
[1]: mailto:sebastian@sebsite.pw

✗ #1057384 FAILED  harec/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1057384
✓ #1057386 SUCCESS harec/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1057386
✓ #1057385 SUCCESS harec/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1057385
'\u  aa' was also considered valid before this patch (strto* functions allow
whitespace before the number), so the commit message, comments and tests should
also reflect that.