~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
11 4

[PATCH harec 1/3] lex: improve location in invalid escape errors

Details
Message ID
<20230914040426.30728-1-sebastian@sebsite.pw>
DKIM signature
missing
Download raw message
Patch: +6 -4
Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 src/lex.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lex.c b/src/lex.c
index 1779e9f6..71bd416b 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -511,11 +511,13 @@ lex_rune(struct lexer *lexer)
{
	char buf[9];
	char *endptr;
	struct location loc;
	uint32_t c = next(lexer, NULL, false);
	assert(c != C_EOF);

	switch (c) {
	case '\\':
		loc = lexer->loc;
		c = next(lexer, NULL, false);
		switch (c) {
		case '0':
@@ -546,7 +548,7 @@ lex_rune(struct lexer *lexer)
			buf[2] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case 'u':
@@ -557,7 +559,7 @@ lex_rune(struct lexer *lexer)
			buf[4] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case 'U':
@@ -572,13 +574,13 @@ lex_rune(struct lexer *lexer)
			buf[8] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
				error(lexer->loc, "Invalid hex literal");
				error(loc, "Invalid hex literal");
			}
			return c;
		case C_EOF:
			error(lexer->loc, "Unexpected end of file");
		default:
			error(lexer->loc, "Invalid escape '\\%c'", c);
			error(loc, "Invalid escape '\\%c'", c);
		}
		assert(0);
	default:
-- 
2.40.1

[PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals

Details
Message ID
<20230914040426.30728-2-sebastian@sebsite.pw>
In-Reply-To
<20230914040426.30728-1-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
Patch: +118 -33
References: https://todo.sr.ht/~sircmpwn/hare/632
Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
Hold off on merging this until I send a spec update, I just realized right now I
forgot to do that. Specifically, I imagine the spec will need to be more
explicit about what kind of stuff is allowed in rune/string constants, and the
behavior of \x in rune/string constants will need to be changed. Actually I
should probably talk about that:

So, some context: \u is kinda weird. It behaves differently depending on where
it's used. In rune literals, it denotes the value the rune should hold, so
'\u1234' is represented as 0x1234. But in string literals, it's converted to
UTF-8. This is weird, but I feel like it's still correct.
\x is also weird. Prior to this commit, it behaved pretty much identically to
\u: in rune literals, it was used literally as the value; in string literals, it
was converted to UTF-8. Which is different from how \x behaves in other similar
languages: C, Go, Rust, and Zig all have \x denote a literal byte, whereas \u
denotes a Unicode codepoint. So I've changed \x to denote literal bytes in
strings, and, within runes, everything above \x7f is disallowed, since it's
invalid UTF-8.

 include/utf8.h        |  3 +-
 src/lex.c             | 88 ++++++++++++++++++++++++++++---------------
 src/parse.c           | 15 ++++++++
 tests/00-constants.ha | 18 ++++++++-
 tests/04-strings.ha   | 27 ++++++++++++-
 5 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/include/utf8.h b/include/utf8.h
index 2e7e4afd..727b3031 100644
--- a/include/utf8.h
+++ b/include/utf8.h
@@ -1,10 +1,11 @@
#ifndef HAREC_UTF8_H
#define HAREC_UTF8_H
#include <limits.h>
#include <stdio.h>

#define UTF8_MAX_SIZE 4

#define UTF8_INVALID 0x80
#define UTF8_INVALID UINT32_MAX

/**
 * Grabs the next UTF-8 codepoint and advances the string pointer
diff --git a/src/lex.c b/src/lex.c
index 71bd416b..336d9906 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -194,6 +194,18 @@ update_lineno(struct location *loc, uint32_t c)
	}
}

static void
append_buffer(struct lexer *lexer, const char *buf, size_t sz)
{
	if (lexer->buflen + sz >= lexer->bufsz) {
		lexer->bufsz *= 2;
		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
	}
	memcpy(lexer->buf + lexer->buflen, buf, sz);
	lexer->buflen += sz;
	lexer->buf[lexer->buflen] = '\0';
}

static uint32_t
next(struct lexer *lexer, struct location *loc, bool buffer)
{
@@ -220,13 +232,7 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
	}
	char buf[UTF8_MAX_SIZE];
	size_t sz = utf8_encode(&buf[0], c);
	if (lexer->buflen + sz >= lexer->bufsz) {
		lexer->bufsz *= 2;
		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
	}
	memcpy(lexer->buf + lexer->buflen, buf, sz);
	lexer->buflen += sz;
	lexer->buf[lexer->buflen] = '\0';
	append_buffer(lexer, buf, sz);
	return c;
}

@@ -506,8 +512,8 @@ want_int:
	consume(lexer, -1);
}

static uint32_t
lex_rune(struct lexer *lexer)
static size_t
lex_rune(struct lexer *lexer, char *out)
{
	char buf[9];
	char *endptr;
@@ -521,27 +527,38 @@ lex_rune(struct lexer *lexer)
		c = next(lexer, NULL, false);
		switch (c) {
		case '0':
			return '\0';
			out[0] = '\0';
			return 1;
		case 'a':
			return '\a';
			out[0] = '\a';
			return 1;
		case 'b':
			return '\b';
			out[0] = '\b';
			return 1;
		case 'f':
			return '\f';
			out[0] = '\f';
			return 1;
		case 'n':
			return '\n';
			out[0] = '\n';
			return 1;
		case 'r':
			return '\r';
			out[0] = '\r';
			return 1;
		case 't':
			return '\t';
			out[0] = '\t';
			return 1;
		case 'v':
			return '\v';
			out[0] = '\v';
			return 1;
		case '\\':
			return '\\';
			out[0] = '\\';
			return 1;
		case '\'':
			return '\'';
			out[0] = '\'';
			return 1;
		case '"':
			return '\"';
			out[0] = '\"';
			return 1;
		case 'x':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -550,7 +567,8 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			out[0] = c;
			return 1;
		case 'u':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -561,7 +579,7 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			return utf8_encode(out, c);
		case 'U':
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
@@ -576,7 +594,7 @@ lex_rune(struct lexer *lexer)
			if (*endptr != '\0') {
				error(loc, "Invalid hex literal");
			}
			return c;
			return utf8_encode(out, c);
		case C_EOF:
			error(lexer->loc, "Unexpected end of file");
		default:
@@ -584,7 +602,7 @@ lex_rune(struct lexer *lexer)
		}
		assert(0);
	default:
		return c;
		return utf8_encode(out, c);
	}
	assert(0);
}
@@ -594,6 +612,7 @@ lex_string(struct lexer *lexer, struct token *out)
{
	uint32_t c = next(lexer, &out->loc, false);
	uint32_t delim;
	char buf[UTF8_MAX_SIZE + 1];

	switch (c) {
	case '"':
@@ -605,16 +624,18 @@ lex_string(struct lexer *lexer, struct token *out)
			}
			push(lexer, c, false);
			if (delim == '"') {
				push(lexer, lex_rune(lexer), false);
				size_t sz = lex_rune(lexer, buf);
				append_buffer(lexer, buf, sz);
			} else {
				next(lexer, NULL, true);
			}
			next(lexer, NULL, true);
		}
		char *buf = xcalloc(lexer->buflen + 1, 1);
		memcpy(buf, lexer->buf, lexer->buflen);
		char *s = xcalloc(lexer->buflen + 1, 1);
		memcpy(s, lexer->buf, lexer->buflen);
		out->token = T_LITERAL;
		out->storage = STORAGE_STRING;
		out->string.len = lexer->buflen;
		out->string.value = buf;
		out->string.value = s;
		consume(lexer, -1);
		return out->token;
	case '\'':
@@ -624,7 +645,14 @@ lex_string(struct lexer *lexer, struct token *out)
			error(out->loc, "Expected rune before trailing single quote");
		case '\\':
			push(lexer, c, false);
			out->rune = lex_rune(lexer);
			struct location loc = lexer->loc;
			size_t sz = lex_rune(lexer, buf);
			buf[sz] = '\0';
			const char *s = buf;
			out->rune = utf8_decode(&s);
			if (out->rune == UTF8_INVALID) {
				error(loc, "invalid UTF-8 in rune literal");
			}
			break;
		default:
			out->rune = c;
diff --git a/src/parse.c b/src/parse.c
index 4f91c8a5..5f44db85 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -12,6 +12,7 @@
#include "lex.h"
#include "parse.h"
#include "types.h"
#include "utf8.h"
#include "util.h"

static void
@@ -823,6 +824,7 @@ parse_constant(struct lexer *lexer)
		break;
	}

	struct location loc = tok.loc;
	switch (tok.storage) {
	case STORAGE_U8:
	case STORAGE_U16:
@@ -863,6 +865,19 @@ parse_constant(struct lexer *lexer)
			exp->constant.string.len += tok.string.len;
		}
		unlex(lexer, &tok);

		// check for invalid UTF-8 (possible when \x is used)
		const char *s = exp->constant.string.value;
		while (*s != '\0') {
			if (utf8_decode(&s) != UTF8_INVALID) {
				continue;
			}
			xfprintf(stderr,
				"%s:%d%d: invalid UTF-8 in string literal\n",
				sources[loc.file], loc.lineno, loc.colno);
			errline(loc);
			exit(EXIT_FAILURE);
		}
		break;
	case STORAGE_BOOL:
	case STORAGE_NULL:
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index cebd4114..e2fa9e28 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -508,7 +508,23 @@ fn basics() void = {
	let b1 = true, b2 = false;
	let p1: nullable *int = null;
	let r1 = ['x', '\x0A', '\u1234', '\0', '\a', '\b', '\f', '\n', '\r', '\t',
		'\v', '\\', '\'', '\"', '\U12345678'];
		'\v', '\\', '\'', '\"', '\U00123456', '\u0080', '\U00000080'];
	static assert('a' == '\x61');
	static assert('a' == '\u0061');
	static assert('a' == '\U00000061');
	static assert('a' == 0x61u32);
	static assert('à' == '\u00e0');
	static assert('à' == '\U000000e0');
	static assert('à' == 0xe0u32);
	assert(compile(`let r = 'abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\033';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\xc3';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x123';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u69';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U1234567';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\xah';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\uahij';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\Uahijklmn';`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
diff --git a/tests/04-strings.ha b/tests/04-strings.ha
index 6dfca7d5..a4b2bfd1 100644
--- a/tests/04-strings.ha
+++ b/tests/04-strings.ha
@@ -1,4 +1,4 @@
use rt::{compile, exited, EXIT_SUCCESS};
use rt::{compile, exited, EXIT_SUCCESS, toutf8};

fn measurements() void = {
	const x = "Hello!";
@@ -65,6 +65,19 @@ fn equality() void = {
	static assert("foo\0bar" != "foo\0foo");
};

fn escapes() void = {
	const s = "à";
	assert(s == "\xc3\xa0");
	assert(s == "\xc3" "" "\xa0");
	assert(s == "\u00e0");
	assert(s == "\U000000e0");
	const s = toutf8(s);
	assert(len(s) == 2 && s[0] == 0xc3 && s[1] == 0xa0);

	assert("\x345" == "45");
	assert("\033" == "\x0033");
};

fn raw() void = {
	assert(`hello \" world` == "hello \\\" world");
};
@@ -81,6 +94,17 @@ fn reject() void = {
			x += "fdsa";
		};
	`) as exited != EXIT_SUCCESS);

	assert(compile(`let s = "\xc3";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc3\x00";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc30";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xc3" "\xc3";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xa";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u69";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U1234567";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\xah";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\uahij";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\Uahijklmn";`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
@@ -88,6 +112,7 @@ export fn main() void = {
	storage();
	concat();
	equality();
	escapes();
	raw();
	reject();
};
-- 
2.40.1

[PATCH harec 3/3] lex: disallow sign in hex literal

Details
Message ID
<20230914040426.30728-3-sebastian@sebsite.pw>
In-Reply-To
<20230914040426.30728-1-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
Patch: +21 -6
Signed-off-by: Sebastian <sebastian@sebsite.pw>
---
 src/lex.c             | 15 +++++++++------
 tests/00-constants.ha |  6 ++++++
 tests/04-strings.ha   |  6 ++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/lex.c b/src/lex.c
index 336d9906..93f5f509 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -563,8 +563,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[0] = next(lexer, NULL, false);
			buf[1] = next(lexer, NULL, false);
			buf[2] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			out[0] = c;
@@ -575,8 +576,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[2] = next(lexer, NULL, false);
			buf[3] = next(lexer, NULL, false);
			buf[4] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			return utf8_encode(out, c);
@@ -590,8 +592,9 @@ lex_rune(struct lexer *lexer, char *out)
			buf[6] = next(lexer, NULL, false);
			buf[7] = next(lexer, NULL, false);
			buf[8] = '\0';
			c = strtoul(&buf[0], &endptr, 16);
			if (*endptr != '\0') {
			c = strtoul(buf, &endptr, 16);
			// need isxdigit check to disallow sign
			if (*endptr != '\0' || !isxdigit(buf[0])) {
				error(loc, "Invalid hex literal");
			}
			return utf8_encode(out, c);
diff --git a/tests/00-constants.ha b/tests/00-constants.ha
index e2fa9e28..5e43937c 100644
--- a/tests/00-constants.ha
+++ b/tests/00-constants.ha
@@ -525,6 +525,12 @@ fn basics() void = {
	assert(compile(`let r = '\xah';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\uahij';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\Uahijklmn';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x-a';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\x+a';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u-abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\u+abc';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U-abcdefg';`) as exited != EXIT_SUCCESS);
	assert(compile(`let r = '\U+abcdefg';`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
diff --git a/tests/04-strings.ha b/tests/04-strings.ha
index a4b2bfd1..51b86612 100644
--- a/tests/04-strings.ha
+++ b/tests/04-strings.ha
@@ -105,6 +105,12 @@ fn reject() void = {
	assert(compile(`let s = "\xah";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\uahij";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\Uahijklmn";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\x-a";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\x+a";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u-abc";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\u+abc";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U-abcdefg";`) as exited != EXIT_SUCCESS);
	assert(compile(`let s = "\U+abcdefg";`) as exited != EXIT_SUCCESS);
};

export fn main() void = {
-- 
2.40.1

[harec/patches] build failed

builds.sr.ht <builds@sr.ht>
Details
Message ID
<CVICEA1PKQCZ.38F3D9LN7MU7M@cirno2>
In-Reply-To
<20230914040426.30728-3-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
harec/patches: FAILED in 1m39s

[lex: improve location in invalid escape errors][0] from [Sebastian][1]

[0]: https://lists.sr.ht/~sircmpwn/hare-dev/patches/44669
[1]: sebastian@sebsite.pw

✗ #1057384 FAILED  harec/patches/alpine.yml  https://builds.sr.ht/~sircmpwn/job/1057384
✓ #1057386 SUCCESS harec/patches/netbsd.yml  https://builds.sr.ht/~sircmpwn/job/1057386
✓ #1057385 SUCCESS harec/patches/freebsd.yml https://builds.sr.ht/~sircmpwn/job/1057385

Re: [PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals

Details
Message ID
<CVN7RBX7829W.2J01OINDW3AE7@attila>
In-Reply-To
<20230914040426.30728-2-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
> Hold off on merging this until I send a spec update, I just realized right now I
> forgot to do that. Specifically, I imagine the spec will need to be more
> explicit about what kind of stuff is allowed in rune/string constants, and the
> behavior of \x in rune/string constants will need to be changed. Actually I
> should probably talk about that:
>
> So, some context: \u is kinda weird. It behaves differently depending on where
> it's used. In rune literals, it denotes the value the rune should hold, so
> '\u1234' is represented as 0x1234. But in string literals, it's converted to
> UTF-8. This is weird, but I feel like it's still correct.

I don't particularly like this. The name \u implies it has something to do with
unicode in all contexts. Perhaps a less weird alternative would be to have \u
behave identically in str and rune and instead allow arbitrary \x sequences up
to 4 bytes in lenght in runes, consistently with how \x allows one to disregard
the rules in string literals.

> \x is also weird. Prior to this commit, it behaved pretty much identically to
> \u: in rune literals, it was used literally as the value; in string literals, it
> was converted to UTF-8. Which is different from how \x behaves in other similar
> languages: C, Go, Rust, and Zig all have \x denote a literal byte, whereas \u
> denotes a Unicode codepoint. So I've changed \x to denote literal bytes in
> strings

This is a huge can of worms. Right now just about everything and everyone
assumes string literals are valid UTF-8. I have stated before that I do not
agree with the current situation and I'm happy you're opening this discussion,
but moving in this direction also amplifies some problems that I see around
Hare's current str type (and I used to like Hare's way of doing string types, I
even argued with Zig people about the awesomeness of our design at some point).

Right now `str` serves a double purpose, as a type for string literals and as a
type for constant byte sequences of valid UTF-8. We already sometimes employ
various hacks to get around the second intended purpose, but before this
proposed change at least the two purposes had a non-empty intersection because
the literals were supposed to be valid UTF-8.

Now we're explicitly adding a way to make the string literals disobey UTF-8
rules, making the two purposes independent of each other and that makes me
question this whole idea of having a single type doing both of those things.

So there are plenty of options what to do, we could dedicate the str type just
to string literals and use []u8 for all byte data at runtime no matter whether
it's valid UTF-8 or not, we could do it the other way around, we could also
have a separate builtin type for each of those things I suppose. All of those
are quite disruptive unfortunately. We could also do nothing. I personally
prefer the first one.

(Sorry, this got a bit longer than expected and also not neccesarily immediately
relevant to merging or not merging this patch.)

Re: [PATCH harec 3/3] lex: disallow sign in hex literal

Details
Message ID
<CVN7RZT8WXRS.7FVXUZG1UUX7@attila>
In-Reply-To
<20230914040426.30728-3-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
'\u  aa' was also considered valid before this patch (strto* functions allow
whitespace before the number), so the commit message, comments and tests should
also reflect that.

Re: [PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals

Details
Message ID
<CVN7PP5GC9MW.34IQCM6IODH8H@notmylaptop>
In-Reply-To
<CVN7RBX7829W.2J01OINDW3AE7@attila> (view parent)
DKIM signature
missing
Download raw message
On Tue Sep 19, 2023 at 5:32 PM EDT, Bor Grošelj Simić wrote:
> I don't particularly like this. The name \u implies it has something to do with
> unicode in all contexts. Perhaps a less weird alternative would be to have \u
> behave identically in str and rune and instead allow arbitrary \x sequences up
> to 4 bytes in lenght in runes, consistently with how \x allows one to disregard
> the rules in string literals.

\u does have to do with Unicode in all contexts though. It's just that
codepoints are stored differently in rune than in str.

> Right now `str` serves a double purpose, as a type for string literals and as a
> type for constant byte sequences of valid UTF-8. We already sometimes employ
> various hacks to get around the second intended purpose, but before this
> proposed change at least the two purposes had a non-empty intersection because
> the literals were supposed to be valid UTF-8.

I don't really understand the issue here? I don't think str really has a
double purpose: its purpose is to store UTF-8 encoded text.

> Now we're explicitly adding a way to make the string literals disobey UTF-8
> rules, making the two purposes independent of each other and that makes me
> question this whole idea of having a single type doing both of those things.

String literals still can't disobey UTF-8 rules. e.g. "\xaa" is still
invalid, since it doesn't form a valid UTF-8 codepoint. The only
difference is that \x now denotes bytes rather than codepoints, but the
compiler still enforces that those bytes form valid UTF-8.

Re: [PATCH harec 3/3] lex: disallow sign in hex literal

Details
Message ID
<CVN7UWLUG13E.258HDG7X6FJC2@monch>
In-Reply-To
<CVN7RZT8WXRS.7FVXUZG1UUX7@attila> (view parent)
DKIM signature
missing
Download raw message
On Tue Sep 19, 2023 at 9:32 PM UTC, Bor Grošelj Simić wrote:
> '\u  aa' was also considered valid before this patch (strto* functions allow
> whitespace before the number), so the commit message, comments and tests should
> also reflect that.

does the spec allow this? if so, it should be disallowed there as well

Re: [PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals

Details
Message ID
<CVN7VP1EL3OP.1LFB3O7V12VVA@monch>
In-Reply-To
<CVN7PP5GC9MW.34IQCM6IODH8H@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Tue Sep 19, 2023 at 9:29 PM UTC, Sebastian wrote:
> String literals still can't disobey UTF-8 rules. e.g. "\xaa" is still
> invalid, since it doesn't form a valid UTF-8 codepoint. The only
> difference is that \x now denotes bytes rather than codepoints, but the
> compiler still enforces that those bytes form valid UTF-8.

+1 to doing this. if someone needs invalid utf8, they should use a u8
array literal instead

Re: [PATCH harec 3/3] lex: disallow sign in hex literal

Details
Message ID
<CVN7WOUQF6E1.YIRWLDN0VQH6@notmylaptop>
In-Reply-To
<CVN7UWLUG13E.258HDG7X6FJC2@monch> (view parent)
DKIM signature
missing
Download raw message
On Tue Sep 19, 2023 at 5:36 PM EDT, Ember Sawady wrote:
> On Tue Sep 19, 2023 at 9:32 PM UTC, Bor Grošelj Simić wrote:
> > '\u  aa' was also considered valid before this patch (strto* functions allow
> > whitespace before the number), so the commit message, comments and tests should
> > also reflect that.
>
> does the spec allow this? if so, it should be disallowed there as well

No, it was a bug caused by strto* allowing the string to begin with
arbitrary whitespace. Similar to how strto* allows a sign at the
beginning.

Re: [PATCH harec 2/3] Improve handling of invalid UTF-8 in rune/str literals

Details
Message ID
<CVNACFKSAPRT.2H47S35HFN4EH@attila>
In-Reply-To
<CVN7PP5GC9MW.34IQCM6IODH8H@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Tue Sep 19, 2023 at 11:29 PM CEST, Sebastian wrote:
> On Tue Sep 19, 2023 at 5:32 PM EDT, Bor Grošelj Simić wrote:
> > I don't particularly like this. The name \u implies it has something to do with
> > unicode in all contexts. Perhaps a less weird alternative would be to have \u
> > behave identically in str and rune and instead allow arbitrary \x sequences up
> > to 4 bytes in lenght in runes, consistently with how \x allows one to disregard
> > the rules in string literals.
>
> \u does have to do with Unicode in all contexts though. It's just that
> codepoints are stored differently in rune than in str.

Right. +1 to this part then.

>
> > Right now `str` serves a double purpose, as a type for string literals and as a
> > type for constant byte sequences of valid UTF-8. We already sometimes employ
> > various hacks to get around the second intended purpose, but before this
> > proposed change at least the two purposes had a non-empty intersection because
> > the literals were supposed to be valid UTF-8.
>
> I don't really understand the issue here? I don't think str really has a
> double purpose: its purpose is to store UTF-8 encoded text.
>
> > Now we're explicitly adding a way to make the string literals disobey UTF-8
> > rules, making the two purposes independent of each other and that makes me
> > question this whole idea of having a single type doing both of those things.
>
> String literals still can't disobey UTF-8 rules. e.g. "\xaa" is still
> invalid, since it doesn't form a valid UTF-8 codepoint. The only
> difference is that \x now denotes bytes rather than codepoints, but the
> compiler still enforces that those bytes form valid UTF-8.

Sorry, I can't read. Disregard what I said. I still have mixed feelings
regarding the str type and the UTF-8 invariant, but that truly has no relevance
for this patch then.
Details
Message ID
<CVRME6H0T9XO.3LKREZH1FMSQD@monch>
In-Reply-To
<20230914040426.30728-1-sebastian@sebsite.pw> (view parent)
DKIM signature
missing
Download raw message
thanks!

to git@git.sr.ht:~sircmpwn/harec
  6151bd6..a3687cb  master -> master

took 1/3, sounds like 2/3 and 3/3 need a v2
Reply to thread Export thread (mbox)