~mcf/cproc

qbe.c: always output strings as byte type v1 PROPOSED

Nihal Jere
Nihal Jere: 8
 qbe.c: always output strings as byte type
 add utf.*: a subset of sbase's libutf
 expr.c: handle prefixed string literals
 qbe.c: put explicit 0 at the end of strings
 main.c: use system locale
 allow for different width strings in string expressions
 expr.c: handle prefixed string literals
 add test for prefixed literals

 14 files changed, 496 insertions(+), 42 deletions(-)
Thanks for the v2!

On 2021-04-04, Nihal Jere <nihal@nihaljere.xyz> wrote:
Next
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~mcf/cproc/patches/21543/mbox | git am -3
Learn more about email & git
View this thread in the archives

[PATCH 1/3] qbe.c: always output strings as byte type Export this patch

Nihal Jere
qbe only understands byte-width string literals, so this lets us give it
byte-width string literals even if the underlying data represents a
string literal with a larger width.
---
In an attempt to learn something about compilers, I tried to implement
prefixed string literals for cproc. I believe it to be correct, as it
seems to generate the correct qbe and asm, and st works properly without
the config.h patch. However, I wouldn't be surprised if there was a more
concise way to do it. This is the best that I could think of.

I would have also written a test for it, but the wide character types
require headers. I suppose they could be typedefed to the right types in
the test, but that might be unportable.
Nihal

 qbe.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/qbe.c b/qbe.c
index 799d56b..0a3b73b 100644
--- a/qbe.c
+++ b/qbe.c
@@ -1326,7 +1326,11 @@ emitdata(struct decl *d, struct init *init)
			*/
			bits &= 0x7f >> (cur->bits.after + 7) % 8;
		} else {
			printf("%c ", cur->expr->type->kind == TYPEARRAY ? cur->expr->type->base->repr->ext : cur->expr->type->repr->ext);
			/* qbe does not support string literals with widths larger than 'b' */
			if (cur->expr->kind == EXPRSTRING)
				printf("b ");
			else
				printf("%c ", cur->expr->type->kind == TYPEARRAY ? cur->expr->type->base->repr->ext : cur->expr->type->repr->ext);
			dataitem(cur->expr, cur->end - cur->start);
			fputs(", ", stdout);
		}
-- 
2.31.0
Thanks so much for the patches! This is something I have put off for
too long, but really needs to be done. Your patches look like a really
good first attempt.

On 2021-03-27, Nihal Jere <nihal@nihaljere.xyz> wrote:

[PATCH v2 1/5] add utf.*: a subset of sbase's libutf Export this patch

Nihal Jere
Also I wrote some functions for UTF-16.
---
 Makefile |   1 +
 utf.c    | 239 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf.h    |  44 ++++++++++
 3 files changed, 284 insertions(+)
 create mode 100644 utf.c
 create mode 100644 utf.h

diff --git a/Makefile b/Makefile
index f53225d..4cf0b0d 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ SRC=\
	token.c\
	tree.c\
	type.c\
	utf.c\
	util.c\
	$(BACKEND).c
OBJ=$(SRC:%.c=$(objdir)/%.o)
diff --git a/utf.c b/utf.c
new file mode 100644
index 0000000..ba9c6a6
--- /dev/null
+++ b/utf.c
@@ -0,0 +1,239 @@
/* MIT/X Consortium Copyright (c) 2012 Connor Lane Smith <cls@lubutu.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
#include <string.h>
#include "utf.h"

#define MIN(x,y)  ((x) < (y) ? (x) : (y))

#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \
                 : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \
                 : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \
                 : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \
                 : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \
                 : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \
                 : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \
                                          : 0 )

#define BADRUNE(x) ((x) < 0 || (x) > Runemax \
                || ((x) & 0xFFFE) == 0xFFFE \
                || ((x) >= 0xD800 && (x) <= 0xDFFF) \
                || ((x) >= 0xFDD0 && (x) <= 0xFDEF))

int
runetochar(char *s, const Rune *p)
{
	Rune r = *p;

	switch(runelen(r)) {
	case 1: /* 0aaaaaaa */
		s[0] = r;
		return 1;
	case 2: /* 00000aaa aabbbbbb */
		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
		return 2;
	case 3: /* aaaabbbb bbcccccc */
		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
		return 3;
	case 4: /* 000aaabb bbbbcccc ccdddddd */
		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
		return 4;
	default:
		return 0; /* error */
	}
}

int 
runetochar16(unsigned short *s, const Rune *p)
{
	Rune r = *p;
	switch(runelen16(r)) {
	case 1:
		s[0] = r;
		return 1;
	case 2:
		s[0] = 0xD800 | ((r - 0x10000) >> 10);
		s[1] = 0xDC00 | (r & 0x3FF);
		return 2;
	}
	return 0; /* error */
}

int
runestochars(char *s, const Rune *p, size_t n)
{
	int ret;
	char *ptr = s;
	for (int i = 0; i < n; i++) {
		if ((ret = runetochar(ptr, p+i)) == 0)
			return -1;
		ptr += ret;
	}

	return ptr - s;
}

int
runestochar16s(unsigned short *s, const Rune *p, size_t n)
{
	int ret;
	unsigned short *ptr = s;
	for (int i = 0; i < n; i++) {
		if ((ret = runetochar16(ptr, p+i)) == 0)
			return -1;
		ptr += ret;
	}

	return ptr - s;
}

int
chartorune(Rune *p, const char *s)
{
	return charntorune(p, s, UTFmax);
}

int
charntorune(Rune *p, const char *s, size_t len)
{
	unsigned int i, n;
	Rune r;

	if(len == 0) /* can't even look at s[0] */
		return 0;

	switch((n = UTFSEQ(s[0]))) {
	case 1: r = s[0];        break; /* 0xxxxxxx */
	case 2: r = s[0] & 0x1F; break; /* 110xxxxx */
	case 3: r = s[0] & 0x0F; break; /* 1110xxxx */
	case 4: r = s[0] & 0x07; break; /* 11110xxx */
	case 5: r = s[0] & 0x03; break; /* 111110xx */
	case 6: r = s[0] & 0x01; break; /* 1111110x */
	default: /* invalid sequence */
		*p = Runeerror;
		return 1;
	}
	/* add values from continuation bytes */
	for(i = 1; i < MIN(n, len); i++)
		if((s[i] & 0xC0) == 0x80) {
			/* add bits from continuation byte to rune value
			 * cannot overflow: 6 byte sequences contain 31 bits */
			r = (r << 6) | (s[i] & 0x3F); /* 10xxxxxx */
		}
		else { /* expected continuation */
			*p = Runeerror;
			return i;
		}

	if(i < n) /* must have reached len limit */
		return 0;

	/* reject invalid or overlong sequences */
	if(BADRUNE(r) || runelen(r) < (int)n)
		r = Runeerror;

	*p = r;
	return n;
}

int
runelen(Rune r)
{
	if(BADRUNE(r))
		return 0; /* error */
	else if(r <= 0x7F)
		return 1;
	else if(r <= 0x07FF)
		return 2;
	else if(r <= 0xFFFF)
		return 3;
	else
		return 4;
}

int
runelen16(Rune r)
{
	if(BADRUNE(r))
		return 0; /* error */
	else if(r <= 0xFFFF)
		return 1;
	else
		return 2;
}

size_t
runenlen(const Rune *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++)
		n += runelen(p[i]);
	return n;
}

size_t
runenlen16(const Rune *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++)
		n += runelen16(p[i]);
	return n;
}

int
fullrune(const char *s, size_t len)
{
	Rune r;

	return charntorune(&r, s, len) > 0;
}

size_t
utflen(const char *s)
{
	const char *p = s;
	size_t i;
	Rune r;

	for(i = 0; *p != '\0'; i++)
		p += chartorune(&r, p);
	return i;
}

size_t
utfnlen(const char *s, size_t len)
{
	const char *p = s;
	size_t i;
	Rune r;
	int n;

	for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++)
		p += n;
	return i;
}
diff --git a/utf.h b/utf.h
new file mode 100644
index 0000000..c69d3cd
--- /dev/null
+++ b/utf.h
@@ -0,0 +1,44 @@
/* MIT/X Consortium Copyright (c) 2012 Connor Lane Smith <cls@lubutu.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
#include <stdio.h>

typedef int Rune;

enum {
	UTFmax    = 6,       /* maximum bytes per rune */
	Runeself  = 0x80,    /* rune and utf are equal (<) */
	Runeerror = 0xFFFD,  /* decoding error in utf */
	Runemax   = 0x10FFFF /* maximum rune value */
};

int runetochar(char *, const Rune *);
int runetochar16(unsigned short *, const Rune *);
int runestochars(char *, const Rune *, size_t);
int runestochar16s(unsigned short *, const Rune *, size_t);
int chartorune(Rune *, const char *);
int charntorune(Rune *, const char *, size_t);
int runelen(Rune);
int runelen16(Rune);
size_t runenlen(const Rune *, size_t);
size_t runenlen16(const Rune *, size_t);
int fullrune(const char *, size_t);
size_t utflen(const char *);
size_t utfnlen(const char *, size_t);
-- 
2.31.1

[PATCH 2/3] expr.c: handle prefixed string literals Export this patch

Nihal Jere
---
 expr.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 qbe.c  |   2 +-
 2 files changed, 101 insertions(+), 11 deletions(-)

diff --git a/expr.c b/expr.c
index 8b10bf4..d20e8c2 100644
--- a/expr.c
+++ b/expr.c
@@ -7,9 +7,51 @@
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <wchar.h>
#include <uchar.h>
#include "util.h"
#include "cc.h"

static size_t
mbstoc16s(char16_t *dest, const char *src, size_t n)
{
	size_t ret, w = 0;
	do {
		ret = mbrtoc16(dest ? dest + w : NULL, src, n, 0);
		if (ret > 0) {
			src += ret;
			n -= ret;
			w += 1;
		} else if (ret == -3) {
			w += 1;
		} else if (ret == -1) {
			return -1;
		}
	} while (ret != 0 && n > 0);

	return w;
}

static size_t
mbstoc32s(char32_t *dest, const char *src, size_t n)
{
	size_t ret, w = 0;
	do {
		ret = mbrtoc32(dest ? dest + w : NULL, src, n, 0);
		if (ret > 0) {
			src += ret;
			n -= ret;
			w += 1;
		} else if (ret == -3) {
			w += 1;
		} else if (ret == -1) {
			return -1;
		}
	} while (ret != 0 && n > 0);

	return w;
}

static struct expr *
mkexpr(enum exprkind k, struct type *t)
{
@@ -456,8 +498,9 @@ primaryexpr(struct scope *s)
	struct expr *e;
	struct decl *d;
	struct type *t;
	char *src, *dst, *end;
	char *src, *dst, *end, *tmp, p = 0, np;
	int base;
	size_t tmpsize, count;

	switch (tok.kind) {
	case TIDENT:
@@ -473,21 +516,68 @@ primaryexpr(struct scope *s)
		next();
		break;
	case TSTRINGLIT:
		e = mkexpr(EXPRSTRING, mkarraytype(&typechar, QUALNONE, 0));
		e->lvalue = true;
		e->string.size = 0;
		e->string.data = NULL;
		tmp = NULL;
		tmpsize = 0;
		do {
			e->string.data = xreallocarray(e->string.data, e->string.size + strlen(tok.lit), 1);
			dst = e->string.data + e->string.size;
			src = tok.lit;
			if (*src != '"')
				fatal("wide string literal not yet implemented");
			switch (*src) {
			default: error(&tok.loc, "invalid prefix for string literal");
			case 'L': ++src; np = 'w'; goto typeknown;
			case 'U': ++src; np = 'U'; goto typeknown;
			case '"':        np = 'c'; goto typeknown;
			case 'u': ++src;
			}
			switch (*src) {
			default: error(&tok.loc, "invalid prefix for string literal");
			case '"':        np = 'u'; break;
			case '8': ++src; np = '8';
			}
typeknown:
			if (p && p != np && !(p == 'c' || np == 'c'))
				error(&tok.loc, "cannot have adjacent string literals with different prefixes");
			if (!p || np != 'c')
				p = np;
			tmp = xreallocarray(tmp, tmpsize + strlen(tok.lit), 1);
			/* mbstowcs might miscalculate length without this */
			memset(tmp + tmpsize, 0, strlen(tok.lit));
			dst = tmp + tmpsize;
			for (++src; *src != '"'; ++dst)
				*dst = unescape(&src);
			e->string.size = dst - e->string.data;
			tmpsize = dst - tmp;
			next();
		} while (tok.kind == TSTRINGLIT);
		switch (p) {
			case 'c':
			case '8': t = &typechar;       break;
			case 'w': t = targ->typewchar; break;
			case 'u': t = &typeushort;     break;
			case 'U': t = &typeuint;       break;
		}
		e = mkexpr(EXPRSTRING, mkarraytype(t, QUALNONE, 0));
		e->lvalue = true;
		e->string.data  = NULL;
		switch (p) {
			case 'c':
			case '8': e->string.size = tmpsize; e->string.data = tmp;     break;
			case 'w': e->string.size =  mbstowcs(NULL, tmp, tmpsize);     break;
			case 'u': e->string.size = mbstoc16s(NULL, tmp, tmpsize);     break;
			case 'U': e->string.size = mbstoc32s(NULL, tmp, tmpsize);     break;
			default: assert(0);
		}
		e->string.data = xreallocarray(e->string.data, e->string.size, t->size);
		switch (p) {
			case 'c':
			case '8': goto postconvert;
			case 'w': count =   mbstowcs((wchar_t *)e->string.data, tmp, tmpsize); break;
			case 'u': count = mbstoc16s((char16_t *)e->string.data, tmp, tmpsize); break;
			case 'U': count = mbstoc32s((char32_t *)e->string.data, tmp, tmpsize); break;
			default: assert(0);
		}
		if (count == -1)
			error(&tok.loc, "string literal contains invalid multibyte sequence");
		assert(count == e->string.size);
		free(tmp);
postconvert:
		e->type->array.length = e->string.size + 1;
		e->type->size = e->type->array.length * e->type->base->size;
		e->type->incomplete = false;
diff --git a/qbe.c b/qbe.c
index 0a3b73b..1957c8a 100644
--- a/qbe.c
+++ b/qbe.c
@@ -1257,7 +1257,7 @@ dataitem(struct expr *expr, uint64_t size)
		break;
	case EXPRSTRING:
		fputc('"', stdout);
		for (i = 0; i < expr->string.size && i < size; ++i) {
		for (i = 0; i < expr->string.size * expr->type->base->size && i < size; ++i) {
			c = expr->string.data[i];
			if (isprint(c) && c != '"' && c != '\\')
				putchar(c);
-- 
2.31.0

[PATCH v2 2/5] qbe.c: put explicit 0 at the end of strings Export this patch

Nihal Jere
rather than using "z" to fill in zeros, an explicit 0 is used. This is a
simpler way to deal with the empty string edge case, which outputs a
type without any values. For example:

    void *a = L"";

might output

    data $.Lstring.1 = align 4 { w , z 4, }

which is invalid qbe. Now it will output

    data $.Lstring.1 = align 4 { w 0 , }

which is valid.
---
I would imagine there's probably a good reason that the 'z' is used to
fill out the data field, but this is the simplest way I found to deal
with an empty string if there isn't.
Also I think the commit message messed up, but too late for fixing it.
This commit also makes the characters in the string output as a series
of integer literals rather than a string literal. This fixes the
endianness issue in the last patchset, but decreases readability in the
qbe as byte-string literals are no longer output as string literals.
 qbe.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/qbe.c b/qbe.c
index 01dd2d3..b299ba6 100644
--- a/qbe.c
+++ b/qbe.c
@@ -1274,8 +1274,7 @@ static void
dataitem(struct expr *expr, uint64_t size)
{
	struct decl *decl;
	size_t i;
	char c;
	size_t i, w;

	switch (expr->kind) {
	case EXPRUNARY:
@@ -1303,17 +1302,16 @@ dataitem(struct expr *expr, uint64_t size)
			printf("%" PRIu64, expr->constant.i);
		break;
	case EXPRSTRING:
		fputc('"', stdout);
		for (i = 0; i < expr->string.size && i < size; ++i) {
			c = expr->string.data[i];
			if (isprint(c) && c != '"' && c != '\\')
				putchar(c);
			else
				printf("\\%03hho", c);
		}
		fputc('"', stdout);
		if (i < size)
			printf(", z %" PRIu64, size - i);
		/* not sure if this is the right thing way to find the width */
		w = size / expr->type->array.length;
		for (i = 0; i < expr->string.size && i*w < size; ++i)
			switch (w) {
			case 1: printf("%hhu ", expr->string.data8[i]); break;
			case 2: printf("%hu ", expr->string.data16[i]); break;
			case 4: printf("%u ", expr->string.data32[i]);  break;
			default: assert(0);
			}
		printf("0 ");
		break;
	default:
		error(&tok.loc, "initializer is not a constant expression");
-- 
2.31.1
Thanks for the v2!

On 2021-04-04, Nihal Jere <nihal@nihaljere.xyz> wrote:

[PATCH 3/3] main.c: use system locale Export this patch

Nihal Jere
---
 main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.c b/main.c
index 004df85..41353e2 100644
--- a/main.c
+++ b/main.c
@@ -1,3 +1,4 @@
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
@@ -20,6 +21,7 @@ main(int argc, char *argv[])
	bool pponly = false;
	char *output = NULL, *target = NULL;

	setlocale(LC_ALL, "");
	argv0 = progname(argv[0], "cproc-qbe");
	ARGBEGIN {
	case 'E':
-- 
2.31.0

[PATCH v2 3/5] allow for different width strings in string expressions Export this patch

Nihal Jere
---
 cc.h   | 8 +++++++-
 decl.c | 4 ++--
 qbe.c  | 4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/cc.h b/cc.h
index 482b2a2..1763894 100644
--- a/cc.h
+++ b/cc.h
@@ -1,4 +1,6 @@
#include <stdio.h>
#include <wchar.h>
#include <uchar.h>

struct func;

@@ -332,7 +334,11 @@ struct expr {
			double f;
		} constant;
		struct {
			char *data;
			union {
				char *data8;
				char16_t *data16;
				char32_t *data32;
			};
			size_t size;
		} string;
		struct {
diff --git a/decl.c b/decl.c
index 94e4b15..bba4ddc 100644
--- a/decl.c
+++ b/decl.c
@@ -873,7 +873,7 @@ staticassert(struct scope *s)
		if (!e->decayed || e->base->kind != EXPRSTRING)
			error(&tok.loc, "expected string literal after static assertion expression");
		if (!c)
			error(&tok.loc, "static assertion failed: %.*s", (int)e->base->string.size, e->base->string.data);
			error(&tok.loc, "static assertion failed: %.*s", (int)e->base->string.size, e->base->string.data8);
	} else if (!c) {
		error(&tok.loc, "static assertion failed");
	}
@@ -1027,7 +1027,7 @@ struct decl *stringdecl(struct expr *expr)
	if (!strings)
		strings = mkmap(64);
	assert(expr->kind == EXPRSTRING);
	mapkey(&key, expr->string.data, expr->string.size);
	mapkey(&key, expr->string.data8, expr->string.size);
	entry = mapput(strings, &key);
	d = *entry;
	if (!d) {
diff --git a/qbe.c b/qbe.c
index b299ba6..21b1758 100644
--- a/qbe.c
+++ b/qbe.c
@@ -985,7 +985,7 @@ funcinit(struct func *func, struct decl *d, struct init *init)
		if (init->expr->kind == EXPRSTRING) {
			for (i = 0; i < init->expr->string.size && i < init->end - init->start; ++i) {
				dst.addr = funcinst(func, IADD, &iptr, d->value, mkintconst(&iptr, init->start + i));
				funcstore(func, &typechar, QUALNONE, dst, mkintconst(&i8, init->expr->string.data[i]));
				funcstore(func, &typechar, QUALNONE, dst, mkintconst(&i8, init->expr->string.data8[i]));
			}
			offset = init->start + i;
		} else {
@@ -1346,7 +1346,7 @@ emitdata(struct decl *d, struct init *init)
			*/
			assert(cur->expr->kind == EXPRSTRING);
			assert(init->expr->kind == EXPRCONST);
			cur->expr->string.data[init->start - cur->start] = init->expr->constant.i;
			cur->expr->string.data8[init->start - cur->start] = init->expr->constant.i;
		}
		start = cur->start + cur->bits.before / 8;
		end = cur->end - (cur->bits.after + 7) / 8;
-- 
2.31.1

[PATCH v2 4/5] expr.c: handle prefixed string literals Export this patch

Nihal Jere
---
To fix the escape sequence truncation issue, I chose to decode the
string literal into UTF-32, and then reencode later once we know the
width.

 expr.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 12 deletions(-)

diff --git a/expr.c b/expr.c
index 8b10bf4..e3ddbbe 100644
--- a/expr.c
+++ b/expr.c
@@ -7,6 +7,7 @@
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "utf.h"
#include "util.h"
#include "cc.h"

@@ -366,7 +367,8 @@ isodigit(int c)
static int
unescape(char **p)
{
	int c;
	Rune c;
	int ret;
	char *s = *p;

	if (*s == '\\') {
@@ -397,7 +399,10 @@ unescape(char **p)
			while (isodigit(*++s));
		}
	} else {
		c = *s++;
		if ((ret = chartorune(&c, s)) == Runeerror)
			error(&tok.loc, "invalid UTF-8 string");
		fprintf(stderr, "");
		s += ret;
	}
	*p = s;
	return c;
@@ -456,8 +461,10 @@ primaryexpr(struct scope *s)
	struct expr *e;
	struct decl *d;
	struct type *t;
	char *src, *dst, *end;
	char *src, *end, p = 0, np;
	Rune *dst, *tmp;
	int base;
	size_t tmpsize, count;

	switch (tok.kind) {
	case TIDENT:
@@ -473,21 +480,66 @@ primaryexpr(struct scope *s)
		next();
		break;
	case TSTRINGLIT:
		e = mkexpr(EXPRSTRING, mkarraytype(&typechar, QUALNONE, 0));
		e->lvalue = true;
		e->string.size = 0;
		e->string.data = NULL;
		tmp = NULL;
		tmpsize = 0;
		do {
			e->string.data = xreallocarray(e->string.data, e->string.size + strlen(tok.lit), 1);
			dst = e->string.data + e->string.size;
			src = tok.lit;
			if (*src != '"')
				fatal("wide string literal not yet implemented");
			switch (*src) {
			default: error(&tok.loc, "invalid prefix for string literal");
			case 'L': ++src; np = 'w'; goto typeknown;
			case 'U': ++src; np = 'U'; goto typeknown;
			case '"':        np = 'c'; goto typeknown;
			case 'u': ++src;
			}
			switch (*src) {
			default: error(&tok.loc, "invalid prefix for string literal");
			case '"':        np = 'u'; break;
			case '8': ++src; np = '8';
			}
typeknown:
			if (p && p != np && !(p == 'c' || np == 'c'))
				error(&tok.loc, "cannot have adjacent string literals with different prefixes");
			if (!p || np != 'c')
				p = np;
			tmp = xreallocarray(tmp, tmpsize + utflen(tok.lit), sizeof(Rune));
			dst = tmp + tmpsize;
			for (++src; *src != '"'; ++dst)
				*dst = unescape(&src);
			e->string.size = dst - e->string.data;
			tmpsize = dst - tmp;
			next();
		} while (tok.kind == TSTRINGLIT);
		switch (p) {
		case 'c':
		case '8': t = &typechar;       break;
		case 'w': t = targ->typewchar; break;
		case 'u': t = &typeushort;     break;
		case 'U': t = &typeuint;       break;
		}
		e = mkexpr(EXPRSTRING, mkarraytype(t, QUALNONE, 0));
		e->lvalue = true;
		e->string.data8 = NULL;
		switch (p) {
		case 'c':
		case '8': e->string.size = runenlen(tmp, tmpsize);          break;
		case 'u': e->string.size = runenlen16(tmp, tmpsize);        break;
		case 'U':
		case 'w': e->string.size = tmpsize; e->string.data32 = tmp; break;
		default: assert(0);
		}
		e->string.data8 = xreallocarray(e->string.data8, e->string.size, t->size);
		switch (p) {
		case 'c':
		case '8': count = runestochars(e->string.data8, tmp, tmpsize); break;
		case 'u': count = runestochar16s(e->string.data16, tmp, tmpsize); break;
		case 'U': 
		case 'w': goto postconvert;
		default: assert(0);
		}
		if (count == -1)
			error(&tok.loc, "string literal contains invalid multibyte sequence");
		assert(count == e->string.size);
		free(tmp);
postconvert:
		e->type->array.length = e->string.size + 1;
		e->type->size = e->type->array.length * e->type->base->size;
		e->type->incomplete = false;
-- 
2.31.1

[PATCH v2 5/5] add test for prefixed literals Export this patch

Nihal Jere
---
 test/prefixed-string.c   |  6 ++++++
 test/prefixed-string.qbe | 12 ++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 test/prefixed-string.c
 create mode 100644 test/prefixed-string.qbe

diff --git a/test/prefixed-string.c b/test/prefixed-string.c
new file mode 100644
index 0000000..e6b2630
--- /dev/null
+++ b/test/prefixed-string.c
@@ -0,0 +1,6 @@
void *e = L"";
void *a = L"⌷ā¾" "½";
unsigned short *b = u"⋄¿ß" "å";
unsigned *c = U"¸…¨" "´";
char *d = u8"āāā" "d";
void *hex = L"\x1F600";
diff --git a/test/prefixed-string.qbe b/test/prefixed-string.qbe
new file mode 100644
index 0000000..f87efee
--- /dev/null
+++ b/test/prefixed-string.qbe
@@ -0,0 +1,12 @@
data $.Lstring.1 = align 4 { w 0 , }
export data $e = align 8 { l $.Lstring.1, }
data $.Lstring.2 = align 4 { w 9015 257 190 189 0 , }
export data $a = align 8 { l $.Lstring.2, }
data $.Lstring.3 = align 2 { h 8900 191 223 229 0 , }
export data $b = align 8 { l $.Lstring.3, }
data $.Lstring.4 = align 4 { w 184 8230 168 180 0 , }
export data $c = align 8 { l $.Lstring.4, }
data $.Lstring.5 = align 1 { b 196 129 196 129 196 129 100 0 , }
export data $d = align 8 { l $.Lstring.5, }
data $.Lstring.6 = align 4 { w 128512 0 , }
export data $hex = align 8 { l $.Lstring.6, }
-- 
2.31.1