~mcf/cproc

add utf.*: a subset of sbase's libutf v4 PROPOSED

Nihal Jere
Nihal Jere: 2
 add utf.*: a subset of sbase's libutf
 handle prefixed string literals

 20 files changed, 698 insertions(+), 63 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~mcf/cproc/patches/22238/mbox | git am -3
Learn more about email & git
View this thread in the archives

[PATCH v4 1/2] add utf.*: a subset of sbase's libutf Export this patch

Nihal Jere
Also added an sitem struct to represent whether data in a string literal
should be encoded or not, and associated encoding/length functions. And
some functions for UTF-16.
---
 Makefile |   1 +
 utf.c    | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf.h    |  53 ++++++++++
 3 files changed, 356 insertions(+)
 create mode 100644 utf.c
 create mode 100644 utf.h

diff --git a/Makefile b/Makefile
index f53225d..4cf0b0d 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ SRC=\
	token.c\
	tree.c\
	type.c\
	utf.c\
	util.c\
	$(BACKEND).c
OBJ=$(SRC:%.c=$(objdir)/%.o)
diff --git a/utf.c b/utf.c
new file mode 100644
index 0000000..fcd5f31
--- /dev/null
+++ b/utf.c
@@ -0,0 +1,302 @@
/* MIT/X Consortium Copyright (c) 2012 Connor Lane Smith <cls@lubutu.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
#include <assert.h>
#include <string.h>

#include "utf.h"

#define MIN(x,y)  ((x) < (y) ? (x) : (y))

#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \
                 : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \
                 : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \
                 : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \
                 : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \
                 : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \
                 : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \
                                          : 0 )

#define BADRUNE(x) ((x) < 0 || (x) > Runemax \
                || ((x) & 0xFFFE) == 0xFFFE \
                || ((x) >= 0xD800 && (x) <= 0xDFFF) \
                || ((x) >= 0xFDD0 && (x) <= 0xFDEF))

int
runetochar(char *s, const Rune *p)
{
	Rune r = *p;

	switch(runelen(r)) {
	case 1: /* 0aaaaaaa */
		s[0] = r;
		return 1;
	case 2: /* 00000aaa aabbbbbb */
		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
		return 2;
	case 3: /* aaaabbbb bbcccccc */
		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
		return 3;
	case 4: /* 000aaabb bbbbcccc ccdddddd */
		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
		return 4;
	default:
		return 0; /* error */
	}
}

int 
runetochar16(unsigned short *s, const Rune *p)
{
	Rune r = *p;
	switch(runelen16(r)) {
	case 1:
		s[0] = r;
		return 1;
	case 2:
		s[0] = 0xD800 | ((r - 0x10000) >> 10);
		s[1] = 0xDC00 | (r & 0x3FF);
		return 2;
	}
	return 0; /* error */
}

int
sitemstochars(char *s, const struct sitem *p, size_t n)
{
	int ret;
	char *ptr = s;
	for (int i = 0; i < n; i++) {
		switch (p[i].type) {
		case SITEM_LITERAL:
			*ptr = p[i].val;
			ptr++;
			break;
		case SITEM_ENCODED:
			if ((ret = runetochar(ptr, &p[i].val)) == 0)
				return -1;
			ptr += ret;
			break;
		default: assert(0);
		}
	}

	return ptr - s;
}

int
sitemstochar16s(unsigned short *s, const struct sitem *p, size_t n)
{
	int ret;
	unsigned short *ptr = s;
	for (int i = 0; i < n; i++) {
		switch (p[i].type) {
		case SITEM_LITERAL:
			*ptr = p[i].val;
			ptr++;
			break;
		case SITEM_ENCODED:
			if ((ret = runetochar16(ptr, &p[i].val)) == 0)
				return -1;
			ptr += ret;
			break;
		default: assert(0);
		}
	}

	return ptr - s;
}

int
sitemstochar32s(unsigned *s, const struct sitem *p, size_t n)
{
	int ret;
	unsigned *ptr = s;
	for (int i = 0; i < n; i++) {
		*ptr = p[i].val;
		ptr++;
	}

	return ptr - s;
}

int
chartorune(Rune *p, const char *s)
{
	return charntorune(p, s, UTFmax);
}

int
charntorune(Rune *p, const char *s, size_t len)
{
	unsigned int i, n;
	Rune r;

	if(len == 0) /* can't even look at s[0] */
		return 0;

	switch((n = UTFSEQ(s[0]))) {
	case 1: r = s[0];        break; /* 0xxxxxxx */
	case 2: r = s[0] & 0x1F; break; /* 110xxxxx */
	case 3: r = s[0] & 0x0F; break; /* 1110xxxx */
	case 4: r = s[0] & 0x07; break; /* 11110xxx */
	case 5: r = s[0] & 0x03; break; /* 111110xx */
	case 6: r = s[0] & 0x01; break; /* 1111110x */
	default: /* invalid sequence */
		*p = Runeerror;
		return 1;
	}
	/* add values from continuation bytes */
	for(i = 1; i < MIN(n, len); i++)
		if((s[i] & 0xC0) == 0x80) {
			/* add bits from continuation byte to rune value
			 * cannot overflow: 6 byte sequences contain 31 bits */
			r = (r << 6) | (s[i] & 0x3F); /* 10xxxxxx */
		}
		else { /* expected continuation */
			*p = Runeerror;
			return i;
		}

	if(i < n) /* must have reached len limit */
		return 0;

	/* reject invalid or overlong sequences */
	if(BADRUNE(r) || runelen(r) < (int)n)
		r = Runeerror;

	*p = r;
	return n;
}

int
runelen(Rune r)
{
	if(BADRUNE(r))
		return 0; /* error */
	else if(r <= 0x7F)
		return 1;
	else if(r <= 0x07FF)
		return 2;
	else if(r <= 0xFFFF)
		return 3;
	else
		return 4;
}

int
runelen16(Rune r)
{
	if(BADRUNE(r))
		return 0; /* error */
	else if(r <= 0xFFFF)
		return 1;
	else
		return 2;
}

size_t
runenlen(const Rune *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++)
		n += runelen(p[i]);
	return n;
}

size_t
sitemnlen(const struct sitem *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++) {
		switch (p[i].type) {
		case SITEM_LITERAL:
			n += 1;
			break;
		case SITEM_ENCODED:
			n += runelen(p[i].val);
			break;
		default: assert(0);
		}
	}
	return n;
}

size_t
runenlen16(const Rune *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++)
		n += runelen16(p[i]);
	return n;
}

size_t
sitemnlen16(const struct sitem *p, size_t len)
{
	size_t i, n = 0;

	for(i = 0; i < len; i++) {
		switch (p[i].type) {
		case SITEM_LITERAL:
			n += 1;
			break;
		case SITEM_ENCODED:
			n += runelen16(p[i].val);
			break;
		default: assert(0);
		}
	}
	return n;
}

size_t
utflen(const char *s)
{
	const char *p = s;
	size_t i;
	Rune r;

	for(i = 0; *p != '\0'; i++)
		p += chartorune(&r, p);
	return i;
}

size_t
utfnlen(const char *s, size_t len)
{
	const char *p = s;
	size_t i;
	Rune r;
	int n;

	for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++)
		p += n;
	return i;
}
diff --git a/utf.h b/utf.h
new file mode 100644
index 0000000..d272d35
--- /dev/null
+++ b/utf.h
@@ -0,0 +1,53 @@
/* MIT/X Consortium Copyright (c) 2012 Connor Lane Smith <cls@lubutu.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

struct sitem {
	enum {
		SITEM_LITERAL,
		SITEM_ENCODED
	} type;
	unsigned int val;
};

typedef int Rune;

enum {
	UTFmax    = 6,       /* maximum bytes per rune */
	Runeself  = 0x80,    /* rune and utf are equal (<) */
	Runeerror = 0xFFFD,  /* decoding error in utf */
	Runemax   = 0x10FFFF /* maximum rune value */
};

int runetochar(char *, const Rune *);
int runetochar16(unsigned short *, const Rune *);
int chartorune(Rune *, const char *);
int charntorune(Rune *, const char *, size_t);
int runelen(Rune);
int runelen16(Rune);
size_t runenlen(const Rune *, size_t);
size_t runenlen16(const Rune *, size_t);
size_t utflen(const char *);
size_t utfnlen(const char *, size_t);
int sitemstochars(char *s, const struct sitem *p, size_t n);
int sitemstochar16s(unsigned short *s, const struct sitem *p, size_t n);
int sitemstochar32s(unsigned *s, const struct sitem *p, size_t n);
size_t sitemnlen(const struct sitem *p, size_t len);
size_t sitemnlen16(const struct sitem *p, size_t len);
-- 
2.31.1

[PATCH v4 2/2] handle prefixed string literals Export this patch

Nihal Jere
---
 cc.h                                 |  6 +-
 decl.c                               | 15 +++--
 expr.c                               | 92 +++++++++++++++++++++++-----
 init.c                               |  9 +--
 qbe.c                                | 58 ++++++++++++------
 test/hello.qbe                       |  2 +-
 test/initializer-replace-local.c     | 10 +++
 test/initializer-replace-local.qbe   | 51 +++++++++++++--
 test/initializer-replace-static.c    | 14 +++++
 test/initializer-replace-static.qbe  |  4 +-
 test/initializer-string-array.qbe    | 34 ++++++++--
 test/initializer-string-braces.qbe   |  2 +-
 test/initializer-string-wide.c       | 11 ++++
 test/initializer-string-wide.qbe     | 88 ++++++++++++++++++++++++++
 test/initializer-string.c            |  5 +-
 test/initializer-string.qbe          |  2 +-
 test/initializer-unsigned-string.qbe |  2 +-
 17 files changed, 342 insertions(+), 63 deletions(-)
 create mode 100644 test/initializer-string-wide.c
 create mode 100644 test/initializer-string-wide.qbe

diff --git a/cc.h b/cc.h
index 3df0880..f10d4b6 100644
--- a/cc.h
+++ b/cc.h
@@ -332,7 +332,11 @@ struct expr {
			double f;
		} constant;
		struct {
			char *data;
			union {
				char *data8;
				unsigned short *data16;
				unsigned *data32;
			};
			size_t size;
		} string;
		struct {
diff --git a/decl.c b/decl.c
index 0d47b8e..d0f5307 100644
--- a/decl.c
+++ b/decl.c
@@ -760,14 +760,19 @@ staticassert(struct scope *s)
		return false;
	expect(TLPAREN, "after _Static_assert");
	c = intconstexpr(s, true);
	e = NULL;
	if (consume(TCOMMA)) {
		e = assignexpr(s);
		if (!e->decayed || e->base->kind != EXPRSTRING)
			error(&tok.loc, "expected string literal after static assertion expression");
		if (!c)
			error(&tok.loc, "static assertion failed: %.*s", (int)e->base->string.size, e->base->string.data);
	} else if (!c) {
		error(&tok.loc, "static assertion failed");
		if (e->base->type->base->size != 1)
			e = NULL;
	}
	if (!c) {
		if (e)
			error(&tok.loc, "static assertion failed: %.*s", (int)e->base->string.size, e->base->string.data8);
		else
			error(&tok.loc, "static assertion failed");
	}
	expect(TRPAREN, "after static assertion");
	expect(TSEMICOLON, "after static assertion");
@@ -1030,7 +1035,7 @@ struct decl *stringdecl(struct expr *expr)
	if (!strings)
		strings = mkmap(64);
	assert(expr->kind == EXPRSTRING);
	mapkey(&key, expr->string.data, expr->string.size);
	mapkey(&key, expr->string.data8, expr->string.size);
	entry = mapput(strings, &key);
	d = *entry;
	if (!d) {
diff --git a/expr.c b/expr.c
index 8b10bf4..eedcad4 100644
--- a/expr.c
+++ b/expr.c
@@ -7,6 +7,7 @@
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "utf.h"
#include "util.h"
#include "cc.h"

@@ -363,10 +364,14 @@ isodigit(int c)
	return '0' <= c && c <= '8';
}

static int
static struct sitem
unescape(char **p)
{
	int c;
	struct sitem item = {
		.type = SITEM_LITERAL
	};
	Rune c;
	int ret;
	char *s = *p;

	if (*s == '\\') {
@@ -397,10 +402,14 @@ unescape(char **p)
			while (isodigit(*++s));
		}
	} else {
		c = *s++;
		item.type = SITEM_ENCODED;
		if ((ret = chartorune(&c, s)) == Runeerror)
			error(&tok.loc, "invalid UTF-8 string");
		s += ret;
	}
	*p = s;
	return c;
	item.val = c;
	return item;
}

static struct expr *
@@ -456,8 +465,10 @@ primaryexpr(struct scope *s)
	struct expr *e;
	struct decl *d;
	struct type *t;
	char *src, *dst, *end;
	char *src, *end, p = 'c', np;
	struct sitem *dst, *tmp;
	int base;
	size_t tmpsize, count;

	switch (tok.kind) {
	case TIDENT:
@@ -473,22 +484,69 @@ primaryexpr(struct scope *s)
		next();
		break;
	case TSTRINGLIT:
		e = mkexpr(EXPRSTRING, mkarraytype(&typechar, QUALNONE, 0));
		e->lvalue = true;
		e->string.size = 0;
		e->string.data = NULL;
		tmp = NULL;
		tmpsize = 0;
		do {
			e->string.data = xreallocarray(e->string.data, e->string.size + strlen(tok.lit), 1);
			dst = e->string.data + e->string.size;
			src = tok.lit;
			if (*src != '"')
				fatal("wide string literal not yet implemented");
			switch (*src) {
			case 'L': np = 'w'; ++src; break;
			case 'U': np = 'U'; ++src; break;
			case '"': np = 'c'; break;
			case 'u': np = src[1] == '8' ? (++src, '8') : 'u'; ++src; break;
			default: error(&tok.loc, "invalid prefix for string literal");
			}

			if (np != 'c' && p != np) {
				if (p == 'c')
					p = np;
				else
					error(&tok.loc, "cannot have adjacent string literals with different prefixes");
			}

			tmp = xreallocarray(tmp, tmpsize + utflen(tok.lit), sizeof(struct sitem));
			dst = tmp + tmpsize;
			for (++src; *src != '"'; ++dst)
				*dst = unescape(&src);
			e->string.size = dst - e->string.data;
			tmpsize = dst - tmp;
			next();
		} while (tok.kind == TSTRINGLIT);
		e->type->array.length = e->string.size + 1;
		switch (p) {
		case 'c':
		case '8': t = &typechar;       break;
		case 'w': t = targ->typewchar; break;
		case 'u': t = &typeushort;     break;
		case 'U': t = &typeuint;       break;
		}
		e = mkexpr(EXPRSTRING, mkarraytype(t, QUALNONE, 0));
		e->lvalue = true;
		e->string.data8 = NULL;
		switch (p) {
		case 'c':
		case '8': e->string.size = sitemnlen(tmp, tmpsize);   break;
		case 'u': e->string.size = sitemnlen16(tmp, tmpsize); break;
		case 'U':
		case 'w': e->string.size = tmpsize; break;
		default: assert(0);
		}
		/* account for null terminator */
		e->string.size += 1;
		e->string.data8 = xreallocarray(e->string.data8, e->string.size, t->size);
		switch (p) {
		case 'c':
		case '8': count = sitemstochars(e->string.data8, tmp, tmpsize);
		          e->string.data8[e->string.size-1] = 0; break;
		case 'u': count = sitemstochar16s(e->string.data16, tmp, tmpsize);
		          e->string.data16[e->string.size-1] = 0; break;
		case 'U': 
		case 'w': count = sitemstochar32s(e->string.data32, tmp, tmpsize);
		          e->string.data32[e->string.size-1] = 0; break;
		default: assert(0);
		}
		if (count == -1)
			error(&tok.loc, "string literal contains invalid multibyte sequence");
		assert(count == e->string.size - 1);
		free(tmp);
		e->type->array.length = e->string.size;
		e->type->size = e->type->array.length * e->type->base->size;
		e->type->incomplete = false;
		e = decay(e);
@@ -503,7 +561,7 @@ primaryexpr(struct scope *s)
		}
		assert(*src == '\'');
		++src;
		e = mkconstexpr(t, unescape(&src));
		e = mkconstexpr(t, unescape(&src).val);
		if (*src != '\'')
			error(&tok.loc, "character constant contains more than one character: %c", *src);
		next();
@@ -620,7 +678,7 @@ builtinfunc(struct scope *s, enum builtinkind kind)
		break;
	case BUILTINNANF:
		e = assignexpr(s);
		if (!e->decayed || e->base->kind != EXPRSTRING || e->base->string.size > 0)
		if (!e->decayed || e->base->kind != EXPRSTRING || e->base->string.size > 1)
			error(&tok.loc, "__builtin_nanf currently only supports empty string literals");
		e = mkexpr(EXPRCONST, &typefloat);
		/* TODO: use NAN here when we can handle musl's math.h */
diff --git a/init.c b/init.c
index 0e890c0..13a8af8 100644
--- a/init.c
+++ b/init.c
@@ -246,12 +246,13 @@ parseinit(struct scope *s, struct type *t)
				if (!expr->decayed || expr->base->kind != EXPRSTRING)
					break;
				base = t->base;
				/* XXX: wide string literals */
				if (!(base->prop & PROPCHAR))
					break;
				expr = expr->base;
				if (!(typecompatible(base, expr->type->base) || (base->prop & TYPECHAR) && (expr->type->prop & TYPECHAR))) {
					error(&tok.loc, "cannot initialize array with string literal of different width");
					break;
				}
				if (t->incomplete)
					updatearray(t, expr->string.size);
					updatearray(t, expr->string.size - 1);
				goto add;
			case TYPESTRUCT:
			case TYPEUNION:
diff --git a/qbe.c b/qbe.c
index 91b88fd..8683287 100644
--- a/qbe.c
+++ b/qbe.c
@@ -945,7 +945,7 @@ funcinit(struct func *func, struct decl *d, struct init *init)
	struct lvalue dst;
	struct value *src, *v;
	uint64_t offset = 0, max = 0;
	size_t i;
	size_t i, w;

	funcalloc(func, d);
	if (!init)
@@ -954,13 +954,18 @@ funcinit(struct func *func, struct decl *d, struct init *init)
		zero(func, d->value, d->type->align, offset, init->start);
		dst.bits = init->bits;
		if (init->expr->kind == EXPRSTRING) {
			for (i = 0; i < init->expr->string.size && i < init->end - init->start; ++i) {
				v = mkintconst(&iptr, init->start + i);
			w = init->expr->type->base->size;
			for (i = 0; i < init->expr->string.size && i < (init->end - init->start)/w; ++i) {
				v = mkintconst(&iptr, init->start + i*w);
				dst.addr = funcinst(func, IADD, &iptr, d->value, v);
				v = mkintconst(&i8, init->expr->string.data[i]);
				funcstore(func, &typechar, QUALNONE, dst, v);
				switch (w) {
					case 1: v = mkintconst(&i8, (unsigned char) init->expr->string.data8[i]); break;
					case 2: v = mkintconst(&i16, init->expr->string.data16[i]); break;
					case 4: v = mkintconst(&i32, init->expr->string.data32[i]); break;
				}
				funcstore(func, init->expr->type->base, QUALNONE, dst, v);
			}
			offset = init->start + i;
			offset = init->start + i*w;
		} else {
			if (offset < init->end && (dst.bits.before || dst.bits.after))
				zero(func, d->value, d->type->align, offset, init->end);
@@ -1251,7 +1256,7 @@ static void
dataitem(struct expr *expr, uint64_t size)
{
	struct decl *decl;
	size_t i;
	size_t i, w;
	char c;

	switch (expr->kind) {
@@ -1280,17 +1285,28 @@ dataitem(struct expr *expr, uint64_t size)
			printf("%" PRIu64, expr->constant.i);
		break;
	case EXPRSTRING:
		fputc('"', stdout);
		for (i = 0; i < expr->string.size && i < size; ++i) {
			c = expr->string.data[i];
			if (isprint(c) && c != '"' && c != '\\')
				putchar(c);
			else
				printf("\\%03hho", c);
		w = expr->type->base->size;
		if (w == 1) {
			fputc('"', stdout);
			for (i = 0; i < expr->string.size && i < size; ++i) {
				c = expr->string.data8[i];
				if (isprint(c) && c != '"' && c != '\\')
					putchar(c);
				else
					printf("\\%03hho", c);
			}
			fputc('"', stdout);
		} else {
			for (i = 0; i < expr->string.size && i*w < size; ++i) {
				switch (w) {
				case 2: printf("%hu ", expr->string.data16[i]); break;
				case 4: printf("%u ", expr->string.data32[i]);  break;
				default: assert(0);
				}
			}
		}
		fputc('"', stdout);
		if (i < size)
			printf(", z %" PRIu64, size - i);
		if (i*w < size)
			printf(", z %" PRIu64, size - i*w);
		break;
	default:
		error(&tok.loc, "initializer is not a constant expression");
@@ -1303,6 +1319,7 @@ emitdata(struct decl *d, struct init *init)
	struct init *cur;
	struct type *t;
	uint64_t offset = 0, start, end, bits = 0;
	size_t idx;

	if (!d->align)
		d->align = d->type->align;
@@ -1326,7 +1343,12 @@ emitdata(struct decl *d, struct init *init)
			*/
			assert(cur->expr->kind == EXPRSTRING);
			assert(init->expr->kind == EXPRCONST);
			cur->expr->string.data[init->start - cur->start] = init->expr->constant.i;
			idx = (init->start - cur->start) / cur->expr->type->base->size;
			switch (cur->expr->type->base->size) {
			case 1: cur->expr->string.data8[idx]  = init->expr->constant.i; break;
			case 2: cur->expr->string.data16[idx] = init->expr->constant.i; break;
			case 4: cur->expr->string.data32[idx] = init->expr->constant.i; break;
			}
		}
		start = cur->start + cur->bits.before / 8;
		end = cur->end - (cur->bits.after + 7) / 8;
diff --git a/test/hello.qbe b/test/hello.qbe
index 3e695ab..1826315 100644
--- a/test/hello.qbe
+++ b/test/hello.qbe
@@ -1,4 +1,4 @@
data $.Lstring.2 = align 1 { b "hello", z 1, }
data $.Lstring.2 = align 1 { b "hello\000", }
export
function w $main() {
@start.1
diff --git a/test/initializer-replace-local.c b/test/initializer-replace-local.c
index 8b93ef2..be6a58f 100644
--- a/test/initializer-replace-local.c
+++ b/test/initializer-replace-local.c
@@ -1,10 +1,20 @@
void f(void) {
	struct {
		char s[6];
		unsigned short t[6];
		unsigned u[6];
	} x = {
		.s[0] = 'x',
		.s[4] = 'y',
		.s = "hello",
		.s[1] = 'a',
		.t[0] = u'x',
		.t[4] = u'y',
		.t = u"hello",
		.t[1] = u'a',
		.u[0] = U'x',
		.u[4] = U'y',
		.u = U"hello",
		.u[1] = u'a',
	};
}
diff --git a/test/initializer-replace-local.qbe b/test/initializer-replace-local.qbe
index 72ad90a..480773b 100644
--- a/test/initializer-replace-local.qbe
+++ b/test/initializer-replace-local.qbe
@@ -1,7 +1,7 @@
export
function $f() {
@start.1
	%.1 =l alloc4 6
	%.1 =l alloc4 44
@body.2
	%.2 =l add %.1, 0
	storeb 104, %.2
@@ -13,10 +13,49 @@ function $f() {
	storeb 108, %.5
	%.6 =l add %.1, 4
	storeb 111, %.6
	%.7 =l add %.1, 1
	%.8 =w copy 97
	storeb %.8, %.7
	%.9 =l add %.1, 5
	storeb 0, %.9
	%.7 =l add %.1, 5
	storeb 0, %.7
	%.8 =l add %.1, 1
	%.9 =w copy 97
	storeb %.9, %.8
	%.10 =l add %.1, 2
	storeh 0, %.10
	%.11 =l add %.1, 4
	storew 0, %.11
	%.12 =l add %.1, 6
	storeh 104, %.12
	%.13 =l add %.1, 8
	storeh 101, %.13
	%.14 =l add %.1, 10
	storeh 108, %.14
	%.15 =l add %.1, 12
	storeh 108, %.15
	%.16 =l add %.1, 14
	storeh 111, %.16
	%.17 =l add %.1, 16
	storeh 0, %.17
	%.18 =l add %.1, 8
	storeh 97, %.18
	%.19 =l add %.1, 10
	storeh 0, %.19
	%.20 =l add %.1, 12
	storew 0, %.20
	%.21 =l add %.1, 16
	storew 0, %.21
	%.22 =l add %.1, 20
	storew 104, %.22
	%.23 =l add %.1, 24
	storew 101, %.23
	%.24 =l add %.1, 28
	storew 108, %.24
	%.25 =l add %.1, 32
	storew 108, %.25
	%.26 =l add %.1, 36
	storew 111, %.26
	%.27 =l add %.1, 40
	storew 0, %.27
	%.28 =l add %.1, 24
	%.29 =w extuh 97
	storew %.29, %.28
	ret
}
diff --git a/test/initializer-replace-static.c b/test/initializer-replace-static.c
index c1fa376..a6839c9 100644
--- a/test/initializer-replace-static.c
+++ b/test/initializer-replace-static.c
@@ -4,3 +4,17 @@ struct {
	.s = "hello",
	.s[1] = 'a',
};

struct {
	unsigned l[5];
} y = {
	.l = L"a😐Ϩ€",
	.l[1] = L'😃',
};

struct {
	unsigned short u[6];
} z = {
	.u = u"a😐Ϩ€",
	.u[1] = u'😃',
};
diff --git a/test/initializer-replace-static.qbe b/test/initializer-replace-static.qbe
index 18b774e..01d7435 100644
--- a/test/initializer-replace-static.qbe
+++ b/test/initializer-replace-static.qbe
@@ -1 +1,3 @@
export data $x = align 1 { b "hallo", z 1, }
export data $x = align 1 { b "hallo\000", }
export data $y = align 4 { w 97 128515 1000 8364 0 , }
export data $z = align 2 { h 97 62979 56848 1000 8364 0 , }
diff --git a/test/initializer-string-array.qbe b/test/initializer-string-array.qbe
index 87a03fd..cfae412 100644
--- a/test/initializer-string-array.qbe
+++ b/test/initializer-string-array.qbe
@@ -1,7 +1,7 @@
export
function $f() {
@start.1
	%.1 =l alloc4 8
	%.1 =l alloc4 16
@body.2
	%.2 =l add %.1, 0
	storeb 97, %.2
@@ -11,13 +11,37 @@ function $f() {
	storeb 99, %.4
	%.5 =l add %.1, 3
	storeb 0, %.5
	%.6 =l add %.1, 4
	%.6 =l add %.1, 0
	storeb 120, %.6
	%.7 =l add %.1, 5
	%.7 =l add %.1, 1
	storeb 121, %.7
	%.8 =l add %.1, 6
	%.8 =l add %.1, 2
	storeb 122, %.8
	%.9 =l add %.1, 7
	%.9 =l add %.1, 3
	storeb 0, %.9
	%.10 =l add %.1, 4
	storeb 0, %.10
	%.11 =l add %.1, 5
	storeb 0, %.11
	%.12 =l add %.1, 6
	storeb 0, %.12
	%.13 =l add %.1, 7
	storeb 0, %.13
	%.14 =l add %.1, 8
	storeb 0, %.14
	%.15 =l add %.1, 9
	storeb 0, %.15
	%.16 =l add %.1, 10
	storeb 0, %.16
	%.17 =l add %.1, 11
	storeb 0, %.17
	%.18 =l add %.1, 12
	storeb 0, %.18
	%.19 =l add %.1, 13
	storeb 0, %.19
	%.20 =l add %.1, 14
	storeb 0, %.20
	%.21 =l add %.1, 15
	storeb 0, %.21
	ret
}
diff --git a/test/initializer-string-braces.qbe b/test/initializer-string-braces.qbe
index 46be123..5d44117 100644
--- a/test/initializer-string-braces.qbe
+++ b/test/initializer-string-braces.qbe
@@ -1 +1 @@
export data $s = align 1 { b "abc", z 1, }
export data $s = align 1 { b "abc\000", }
diff --git a/test/initializer-string-wide.c b/test/initializer-string-wide.c
new file mode 100644
index 0000000..f409dab
--- /dev/null
+++ b/test/initializer-string-wide.c
@@ -0,0 +1,11 @@
char b[] = u8"سلام عليكم";
unsigned short c[] = u"नमस्ते";
unsigned d[] = U"Привет";
unsigned e[] = L"你好";

void f(void) {
	char w[] = u8"سلام عليكم";
	unsigned short x[] = u"नमस्ते";
	unsigned y[] = U"Привет";
	unsigned z[] = L"你好";
}
diff --git a/test/initializer-string-wide.qbe b/test/initializer-string-wide.qbe
new file mode 100644
index 0000000..4ca9264
--- /dev/null
+++ b/test/initializer-string-wide.qbe
@@ -0,0 +1,88 @@
export data $b = align 1 { b "\330\263\331\204\330\247\331\205 \330\271\331\204\331\212\331\203\331\205\000", }
export data $c = align 2 { h 2344 2350 2360 2381 2340 2375 0 , }
export data $d = align 4 { w 1055 1088 1080 1074 1077 1090 0 , }
export data $e = align 4 { w 20320 22909 0 , }
export
function $f() {
@start.1
	%.1 =l alloc4 20
	%.22 =l alloc4 14
	%.30 =l alloc4 28
	%.38 =l alloc4 12
@body.2
	%.2 =l add %.1, 0
	storeb 216, %.2
	%.3 =l add %.1, 1
	storeb 179, %.3
	%.4 =l add %.1, 2
	storeb 217, %.4
	%.5 =l add %.1, 3
	storeb 132, %.5
	%.6 =l add %.1, 4
	storeb 216, %.6
	%.7 =l add %.1, 5
	storeb 167, %.7
	%.8 =l add %.1, 6
	storeb 217, %.8
	%.9 =l add %.1, 7
	storeb 133, %.9
	%.10 =l add %.1, 8
	storeb 32, %.10
	%.11 =l add %.1, 9
	storeb 216, %.11
	%.12 =l add %.1, 10
	storeb 185, %.12
	%.13 =l add %.1, 11
	storeb 217, %.13
	%.14 =l add %.1, 12
	storeb 132, %.14
	%.15 =l add %.1, 13
	storeb 217, %.15
	%.16 =l add %.1, 14
	storeb 138, %.16
	%.17 =l add %.1, 15
	storeb 217, %.17
	%.18 =l add %.1, 16
	storeb 131, %.18
	%.19 =l add %.1, 17
	storeb 217, %.19
	%.20 =l add %.1, 18
	storeb 133, %.20
	%.21 =l add %.1, 19
	storeb 0, %.21
	%.23 =l add %.22, 0
	storeh 2344, %.23
	%.24 =l add %.22, 2
	storeh 2350, %.24
	%.25 =l add %.22, 4
	storeh 2360, %.25
	%.26 =l add %.22, 6
	storeh 2381, %.26
	%.27 =l add %.22, 8
	storeh 2340, %.27
	%.28 =l add %.22, 10
	storeh 2375, %.28
	%.29 =l add %.22, 12
	storeh 0, %.29
	%.31 =l add %.30, 0
	storew 1055, %.31
	%.32 =l add %.30, 4
	storew 1088, %.32
	%.33 =l add %.30, 8
	storew 1080, %.33
	%.34 =l add %.30, 12
	storew 1074, %.34
	%.35 =l add %.30, 16
	storew 1077, %.35
	%.36 =l add %.30, 20
	storew 1090, %.36
	%.37 =l add %.30, 24
	storew 0, %.37
	%.39 =l add %.38, 0
	storew 20320, %.39
	%.40 =l add %.38, 4
	storew 22909, %.40
	%.41 =l add %.38, 8
	storew 0, %.41
	ret
}
diff --git a/test/initializer-string.c b/test/initializer-string.c
index c92f897..a16a882 100644
--- a/test/initializer-string.c
+++ b/test/initializer-string.c
@@ -1,4 +1,5 @@
char x[] = "hello";
char a[] = "hello";

void f(void) {
	char y[] = "hello";
	char v[] = "hello";
}
diff --git a/test/initializer-string.qbe b/test/initializer-string.qbe
index ba992da..26a0df5 100644
--- a/test/initializer-string.qbe
+++ b/test/initializer-string.qbe
@@ -1,4 +1,4 @@
export data $x = align 1 { b "hello", z 1, }
export data $a = align 1 { b "hello\000", }
export
function $f() {
@start.1
diff --git a/test/initializer-unsigned-string.qbe b/test/initializer-unsigned-string.qbe
index 46be123..5d44117 100644
--- a/test/initializer-unsigned-string.qbe
+++ b/test/initializer-unsigned-string.qbe
@@ -1 +1 @@
export data $s = align 1 { b "abc", z 1, }
export data $s = align 1 { b "abc\000", }
-- 
2.31.1