~sircmpwn/hare-users

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch

[PATCH hare-message v2] Decode and encode non-ascii header values (UTF-8)

Details
Message ID
<20240425200828.41745-2-max@mxsr.de>
DKIM signature
pass
Download raw message
Patch: +193 -3
Non-ascii strings are represented as "encoded-words" as defined in
RFC 2047:
https://datatracker.ietf.org/doc/html/rfc2047

This commit handles only UTF-8 charsets, both "B" and "Q" encodings.
---
 message/canonical.ha   |   2 +-
 message/encodedword.ha | 178 +++++++++++++++++++++++++++++++++++++++++
 message/header.ha      |  16 +++-
 3 files changed, 193 insertions(+), 3 deletions(-)
 create mode 100644 message/encodedword.ha

diff --git a/message/canonical.ha b/message/canonical.ha
index 0e231bc..5497f12 100644
--- a/message/canonical.ha
+++ b/message/canonical.ha
@@ -46,7 +46,7 @@ export fn canonical_mime_header_key(key: str) str = {
		const rn = match (strings::next(&iter)) {
		case let rn: rune =>
			yield rn;
		case void =>
		case done =>
			break;
		};
		if (!ascii::valid(rn) || !valid_header_field(rn: u32: u8)) {
diff --git a/message/encodedword.ha b/message/encodedword.ha
new file mode 100644
index 0000000..a9b465f
--- /dev/null
+++ b/message/encodedword.ha
@@ -0,0 +1,178 @@
use ascii;
use encoding::base64;
use encoding::utf8;
use regex;
use strconv;
use strings;

let re_encoded_word: regex::regex = regex::regex { ... };

@init fn init() void = {
	re_encoded_word = regex::compile(`=\?([^? ]+)\?([bBqQ])\?([^? ]+)\?=`)!;
};

@fini fn fini() void = {
	regex::finish(&re_encoded_word);
};

// See RFC 2047, Section 4, for the definition of Q and B encodings:
// https://datatracker.ietf.org/doc/html/rfc2047#section-4
type recommended_encoding = enum {
	NONE,
	Q,
	B,
};

fn decode_utf8q(s: str) str = {
	let result: []u8 = alloc([0...], len(s));
	let bytes = strings::toutf8(s);
	let j = 0z;
	for (let i = 0z; i < len(s); i += 1) {
		if (bytes[i] == '=' && i+2 < len(s)) {
			i += 1;
			const byte = match (strconv::stou8(strings::sub(s, i, i+2), 16)) {
			case let b: u8 => yield b;
			case => yield '?': u8;
			};
			result[j] = byte;
			i += 1;
		} else if (bytes[i] == '_') {
			result[j] = ' ';
		} else {
			result[j] = bytes[i];
		};
		j += 1;
	};
	return strings::fromutf8(result[0..j])!;
};

@test fn decode_utf8q() void = {
	assert(decode_utf8q("M=C3=BCller") == "Müller");
	assert(decode_utf8q("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
	assert(decode_utf8q("=F0=9F=98=8E") == "😎");
};

fn decode_encoded_words(line: str) str = {
	let matches: regex::result = [];
	for (strings::contains(line, "=?")) {
		matches = regex::find(&re_encoded_word, line);
		defer regex::result_free(matches);
		if (len(matches) == 0)
			break;

		const charset = ascii::strlower(matches[1].content);
		const encoding = ascii::strlower(matches[2].content);
		const encoded_text = matches[3].content;

		switch (charset) {
		case "utf-8" =>
			switch (encoding) {
			case "b" =>
				const decoded_slice = encoding::base64::decodestr(
					&encoding::base64::std_encoding, encoded_text)!;
				defer free(decoded_slice);
				const decoded_string = strings::fromutf8_unsafe(decoded_slice);
				line = strings::replace(line, matches[0].content, decoded_string);
			case "q" =>
				const decoded = decode_utf8q(encoded_text);
				defer free(decoded);
				line = strings::replace(line, matches[0].content, decoded);
			case => return line; // warning?
			};
		case =>
			// TODO: Handle charsets other than UTF-8,
			// especially ISO-8859-1(5) and Windows-1252
			break;
		};
	};
	return line;
};

@test fn decode_encoded_words() void = {
	assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
	assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
		== "张三 <zhang.san@example.com>");
};

fn get_recommended_encoding(s: str) recommended_encoding = {
	let iter = strings::iter(s);
	let ascii_count = 0z;
	let rune_count = 0z;
	for (let r => strings::next(&iter)) {
		rune_count += 1;
		if (ascii::isprint(r)) {
			ascii_count += 1;
		};
	};
	// RFC 2047, Section 4:
	// The "Q" encoding is recommended for use when most of the characters
	// to be encoded are in the ASCII character set; otherwise, the "B"
	// encoding should be used.
	if (ascii_count == rune_count) {
		return recommended_encoding::NONE;
	} else if (ascii_count: f32 >= rune_count: f32 / 2.0) {
		return recommended_encoding::Q;
	} else {
		return recommended_encoding::B;
	};
};

@test fn get_recommended_encoding() void = {
	assert(get_recommended_encoding("John Doe <john@example.org>")
		== recommended_encoding::NONE);
	assert(get_recommended_encoding("Möller") == recommended_encoding::Q);
	assert(get_recommended_encoding("张三 <zhang.san@example.com>")
		== recommended_encoding::Q);
	assert(get_recommended_encoding("张三") == recommended_encoding::B);
	assert(get_recommended_encoding("😎") == recommended_encoding::B);
};

fn encode_utf8q(value: str) str = {
	let bytes_encoded: []u8 = [];
	let iter = strings::iter(value);
	for (let r => strings::next(&iter)) {
		if (r == ' ') {
			append(bytes_encoded, '_');
		} else if (ascii::isprint(r)) {
			append(bytes_encoded, r: u8);
		} else {
			const bytes = encoding::utf8::encoderune(r);
			for (let b .. bytes) {
				const byte_encoded =
					strings::toutf8(strings::dup(strconv::u8tos(b, 16)));
				append(bytes_encoded, '=');
				append(bytes_encoded, byte_encoded...);
			};
		};
	};
	return strings::fromutf8_unsafe(bytes_encoded);
};

@test fn encode_utf8q() void = {
	assert(encode_utf8q("Dr. Möller") == "Dr._M=C3=B6ller");
	assert(encode_utf8q("张三") == "=E5=BC=A0=E4=B8=89");
	assert(encode_utf8q("😎") == "=F0=9F=98=8E");
};

fn encode(value: str) str = {
	switch (get_recommended_encoding(value)) {
	case recommended_encoding::B =>
		return strings::concat("=?UTF-8?B?",
			encoding::base64::encodestr(&encoding::base64::std_encoding,
			strings::toutf8(value)), "?=");
	case recommended_encoding::Q =>
		return strings::concat("=?UTF-8?Q?", encode_utf8q(value), "?=");
	case recommended_encoding::NONE =>
		return value;
	};
};

@test fn encode() void = {
	assert(encode("John Doe <john@example.org>")
		== "John Doe <john@example.org>");
	assert(encode("Möller") == "=?UTF-8?Q?M=C3=B6ller?=");
	assert(encode("张三") == "=?UTF-8?B?5byg5LiJ?=");
	assert(encode("张三 <zhang.san@example.com>")
		== "=?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=");
	assert(encode("😎") == "=?UTF-8?B?8J+Yjg==?=");
};
diff --git a/message/header.ha b/message/header.ha
index cd35132..483a7dc 100644
--- a/message/header.ha
+++ b/message/header.ha
@@ -84,6 +84,7 @@ export fn header_add(head: *header, key: str, val: str) void = {
	defer free(key);
	let map = header_get_mapkey(head, key);

	const val = encode(val);
	const field = alloc(new_header_field(key, val, []));
	append(head.fields, field);
	append(map.fields, field);
@@ -101,7 +102,7 @@ export fn header_get(head: *header, key: str) str = {
		if (map.key != key) {
			continue;
		};
		return map.fields[len(map.fields) - 1].val;
		return decode_encoded_words(map.fields[len(map.fields) - 1].val);
	};

	return "";
@@ -123,6 +124,9 @@ export fn header_get(head: *header, key: str) str = {
	header_add(&head, "User-Agent", "Harriet");
	assert(header_get(&head, "User-Agent") == "Harriet");

	header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
	assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");

	assert(header_get(&head, "foobar") == "");
};

@@ -356,6 +360,7 @@ export fn read_header(
		};

		const val = decode_header_value(kv[i+1..]);
		const val = decode_encoded_words(val);
		const field = alloc(header_field {
			raw = kv,
			key = key,
@@ -371,6 +376,7 @@ export fn read_header(
	const input =
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"Content-Type: text/plain\r\n"
		"DKIM-Signature: a=rsa-sha256;\r\n"
		" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
@@ -383,6 +389,7 @@ export fn read_header(

	assert(header_get(&head, "To") == "Drew DeVault <sir@cmpwn.com>");
	assert(header_get(&head, "From") == "Harriet <harriet@harelang.org>");
	assert(header_get(&head, "Cc") == "张三 <zhang.san@example.com>");
	assert(header_get(&head, "Content-Type") == "text/plain");
	assert(header_get(&head, "Dkim-Signature") == "a=rsa-sha256; bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple; d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;");
};
@@ -410,6 +417,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	header_add(&head, "Content-Type", "text/plain");
	header_add(&head, "FROM", "Harriet <harriet@harelang.org>");
	header_add(&head, "to", "Drew DeVault <sir@cmpwn.com>");
	header_add(&head, "cc", "张三 <zhang.san@example.com>");

	const sink = memio::dynamic();
	defer io::close(&sink)!;
@@ -417,6 +425,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	const result = memio::string(&sink)!;

	const expect =
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Content-Type: text/plain\r\n"
@@ -428,6 +437,8 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
		" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
		" d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;\r\n"
		"\r\n";
	// fmt::printfln(`expect = "{}"`, expect)!;
	// fmt::printfln(`result = "{}"`, result)!;
	assert(result == expect);
};

@@ -435,6 +446,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	const input =
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"Content-Type: text/plain\r\n"
		"DKIM-Signature: a=rsa-sha256;\r\n"
		" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
@@ -481,7 +493,7 @@ fn header_field_raw(hf: *header_field) ([]u8 | errors::invalid) = {
		const rn = match (strings::next(&iter)) {
		case let rn: rune =>
			yield rn;
		case void =>
		case done =>
			break;
		};

-- 
2.44.0
Reply to thread Export thread (mbox)