~sircmpwn/hare-users

hare-message: Decode encoded-word strings (UTF-8 only) v1 NEEDS REVISION

Max Schillinger: 1
 Decode encoded-word strings (UTF-8 only)

 2 files changed, 77 insertions(+), 1 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/hare-users/patches/50587/mbox | git am -3
Learn more about email & git

[PATCH hare-message] Decode encoded-word strings (UTF-8 only) Export this patch

---
 message/encodedword.ha | 73 ++++++++++++++++++++++++++++++++++++++++++
 message/header.ha      |  5 ++-
 2 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 message/encodedword.ha

diff --git a/message/encodedword.ha b/message/encodedword.ha
new file mode 100644
index 0000000..99c45bd
--- /dev/null
+++ b/message/encodedword.ha
@@ -0,0 +1,73 @@
use encoding::base64;
use regex;
use strconv;
use strings;

let re_encoded_word_utf8: regex::regex = regex::regex { ... };

@init fn init() void = {
	re_encoded_word_utf8 = regex::compile(`=\?(utf|UTF)-8\?([bBqQ])\?([^?]+)\?=`)!;
};

@fini fn fini() void = {
	regex::finish(&re_encoded_word_utf8);
};

fn decode_q_encoded_string(s: str) str = {
	let result: []u8 = alloc([0...], len(s));
	let bytes = strings::toutf8(s);
	let j = 0z;
	for (let i = 0z; i < len(s); i += 1) {
		if (bytes[i] == '=' && i+2 < len(s)) {
			i += 1;
			const byte = match (strconv::stou8b(strings::sub(s, i, i+2), 16)) {
			case let b: u8 => yield b;
			case => yield '?': u8;
			};
			result[j] = byte;
			i += 1;
		} else if (bytes[i] == '_') {
			result[j] = ' ';
		} else {
			result[j] = bytes[i];
		};
		j += 1;
	};
	return strings::fromutf8(result[0..j])!;
};

@test fn decode_q_encoded_string() void = {
	assert(decode_q_encoded_string("M=C3=BCller") == "Müller");
	assert(decode_q_encoded_string("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
	assert(decode_q_encoded_string("=F0=9F=98=8E") == "😎");
};

fn decode_encoded_words(line: str) str = {
	for (let matches = regex::find(&re_encoded_word_utf8, line);
			len(matches) == 4;
			matches = regex::find(&re_encoded_word_utf8, line)) {
		const matches = regex::find(&re_encoded_word_utf8, line);
		// TODO: Handle charsets other than UTF-8
		defer free(matches);
		if (len(matches) != 4) {
			return line;
		};
		switch (matches[2].content) {
		case "B", "b" =>
			const decoded_slice =
				encoding::base64::decodestr(&encoding::base64::std_encoding, matches[3].content)!;
			const decoded_string = strings::fromutf8(decoded_slice)!;
			line = strings::replace(line, matches[0].content, decoded_string);
		case "Q", "q" =>
			line = strings::replace(line, matches[0].content, decode_q_encoded_string(matches[3].content));
		case => true;
		};
	};
	return line;
};

@test fn decode_encoded_words() void = {
	assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
	assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
		== "张三 <zhang.san@example.com>");
};
diff --git a/message/header.ha b/message/header.ha
index cd35132..4d31298 100644
--- a/message/header.ha
+++ b/message/header.ha
@@ -101,7 +101,7 @@ export fn header_get(head: *header, key: str) str = {
		if (map.key != key) {
			continue;
		};
		return map.fields[len(map.fields) - 1].val;
		return decode_encoded_words(map.fields[len(map.fields) - 1].val);
	};

	return "";
@@ -123,6 +123,9 @@ export fn header_get(head: *header, key: str) str = {
	header_add(&head, "User-Agent", "Harriet");
	assert(header_get(&head, "User-Agent") == "Harriet");

	header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
	assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");

	assert(header_get(&head, "foobar") == "");
};

-- 
2.44.0
I'll need a few more things here before this patch is acceptable.

1. We need to be able to encode these, not just decode them
2. We need a test that the round-trip behavior works (and produces an
   email identical to the original)
3. To ease code review, please add the necessary RFC references into the
   commit message.

Thanks!