~sircmpwn/hare-users

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
1

[PATCH hare-message] Decode encoded-word strings (UTF-8 only)

Details
Message ID
<20240329121226.27728-1-max@mxsr.de>
DKIM signature
pass
Download raw message
Patch: +77 -1
---
 message/encodedword.ha | 73 ++++++++++++++++++++++++++++++++++++++++++
 message/header.ha      |  5 ++-
 2 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 message/encodedword.ha

diff --git a/message/encodedword.ha b/message/encodedword.ha
new file mode 100644
index 0000000..99c45bd
--- /dev/null
+++ b/message/encodedword.ha
@@ -0,0 +1,73 @@
use encoding::base64;
use regex;
use strconv;
use strings;

let re_encoded_word_utf8: regex::regex = regex::regex { ... };

@init fn init() void = {
	re_encoded_word_utf8 = regex::compile(`=\?(utf|UTF)-8\?([bBqQ])\?([^?]+)\?=`)!;
};

@fini fn fini() void = {
	regex::finish(&re_encoded_word_utf8);
};

fn decode_q_encoded_string(s: str) str = {
	let result: []u8 = alloc([0...], len(s));
	let bytes = strings::toutf8(s);
	let j = 0z;
	for (let i = 0z; i < len(s); i += 1) {
		if (bytes[i] == '=' && i+2 < len(s)) {
			i += 1;
			const byte = match (strconv::stou8b(strings::sub(s, i, i+2), 16)) {
			case let b: u8 => yield b;
			case => yield '?': u8;
			};
			result[j] = byte;
			i += 1;
		} else if (bytes[i] == '_') {
			result[j] = ' ';
		} else {
			result[j] = bytes[i];
		};
		j += 1;
	};
	return strings::fromutf8(result[0..j])!;
};

@test fn decode_q_encoded_string() void = {
	assert(decode_q_encoded_string("M=C3=BCller") == "Müller");
	assert(decode_q_encoded_string("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
	assert(decode_q_encoded_string("=F0=9F=98=8E") == "😎");
};

fn decode_encoded_words(line: str) str = {
	for (let matches = regex::find(&re_encoded_word_utf8, line);
			len(matches) == 4;
			matches = regex::find(&re_encoded_word_utf8, line)) {
		const matches = regex::find(&re_encoded_word_utf8, line);
		// TODO: Handle charsets other than UTF-8
		defer free(matches);
		if (len(matches) != 4) {
			return line;
		};
		switch (matches[2].content) {
		case "B", "b" =>
			const decoded_slice =
				encoding::base64::decodestr(&encoding::base64::std_encoding, matches[3].content)!;
			const decoded_string = strings::fromutf8(decoded_slice)!;
			line = strings::replace(line, matches[0].content, decoded_string);
		case "Q", "q" =>
			line = strings::replace(line, matches[0].content, decode_q_encoded_string(matches[3].content));
		case => true;
		};
	};
	return line;
};

@test fn decode_encoded_words() void = {
	assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
	assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
		== "张三 <zhang.san@example.com>");
};
diff --git a/message/header.ha b/message/header.ha
index cd35132..4d31298 100644
--- a/message/header.ha
+++ b/message/header.ha
@@ -101,7 +101,7 @@ export fn header_get(head: *header, key: str) str = {
		if (map.key != key) {
			continue;
		};
		return map.fields[len(map.fields) - 1].val;
		return decode_encoded_words(map.fields[len(map.fields) - 1].val);
	};

	return "";
@@ -123,6 +123,9 @@ export fn header_get(head: *header, key: str) str = {
	header_add(&head, "User-Agent", "Harriet");
	assert(header_get(&head, "User-Agent") == "Harriet");

	header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
	assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");

	assert(header_get(&head, "foobar") == "");
};

-- 
2.44.0
Details
Message ID
<D0NZIVAXTFHN.EKLRATXTNGXO@cmpwn.com>
In-Reply-To
<20240329121226.27728-1-max@mxsr.de> (view parent)
DKIM signature
pass
Download raw message
I'll need a few more things here before this patch is acceptable.

1. We need to be able to encode these, not just decode them
2. We need a test that the round-trip behavior works (and produces an
   email identical to the original)
3. To ease code review, please add the necessary RFC references into the
   commit message.

Thanks!
Reply to thread Export thread (mbox)