[PATCH hare-message] Decode encoded-word strings (UTF-8 only)
Export this patch
---
message/encodedword.ha | 73 ++++++++++++++++++++++++++++++++++++++++++
message/header.ha | 5 ++ -
2 files changed, 77 insertions(+), 1 deletion(-)
create mode 100644 message/encodedword.ha
diff --git a/message/encodedword.ha b/message/encodedword.ha
new file mode 100644
index 0000000..99c45bd
--- /dev/null
+++ b/message/encodedword.ha
@@ -0,0 +1,73 @@
+ use encoding::base64;
+ use regex;
+ use strconv;
+ use strings;
+
+ let re_encoded_word_utf8: regex::regex = regex::regex { ... };
+
+ @init fn init() void = {
+ re_encoded_word_utf8 = regex::compile(`=\?(utf|UTF)-8\?([bBqQ])\?([^?]+)\?=`)!;
+ };
+
+ @fini fn fini() void = {
+ regex::finish(&re_encoded_word_utf8);
+ };
+
+ fn decode_q_encoded_string(s: str) str = {
+ let result: []u8 = alloc([0...], len(s));
+ let bytes = strings::toutf8(s);
+ let j = 0z;
+ for (let i = 0z; i < len(s); i += 1) {
+ if (bytes[i] == '=' && i+2 < len(s)) {
+ i += 1;
+ const byte = match (strconv::stou8b(strings::sub(s, i, i+2), 16)) {
+ case let b: u8 => yield b;
+ case => yield '?': u8;
+ };
+ result[j] = byte;
+ i += 1;
+ } else if (bytes[i] == '_') {
+ result[j] = ' ';
+ } else {
+ result[j] = bytes[i];
+ };
+ j += 1;
+ };
+ return strings::fromutf8(result[0..j])!;
+ };
+
+ @test fn decode_q_encoded_string() void = {
+ assert(decode_q_encoded_string("M=C3=BCller") == "Müller");
+ assert(decode_q_encoded_string("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
+ assert(decode_q_encoded_string("=F0=9F=98=8E") == "😎");
+ };
+
+ fn decode_encoded_words(line: str) str = {
+ for (let matches = regex::find(&re_encoded_word_utf8, line);
+ len(matches) == 4;
+ matches = regex::find(&re_encoded_word_utf8, line)) {
+ const matches = regex::find(&re_encoded_word_utf8, line);
+ // TODO: Handle charsets other than UTF-8
+ defer free(matches);
+ if (len(matches) != 4) {
+ return line;
+ };
+ switch (matches[2].content) {
+ case "B", "b" =>
+ const decoded_slice =
+ encoding::base64::decodestr(&encoding::base64::std_encoding, matches[3].content)!;
+ const decoded_string = strings::fromutf8(decoded_slice)!;
+ line = strings::replace(line, matches[0].content, decoded_string);
+ case "Q", "q" =>
+ line = strings::replace(line, matches[0].content, decode_q_encoded_string(matches[3].content));
+ case => true;
+ };
+ };
+ return line;
+ };
+
+ @test fn decode_encoded_words() void = {
+ assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
+ assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
+ == "张三 <zhang.san@example.com>");
+ };
diff --git a/message/header.ha b/message/header.ha
index cd35132..4d31298 100644
--- a/message/header.ha
@@ -101,7 +101,7 @@ export fn header_get(head: *header, key: str) str = {
if (map.key != key) {
continue;
};
- return map.fields[len(map.fields) - 1].val;
+ return decode_encoded_words(map.fields[len(map.fields) - 1].val);
};
return "";
@@ -123,6 +123,9 @@ export fn header_get(head: *header, key: str) str = {
header_add(&head, "User-Agent", "Harriet");
assert(header_get(&head, "User-Agent") == "Harriet");
+ header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
+ assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");
+
assert(header_get(&head, "foobar") == "");
};
--
2.44.0
I'll need a few more things here before this patch is acceptable.
1. We need to be able to encode these, not just decode them
2. We need a test that the round-trip behavior works (and produces an
email identical to the original)
3. To ease code review, please add the necessary RFC references into the
commit message.
Thanks!