Max Schillinger: 1 Decode and encode non-ascii header values (UTF-8) 3 files changed, 193 insertions(+), 3 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~sircmpwn/hare-users/patches/51354/mbox | git am -3Learn more about email & git
Non-ascii strings are represented as "encoded-words" as defined in RFC 2047: https://datatracker.ietf.org/doc/html/rfc2047 This commit handles only UTF-8 charsets, both "B" and "Q" encodings. --- message/canonical.ha | 2 +- message/encodedword.ha | 178 +++++++++++++++++++++++++++++++++++++++++ message/header.ha | 16 +++- 3 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 message/encodedword.ha diff --git a/message/canonical.ha b/message/canonical.ha index 0e231bc..5497f12 100644 --- a/message/canonical.ha +++ b/message/canonical.ha @@ -46,7 +46,7 @@ export fn canonical_mime_header_key(key: str) str = { const rn = match (strings::next(&iter)) { case let rn: rune => yield rn; - case void => + case done => break; }; if (!ascii::valid(rn) || !valid_header_field(rn: u32: u8)) { diff --git a/message/encodedword.ha b/message/encodedword.ha new file mode 100644 index 0000000..a9b465f --- /dev/null +++ b/message/encodedword.ha @@ -0,0 +1,178 @@ +use ascii; +use encoding::base64; +use encoding::utf8; +use regex; +use strconv; +use strings; + +let re_encoded_word: regex::regex = regex::regex { ... }; + +@init fn init() void = { + re_encoded_word = regex::compile(`=\?([^? ]+)\?([bBqQ])\?([^? ]+)\?=`)!; +}; + +@fini fn fini() void = { + regex::finish(&re_encoded_word); +}; + +// See RFC 2047, Section 4, for the definition of Q and B encodings: +// https://datatracker.ietf.org/doc/html/rfc2047#section-4 +type recommended_encoding = enum { + NONE, + Q, + B, +}; + +fn decode_utf8q(s: str) str = { + let result: []u8 = alloc([0...], len(s)); + let bytes = strings::toutf8(s); + let j = 0z; + for (let i = 0z; i < len(s); i += 1) { + if (bytes[i] == '=' && i+2 < len(s)) { + i += 1; + const byte = match (strconv::stou8(strings::sub(s, i, i+2), 16)) { + case let b: u8 => yield b; + case => yield '?': u8; + }; + result[j] = byte; + i += 1; + } else if (bytes[i] == '_') { + result[j] = ' '; + } else { + result[j] = bytes[i]; + }; + j += 1; + }; + return strings::fromutf8(result[0..j])!; +}; + +@test fn decode_utf8q() void = { + assert(decode_utf8q("M=C3=BCller") == "Müller"); + assert(decode_utf8q("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók"); + assert(decode_utf8q("=F0=9F=98=8E") == "😎"); +}; + +fn decode_encoded_words(line: str) str = { + let matches: regex::result = []; + for (strings::contains(line, "=?")) { + matches = regex::find(&re_encoded_word, line); + defer regex::result_free(matches); + if (len(matches) == 0) + break; + + const charset = ascii::strlower(matches[1].content); + const encoding = ascii::strlower(matches[2].content); + const encoded_text = matches[3].content; + + switch (charset) { + case "utf-8" => + switch (encoding) { + case "b" => + const decoded_slice = encoding::base64::decodestr( + &encoding::base64::std_encoding, encoded_text)!; + defer free(decoded_slice); + const decoded_string = strings::fromutf8_unsafe(decoded_slice); + line = strings::replace(line, matches[0].content, decoded_string); + case "q" => + const decoded = decode_utf8q(encoded_text); + defer free(decoded); + line = strings::replace(line, matches[0].content, decoded); + case => return line; // warning? + }; + case => + // TODO: Handle charsets other than UTF-8, + // especially ISO-8859-1(5) and Windows-1252 + break; + }; + }; + return line; +}; + +@test fn decode_encoded_words() void = { + assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller"); + assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>") + == "张三 <zhang.san@example.com>"); +}; + +fn get_recommended_encoding(s: str) recommended_encoding = { + let iter = strings::iter(s); + let ascii_count = 0z; + let rune_count = 0z; + for (let r => strings::next(&iter)) { + rune_count += 1; + if (ascii::isprint(r)) { + ascii_count += 1; + }; + }; + // RFC 2047, Section 4: + // The "Q" encoding is recommended for use when most of the characters + // to be encoded are in the ASCII character set; otherwise, the "B" + // encoding should be used. + if (ascii_count == rune_count) { + return recommended_encoding::NONE; + } else if (ascii_count: f32 >= rune_count: f32 / 2.0) { + return recommended_encoding::Q; + } else { + return recommended_encoding::B; + }; +}; + +@test fn get_recommended_encoding() void = { + assert(get_recommended_encoding("John Doe <john@example.org>") + == recommended_encoding::NONE); + assert(get_recommended_encoding("Möller") == recommended_encoding::Q); + assert(get_recommended_encoding("张三 <zhang.san@example.com>") + == recommended_encoding::Q); + assert(get_recommended_encoding("张三") == recommended_encoding::B); + assert(get_recommended_encoding("😎") == recommended_encoding::B); +}; + +fn encode_utf8q(value: str) str = { + let bytes_encoded: []u8 = []; + let iter = strings::iter(value); + for (let r => strings::next(&iter)) { + if (r == ' ') { + append(bytes_encoded, '_'); + } else if (ascii::isprint(r)) { + append(bytes_encoded, r: u8); + } else { + const bytes = encoding::utf8::encoderune(r); + for (let b .. bytes) { + const byte_encoded = + strings::toutf8(strings::dup(strconv::u8tos(b, 16))); + append(bytes_encoded, '='); + append(bytes_encoded, byte_encoded...); + }; + }; + }; + return strings::fromutf8_unsafe(bytes_encoded); +}; + +@test fn encode_utf8q() void = { + assert(encode_utf8q("Dr. Möller") == "Dr._M=C3=B6ller"); + assert(encode_utf8q("张三") == "=E5=BC=A0=E4=B8=89"); + assert(encode_utf8q("😎") == "=F0=9F=98=8E"); +}; + +fn encode(value: str) str = { + switch (get_recommended_encoding(value)) { + case recommended_encoding::B => + return strings::concat("=?UTF-8?B?", + encoding::base64::encodestr(&encoding::base64::std_encoding, + strings::toutf8(value)), "?="); + case recommended_encoding::Q => + return strings::concat("=?UTF-8?Q?", encode_utf8q(value), "?="); + case recommended_encoding::NONE => + return value; + }; +}; + +@test fn encode() void = { + assert(encode("John Doe <john@example.org>") + == "John Doe <john@example.org>"); + assert(encode("Möller") == "=?UTF-8?Q?M=C3=B6ller?="); + assert(encode("张三") == "=?UTF-8?B?5byg5LiJ?="); + assert(encode("张三 <zhang.san@example.com>") + == "=?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?="); + assert(encode("😎") == "=?UTF-8?B?8J+Yjg==?="); +}; diff --git a/message/header.ha b/message/header.ha index cd35132..483a7dc 100644 --- a/message/header.ha +++ b/message/header.ha @@ -84,6 +84,7 @@ export fn header_add(head: *header, key: str, val: str) void = { defer free(key); let map = header_get_mapkey(head, key); + const val = encode(val); const field = alloc(new_header_field(key, val, [])); append(head.fields, field); append(map.fields, field); @@ -101,7 +102,7 @@ export fn header_get(head: *header, key: str) str = { if (map.key != key) { continue; }; - return map.fields[len(map.fields) - 1].val; + return decode_encoded_words(map.fields[len(map.fields) - 1].val); }; return ""; @@ -123,6 +124,9 @@ export fn header_get(head: *header, key: str) str = { header_add(&head, "User-Agent", "Harriet"); assert(header_get(&head, "User-Agent") == "Harriet"); + header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>"); + assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>"); + assert(header_get(&head, "foobar") == ""); }; @@ -356,6 +360,7 @@ export fn read_header( }; const val = decode_header_value(kv[i+1..]); + const val = decode_encoded_words(val); const field = alloc(header_field { raw = kv, key = key, @@ -371,6 +376,7 @@ export fn read_header( const input = "To: Drew DeVault <sir@cmpwn.com>\r\n" "From: Harriet <harriet@harelang.org>\r\n" + "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n" "Content-Type: text/plain\r\n" "DKIM-Signature: a=rsa-sha256;\r\n" " bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n" @@ -383,6 +389,7 @@ export fn read_header( assert(header_get(&head, "To") == "Drew DeVault <sir@cmpwn.com>"); assert(header_get(&head, "From") == "Harriet <harriet@harelang.org>"); + assert(header_get(&head, "Cc") == "张三 <zhang.san@example.com>"); assert(header_get(&head, "Content-Type") == "text/plain"); assert(header_get(&head, "Dkim-Signature") == "a=rsa-sha256; bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple; d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;"); }; @@ -410,6 +417,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = { header_add(&head, "Content-Type", "text/plain"); header_add(&head, "FROM", "Harriet <harriet@harelang.org>"); header_add(&head, "to", "Drew DeVault <sir@cmpwn.com>"); + header_add(&head, "cc", "张三 <zhang.san@example.com>"); const sink = memio::dynamic(); defer io::close(&sink)!; @@ -417,6 +425,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = { const result = memio::string(&sink)!; const expect = + "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n" "To: Drew DeVault <sir@cmpwn.com>\r\n" "From: Harriet <harriet@harelang.org>\r\n" "Content-Type: text/plain\r\n" @@ -428,6 +437,8 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = { " bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n" " d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;\r\n" "\r\n"; + // fmt::printfln(`expect = "{}"`, expect)!; + // fmt::printfln(`result = "{}"`, result)!; assert(result == expect); }; @@ -435,6 +446,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = { const input = "To: Drew DeVault <sir@cmpwn.com>\r\n" "From: Harriet <harriet@harelang.org>\r\n" + "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n" "Content-Type: text/plain\r\n" "DKIM-Signature: a=rsa-sha256;\r\n" " bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n" @@ -481,7 +493,7 @@ fn header_field_raw(hf: *header_field) ([]u8 | errors::invalid) = { const rn = match (strings::next(&iter)) { case let rn: rune => yield rn; - case void => + case done => break; }; -- 2.44.0