Non-ascii strings are represented as "encoded-words" as defined in
RFC 2047:
https://datatracker.ietf.org/doc/html/rfc2047
This commit handles only UTF-8 charsets, both "B" and "Q" encodings.
---
message/canonical.ha | 2 +-
message/encodedword.ha | 178 +++++++++++++++++++++++++++++++++++++++++
message/header.ha | 16 +++-
3 files changed, 193 insertions(+), 3 deletions(-)
create mode 100644 message/encodedword.ha
diff --git a/message/canonical.ha b/message/canonical.ha
index 0e231bc..5497f12 100644
--- a/message/canonical.ha
+++ b/message/canonical.ha
@@ -46,7 +46,7 @@ export fn canonical_mime_header_key(key: str) str = {
const rn = match (strings::next(&iter)) {
case let rn: rune =>
yield rn;
- case void =>
+ case done =>
break;
};
if (!ascii::valid(rn) || !valid_header_field(rn: u32: u8)) {
diff --git a/message/encodedword.ha b/message/encodedword.ha
new file mode 100644
index 0000000..a9b465f
--- /dev/null
+++ b/message/encodedword.ha
@@ -0,0 +1,178 @@
+use ascii;
+use encoding::base64;
+use encoding::utf8;
+use regex;
+use strconv;
+use strings;
+
+let re_encoded_word: regex::regex = regex::regex { ... };
+
+@init fn init() void = {
+ re_encoded_word = regex::compile(`=\?([^? ]+)\?([bBqQ])\?([^? ]+)\?=`)!;
+};
+
+@fini fn fini() void = {
+ regex::finish(&re_encoded_word);
+};
+
+// See RFC 2047, Section 4, for the definition of Q and B encodings:
+// https://datatracker.ietf.org/doc/html/rfc2047#section-4
+type recommended_encoding = enum {
+ NONE,
+ Q,
+ B,
+};
+
+fn decode_utf8q(s: str) str = {
+ let result: []u8 = alloc([0...], len(s));
+ let bytes = strings::toutf8(s);
+ let j = 0z;
+ for (let i = 0z; i < len(s); i += 1) {
+ if (bytes[i] == '=' && i+2 < len(s)) {
+ i += 1;
+ const byte = match (strconv::stou8(strings::sub(s, i, i+2), 16)) {
+ case let b: u8 => yield b;
+ case => yield '?': u8;
+ };
+ result[j] = byte;
+ i += 1;
+ } else if (bytes[i] == '_') {
+ result[j] = ' ';
+ } else {
+ result[j] = bytes[i];
+ };
+ j += 1;
+ };
+ return strings::fromutf8(result[0..j])!;
+};
+
+@test fn decode_utf8q() void = {
+ assert(decode_utf8q("M=C3=BCller") == "Müller");
+ assert(decode_utf8q("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
+ assert(decode_utf8q("=F0=9F=98=8E") == "😎");
+};
+
+fn decode_encoded_words(line: str) str = {
+ let matches: regex::result = [];
+ for (strings::contains(line, "=?")) {
+ matches = regex::find(&re_encoded_word, line);
+ defer regex::result_free(matches);
+ if (len(matches) == 0)
+ break;
+
+ const charset = ascii::strlower(matches[1].content);
+ const encoding = ascii::strlower(matches[2].content);
+ const encoded_text = matches[3].content;
+
+ switch (charset) {
+ case "utf-8" =>
+ switch (encoding) {
+ case "b" =>
+ const decoded_slice = encoding::base64::decodestr(
+ &encoding::base64::std_encoding, encoded_text)!;
+ defer free(decoded_slice);
+ const decoded_string = strings::fromutf8_unsafe(decoded_slice);
+ line = strings::replace(line, matches[0].content, decoded_string);
+ case "q" =>
+ const decoded = decode_utf8q(encoded_text);
+ defer free(decoded);
+ line = strings::replace(line, matches[0].content, decoded);
+ case => return line; // warning?
+ };
+ case =>
+ // TODO: Handle charsets other than UTF-8,
+ // especially ISO-8859-1(5) and Windows-1252
+ break;
+ };
+ };
+ return line;
+};
+
+@test fn decode_encoded_words() void = {
+ assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
+ assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
+ == "张三 <zhang.san@example.com>");
+};
+
+fn get_recommended_encoding(s: str) recommended_encoding = {
+ let iter = strings::iter(s);
+ let ascii_count = 0z;
+ let rune_count = 0z;
+ for (let r => strings::next(&iter)) {
+ rune_count += 1;
+ if (ascii::isprint(r)) {
+ ascii_count += 1;
+ };
+ };
+ // RFC 2047, Section 4:
+ // The "Q" encoding is recommended for use when most of the characters
+ // to be encoded are in the ASCII character set; otherwise, the "B"
+ // encoding should be used.
+ if (ascii_count == rune_count) {
+ return recommended_encoding::NONE;
+ } else if (ascii_count: f32 >= rune_count: f32 / 2.0) {
+ return recommended_encoding::Q;
+ } else {
+ return recommended_encoding::B;
+ };
+};
+
+@test fn get_recommended_encoding() void = {
+ assert(get_recommended_encoding("John Doe <john@example.org>")
+ == recommended_encoding::NONE);
+ assert(get_recommended_encoding("Möller") == recommended_encoding::Q);
+ assert(get_recommended_encoding("张三 <zhang.san@example.com>")
+ == recommended_encoding::Q);
+ assert(get_recommended_encoding("张三") == recommended_encoding::B);
+ assert(get_recommended_encoding("😎") == recommended_encoding::B);
+};
+
+fn encode_utf8q(value: str) str = {
+ let bytes_encoded: []u8 = [];
+ let iter = strings::iter(value);
+ for (let r => strings::next(&iter)) {
+ if (r == ' ') {
+ append(bytes_encoded, '_');
+ } else if (ascii::isprint(r)) {
+ append(bytes_encoded, r: u8);
+ } else {
+ const bytes = encoding::utf8::encoderune(r);
+ for (let b .. bytes) {
+ const byte_encoded =
+ strings::toutf8(strings::dup(strconv::u8tos(b, 16)));
+ append(bytes_encoded, '=');
+ append(bytes_encoded, byte_encoded...);
+ };
+ };
+ };
+ return strings::fromutf8_unsafe(bytes_encoded);
+};
+
+@test fn encode_utf8q() void = {
+ assert(encode_utf8q("Dr. Möller") == "Dr._M=C3=B6ller");
+ assert(encode_utf8q("张三") == "=E5=BC=A0=E4=B8=89");
+ assert(encode_utf8q("😎") == "=F0=9F=98=8E");
+};
+
+fn encode(value: str) str = {
+ switch (get_recommended_encoding(value)) {
+ case recommended_encoding::B =>
+ return strings::concat("=?UTF-8?B?",
+ encoding::base64::encodestr(&encoding::base64::std_encoding,
+ strings::toutf8(value)), "?=");
+ case recommended_encoding::Q =>
+ return strings::concat("=?UTF-8?Q?", encode_utf8q(value), "?=");
+ case recommended_encoding::NONE =>
+ return value;
+ };
+};
+
+@test fn encode() void = {
+ assert(encode("John Doe <john@example.org>")
+ == "John Doe <john@example.org>");
+ assert(encode("Möller") == "=?UTF-8?Q?M=C3=B6ller?=");
+ assert(encode("张三") == "=?UTF-8?B?5byg5LiJ?=");
+ assert(encode("张三 <zhang.san@example.com>")
+ == "=?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=");
+ assert(encode("😎") == "=?UTF-8?B?8J+Yjg==?=");
+};
diff --git a/message/header.ha b/message/header.ha
index cd35132..483a7dc 100644
--- a/message/header.ha
@@ -84,6 +84,7 @@ export fn header_add(head: *header, key: str, val: str) void = {
defer free(key);
let map = header_get_mapkey(head, key);
+ const val = encode(val);
const field = alloc(new_header_field(key, val, []));
append(head.fields, field);
append(map.fields, field);
@@ -101,7 +102,7 @@ export fn header_get(head: *header, key: str) str = {
if (map.key != key) {
continue;
};
- return map.fields[len(map.fields) - 1].val;
+ return decode_encoded_words(map.fields[len(map.fields) - 1].val);
};
return "";
@@ -123,6 +124,9 @@ export fn header_get(head: *header, key: str) str = {
header_add(&head, "User-Agent", "Harriet");
assert(header_get(&head, "User-Agent") == "Harriet");
+ header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
+ assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");
+
assert(header_get(&head, "foobar") == "");
};
@@ -356,6 +360,7 @@ export fn read_header(
};
const val = decode_header_value(kv[i+1..]);
+ const val = decode_encoded_words(val);
const field = alloc(header_field {
raw = kv,
key = key,
@@ -371,6 +376,7 @@ export fn read_header(
const input =
"To: Drew DeVault <sir@cmpwn.com>\r\n"
"From: Harriet <harriet@harelang.org>\r\n"
+ "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
"Content-Type: text/plain\r\n"
"DKIM-Signature: a=rsa-sha256;\r\n"
" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
@@ -383,6 +389,7 @@ export fn read_header(
assert(header_get(&head, "To") == "Drew DeVault <sir@cmpwn.com>");
assert(header_get(&head, "From") == "Harriet <harriet@harelang.org>");
+ assert(header_get(&head, "Cc") == "张三 <zhang.san@example.com>");
assert(header_get(&head, "Content-Type") == "text/plain");
assert(header_get(&head, "Dkim-Signature") == "a=rsa-sha256; bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple; d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;");
};
@@ -410,6 +417,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
header_add(&head, "Content-Type", "text/plain");
header_add(&head, "FROM", "Harriet <harriet@harelang.org>");
header_add(&head, "to", "Drew DeVault <sir@cmpwn.com>");
+ header_add(&head, "cc", "张三 <zhang.san@example.com>");
const sink = memio::dynamic();
defer io::close(&sink)!;
@@ -417,6 +425,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
const result = memio::string(&sink)!;
const expect =
+ "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
"To: Drew DeVault <sir@cmpwn.com>\r\n"
"From: Harriet <harriet@harelang.org>\r\n"
"Content-Type: text/plain\r\n"
@@ -428,6 +437,8 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
" d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;\r\n"
"\r\n";
+ // fmt::printfln(`expect = "{}"`, expect)!;
+ // fmt::printfln(`result = "{}"`, result)!;
assert(result == expect);
};
@@ -435,6 +446,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
const input =
"To: Drew DeVault <sir@cmpwn.com>\r\n"
"From: Harriet <harriet@harelang.org>\r\n"
+ "Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
"Content-Type: text/plain\r\n"
"DKIM-Signature: a=rsa-sha256;\r\n"
" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"
@@ -481,7 +493,7 @@ fn header_field_raw(hf: *header_field) ([]u8 | errors::invalid) = {
const rn = match (strings::next(&iter)) {
case let rn: rune =>
yield rn;
- case void =>
+ case done =>
break;
};
--
2.44.0