Parse multipart messages on a best-efforts basis. Allow the user to see
as much of the message as possible, but log the errors.
If a charset or encoding error is encountered for a message part of a
multipart message, the error is logged and ignored. In those cases, we
still get a valid message body but the content is just not decoded or
converted. No error will be propagated.
If a multipart message cannot be parsed, ParseEntityStructure will
return a multipart error. This error indicates that the message is
malformed and there is nothing more we can do. The caller is then
advised to use a single text/plain body structure using
CreateTextPlainPart() to provide the entire message content to the user.
Fixes: https://todo.sr.ht/~rjarry/aerc/288
Signed-off-by: Koni Marti <koni.marti@gmail.com>
---
v2->v3:
* fix tests
lib/emlview.go | 6 +-lib/messageview.go | 5 +-lib/rfc822/message.go | 83 ++++++++++++++++---lib/rfc822/message_test.go | 10 +--
.../message/{invalid => malformed}/hexa | 0
5 files changed, 83 insertions(+), 21 deletions(-)
rename lib/rfc822/testdata/message/{invalid => malformed}/hexa (100%)
diff --git a/lib/emlview.go b/lib/emlview.go
index 3f9c3f39..a5740a16 100644
--- a/lib/emlview.go+++ b/lib/emlview.go
@@ -8,6 +8,7 @@ import (
_ "github.com/emersion/go-message/charset"
"git.sr.ht/~rjarry/aerc/lib/crypto"
+ "git.sr.ht/~rjarry/aerc/lib/log" "git.sr.ht/~rjarry/aerc/lib/rfc822"
"git.sr.ht/~rjarry/aerc/models"
)
@@ -71,7 +72,10 @@ func NewEmlMessageView(full []byte, pgp crypto.Provider,
return
}
bs, err := rfc822.ParseEntityStructure(entity)
- if err != nil {+ if rfc822.IsMultipartError(err) {+ log.Warnf("EmlView: %v", err)+ bs = rfc822.CreateTextPlainBody()+ } else if err != nil { cb(nil, err)
return
}
diff --git a/lib/messageview.go b/lib/messageview.go
index b849148a..624e96b5 100644
--- a/lib/messageview.go+++ b/lib/messageview.go
@@ -112,7 +112,10 @@ func NewMessageStoreView(messageInfo *models.MessageInfo, setSeen bool,
return
}
bs, err := rfc822.ParseEntityStructure(decrypted)
- if err != nil {+ if rfc822.IsMultipartError(err) {+ log.Warnf("MessageView: %v", err)+ bs = rfc822.CreateTextPlainBody()+ } else if err != nil { cb(nil, err)
return
}
diff --git a/lib/rfc822/message.go b/lib/rfc822/message.go
index 653cb2fe..e7c9cab1 100644
--- a/lib/rfc822/message.go+++ b/lib/rfc822/message.go
@@ -19,6 +19,22 @@ import (
"github.com/emersion/go-message/mail"
)
+type MultipartError struct {+ e error+}++func (u MultipartError) Unwrap() error { return u.e }++func (u MultipartError) Error() string {+ return "multipart error: " + u.e.Error()+}++// IsMultipartError returns a boolean indicating whether the error is known to+// report that the multipart message is malformed and could not be parsed.+func IsMultipartError(err error) bool {+ return errors.As(err, new(MultipartError))+}+// RFC 1123Z regexp
var dateRe = regexp.MustCompile(`(((Mon|Tue|Wed|Thu|Fri|Sat|Sun))[,]?\s[0-9]{1,2})\s` +
`(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s` +
@@ -34,8 +50,14 @@ func FetchEntityPartReader(e *message.Entity, index []int) (io.Reader, error) {
for {
idx++
part, err := mpr.NextPart()
- if err != nil {- return nil, err+ switch {+ case message.IsUnknownCharset(err):+ log.Warnf("FetchEntityPartReader: %v", err)+ case message.IsUnknownEncoding(err):+ log.Warnf("FetchEntityPartReader: %v", err)+ case err != nil:+ log.Warnf("FetchEntityPartReader: %v", err)+ return bufReader(e) }
if idx == index[0] {
rest := index[1:]
@@ -89,6 +111,22 @@ func fixContentType(h message.Header) (string, map[string]string) {
return "text/plain", nil
}
+// ParseEntityStructure will parse the message and create a multipart structure+// for multipart messages. Parsing is done on a best-efforts basis:+//+// If the content-type cannot be parsed, ParseEntityStructure will try to fix+// it; otherwise, it returns a text/plain mime type as a fallback. No error will+// be returned.+//+// If a charset or encoding error is encountered for a message part of a+// multipart message, the error is logged and ignored. In those cases, we still+// get a valid message body but the content is just not decoded or converted. No+// error will be returned.+//+// If reading a multipart message fails, ParseEntityStructure will return a+// multipart error. This error indicates that this message is malformed and+// there is nothing more we can do. The caller is then advised to use a single+// text/plain body structure using CreateTextPlainPart().func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {
var body models.BodyStructure
contentType, ctParams, err := e.Header.ContentType()
@@ -116,10 +154,15 @@ func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {
if mpr := e.MultipartReader(); mpr != nil {
for {
part, err := mpr.NextPart()
- if errors.Is(err, io.EOF) {+ switch {+ case errors.Is(err, io.EOF): return &body, nil
- } else if err != nil {- return nil, err+ case message.IsUnknownCharset(err):+ log.Warnf("ParseEntityStructure: %v", err)+ case message.IsUnknownEncoding(err):+ log.Warnf("ParseEntityStructure: %v", err)+ case err != nil:+ return nil, MultipartError{err} }
ps, err := ParseEntityStructure(part)
if err != nil {
@@ -131,6 +174,16 @@ func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {
return &body, nil
}
+// CreateTextPlainBody creats a plain-vanilla text/plain body structure.+func CreateTextPlainBody() *models.BodyStructure {+ body := &models.BodyStructure{}+ body.MIMEType = "text"+ body.MIMESubType = "plain"+ body.Params = map[string]string{"charset": "utf-8"}+ body.Parts = []*models.BodyStructure{}+ return body+}+func parseEnvelope(h *mail.Header) *models.Envelope {
subj, err := h.Subject()
if err != nil {
@@ -308,8 +361,9 @@ func MessageInfo(raw RawMessage) (*models.MessageInfo, error) {
return nil, fmt.Errorf("could not read message: %w", err)
}
bs, err := ParseEntityStructure(msg)
- if errors.As(err, new(message.UnknownEncodingError)) {- parseErr = err+ if IsMultipartError(err) {+ log.Warnf("multipart error: %v", err)+ bs = CreateTextPlainBody() } else if err != nil {
return nil, fmt.Errorf("could not get structure: %w", err)
}
@@ -394,13 +448,18 @@ func NewCRLFReader(r io.Reader) io.Reader {
// ReadMessage is a wrapper for the message.Read function to read a message
// from r. The message's encoding and charset are automatically decoded to
-// UTF-8. If an unknown charset is encountered, the error is logged but a nil-// error is returned since the entity object can still be read.+// UTF-8. If an unknown charset or unknown encoding is encountered, the error is+// logged but a nil error is returned since the entity object can still be read.func ReadMessage(r io.Reader) (*message.Entity, error) {
entity, err := message.Read(r)
- if message.IsUnknownCharset(err) {- log.Warnf("unknown charset encountered")- } else if err != nil {+ switch {+ case message.IsUnknownCharset(err):+ // message body is valid, just not converted, so continue+ log.Warnf("ReadMessage: %v", err)+ case message.IsUnknownEncoding(err):+ // message body is valid, just not decoded, so continue+ log.Warnf("ReadMessage: %v", err)+ case err != nil: return nil, fmt.Errorf("could not read message: %w", err)
}
return entity, nil
diff --git a/lib/rfc822/message_test.go b/lib/rfc822/message_test.go
index ca5ae014..a5812f8b 100644
--- a/lib/rfc822/message_test.go+++ b/lib/rfc822/message_test.go
@@ -38,8 +38,8 @@ func TestMessageInfoParser(t *testing.T) {
}
}
-func TestMessageInfoHandledError(t *testing.T) {- rootDir := "testdata/message/invalid"+func TestMessageInfoMalformed(t *testing.T) {+ rootDir := "testdata/message/malformed" msgFiles, err := os.ReadDir(rootDir)
die(err)
@@ -51,14 +51,10 @@ func TestMessageInfoHandledError(t *testing.T) {
p := fi.Name()
t.Run(p, func(t *testing.T) {
m := newMockRawMessageFromPath(filepath.Join(rootDir, p))
- mi, err := MessageInfo(m)+ _, err := MessageInfo(m) if err != nil {
t.Fatal(err)
}
-- if perr := mi.Error; perr == nil {- t.Fatal("Expected MessageInfo.Error, got none")- } })
}
}
diff --git a/lib/rfc822/testdata/message/invalid/hexa b/lib/rfc822/testdata/message/malformed/hexa
similarity index 100%
rename from lib/rfc822/testdata/message/invalid/hexa
rename to lib/rfc822/testdata/message/malformed/hexa
--
2.47.0
Koni Marti, Nov 08, 2024 at 17:07:
> Parse multipart messages on a best-efforts basis. Allow the user to see> as much of the message as possible, but log the errors.>> If a charset or encoding error is encountered for a message part of a> multipart message, the error is logged and ignored. In those cases, we> still get a valid message body but the content is just not decoded or> converted. No error will be propagated.>> If a multipart message cannot be parsed, ParseEntityStructure will> return a multipart error. This error indicates that the message is> malformed and there is nothing more we can do. The caller is then> advised to use a single text/plain body structure using> CreateTextPlainPart() to provide the entire message content to the user.>> Fixes: https://todo.sr.ht/~rjarry/aerc/288> Signed-off-by: Koni Marti <koni.marti@gmail.com>> ---> v2->v3:>> * fix tests
Hi Koni,
I am trying to wrap my head around this workaround. See my comments
below. Thanks!
>> lib/emlview.go | 6 +-> lib/messageview.go | 5 +-> lib/rfc822/message.go | 83 ++++++++++++++++---> lib/rfc822/message_test.go | 10 +--> .../message/{invalid => malformed}/hexa | 0> 5 files changed, 83 insertions(+), 21 deletions(-)> rename lib/rfc822/testdata/message/{invalid => malformed}/hexa (100%)>> diff --git a/lib/emlview.go b/lib/emlview.go> index 3f9c3f39..a5740a16 100644> --- a/lib/emlview.go> +++ b/lib/emlview.go> @@ -8,6 +8,7 @@ import (> _ "github.com/emersion/go-message/charset"> > "git.sr.ht/~rjarry/aerc/lib/crypto"> + "git.sr.ht/~rjarry/aerc/lib/log"> "git.sr.ht/~rjarry/aerc/lib/rfc822"> "git.sr.ht/~rjarry/aerc/models"> )> @@ -71,7 +72,10 @@ func NewEmlMessageView(full []byte, pgp crypto.Provider,> return> }> bs, err := rfc822.ParseEntityStructure(entity)> - if err != nil {> + if rfc822.IsMultipartError(err) {> + log.Warnf("EmlView: %v", err)> + bs = rfc822.CreateTextPlainBody()
Maybe I am not getting this correctly but it seems that you are
effectively dropping the message body completely?
> + } else if err != nil {> cb(nil, err)> return> }> diff --git a/lib/messageview.go b/lib/messageview.go> index b849148a..624e96b5 100644> --- a/lib/messageview.go> +++ b/lib/messageview.go> @@ -112,7 +112,10 @@ func NewMessageStoreView(messageInfo *models.MessageInfo, setSeen bool,> return> }> bs, err := rfc822.ParseEntityStructure(decrypted)> - if err != nil {> + if rfc822.IsMultipartError(err) {> + log.Warnf("MessageView: %v", err)> + bs = rfc822.CreateTextPlainBody()
Maybe I am not getting this correctly but it seems that you are
effectively dropping the message body completely?
> + } else if err != nil {> cb(nil, err)> return> }> diff --git a/lib/rfc822/message.go b/lib/rfc822/message.go> index 653cb2fe..e7c9cab1 100644> --- a/lib/rfc822/message.go> +++ b/lib/rfc822/message.go> @@ -19,6 +19,22 @@ import (> "github.com/emersion/go-message/mail"> )> > +type MultipartError struct {> + e error> +}> +> +func (u MultipartError) Unwrap() error { return u.e }> +> +func (u MultipartError) Error() string {> + return "multipart error: " + u.e.Error()> +}> +> +// IsMultipartError returns a boolean indicating whether the error is known to> +// report that the multipart message is malformed and could not be parsed.> +func IsMultipartError(err error) bool {> + return errors.As(err, new(MultipartError))> +}> +> // RFC 1123Z regexp> var dateRe = regexp.MustCompile(`(((Mon|Tue|Wed|Thu|Fri|Sat|Sun))[,]?\s[0-9]{1,2})\s` +> `(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s` +> @@ -34,8 +50,14 @@ func FetchEntityPartReader(e *message.Entity, index []int) (io.Reader, error) {> for {> idx++> part, err := mpr.NextPart()> - if err != nil {> - return nil, err> + switch {> + case message.IsUnknownCharset(err):> + log.Warnf("FetchEntityPartReader: %v", err)> + case message.IsUnknownEncoding(err):> + log.Warnf("FetchEntityPartReader: %v", err)> + case err != nil:> + log.Warnf("FetchEntityPartReader: %v", err)> + return bufReader(e)
Or maybe does this mean that all the message buffer will be sent to the
message view? Does this include top level headers, or only the multipart
headers?
> }> if idx == index[0] {> rest := index[1:]> @@ -89,6 +111,22 @@ func fixContentType(h message.Header) (string, map[string]string) {> return "text/plain", nil> }> > +// ParseEntityStructure will parse the message and create a multipart structure> +// for multipart messages. Parsing is done on a best-efforts basis:> +//> +// If the content-type cannot be parsed, ParseEntityStructure will try to fix> +// it; otherwise, it returns a text/plain mime type as a fallback. No error will> +// be returned.> +//> +// If a charset or encoding error is encountered for a message part of a> +// multipart message, the error is logged and ignored. In those cases, we still> +// get a valid message body but the content is just not decoded or converted. No> +// error will be returned.> +//> +// If reading a multipart message fails, ParseEntityStructure will return a> +// multipart error. This error indicates that this message is malformed and> +// there is nothing more we can do. The caller is then advised to use a single> +// text/plain body structure using CreateTextPlainPart().> func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {> var body models.BodyStructure> contentType, ctParams, err := e.Header.ContentType()> @@ -116,10 +154,15 @@ func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {> if mpr := e.MultipartReader(); mpr != nil {> for {> part, err := mpr.NextPart()> - if errors.Is(err, io.EOF) {> + switch {> + case errors.Is(err, io.EOF):> return &body, nil> - } else if err != nil {> - return nil, err> + case message.IsUnknownCharset(err):> + log.Warnf("ParseEntityStructure: %v", err)> + case message.IsUnknownEncoding(err):> + log.Warnf("ParseEntityStructure: %v", err)> + case err != nil:> + return nil, MultipartError{err}> }> ps, err := ParseEntityStructure(part)> if err != nil {> @@ -131,6 +174,16 @@ func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {> return &body, nil> }> > +// CreateTextPlainBody creats a plain-vanilla text/plain body structure.> +func CreateTextPlainBody() *models.BodyStructure {> + body := &models.BodyStructure{}> + body.MIMEType = "text"> + body.MIMESubType = "plain"> + body.Params = map[string]string{"charset": "utf-8"}> + body.Parts = []*models.BodyStructure{}> + return body> +}> +> func parseEnvelope(h *mail.Header) *models.Envelope {> subj, err := h.Subject()> if err != nil {> @@ -308,8 +361,9 @@ func MessageInfo(raw RawMessage) (*models.MessageInfo, error) {> return nil, fmt.Errorf("could not read message: %w", err)> }> bs, err := ParseEntityStructure(msg)> - if errors.As(err, new(message.UnknownEncodingError)) {> - parseErr = err> + if IsMultipartError(err) {> + log.Warnf("multipart error: %v", err)> + bs = CreateTextPlainBody()
how come you don't need to pass msg to CreateTextPlainBody()?
> } else if err != nil {> return nil, fmt.Errorf("could not get structure: %w", err)> }> @@ -394,13 +448,18 @@ func NewCRLFReader(r io.Reader) io.Reader {> > // ReadMessage is a wrapper for the message.Read function to read a message> // from r. The message's encoding and charset are automatically decoded to> -// UTF-8. If an unknown charset is encountered, the error is logged but a nil> -// error is returned since the entity object can still be read.> +// UTF-8. If an unknown charset or unknown encoding is encountered, the error is> +// logged but a nil error is returned since the entity object can still be read.> func ReadMessage(r io.Reader) (*message.Entity, error) {> entity, err := message.Read(r)> - if message.IsUnknownCharset(err) {> - log.Warnf("unknown charset encountered")> - } else if err != nil {> + switch {> + case message.IsUnknownCharset(err):> + // message body is valid, just not converted, so continue> + log.Warnf("ReadMessage: %v", err)> + case message.IsUnknownEncoding(err):> + // message body is valid, just not decoded, so continue> + log.Warnf("ReadMessage: %v", err)> + case err != nil:> return nil, fmt.Errorf("could not read message: %w", err)> }> return entity, nil> diff --git a/lib/rfc822/message_test.go b/lib/rfc822/message_test.go> index ca5ae014..a5812f8b 100644> --- a/lib/rfc822/message_test.go> +++ b/lib/rfc822/message_test.go> @@ -38,8 +38,8 @@ func TestMessageInfoParser(t *testing.T) {> }> }> > -func TestMessageInfoHandledError(t *testing.T) {> - rootDir := "testdata/message/invalid"> +func TestMessageInfoMalformed(t *testing.T) {> + rootDir := "testdata/message/malformed"> msgFiles, err := os.ReadDir(rootDir)> die(err)> > @@ -51,14 +51,10 @@ func TestMessageInfoHandledError(t *testing.T) {> p := fi.Name()> t.Run(p, func(t *testing.T) {> m := newMockRawMessageFromPath(filepath.Join(rootDir, p))> - mi, err := MessageInfo(m)> + _, err := MessageInfo(m)> if err != nil {> t.Fatal(err)> }> -> - if perr := mi.Error; perr == nil {> - t.Fatal("Expected MessageInfo.Error, got none")> - }> })> }> }> diff --git a/lib/rfc822/testdata/message/invalid/hexa b/lib/rfc822/testdata/message/malformed/hexa> similarity index 100%> rename from lib/rfc822/testdata/message/invalid/hexa> rename to lib/rfc822/testdata/message/malformed/hexa> -- > 2.47.0
Hi Robin
thanks for your input. I understand that this is a bit of a
"workaround", but it shows a "possible" approach to deal with malformed
multipart messages. We can also let that patch sit around a bit and try
to come up with a better solution.
A proper fix would probably be to write our own multipart parser, try to
detect errors (missing or wrong boundaries) and correct them?
Anyways, I've tried to answer your remarks below.
On Sat Nov 9, 2024 at 7:02 PM CET, Robin Jarry wrote:
> Koni Marti, Nov 08, 2024 at 17:07:>> diff --git a/lib/emlview.go b/lib/emlview.go>> index 3f9c3f39..a5740a16 100644>> --- a/lib/emlview.go>> +++ b/lib/emlview.go>> @@ -8,6 +8,7 @@ import (>> _ "github.com/emersion/go-message/charset">> >> "git.sr.ht/~rjarry/aerc/lib/crypto">> + "git.sr.ht/~rjarry/aerc/lib/log">> "git.sr.ht/~rjarry/aerc/lib/rfc822">> "git.sr.ht/~rjarry/aerc/models">> )>> @@ -71,7 +72,10 @@ func NewEmlMessageView(full []byte, pgp crypto.Provider,>> return>> }>> bs, err := rfc822.ParseEntityStructure(entity)>> - if err != nil {>> + if rfc822.IsMultipartError(err) {>> + log.Warnf("EmlView: %v", err)>> + bs = rfc822.CreateTextPlainBody()>> Maybe I am not getting this correctly but it seems that you are > effectively dropping the message body completely?
If there's a multipart error, CreateTextPlainBody() will create a new
*models.BodyStructure resembling one for a "normal", non-multipart
message. In such a situation, the message structure is messed up anyways
and not really useful.
The message content itself is never touched. You could still do a :pipe
-m less and get the same raw message.
>>> + } else if err != nil {>> cb(nil, err)>> return>> }>> diff --git a/lib/messageview.go b/lib/messageview.go>> index b849148a..624e96b5 100644>> --- a/lib/messageview.go>> +++ b/lib/messageview.go>> @@ -112,7 +112,10 @@ func NewMessageStoreView(messageInfo *models.MessageInfo, setSeen bool,>> return>> }>> bs, err := rfc822.ParseEntityStructure(decrypted)>> - if err != nil {>> + if rfc822.IsMultipartError(err) {>> + log.Warnf("MessageView: %v", err)>> + bs = rfc822.CreateTextPlainBody()>> Maybe I am not getting this correctly but it seems that you are > effectively dropping the message body completely?
Yes, only the *models.BodyStructure info is overwritten. This is
"meta"-information that our rfc822.ParseEntityStructure() extracts from
the message content.
>>> + } else if err != nil {>> cb(nil, err)>> return>> }>> diff --git a/lib/rfc822/message.go b/lib/rfc822/message.go>> index 653cb2fe..e7c9cab1 100644>> --- a/lib/rfc822/message.go>> +++ b/lib/rfc822/message.go>> @@ -19,6 +19,22 @@ import (>> "github.com/emersion/go-message/mail">> )>> >> +type MultipartError struct {>> + e error>> +}>> +>> +func (u MultipartError) Unwrap() error { return u.e }>> +>> +func (u MultipartError) Error() string {>> + return "multipart error: " + u.e.Error()>> +}>> +>> +// IsMultipartError returns a boolean indicating whether the error is known to>> +// report that the multipart message is malformed and could not be parsed.>> +func IsMultipartError(err error) bool {>> + return errors.As(err, new(MultipartError))>> +}>> +>> // RFC 1123Z regexp>> var dateRe = regexp.MustCompile(`(((Mon|Tue|Wed|Thu|Fri|Sat|Sun))[,]?\s[0-9]{1,2})\s` +>> `(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s` +>> @@ -34,8 +50,14 @@ func FetchEntityPartReader(e *message.Entity, index []int) (io.Reader, error) {>> for {>> idx++>> part, err := mpr.NextPart()>> - if err != nil {>> - return nil, err>> + switch {>> + case message.IsUnknownCharset(err):>> + log.Warnf("FetchEntityPartReader: %v", err)>> + case message.IsUnknownEncoding(err):>> + log.Warnf("FetchEntityPartReader: %v", err)>> + case err != nil:>> + log.Warnf("FetchEntityPartReader: %v", err)>> + return bufReader(e)>> Or maybe does this mean that all the message buffer will be sent to the > message view? Does this include top level headers, or only the multipart > headers?
At that point, we want to return as much information as we can. If
mpr.NextPart() returns an error, the part is nil.
Applied: [PATCH aerc v3] rfc822: parse multipart messages on a best efforts basis
Koni Marti <koni.marti@gmail.com> wrote:
> Parse multipart messages on a best-efforts basis. Allow the user to see> as much of the message as possible, but log the errors.>> If a charset or encoding error is encountered for a message part of a> multipart message, the error is logged and ignored. In those cases, we> still get a valid message body but the content is just not decoded or> converted. No error will be propagated.>> If a multipart message cannot be parsed, ParseEntityStructure will> return a multipart error. This error indicates that the message is> malformed and there is nothing more we can do. The caller is then> advised to use a single text/plain body structure using> CreateTextPlainPart() to provide the entire message content to the user.>> Fixes: https://todo.sr.ht/~rjarry/aerc/288> Signed-off-by: Koni Marti <koni.marti@gmail.com>> ---> v2->v3:>> * fix tests
Acked-by: Robin Jarry <robin@jarry.cc>
Applied, thanks.
To git@git.sr.ht:~rjarry/aerc
1a3b2b24eb51..c2048ef30452 master -> master