diff --git a/go.mod b/go.mod index c9d1a209d8..ebdafff8da 100644 --- a/go.mod +++ b/go.mod @@ -13,11 +13,12 @@ require ( github.com/mjl-/sherpats v0.0.6 github.com/prometheus/client_golang v1.18.0 github.com/russross/blackfriday/v2 v2.1.0 + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d go.etcd.io/bbolt v1.3.11 golang.org/x/crypto v0.27.0 golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f golang.org/x/net v0.29.0 - golang.org/x/text v0.18.0 + golang.org/x/text v0.19.0 rsc.io/qr v0.2.0 ) diff --git a/go.sum b/go.sum index 54735006fb..76740712f1 100644 --- a/go.sum +++ b/go.sum @@ -63,6 +63,8 @@ github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= @@ -97,8 +99,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= -golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= diff --git a/store/account.go b/store/account.go index 846e28653c..4fba597d39 100644 --- a/store/account.go +++ b/store/account.go @@ -1855,12 +1855,31 @@ ruleset: header: for _, t := range rs.HeadersRegexpCompiled { + isSubjectMatch := t[0].MatchString("subject") for k, vl := range header { k = strings.ToLower(k) + if t[0].MatchString("body") { // message body match + ws := PrepareWordSearch([]string{t[1].String()}, []string{}) + // todo: regexp match + ok, err := ws.MatchPart(log, &p, true) + if err != nil { + log.Errorx("Failed to match body: %v", err) + } + if ok { + continue header + } + } if !t[0].MatchString(k) { continue } for _, v := range vl { + if isSubjectMatch { + // todo: memorize decoded text + v, err = decodeRFC2047(v) + if err != nil { + log.Errorx("Failed to decode subject: %v", err, slog.String("v", v)) + } + } v = strings.ToLower(strings.TrimSpace(v)) if t[1].MatchString(v) { continue header diff --git a/store/search.go b/store/search.go index ec4858adde..da6f5ea19e 100644 --- a/store/search.go +++ b/store/search.go @@ -2,13 +2,22 @@ package store import ( "bytes" + "encoding/base64" + "fmt" "io" + "mime/quotedprintable" + "regexp" "strings" "unicode" "unicode/utf8" "github.com/mjl-/mox/message" "github.com/mjl-/mox/mlog" + + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/japanese" + encUnicode "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" ) // WordSearch holds context for a search, with scratch buffers to prevent @@ -193,3 +202,61 @@ func toLower(buf []byte) []byte { } return r } + +func decodeRFC2047(encoded string) (string, error) { + // match e.g. =?(iso-2022-jp)?(B)?(Rnc6...)?= + r := regexp.MustCompile(`(?i)=\?([^?]+)\?([BQ])\?([^?]+)\?=`) + matches := r.FindAllStringSubmatch(encoded, -1) + + if len(matches) == 0 { // no match. Looks ASCII. + return encoded, nil + } + + var decodedStrings []string + for _, match := range matches { + charset := match[1] + encodingName := strings.ToUpper(match[2]) + encodedText := match[3] + + // Decode Base64 or Quoted-Printable + var decodedBytes []byte + var err error + switch encodingName { + case "B": // Base64 + decodedBytes, err = base64.StdEncoding.DecodeString(encodedText) + if err != nil { + return encoded, fmt.Errorf("Base64 decode error: %w", err) + } + case "Q": // Quoted-Printable + decodedBytes, err = io.ReadAll(quotedprintable.NewReader(strings.NewReader(encodedText))) + if err != nil { + return "", fmt.Errorf("Quoted-Printable decode error: %w", err) + } + default: + return encoded, fmt.Errorf("not supported encoding: %s", encodingName) + } + + // Select charset + var enc encoding.Encoding + switch strings.ToLower(charset) { + case "iso-2022-jp": + enc = japanese.ISO2022JP + case "utf-8": + enc = encUnicode.UTF8 + default: + return encoded, fmt.Errorf("not supported charset: %s", charset) + } + + // Decode with charset + reader := transform.NewReader(strings.NewReader(string(decodedBytes)), enc.NewDecoder()) + decodedText, err := io.ReadAll(reader) + if err != nil { + return encoded, err + } + + decodedStrings = append(decodedStrings, string(decodedText)) + } + + // Concat multiple strings + return strings.Join(decodedStrings, ""), nil +} diff --git a/store/search_test.go b/store/search_test.go new file mode 100644 index 0000000000..c573e4c995 --- /dev/null +++ b/store/search_test.go @@ -0,0 +1,38 @@ +package store + +import ( + "fmt" + "testing" +) + +func TestSubjectMatch(t *testing.T) { + // Auto detect subject text encoding and decode + + //log := mlog.New("search", nil) + + originalSubject := `テストテキスト Abc 123...` + asciiSubject := "test text Abc 123..." + + encodedSubjectUTF8 := `=?UTF-8?b?44OG44K544OI44OG44Kt44K544OIIEFiYyAxMjMuLi4=?=` + encodedSubjectISO2022 := `=?iso-2022-jp?B?GyRCJUYlOSVIJUYlLSU5JUgbKEIgQWJjIDEyMy4uLg==?=` + encodedSubjectUTF8 = encodedSubjectUTF8 + " \n " + encodedSubjectUTF8 + encodedSubjectISO2022 = encodedSubjectISO2022 + " \n " + encodedSubjectISO2022 + originalSubject = originalSubject + originalSubject + + encodedTexts := map[string]string{encodedSubjectUTF8: originalSubject, encodedSubjectISO2022: originalSubject, asciiSubject: asciiSubject} + + for encodedSubject, originalSubject := range encodedTexts { + + // Autodetect & decode + decodedSubject, err := decodeRFC2047(encodedSubject) + + fmt.Printf("decoded text:%s\n", decodedSubject) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + if originalSubject != decodedSubject { + t.Fatalf("Decode mismatch %s != %s", originalSubject, decodedSubject) + } + } +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 716e159777..3313fb6398 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -61,6 +61,9 @@ github.com/prometheus/procfs/internal/util # github.com/russross/blackfriday/v2 v2.1.0 ## explicit github.com/russross/blackfriday/v2 +# github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d +## explicit +github.com/saintfish/chardet # go.etcd.io/bbolt v1.3.11 ## explicit; go 1.22 go.etcd.io/bbolt @@ -97,7 +100,7 @@ golang.org/x/sync/errgroup golang.org/x/sys/cpu golang.org/x/sys/unix golang.org/x/sys/windows -# golang.org/x/text v0.18.0 +# golang.org/x/text v0.19.0 ## explicit; go 1.18 golang.org/x/text/cases golang.org/x/text/encoding diff --git a/webaccount/account.js b/webaccount/account.js index bf9f62d8b7..53e680b8b0 100644 --- a/webaccount/account.js +++ b/webaccount/account.js @@ -1692,7 +1692,7 @@ const destination = async (name) => { let fullName; let saveButton; const addresses = [name, ...Object.keys(acc.Destinations || {}).filter(a => !a.startsWith('@') && a !== name)]; - dom._kids(page, crumbs(crumblink('Mox Account', '#'), 'Destination ' + name), dom.div(dom.span('Default mailbox', attr.title('Default mailbox where email for this recipient is delivered to if it does not match any ruleset. Default is Inbox.')), dom.br(), defaultMailbox = dom.input(attr.value(dest.Mailbox), attr.placeholder('Inbox'))), dom.br(), dom.div(dom.span('Full name', attr.title('Name to use in From header when composing messages. If not set, the account default full name is used.')), dom.br(), fullName = dom.input(attr.value(dest.FullName))), dom.br(), dom.h2('Rulesets'), dom.p('Incoming messages are checked against the rulesets. If a ruleset matches, the message is delivered to the mailbox configured for the ruleset instead of to the default mailbox.'), dom.p('"Is Forward" does not affect matching, but changes prevents the sending mail server from being included in future junk classifications by clearing fields related to the forwarding email server (IP address, EHLO domain, MAIL FROM domain and a matching DKIM domain), and prevents DMARC rejects for forwarded messages.'), dom.p('"List allow domain" does not affect matching, but skips the regular spam checks if one of the verified domains is a (sub)domain of the domain mentioned here.'), dom.p('"Accept rejects to mailbox" does not affect matching, but causes messages classified as junk to be accepted and delivered to this mailbox, instead of being rejected during the SMTP transaction. Useful for incoming forwarded messages where rejecting incoming messages may cause the forwarding server to stop forwarding.'), dom.table(dom.thead(dom.tr(dom.th('SMTP "MAIL FROM" regexp', attr.title('Matches if this regular expression matches (a substring of) the SMTP MAIL FROM address (not the message From-header). E.g. user@example.org.')), dom.th('Message "From" address regexp', attr.title('Matches if this regular expression matches (a substring of) the single address in the message From header.')), dom.th('Verified domain', attr.title('Matches if this domain matches an SPF- and/or DKIM-verified (sub)domain.')), dom.th('Headers regexp', attr.title('Matches if these header field/value regular expressions all match (substrings of) the message headers. Header fields and valuees are converted to lower case before matching. Whitespace is trimmed from the value before matching. A header field can occur multiple times in a message, only one instance has to match. For mailing lists, you could match on ^list-id$ with the value typically the mailing list address in angled brackets with @ replaced with a dot, e.g. .')), dom.th('Is Forward', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. Can only be used together with SMTPMailFromRegexp and VerifiedDomain. SMTPMailFromRegexp must be set to the address used to deliver the forwarded message, e.g. '^user(|\\+.*)@forward\\.example$'. Changes to junk analysis: 1. Messages are not rejected for failing a DMARC policy, because a legitimate forwarded message without valid/intact/aligned DKIM signature would be rejected because any verified SPF domain will be 'unaligned', of the forwarding mail server. 2. The sending mail server IP address, and sending EHLO and MAIL FROM domains and matching DKIM domain aren't used in future reputation-based spam classifications (but other verified DKIM domains are) because the forwarding server is not a useful spam signal for future messages.")), dom.th('List allow domain', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If this domain matches an SPF- and/or DKIM-verified (sub)domain, the message is accepted without further spam checks, such as a junk filter or DMARC reject evaluation. DMARC rejects should not apply for mailing lists that are not configured to rewrite the From-header of messages that don't have a passing DKIM signature of the From-domain. Otherwise, by rejecting messages, you may be automatically unsubscribed from the mailing list. The assumption is that mailing lists do their own spam filtering/moderation.")), dom.th('Allow rejects to mailbox', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If a message is classified as spam, it isn't rejected during the SMTP transaction (the normal behaviour), but accepted during the SMTP transaction and delivered to the specified mailbox. The specified mailbox is not automatically cleaned up like the account global Rejects mailbox, unless set to that Rejects mailbox.")), dom.th('Mailbox', attr.title('Mailbox to deliver to if this ruleset matches.')), dom.th('Comment', attr.title('Free-form comments.')), dom.th('Action'))), rulesetsTbody, dom.tfoot(dom.tr(dom.td(attr.colspan('9')), dom.td(dom.clickbutton('Add ruleset', function click() { + dom._kids(page, crumbs(crumblink('Mox Account', '#'), 'Destination ' + name), dom.div(dom.span('Default mailbox', attr.title('Default mailbox where email for this recipient is delivered to if it does not match any ruleset. Default is Inbox.')), dom.br(), defaultMailbox = dom.input(attr.value(dest.Mailbox), attr.placeholder('Inbox'))), dom.br(), dom.div(dom.span('Full name', attr.title('Name to use in From header when composing messages. If not set, the account default full name is used.')), dom.br(), fullName = dom.input(attr.value(dest.FullName))), dom.br(), dom.h2('Rulesets'), dom.p('Incoming messages are checked against the rulesets. If a ruleset matches, the message is delivered to the mailbox configured for the ruleset instead of to the default mailbox.'), dom.p('"Is Forward" does not affect matching, but changes prevents the sending mail server from being included in future junk classifications by clearing fields related to the forwarding email server (IP address, EHLO domain, MAIL FROM domain and a matching DKIM domain), and prevents DMARC rejects for forwarded messages.'), dom.p('"List allow domain" does not affect matching, but skips the regular spam checks if one of the verified domains is a (sub)domain of the domain mentioned here.'), dom.p('"Accept rejects to mailbox" does not affect matching, but causes messages classified as junk to be accepted and delivered to this mailbox, instead of being rejected during the SMTP transaction. Useful for incoming forwarded messages where rejecting incoming messages may cause the forwarding server to stop forwarding.'), dom.table(dom.thead(dom.tr(dom.th('SMTP "MAIL FROM" regexp', attr.title('Matches if this regular expression matches (a substring of) the SMTP MAIL FROM address (not the message From-header). E.g. user@example.org.')), dom.th('Message "From" address regexp', attr.title('Matches if this regular expression matches (a substring of) the single address in the message From header.')), dom.th('Verified domain', attr.title('Matches if this domain matches an SPF- and/or DKIM-verified (sub)domain.')), dom.th('Headers/body regexp', attr.title('Matches if these header field/value regular expressions all match (substrings of) the message headers. Header fields and valuees are converted to lower case before matching. Whitespace is trimmed from the value before matching. A header field can occur multiple times in a message, only one instance has to match. For mailing lists, you could match on ^list-id$ with the value typically the mailing list address in angled brackets with @ replaced with a dot, e.g. . Use field "body" for simple message content match')), dom.th('Is Forward', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. Can only be used together with SMTPMailFromRegexp and VerifiedDomain. SMTPMailFromRegexp must be set to the address used to deliver the forwarded message, e.g. '^user(|\\+.*)@forward\\.example$'. Changes to junk analysis: 1. Messages are not rejected for failing a DMARC policy, because a legitimate forwarded message without valid/intact/aligned DKIM signature would be rejected because any verified SPF domain will be 'unaligned', of the forwarding mail server. 2. The sending mail server IP address, and sending EHLO and MAIL FROM domains and matching DKIM domain aren't used in future reputation-based spam classifications (but other verified DKIM domains are) because the forwarding server is not a useful spam signal for future messages.")), dom.th('List allow domain', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If this domain matches an SPF- and/or DKIM-verified (sub)domain, the message is accepted without further spam checks, such as a junk filter or DMARC reject evaluation. DMARC rejects should not apply for mailing lists that are not configured to rewrite the From-header of messages that don't have a passing DKIM signature of the From-domain. Otherwise, by rejecting messages, you may be automatically unsubscribed from the mailing list. The assumption is that mailing lists do their own spam filtering/moderation.")), dom.th('Allow rejects to mailbox', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If a message is classified as spam, it isn't rejected during the SMTP transaction (the normal behaviour), but accepted during the SMTP transaction and delivered to the specified mailbox. The specified mailbox is not automatically cleaned up like the account global Rejects mailbox, unless set to that Rejects mailbox.")), dom.th('Mailbox', attr.title('Mailbox to deliver to if this ruleset matches.')), dom.th('Comment', attr.title('Free-form comments.')), dom.th('Action'))), rulesetsTbody, dom.tfoot(dom.tr(dom.td(attr.colspan('9')), dom.td(dom.clickbutton('Add ruleset', function click() { addRulesetsRow({ SMTPMailFromRegexp: '', MsgFromRegexp: '', diff --git a/webaccount/account.ts b/webaccount/account.ts index c328d04a8f..f64bf9a552 100644 --- a/webaccount/account.ts +++ b/webaccount/account.ts @@ -1582,7 +1582,7 @@ const destination = async (name: string) => { dom.th('SMTP "MAIL FROM" regexp', attr.title('Matches if this regular expression matches (a substring of) the SMTP MAIL FROM address (not the message From-header). E.g. user@example.org.')), dom.th('Message "From" address regexp', attr.title('Matches if this regular expression matches (a substring of) the single address in the message From header.')), dom.th('Verified domain', attr.title('Matches if this domain matches an SPF- and/or DKIM-verified (sub)domain.')), - dom.th('Headers regexp', attr.title('Matches if these header field/value regular expressions all match (substrings of) the message headers. Header fields and valuees are converted to lower case before matching. Whitespace is trimmed from the value before matching. A header field can occur multiple times in a message, only one instance has to match. For mailing lists, you could match on ^list-id$ with the value typically the mailing list address in angled brackets with @ replaced with a dot, e.g. .')), + dom.th('Headers/body regexp', attr.title('Matches if these header field/value regular expressions all match (substrings of) the message headers. Header fields and valuees are converted to lower case before matching. Whitespace is trimmed from the value before matching. A header field can occur multiple times in a message, only one instance has to match. For mailing lists, you could match on ^list-id$ with the value typically the mailing list address in angled brackets with @ replaced with a dot, e.g. . Use field "body" for simple message content match')), dom.th('Is Forward', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. Can only be used together with SMTPMailFromRegexp and VerifiedDomain. SMTPMailFromRegexp must be set to the address used to deliver the forwarded message, e.g. '^user(|\\+.*)@forward\\.example$'. Changes to junk analysis: 1. Messages are not rejected for failing a DMARC policy, because a legitimate forwarded message without valid/intact/aligned DKIM signature would be rejected because any verified SPF domain will be 'unaligned', of the forwarding mail server. 2. The sending mail server IP address, and sending EHLO and MAIL FROM domains and matching DKIM domain aren't used in future reputation-based spam classifications (but other verified DKIM domains are) because the forwarding server is not a useful spam signal for future messages.")), dom.th('List allow domain', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If this domain matches an SPF- and/or DKIM-verified (sub)domain, the message is accepted without further spam checks, such as a junk filter or DMARC reject evaluation. DMARC rejects should not apply for mailing lists that are not configured to rewrite the From-header of messages that don't have a passing DKIM signature of the From-domain. Otherwise, by rejecting messages, you may be automatically unsubscribed from the mailing list. The assumption is that mailing lists do their own spam filtering/moderation.")), dom.th('Allow rejects to mailbox', attr.title("Influences spam filtering only, this option does not change whether a message matches this ruleset. If a message is classified as spam, it isn't rejected during the SMTP transaction (the normal behaviour), but accepted during the SMTP transaction and delivered to the specified mailbox. The specified mailbox is not automatically cleaned up like the account global Rejects mailbox, unless set to that Rejects mailbox.")),