-# Copyright (C) 2013, Eric Wong <normalperson@yhbt.net> and all contributors
+# Copyright (C) 2013-2015 all contributors <meta@public-inbox.org>
# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
#
+# Used to filter incoming mail for -mda and importers
# This only exposes one function: run
# Note: the settings here are highly opinionated. Obviously, this is
# Free Software (AGPLv3), so you may change it if you host yourself.
use Email::Filter;
use IPC::Run;
our $VERSION = '0.0.1';
+use constant NO_HTML => '*** We only accept plain-text email, no HTML ***';
+use constant TEXT_ONLY => '*** We only accept plain-text email ***';
# start with the same defaults as mailman
-our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i;
-our $MIME_HTML = qr!\btext/html\b!i;
+our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i;
+our $MIME_HTML = qr!\btext/x?html\b!i;
our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i;
# this is highly opinionated delivery
# returns 0 only if there is nothing to deliver
sub run {
- my ($class, $simple) = @_;
-
- my $content_type = $simple->header("Content-Type") || "text/plain";
-
- # kill potentially bad/confusing headers
- # Note: ssoma already does this, but since we mangle the message,
- # we should do this before it gets to ssoma.
- # We also kill Mail-{Followup,Reply}-To and Reply-To headers due to
- # the nature of public-inbox having no real subscribers.
- foreach my $d (qw(status lines content-length
- mail-followup-to mail-reply-to reply-to)) {
- $simple->header_set($d);
- }
+ my ($class, $mime, $filter) = @_;
+
+ my $content_type = $mime->header('Content-Type') || 'text/plain';
if ($content_type =~ m!\btext/plain\b!i) {
return 1; # yay, nothing to do
} elsif ($content_type =~ $MIME_HTML) {
+ $filter->reject(NO_HTML) if $filter;
# HTML-only, non-multipart
- my $body = $simple->body;
+ my $body = $mime->body;
my $ct_parsed = parse_content_type($content_type);
dump_html(\$body, $ct_parsed->{attributes}->{charset});
- replace_body($simple, $body);
+ replace_body($mime, $body);
return 1;
} elsif ($content_type =~ m!\bmultipart/!i) {
- return strip_multipart($simple, $content_type);
+ return strip_multipart($mime, $content_type, $filter);
} else {
- replace_body($simple, "$content_type message scrubbed");
+ $filter->reject(TEXT_ONLY) if $filter;
+ replace_body($mime, "$content_type message scrubbed");
return 0;
}
}
sub replace_part {
- my ($simple, $part, $type) = ($_[0], $_[1], $_[3]);
+ my ($mime, $part, $type) = ($_[0], $_[1], $_[3]);
# don't copy $_[2], that's the body (it may be huge)
# Email::MIME insists on setting Date:, so just set it consistently
# to avoid conflicts to avoid git merge conflicts in a split brain
# situation.
unless (defined $part->header('Date')) {
- my $date = $simple->header('Date') ||
+ my $date = $mime->header('Date') ||
'Thu, 01 Jan 1970 00:00:00 +0000';
$part->header_set('Date', $date);
}
# converts one part of a multipart message to text
sub html_part_to_text {
- my ($simple, $part) = @_;
+ my ($mime, $part) = @_;
my $body = $part->body;
my $ct_parsed = parse_content_type($part->content_type);
dump_html(\$body, $ct_parsed->{attributes}->{charset});
- replace_part($simple, $part, $body, 'text/plain');
+ replace_part($mime, $part, $body, 'text/plain');
}
# modifies $_[0] in place
push @cmd, "-assume_charset=$charset";
}
if (IPC::Run::run(\@cmd, $body, \$out, \$err)) {
+ $out =~ s/\r\n/\n/sg;
$$body = $out;
} else {
# give them an ugly version:
}
}
-# this is to correct user errors and not expected to cover all corner cases
-# if users don't want to hit this, they should be sending text/plain messages
-# unfortunately, too many people send HTML mail and we'll attempt to convert
-# it to something safer, smaller and harder-to-track.
+# this is to correct old archives during import.
sub strip_multipart {
- my ($simple, $content_type) = @_;
- my $mime = Email::MIME->new($simple->as_string);
+ my ($mime, $content_type, $filter) = @_;
my (@html, @keep);
my $rejected = 0;
# some extensions are just bad, reject them outright
my $fn = $part->filename;
if (defined($fn) && $fn =~ $BAD_EXT) {
+ $filter->reject("Bad file type: $1") if $filter;
$rejected++;
return;
}
- my $part_type = $part->content_type;
+ my $part_type = $part->content_type || '';
if ($part_type =~ m!\btext/plain\b!i) {
push @keep, $part;
} elsif ($part_type =~ $MIME_HTML) {
+ $filter->reject(NO_HTML) if $filter;
push @html, $part;
} elsif ($part_type =~ $MIME_TEXT_ANY) {
# Give other text attachments the benefit of the doubt,
# help with.
push @keep, $part;
- } elsif ($part_type =~ m!\Aapplication/octet-stream\z!i) {
+ } elsif ($part_type eq '' ||
+ $part_type =~ m!\bapplication/octet-stream\b!i) {
# unfortunately, some mailers don't set correct types,
# let messages of unknown type through but do not
# change the sender-specified type
if (recheck_type_ok($part)) {
push @keep, $part;
+ } elsif ($filter) {
+ $filter->reject("Bad attachment: $part_type ".
+ TEXT_ONLY);
} else {
$rejected++;
}
- } elsif ($part_type =~ m!\Aapplication/pgp-signature\z!i) {
+ } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) {
# PGP signatures are not huge, we may keep them.
# They can only be valid if it's the last element,
# so we keep them iff the message is unmodified:
if ($rejected == 0 && !@html) {
push @keep, $part;
}
+ } elsif ($filter) {
+ $filter->reject("unacceptable mime-type: $part_type ".
+ TEXT_ONLY);
} else {
# reject everything else, including non-PGP signatures
$rejected++;
if ($content_type =~ m!\bmultipart/alternative\b!i) {
if (scalar @keep == 1) {
- return collapse($simple, $keep[0]);
+ return collapse($mime, $keep[0]);
}
} else { # convert HTML parts to plain text
foreach my $part (@html) {
- html_part_to_text($simple, $part);
+ html_part_to_text($mime, $part);
push @keep, $part;
}
}
}
if (scalar(@html) || $rejected) {
$mime->parts_set(\@keep);
- $simple->body_set($mime->body_raw);
- mark_changed($simple);
+ $mime->body_set($mime->body_raw);
+ mark_changed($mime);
} # else: no changes
return $ok;
}
sub mark_changed {
- my ($simple) = @_;
- $simple->header_set("X-Content-Filtered-By", __PACKAGE__ ." $VERSION");
+ my ($mime) = @_;
+ $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION");
}
sub collapse {
- my ($simple, $part) = @_;
- $simple->header_set("Content-Type", $part->content_type);
- $simple->body_set($part->body_raw);
- mark_changed($simple);
+ my ($mime, $part) = @_;
+ $mime->header_set('Content-Type', $part->content_type);
+ $mime->body_set($part->body_raw);
+ my $cte = $part->header('Content-Transfer-Encoding');
+ if (defined($cte) && $cte ne '') {
+ $mime->header_set('Content-Transfer-Encoding', $cte);
+ }
+ mark_changed($mime);
return 1;
}
sub replace_body {
- my $simple = $_[0];
- $simple->body_set($_[1]);
- $simple->header_set("Content-Type", "text/plain");
- if ($simple->header("Content-Transfer-Encoding")) {
- $simple->header_set("Content-Transfer-Encoding", undef);
+ my $mime = $_[0];
+ $mime->body_set($_[1]);
+ $mime->header_set('Content-Type', 'text/plain');
+ if ($mime->header('Content-Transfer-Encoding')) {
+ $mime->header_set('Content-Transfer-Encoding', undef);
}
- mark_changed($simple);
+ mark_changed($mime);
}
# Check for display-able text, no messed up binaries
sub recheck_type_ok {
my ($part) = @_;
my $s = $part->body;
- ((bytes::length($s) < 0x10000) &&
- ($s =~ /\A([\P{XPosixPrint}\f\n\r\t]+)\z/))
+ ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s));
}
1;