X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=blobdiff_plain;f=lib%2FPublicInbox%2FFilter.pm;h=8b78a44163743dcd4a235a1bc661471e29f091e6;hp=e5a8fafe52d66cd07fb06b73018c8b8cf3218d18;hb=c1abb946e53e4179666ebb290e31c2d9ddc40711;hpb=546ff31b4c11d74f63aa4501659224dbf864c21c diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm index e5a8fafe..8b78a441 100644 --- a/lib/PublicInbox/Filter.pm +++ b/lib/PublicInbox/Filter.pm @@ -1,6 +1,7 @@ -# Copyright (C) 2013, Eric Wong and all contributors +# Copyright (C) 2013-2015 all contributors # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) # +# Used to filter incoming mail for -mda and importers # This only exposes one function: run # Note: the settings here are highly opinionated. Obviously, this is # Free Software (AGPLv3), so you may change it if you host yourself. @@ -12,55 +13,49 @@ use Email::MIME::ContentType qw/parse_content_type/; use Email::Filter; use IPC::Run; our $VERSION = '0.0.1'; +use constant NO_HTML => '*** We only accept plain-text email, no HTML ***'; +use constant TEXT_ONLY => '*** We only accept plain-text email ***'; # start with the same defaults as mailman -our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i; -our $MIME_HTML = qr!\btext/html\b!i; +our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i; +our $MIME_HTML = qr!\btext/x?html\b!i; our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; # this is highly opinionated delivery # returns 0 only if there is nothing to deliver sub run { - my ($class, $simple) = @_; - - my $content_type = $simple->header("Content-Type") || "text/plain"; - - # kill potentially bad/confusing headers - # Note: ssoma already does this, but since we mangle the message, - # we should do this before it gets to ssoma. - # We also kill Mail-{Followup,Reply}-To and Reply-To headers due to - # the nature of public-inbox having no real subscribers. - foreach my $d (qw(status lines content-length - mail-followup-to mail-reply-to reply-to)) { - $simple->header_set($d); - } + my ($class, $mime, $filter) = @_; + + my $content_type = $mime->header('Content-Type') || 'text/plain'; if ($content_type =~ m!\btext/plain\b!i) { return 1; # yay, nothing to do } elsif ($content_type =~ $MIME_HTML) { + $filter->reject(NO_HTML) if $filter; # HTML-only, non-multipart - my $body = $simple->body; + my $body = $mime->body; my $ct_parsed = parse_content_type($content_type); dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_body($simple, $body); + replace_body($mime, $body); return 1; } elsif ($content_type =~ m!\bmultipart/!i) { - return strip_multipart($simple, $content_type); + return strip_multipart($mime, $content_type, $filter); } else { - replace_body($simple, "$content_type message scrubbed"); + $filter->reject(TEXT_ONLY) if $filter; + replace_body($mime, "$content_type message scrubbed"); return 0; } } sub replace_part { - my ($simple, $part, $type) = ($_[0], $_[1], $_[3]); + my ($mime, $part, $type) = ($_[0], $_[1], $_[3]); # don't copy $_[2], that's the body (it may be huge) # Email::MIME insists on setting Date:, so just set it consistently # to avoid conflicts to avoid git merge conflicts in a split brain # situation. unless (defined $part->header('Date')) { - my $date = $simple->header('Date') || + my $date = $mime->header('Date') || 'Thu, 01 Jan 1970 00:00:00 +0000'; $part->header_set('Date', $date); } @@ -77,11 +72,11 @@ sub replace_part { # converts one part of a multipart message to text sub html_part_to_text { - my ($simple, $part) = @_; + my ($mime, $part) = @_; my $body = $part->body; my $ct_parsed = parse_content_type($part->content_type); dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_part($simple, $part, $body, 'text/plain'); + replace_part($mime, $part, $body, 'text/plain'); } # modifies $_[0] in place @@ -93,10 +88,11 @@ sub dump_html { my $err = ""; # be careful about remote command injection! - if ($charset =~ /\A[A-Za-z0-9\-]+\z/) { + if ($charset =~ /\A([A-Za-z0-9\-]+)\z/) { push @cmd, "-assume_charset=$charset"; } if (IPC::Run::run(\@cmd, $body, \$out, \$err)) { + $out =~ s/\r\n/\n/sg; $$body = $out; } else { # give them an ugly version: @@ -105,13 +101,9 @@ sub dump_html { } } -# this is to correct user errors and not expected to cover all corner cases -# if users don't want to hit this, they should be sending text/plain messages -# unfortunately, too many people send HTML mail and we'll attempt to convert -# it to something safer, smaller and harder-to-track. +# this is to correct old archives during import. sub strip_multipart { - my ($simple, $content_type) = @_; - my $mime = Email::MIME->new($simple->as_string); + my ($mime, $content_type, $filter) = @_; my (@html, @keep); my $rejected = 0; @@ -125,14 +117,16 @@ sub strip_multipart { # some extensions are just bad, reject them outright my $fn = $part->filename; if (defined($fn) && $fn =~ $BAD_EXT) { + $filter->reject("Bad file type: $1") if $filter; $rejected++; return; } - my $part_type = $part->content_type; + my $part_type = $part->content_type || ''; if ($part_type =~ m!\btext/plain\b!i) { push @keep, $part; } elsif ($part_type =~ $MIME_HTML) { + $filter->reject(NO_HTML) if $filter; push @html, $part; } elsif ($part_type =~ $MIME_TEXT_ANY) { # Give other text attachments the benefit of the doubt, @@ -140,22 +134,29 @@ sub strip_multipart { # help with. push @keep, $part; - } elsif ($part_type =~ m!\Aapplication/octet-stream\z!i) { + } elsif ($part_type eq '' || + $part_type =~ m!\bapplication/octet-stream\b!i) { # unfortunately, some mailers don't set correct types, # let messages of unknown type through but do not # change the sender-specified type if (recheck_type_ok($part)) { push @keep, $part; + } elsif ($filter) { + $filter->reject("Bad attachment: $part_type ". + TEXT_ONLY); } else { $rejected++; } - } elsif ($part_type =~ m!\Aapplication/pgp-signature\z!i) { + } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) { # PGP signatures are not huge, we may keep them. # They can only be valid if it's the last element, # so we keep them iff the message is unmodified: if ($rejected == 0 && !@html) { push @keep, $part; } + } elsif ($filter) { + $filter->reject("unacceptable mime-type: $part_type ". + TEXT_ONLY); } else { # reject everything else, including non-PGP signatures $rejected++; @@ -164,11 +165,11 @@ sub strip_multipart { if ($content_type =~ m!\bmultipart/alternative\b!i) { if (scalar @keep == 1) { - return collapse($simple, $keep[0]); + return collapse($mime, $keep[0]); } } else { # convert HTML parts to plain text foreach my $part (@html) { - html_part_to_text($simple, $part); + html_part_to_text($mime, $part); push @keep, $part; } } @@ -186,34 +187,38 @@ sub strip_multipart { } if (scalar(@html) || $rejected) { $mime->parts_set(\@keep); - $simple->body_set($mime->body_raw); - mark_changed($simple); + $mime->body_set($mime->body_raw); + mark_changed($mime); } # else: no changes return $ok; } sub mark_changed { - my ($simple) = @_; - $simple->header_set("X-Content-Filtered-By", __PACKAGE__ ." $VERSION"); + my ($mime) = @_; + $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION"); } sub collapse { - my ($simple, $part) = @_; - $simple->header_set("Content-Type", $part->content_type); - $simple->body_set($part->body_raw); - mark_changed($simple); + my ($mime, $part) = @_; + $mime->header_set('Content-Type', $part->content_type); + $mime->body_set($part->body_raw); + my $cte = $part->header('Content-Transfer-Encoding'); + if (defined($cte) && $cte ne '') { + $mime->header_set('Content-Transfer-Encoding', $cte); + } + mark_changed($mime); return 1; } sub replace_body { - my $simple = $_[0]; - $simple->body_set($_[1]); - $simple->header_set("Content-Type", "text/plain"); - if ($simple->header("Content-Transfer-Encoding")) { - $simple->header_set("Content-Transfer-Encoding", undef); + my $mime = $_[0]; + $mime->body_set($_[1]); + $mime->header_set('Content-Type', 'text/plain'); + if ($mime->header('Content-Transfer-Encoding')) { + $mime->header_set('Content-Transfer-Encoding', undef); } - mark_changed($simple); + mark_changed($mime); } # Check for display-able text, no messed up binaries @@ -221,8 +226,7 @@ sub replace_body { sub recheck_type_ok { my ($part) = @_; my $s = $part->body; - ((bytes::length($s) < 0x10000) && - ($s =~ /\A([\P{XPosixPrint}\f\n\r\t]+)\z/)) + ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s)); } 1;