X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=blobdiff_plain;f=lib%2FPublicInbox%2FFilter.pm;h=8b78a44163743dcd4a235a1bc661471e29f091e6;hp=0b1ec911407d3fa60ed9b1a3cdcf9ab6023b2641;hb=c1abb946e53e4179666ebb290e31c2d9ddc40711;hpb=5ac572c2b69470db354c6adb241e605eda19f727 diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm index 0b1ec911..8b78a441 100644 --- a/lib/PublicInbox/Filter.pm +++ b/lib/PublicInbox/Filter.pm @@ -1,6 +1,7 @@ -# Copyright (C) 2013, Eric Wong and all contributors +# Copyright (C) 2013-2015 all contributors # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) # +# Used to filter incoming mail for -mda and importers # This only exposes one function: run # Note: the settings here are highly opinionated. Obviously, this is # Free Software (AGPLv3), so you may change it if you host yourself. @@ -13,10 +14,11 @@ use Email::Filter; use IPC::Run; our $VERSION = '0.0.1'; use constant NO_HTML => '*** We only accept plain-text email, no HTML ***'; +use constant TEXT_ONLY => '*** We only accept plain-text email ***'; # start with the same defaults as mailman our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i; -our $MIME_HTML = qr!\btext/html\b!i; +our $MIME_HTML = qr!\btext/x?html\b!i; our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; # this is highly opinionated delivery @@ -26,16 +28,6 @@ sub run { my $content_type = $mime->header('Content-Type') || 'text/plain'; - # kill potentially bad/confusing headers - # Note: ssoma already does this, but since we mangle the message, - # we should do this before it gets to ssoma. - # We also kill Mail-{Followup,Reply}-To and Reply-To headers due to - # the nature of public-inbox having no real subscribers. - foreach my $d (qw(status lines content-length - mail-followup-to mail-reply-to reply-to)) { - $mime->header_set($d); - } - if ($content_type =~ m!\btext/plain\b!i) { return 1; # yay, nothing to do } elsif ($content_type =~ $MIME_HTML) { @@ -49,6 +41,7 @@ sub run { } elsif ($content_type =~ m!\bmultipart/!i) { return strip_multipart($mime, $content_type, $filter); } else { + $filter->reject(TEXT_ONLY) if $filter; replace_body($mime, "$content_type message scrubbed"); return 0; } @@ -108,10 +101,7 @@ sub dump_html { } } -# this is to correct user errors and not expected to cover all corner cases -# if users don't want to hit this, they should be sending text/plain messages -# unfortunately, too many people send HTML mail and we'll attempt to convert -# it to something safer, smaller and harder-to-spy-on-users-with. +# this is to correct old archives during import. sub strip_multipart { my ($mime, $content_type, $filter) = @_; @@ -152,7 +142,8 @@ sub strip_multipart { if (recheck_type_ok($part)) { push @keep, $part; } elsif ($filter) { - $filter->reject('no attachments') + $filter->reject("Bad attachment: $part_type ". + TEXT_ONLY); } else { $rejected++; } @@ -163,8 +154,10 @@ sub strip_multipart { if ($rejected == 0 && !@html) { push @keep, $part; } + } elsif ($filter) { + $filter->reject("unacceptable mime-type: $part_type ". + TEXT_ONLY); } else { - $filter->reject('no attachments') if $filter; # reject everything else, including non-PGP signatures $rejected++; } @@ -211,7 +204,7 @@ sub collapse { $mime->header_set('Content-Type', $part->content_type); $mime->body_set($part->body_raw); my $cte = $part->header('Content-Transfer-Encoding'); - if (defined($cte) && length($cte)) { + if (defined($cte) && $cte ne '') { $mime->header_set('Content-Transfer-Encoding', $cte); } mark_changed($mime); @@ -233,8 +226,7 @@ sub replace_body { sub recheck_type_ok { my ($part) = @_; my $s = $part->body; - ((bytes::length($s) < 0x10000) && - ($s =~ /\A([\P{XPosixPrint}\f\n\r\t]+)\z/)) + ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s)); } 1;