]> Sergey Matveev's repositories - public-inbox.git/commitdiff
filter: use file(1) to detect mime type if octet-stream
authorEric Wong <normalperson@yhbt.net>
Fri, 28 Mar 2014 08:22:45 +0000 (08:22 +0000)
committerEric Wong <normalperson@yhbt.net>
Fri, 28 Mar 2014 08:22:45 +0000 (08:22 +0000)
Some mailers do not correctly detect/set the Content-Type header; so
attempt to keep messages based on our server-detected MIME type if
application/octet-stream was specified.

lib/PublicInbox/Filter.pm
t/filter.t

index a83ecc83088b696e71f24ab9d7d1f9ed8623c746..64c31e8e3fdb1c46c03dd53c8f20945594a7a9fa 100644 (file)
@@ -15,6 +15,8 @@ our $VERSION = '0.0.1';
 
 # start with the same defaults as mailman
 our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i;
+our $MIME_HTML = qr!\btext/html\b!i;
+our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i;
 
 # this is highly opinionated delivery
 # returns 0 only if there is nothing to deliver
@@ -35,7 +37,7 @@ sub run {
 
        if ($content_type =~ m!\btext/plain\b!i) {
                return 1; # yay, nothing to do
-       } elsif ($content_type =~ m!\btext/html\b!i) {
+       } elsif ($content_type =~ $MIME_HTML) {
                # HTML-only, non-multipart
                my $body = $simple->body;
                my $ct_parsed = parse_content_type($content_type);
@@ -129,14 +131,23 @@ sub strip_multipart {
                my $part_type = $part->content_type;
                if ($part_type =~ m!\btext/plain\b!i) {
                        push @keep, $part;
-               } elsif ($part_type =~ m!\btext/html\b!i) {
+               } elsif ($part_type =~ $MIME_HTML) {
                        push @html, $part;
-               } elsif ($part_type =~ m!\btext/[a-z0-9\+\._-]+\b!i) {
+               } elsif ($part_type =~ $MIME_TEXT_ANY) {
                        # Give other text attachments the benefit of the doubt,
                        # here?  Could be source code or script the user wants
                        # help with.
 
                        push @keep, $part;
+               } elsif ($part_type =~ m!\Aapplication/octet-stream\z!i) {
+                       # unfortunately, some mailers don't set correct types,
+                       # let messages of unknown type through but do not
+                       # change the sender-specified type
+                       if (recheck_type_ok($part)) {
+                               push @keep, $part;
+                       } else {
+                               $rejected++;
+                       }
                } else {
                        # reject everything else
                        #
@@ -216,4 +227,23 @@ sub replace_body {
        mark_changed($simple);
 }
 
+# run the file(1) command to detect mime type
+# Not using File::MMagic for now since that requires extra configuration
+# Note: we do not rewrite the message with the detected mime type
+sub recheck_type_ok {
+       my ($part) = @_;
+       my $cmd = "file --mime-type -b -";
+       my $pid = open2(my $out, my $in, $cmd);
+       print $in $part->body;
+       close $in;
+       my $type = eval {
+               local $/;
+               <$out>;
+       };
+       waitpid($pid, 0);
+       chomp $type;
+
+       (($type =~ $MIME_TEXT_ANY) && ($type !~ $MIME_HTML))
+}
+
 1;
index 12f4ed6f1f84e52ace60eab5591d23b8b38c7773..0aa26a5f88977105857a804a8c90e226f40ceb66 100644 (file)
@@ -278,5 +278,40 @@ sub count_body_parts {
        is(undef, $f->simple->header("Mail-Followup-To"), "mft stripped");
 }
 
+# multi-part with application/octet-stream
+{
+       my $os = 'application/octet-stream';
+       my $parts = [
+               Email::MIME->create(
+                       attributes => { content_type => $os },
+                       body => <<EOF
+#include <stdio.h>
+int main(void)
+{
+       printf("Hello world\\n");
+       return 0;
+}
+EOF
+               ),
+               Email::MIME->create(
+                       attributes => {
+                               filename => 'zero.data',
+                               encoding => 'base64',
+                               content_type => $os,
+                       },
+                       body => ("\0" x 4096),
+               )
+       ];
+       my $email = Email::MIME->create(
+               header_str => [ From => 'a@example.com', Subject => 'blah' ],
+               parts => $parts,
+       );
+       my $f = Email::Filter->new(data => $email->as_string);
+       is(1, PublicInbox::Filter->run($f->simple), "run was a success");
+       my $parsed = Email::MIME->new($f->simple->as_string);
+       is(scalar $parsed->parts, 1, "only one remaining part");
+       like($f->simple->header("X-Content-Filtered-By"),
+               qr/PublicInbox::Filter/, "XCFB header added");
+}
 
 done_testing();