1 # Copyright (C) 2013-2015 all contributors <meta@public-inbox.org>
2 # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
4 # Used to filter incoming mail for -mda and importers
5 # This only exposes one function: run
6 # Note: the settings here are highly opinionated. Obviously, this is
7 # Free Software (AGPLv3), so you may change it if you host yourself.
8 package PublicInbox::Filter;
12 use Email::MIME::ContentType qw/parse_content_type/;
15 our $VERSION = '0.0.1';
16 use constant NO_HTML => '*** We only accept plain-text email, no HTML ***';
17 use constant TEXT_ONLY => '*** We only accept plain-text email ***';
19 # start with the same defaults as mailman
20 our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i;
21 our $MIME_HTML = qr!\btext/x?html\b!i;
22 our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i;
24 # this is highly opinionated delivery
25 # returns 0 only if there is nothing to deliver
27 my ($class, $mime, $filter) = @_;
29 my $content_type = $mime->header('Content-Type') || 'text/plain';
31 # kill potentially bad/confusing headers
32 # Note: ssoma already does this, but since we mangle the message,
33 # we should do this before it gets to ssoma.
34 # We also kill Mail-{Followup,Reply}-To headers due to
35 # the nature of public-inbox having no real subscribers.
36 foreach my $d (qw(status lines content-length)) {
37 $mime->header_set($d);
40 if ($content_type =~ m!\btext/plain\b!i) {
41 return 1; # yay, nothing to do
42 } elsif ($content_type =~ $MIME_HTML) {
43 $filter->reject(NO_HTML) if $filter;
44 # HTML-only, non-multipart
45 my $body = $mime->body;
46 my $ct_parsed = parse_content_type($content_type);
47 dump_html(\$body, $ct_parsed->{attributes}->{charset});
48 replace_body($mime, $body);
50 } elsif ($content_type =~ m!\bmultipart/!i) {
51 return strip_multipart($mime, $content_type, $filter);
53 $filter->reject(TEXT_ONLY) if $filter;
54 replace_body($mime, "$content_type message scrubbed");
60 my ($mime, $part, $type) = ($_[0], $_[1], $_[3]);
61 # don't copy $_[2], that's the body (it may be huge)
63 # Email::MIME insists on setting Date:, so just set it consistently
64 # to avoid conflicts to avoid git merge conflicts in a split brain
66 unless (defined $part->header('Date')) {
67 my $date = $mime->header('Date') ||
68 'Thu, 01 Jan 1970 00:00:00 +0000';
69 $part->header_set('Date', $date);
72 $part->charset_set(undef);
73 $part->name_set(undef);
74 $part->filename_set(undef);
75 $part->format_set(undef);
76 $part->encoding_set('8bit');
77 $part->disposition_set(undef);
78 $part->content_type_set($type);
79 $part->body_set($_[2]);
82 # converts one part of a multipart message to text
83 sub html_part_to_text {
84 my ($mime, $part) = @_;
85 my $body = $part->body;
86 my $ct_parsed = parse_content_type($part->content_type);
87 dump_html(\$body, $ct_parsed->{attributes}->{charset});
88 replace_part($mime, $part, $body, 'text/plain');
91 # modifies $_[0] in place
93 my ($body, $charset) = @_;
94 $charset ||= 'US-ASCII';
95 my @cmd = qw(lynx -stdin -stderr -dump);
99 # be careful about remote command injection!
100 if ($charset =~ /\A([A-Za-z0-9\-]+)\z/) {
101 push @cmd, "-assume_charset=$charset";
103 if (IPC::Run::run(\@cmd, $body, \$out, \$err)) {
104 $out =~ s/\r\n/\n/sg;
107 # give them an ugly version:
108 $$body = "public-inbox HTML conversion failed: $err\n" .
113 # this is to correct old archives during import.
114 sub strip_multipart {
115 my ($mime, $content_type, $filter) = @_;
121 # scan through all parts once
122 $mime->walk_parts(sub {
124 return if $part->subparts; # walk_parts already recurses
126 # some extensions are just bad, reject them outright
127 my $fn = $part->filename;
128 if (defined($fn) && $fn =~ $BAD_EXT) {
129 $filter->reject("Bad file type: $1") if $filter;
134 my $part_type = $part->content_type || '';
135 if ($part_type =~ m!\btext/plain\b!i) {
137 } elsif ($part_type =~ $MIME_HTML) {
138 $filter->reject(NO_HTML) if $filter;
140 } elsif ($part_type =~ $MIME_TEXT_ANY) {
141 # Give other text attachments the benefit of the doubt,
142 # here? Could be source code or script the user wants
146 } elsif ($part_type eq '' ||
147 $part_type =~ m!\bapplication/octet-stream\b!i) {
148 # unfortunately, some mailers don't set correct types,
149 # let messages of unknown type through but do not
150 # change the sender-specified type
151 if (recheck_type_ok($part)) {
154 $filter->reject("Bad attachment: $part_type ".
159 } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) {
160 # PGP signatures are not huge, we may keep them.
161 # They can only be valid if it's the last element,
162 # so we keep them iff the message is unmodified:
163 if ($rejected == 0 && !@html) {
167 $filter->reject("unacceptable mime-type: $part_type ".
170 # reject everything else, including non-PGP signatures
175 if ($content_type =~ m!\bmultipart/alternative\b!i) {
176 if (scalar @keep == 1) {
177 return collapse($mime, $keep[0]);
179 } else { # convert HTML parts to plain text
180 foreach my $part (@html) {
181 html_part_to_text($mime, $part);
187 @keep = (Email::MIME->create(
189 content_type => 'text/plain',
190 charset => 'US-ASCII',
193 body_str => 'all attachments scrubbed by '. __PACKAGE__
197 if (scalar(@html) || $rejected) {
198 $mime->parts_set(\@keep);
199 $mime->body_set($mime->body_raw);
208 $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION");
212 my ($mime, $part) = @_;
213 $mime->header_set('Content-Type', $part->content_type);
214 $mime->body_set($part->body_raw);
215 my $cte = $part->header('Content-Transfer-Encoding');
216 if (defined($cte) && $cte ne '') {
217 $mime->header_set('Content-Transfer-Encoding', $cte);
225 $mime->body_set($_[1]);
226 $mime->header_set('Content-Type', 'text/plain');
227 if ($mime->header('Content-Transfer-Encoding')) {
228 $mime->header_set('Content-Transfer-Encoding', undef);
233 # Check for display-able text, no messed up binaries
234 # Note: we can not rewrite the message with the detected mime type
235 sub recheck_type_ok {
238 ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s));