1 # Copyright (C) 2013, Eric Wong <normalperson@yhbt.net> and all contributors
2 # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
4 # This only exposes one function: run
5 # Note: the settings here are highly opinionated. Obviously, this is
6 # Free Software (AGPLv3), so you may change it if you host yourself.
7 package PublicInbox::Filter;
11 use Email::MIME::ContentType qw/parse_content_type/;
14 our $VERSION = '0.0.1';
16 # start with the same defaults as mailman
17 our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i;
18 our $MIME_HTML = qr!\btext/html\b!i;
19 our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i;
21 # this is highly opinionated delivery
22 # returns 0 only if there is nothing to deliver
24 my ($class, $simple) = @_;
26 my $content_type = $simple->header("Content-Type") || "text/plain";
28 # kill potentially bad/confusing headers
29 # Note: ssoma already does this, but since we mangle the message,
30 # we should do this before it gets to ssoma.
31 # We also kill Mail-{Followup,Reply}-To and Reply-To headers due to
32 # the nature of public-inbox having no real subscribers.
33 foreach my $d (qw(status lines content-length
34 mail-followup-to mail-reply-to reply-to)) {
35 $simple->header_set($d);
38 if ($content_type =~ m!\btext/plain\b!i) {
39 return 1; # yay, nothing to do
40 } elsif ($content_type =~ $MIME_HTML) {
41 # HTML-only, non-multipart
42 my $body = $simple->body;
43 my $ct_parsed = parse_content_type($content_type);
44 dump_html($body, $ct_parsed->{attributes}->{charset});
45 replace_body($simple, $body);
47 } elsif ($content_type =~ m!\bmultipart/!i) {
48 return strip_multipart($simple, $content_type);
50 replace_body($simple, "$content_type message scrubbed");
56 my ($simple, $part, $type) = ($_[0], $_[1], $_[3]);
57 # don't copy $_[2], that's the body (it may be huge)
59 # Email::MIME insists on setting Date:, so just set it consistently
60 # to avoid conflicts to avoid git merge conflicts in a split brain
62 unless (defined $part->header('Date')) {
63 my $date = $simple->header('Date') ||
64 'Thu, 01 Jan 1970 00:00:00 +0000';
65 $part->header_set('Date', $date);
68 $part->charset_set(undef);
69 $part->name_set(undef);
70 $part->filename_set(undef);
71 $part->format_set(undef);
72 $part->encoding_set('8bit');
73 $part->disposition_set(undef);
74 $part->content_type_set($type);
75 $part->body_set($_[2]);
78 # converts one part of a multipart message to text
79 sub html_part_to_text {
80 my ($simple, $part) = @_;
81 my $body = $part->body;
82 my $ct_parsed = parse_content_type($part->content_type);
83 dump_html($body, $ct_parsed->{attributes}->{charset});
84 replace_part($simple, $part, $body, 'text/plain');
87 # modifies $_[0] in place
89 my $charset = $_[1] || 'US-ASCII';
90 my $cmd = "lynx -stdin -dump";
92 # be careful about remote command injection!
93 if ($charset =~ /\A[A-Za-z0-9\-]+\z/) {
94 $cmd .= " -assume_charset=$charset";
97 my $pid = open2(my $out, my $in, $cmd);
107 # this is to correct user errors and not expected to cover all corner cases
108 # if users don't want to hit this, they should be sending text/plain messages
109 # unfortunately, too many people send HTML mail and we'll attempt to convert
110 # it to something safer, smaller and harder-to-track.
111 sub strip_multipart {
112 my ($simple, $content_type) = @_;
113 my $mime = Email::MIME->new($simple->as_string);
119 # scan through all parts once
120 $mime->walk_parts(sub {
122 return if $part->subparts; # walk_parts already recurses
124 # some extensions are just bad, reject them outright
125 my $fn = $part->filename;
126 if (defined($fn) && $fn =~ $BAD_EXT) {
131 my $part_type = $part->content_type;
132 if ($part_type =~ m!\btext/plain\b!i) {
134 } elsif ($part_type =~ $MIME_HTML) {
136 } elsif ($part_type =~ $MIME_TEXT_ANY) {
137 # Give other text attachments the benefit of the doubt,
138 # here? Could be source code or script the user wants
142 } elsif ($part_type =~ m!\Aapplication/octet-stream\z!i) {
143 # unfortunately, some mailers don't set correct types,
144 # let messages of unknown type through but do not
145 # change the sender-specified type
146 if (recheck_type_ok($part)) {
151 } elsif ($part_type =~ m!\Aapplication/pgp-signature\z!i) {
152 # PGP signatures are not huge, we may keep them.
153 # They can only be valid if it's the last element,
154 # so we keep them iff the message is unmodified:
155 if ($rejected == 0 && !@html) {
159 # reject everything else, including non-PGP signatures
164 if ($content_type =~ m!\bmultipart/alternative\b!i) {
165 if (scalar @keep == 1) {
166 return collapse($simple, $keep[0]);
168 } else { # convert HTML parts to plain text
169 foreach my $part (@html) {
170 html_part_to_text($simple, $part);
176 @keep = (Email::MIME->create(
178 content_type => 'text/plain',
179 charset => 'US-ASCII',
182 body_str => 'all attachments scrubbed by '. __PACKAGE__
186 if (scalar(@html) || $rejected) {
187 $mime->parts_set(\@keep);
188 $simple->body_set($mime->body_raw);
189 mark_changed($simple);
197 $simple->header_set("X-Content-Filtered-By", __PACKAGE__ ." $VERSION");
201 my ($simple, $part) = @_;
202 $simple->header_set("Content-Type", $part->content_type);
203 $simple->body_set($part->body_raw);
204 mark_changed($simple);
210 $simple->body_set($_[1]);
211 $simple->header_set("Content-Type", "text/plain");
212 if ($simple->header("Content-Transfer-Encoding")) {
213 $simple->header_set("Content-Transfer-Encoding", undef);
215 mark_changed($simple);
218 # Check for display-able text, no messed up binaries
219 # Note: we can not rewrite the message with the detected mime type
220 sub recheck_type_ok {
223 ((bytes::length($s) < 0x10000) &&
224 ($s =~ /\A([\P{XPosixPrint}\f\n\r\t]+)\z/))