1 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
3 # This library is free software; you can redistribute it and/or modify
4 # it under the same terms as Perl itself.
6 # License: GPL-1.0+ or Artistic-1.0-Perl
7 # <https://www.gnu.org/licenses/gpl-1.0.txt>
8 # <https://dev.perl.org/licenses/artistic.html>
10 # This license differs from the rest of public-inbox
12 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
14 # This is a fork of the Email::MIME::ContentType 1.022 with
15 # minor improvements and incompatibilities; namely changes to
16 # quiet warnings with legacy data.
17 package PublicInbox::EmlContentFoo;
19 use parent qw(Exporter);
22 # find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
23 # while we support 2.35 shipped with Perl 5.10.1
24 use Encode 2.35 qw(find_encoding);
25 my %mime_name_map; # $enc->mime_name => $enc object
27 eval { Encode->import('find_mime_encoding') };
29 *find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
30 %mime_name_map = map {;
31 my $enc = find_encoding($_);
32 my $m = lc($enc->mime_name // '');
34 } Encode->encodings(':all');
36 # delete fallback for encodings w/o ->mime_name:
37 delete $mime_name_map{''};
39 # an extra alias see Encode::MIME::NAME
40 $mime_name_map{'utf8'} = find_encoding('UTF-8');
44 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
46 our $STRICT_PARAMS = 1;
48 my $ct_default = 'text/plain; charset=us-ascii';
50 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
51 qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
53 my $re_token_non_strict = # allow CTLs and above ASCII
54 qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
56 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
57 qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
58 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
59 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
61 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
62 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
63 my $re_quoted_string_non_strict =
64 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
66 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
67 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
68 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
70 sub parse_content_type {
73 # If the header isn't there or is empty, give default answer.
74 $ct = $ct_default unless defined($ct) && length($ct);
79 # It is also recommend (sic.) that this default be assumed when a
80 # syntactically invalid Content-Type header field is encountered.
81 unless ($ct =~ s/^($re_token)\/($re_token)//) {
82 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
83 ($re_token_non_strict)//x) {
84 #carp "Invalid Content-Type '$ct'";
85 return parse_content_type($ct_default);
89 my ($type, $subtype) = (lc $1, lc $2);
95 if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
96 # carp "Missing ';' before first Content-Type parameter '$ct'";
98 $attributes = _process_rfc2231(_parse_attributes($ct));
104 attributes => $attributes,
106 # This is dumb. Really really dumb. For backcompat. -- rjbs,
109 composite => $subtype,
113 my $cd_default = 'attachment';
115 sub parse_content_disposition {
118 $cd = $cd_default unless defined($cd) && length($cd);
121 _clean_comments($cd);
123 unless ($cd =~ s/^($re_token)//) {
124 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
125 #carp "Invalid Content-Disposition '$cd'";
126 return parse_content_disposition($cd_default);
132 _clean_comments($cd);
136 if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
137 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
139 $attributes = _process_rfc2231(_parse_attributes($cd));
144 attributes => $attributes,
149 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
152 sub _clean_comments {
153 my $ret = ($_[0] =~ s/^\s+//);
154 while (length $_[0]) {
155 last unless $_[0] =~ s/^\(//;
157 while (length $_[0]) {
158 my $ch = substr $_[0], 0, 1, '';
161 } elsif ($ch eq ')') {
164 } elsif ($ch eq '\\') {
165 substr $_[0], 0, 1, '';
168 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
169 $ret |= ($_[0] =~ s/^\s+//);
174 sub _process_rfc2231 {
178 foreach (keys %{$attribs}) {
179 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
180 my ($attr, $sec) = ($1, $2);
181 $cont{$attr}->[$sec] = $attribs->{$_};
182 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
183 delete $attribs->{$_};
185 foreach (keys %cont) {
187 $key .= '*' if $encoded{$_};
188 $attribs->{$key} = join '', @{$cont{$_}};
190 foreach (keys %{$attribs}) {
191 next unless $_ =~ m/^(.*)\*$/;
193 next unless ($attribs->{$_} // '') =~ m/^$re_exvalue$/;
194 my ($charset, $value) = ($1, $2);
195 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
196 if (length $charset) {
197 my $enc = find_mime_encoding($charset);
199 $value = $enc->decode($value);
201 #carp "Unknown charset '$charset' in
202 #attribute '$key' value";
205 $attribs->{$key} = $value;
206 delete $attribs->{$_};
211 sub _parse_attributes {
213 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
216 s/^;// or $STRICT_PARAMS and do {
217 #carp "Missing semicolon before parameter '$_'";
222 # Some mail software generates a Content-Type like this:
223 # "Content-Type: text/plain;"
224 # RFC 1521 section 3 says a parameter must exist if
225 # there is a semicolon.
226 #carp "Extra semicolon after last parameter" if
231 if (s/^($re_token)=//) {
234 if ($STRICT_PARAMS) {
235 # carp "Illegal parameter '$_'";
238 if (s/^($re_token_non_strict)=//) {
241 unless (s/^([^;=\s]+)\s*=//) {
242 #carp "Cannot parse parameter '$_'";
249 my $value = _extract_attribute_value();
250 $attribs->{$attribute} = $value;
256 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
259 if (s/^($re_token)//) {
261 } elsif (s/^$re_quoted_string//) {
263 $sub =~ s/\\(.)/$1/g;
265 } elsif ($STRICT_PARAMS) {
266 #my $char = substr $_, 0, 1;
267 #carp "Unquoted '$char' not allowed";
269 } elsif (s/^($re_token_non_strict)//) {
271 } elsif (s/^$re_quoted_string_non_strict//) {
273 $sub =~ s/\\(.)/$1/g;
276 my $erased = _clean_comments($_);
277 last if !length $_ or /^;/;
278 if ($STRICT_PARAMS) {
279 #my $char = substr $_, 0, 1;
280 #carp "Extra '$char' found after parameter";
284 # Sometimes semicolon is missing, so check for = char
285 last if m/^$re_token_non_strict=/;
288 $value .= substr $_, 0, 1, '';
295 =func parse_content_type
297 This routine is exported by default.
299 This routine parses email content type headers according to section 5.1 of RFC
300 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
301 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
304 For backward compatibility with a really unfortunate misunderstanding of RFC
305 2045 by the early implementors of this module, C<discrete> and C<composite> are
306 also present in the returned hashref, with the values of C<type> and C<subtype>
309 =func parse_content_disposition
311 This routine is exported by default.
313 This routine parses email Content-Disposition headers according to RFC 2183 and
314 RFC 2231. It returns a hash as above, with entries for the C<type>, and a hash