1 # Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
3 # This library is free software; you can redistribute it and/or modify
4 # it under the same terms as Perl itself.
6 # License: GPL-1.0+ or Artistic-1.0-Perl
7 # <https://www.gnu.org/licenses/gpl-1.0.txt>
8 # <https://dev.perl.org/licenses/artistic.html>
10 # This license differs from the rest of public-inbox
12 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
14 # This is a fork of the Email::MIME::ContentType 1.022 with
15 # minor improvements and incompatibilities; namely changes to
16 # quiet warnings with legacy data.
17 package PublicInbox::EmlContentFoo;
19 use parent qw(Exporter);
22 # find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
23 # while we support 2.35 shipped with Perl 5.10.1
24 use Encode 2.35 qw(find_encoding);
25 my %mime_name_map; # $enc->mime_name => $enc object
27 eval { Encode->import('find_mime_encoding') };
29 *find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
30 %mime_name_map = map {;
31 my $enc = find_encoding($_);
32 my $m = lc($enc->mime_name // '');
34 } Encode->encodings(':all');
36 # delete fallback for encodings w/o ->mime_name:
37 delete $mime_name_map{''};
39 # an extra alias see Encode::MIME::NAME
40 $mime_name_map{'utf8'} = find_encoding('UTF-8');
44 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
46 our $STRICT_PARAMS = 1;
48 my $ct_default = 'text/plain; charset=us-ascii';
50 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
51 qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
53 my $re_token_non_strict = # allow CTLs and above ASCII
54 qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
56 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
57 qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
58 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
59 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
61 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
62 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
63 my $re_quoted_string_non_strict =
64 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
66 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
67 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
68 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
70 sub parse_content_type {
73 # If the header isn't there or is empty, give default answer.
74 $ct = $ct_default unless defined($ct) && length($ct);
79 # It is also recommend (sic.) that this default be assumed when a
80 # syntactically invalid Content-Type header field is encountered.
81 unless ($ct =~ s/^($re_token)\/($re_token)//) {
82 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
83 ($re_token_non_strict)//x) {
84 #carp "Invalid Content-Type '$ct'";
85 return parse_content_type($ct_default);
89 my ($type, $subtype) = (lc $1, lc $2);
95 if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
96 # carp "Missing ';' before first Content-Type parameter '$ct'";
98 $attributes = _process_rfc2231(_parse_attributes($ct));
104 attributes => $attributes,
108 my $cd_default = 'attachment';
110 sub parse_content_disposition {
113 $cd = $cd_default unless defined($cd) && length($cd);
116 _clean_comments($cd);
118 unless ($cd =~ s/^($re_token)//) {
119 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
120 #carp "Invalid Content-Disposition '$cd'";
121 return parse_content_disposition($cd_default);
127 _clean_comments($cd);
131 if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
132 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
134 $attributes = _process_rfc2231(_parse_attributes($cd));
139 attributes => $attributes,
144 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
147 sub _clean_comments {
148 my $ret = ($_[0] =~ s/^\s+//);
149 while (length $_[0]) {
150 last unless $_[0] =~ s/^\(//;
152 while (length $_[0]) {
153 my $ch = substr $_[0], 0, 1, '';
156 } elsif ($ch eq ')') {
159 } elsif ($ch eq '\\') {
160 substr $_[0], 0, 1, '';
163 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
164 $ret |= ($_[0] =~ s/^\s+//);
169 sub _process_rfc2231 {
173 foreach (keys %{$attribs}) {
174 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
175 my ($attr, $sec) = ($1, $2);
176 $cont{$attr}->[$sec] = $attribs->{$_};
177 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
178 delete $attribs->{$_};
180 foreach (keys %cont) {
182 $key .= '*' if $encoded{$_};
183 $attribs->{$key} = join '', @{$cont{$_}};
185 foreach (keys %{$attribs}) {
186 next unless $_ =~ m/^(.*)\*$/;
188 next unless ($attribs->{$_} // '') =~ m/^$re_exvalue$/;
189 my ($charset, $value) = ($1, $2);
190 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
191 if (length $charset) {
192 my $enc = find_mime_encoding($charset);
194 $value = $enc->decode($value);
196 #carp "Unknown charset '$charset' in
197 #attribute '$key' value";
200 $attribs->{$key} = $value;
201 delete $attribs->{$_};
206 sub _parse_attributes {
208 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
211 s/^;// or $STRICT_PARAMS and do {
212 #carp "Missing semicolon before parameter '$_'";
217 # Some mail software generates a Content-Type like this:
218 # "Content-Type: text/plain;"
219 # RFC 1521 section 3 says a parameter must exist if
220 # there is a semicolon.
221 #carp "Extra semicolon after last parameter" if
226 if (s/^($re_token)=//) {
229 if ($STRICT_PARAMS) {
230 # carp "Illegal parameter '$_'";
233 if (s/^($re_token_non_strict)=//) {
236 unless (s/^([^;=\s]+)\s*=//) {
237 #carp "Cannot parse parameter '$_'";
244 my $value = _extract_attribute_value();
245 $attribs->{$attribute} = $value;
251 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
254 if (s/^($re_token)//) {
256 } elsif (s/^$re_quoted_string//) {
258 $sub =~ s/\\(.)/$1/g;
260 } elsif ($STRICT_PARAMS) {
261 #my $char = substr $_, 0, 1;
262 #carp "Unquoted '$char' not allowed";
264 } elsif (s/^($re_token_non_strict)//) {
266 } elsif (s/^$re_quoted_string_non_strict//) {
268 $sub =~ s/\\(.)/$1/g;
271 my $erased = _clean_comments($_);
272 last if !length $_ or /^;/;
273 if ($STRICT_PARAMS) {
274 #my $char = substr $_, 0, 1;
275 #carp "Extra '$char' found after parameter";
279 # Sometimes semicolon is missing, so check for = char
280 last if m/^$re_token_non_strict=/;
283 $value .= substr $_, 0, 1, '';
290 =func parse_content_type
292 This routine is exported by default.
294 This routine parses email content type headers according to section 5.1 of RFC
295 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
296 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
299 =func parse_content_disposition
301 This routine is exported by default.
303 This routine parses email Content-Disposition headers according to RFC 2183 and
304 RFC 2231. It returns a hash as above, with entries for the C<type>, and a hash