1 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
3 # This library is free software; you can redistribute it and/or modify
4 # it under the same terms as Perl itself.
6 # License: GPL-1.0+ or Artistic-1.0-Perl
7 # <https://www.gnu.org/licenses/gpl-1.0.txt>
8 # <https://dev.perl.org/licenses/artistic.html>
10 # This license differs from the rest of public-inbox
12 # This is a fork of the Email::MIME::ContentType 1.022 with
13 # minor improvements and incompatibilities; namely changes to
14 # quiet warnings with legacy data.
15 package PublicInbox::EmlContentFoo;
17 use parent qw(Exporter);
18 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
20 use Encode 2.87 qw(find_mime_encoding);
21 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
23 our $STRICT_PARAMS = 1;
25 my $ct_default = 'text/plain; charset=us-ascii';
27 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
28 qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
30 my $re_token_non_strict = # allow CTLs and above ASCII
31 qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
33 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
34 qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
35 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
36 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
38 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
39 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
40 my $re_quoted_string_non_strict =
41 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
43 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
44 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
45 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
47 sub parse_content_type {
50 # If the header isn't there or is empty, give default answer.
51 $ct = $ct_default unless defined($ct) && length($ct);
56 # It is also recommend (sic.) that this default be assumed when a
57 # syntactically invalid Content-Type header field is encountered.
58 unless ($ct =~ s/^($re_token)\/($re_token)//) {
59 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
60 ($re_token_non_strict)//x) {
61 #carp "Invalid Content-Type '$ct'";
62 return parse_content_type($ct_default);
66 my ($type, $subtype) = (lc $1, lc $2);
72 if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
73 # carp "Missing ';' before first Content-Type parameter '$ct'";
75 $attributes = _process_rfc2231(_parse_attributes($ct));
81 attributes => $attributes,
83 # This is dumb. Really really dumb. For backcompat. -- rjbs,
86 composite => $subtype,
90 my $cd_default = 'attachment';
92 sub parse_content_disposition {
95 $cd = $cd_default unless defined($cd) && length($cd);
100 unless ($cd =~ s/^($re_token)//) {
101 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
102 #carp "Invalid Content-Disposition '$cd'";
103 return parse_content_disposition($cd_default);
109 _clean_comments($cd);
113 if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
114 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
116 $attributes = _process_rfc2231(_parse_attributes($cd));
121 attributes => $attributes,
126 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
129 sub _clean_comments {
130 my $ret = ($_[0] =~ s/^\s+//);
131 while (length $_[0]) {
132 last unless $_[0] =~ s/^\(//;
134 while (length $_[0]) {
135 my $ch = substr $_[0], 0, 1, '';
138 } elsif ($ch eq ')') {
141 } elsif ($ch eq '\\') {
142 substr $_[0], 0, 1, '';
145 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
146 $ret |= ($_[0] =~ s/^\s+//);
151 sub _process_rfc2231 {
155 foreach (keys %{$attribs}) {
156 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
157 my ($attr, $sec) = ($1, $2);
158 $cont{$attr}->[$sec] = $attribs->{$_};
159 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
160 delete $attribs->{$_};
162 foreach (keys %cont) {
164 $key .= '*' if $encoded{$_};
165 $attribs->{$key} = join '', @{$cont{$_}};
167 foreach (keys %{$attribs}) {
168 next unless $_ =~ m/^(.*)\*$/;
170 next unless $attribs->{$_} =~ m/^$re_exvalue$/;
171 my ($charset, $value) = ($1, $2);
172 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
173 if (length $charset) {
174 my $enc = find_mime_encoding($charset);
176 $value = $enc->decode($value);
178 #carp "Unknown charset '$charset' in
179 #attribute '$key' value";
182 $attribs->{$key} = $value;
183 delete $attribs->{$_};
188 sub _parse_attributes {
190 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
193 s/^;// or $STRICT_PARAMS and do {
194 #carp "Missing semicolon before parameter '$_'";
199 # Some mail software generates a Content-Type like this:
200 # "Content-Type: text/plain;"
201 # RFC 1521 section 3 says a parameter must exist if
202 # there is a semicolon.
203 #carp "Extra semicolon after last parameter" if
208 if (s/^($re_token)=//) {
211 if ($STRICT_PARAMS) {
212 # carp "Illegal parameter '$_'";
215 if (s/^($re_token_non_strict)=//) {
218 unless (s/^([^;=\s]+)\s*=//) {
219 #carp "Cannot parse parameter '$_'";
226 my $value = _extract_attribute_value();
227 $attribs->{$attribute} = $value;
233 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
236 if (s/^($re_token)//) {
238 } elsif (s/^$re_quoted_string//) {
240 $sub =~ s/\\(.)/$1/g;
242 } elsif ($STRICT_PARAMS) {
243 #my $char = substr $_, 0, 1;
244 #carp "Unquoted '$char' not allowed";
246 } elsif (s/^($re_token_non_strict)//) {
248 } elsif (s/^$re_quoted_string_non_strict//) {
250 $sub =~ s/\\(.)/$1/g;
253 my $erased = _clean_comments($_);
254 last if !length $_ or /^;/;
255 if ($STRICT_PARAMS) {
256 #my $char = substr $_, 0, 1;
257 #carp "Extra '$char' found after parameter";
261 # Sometimes semicolon is missing, so check for = char
262 last if m/^$re_token_non_strict=/;
265 $value .= substr $_, 0, 1, '';
272 =func parse_content_type
274 This routine is exported by default.
276 This routine parses email content type headers according to section 5.1 of RFC
277 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
278 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
281 For backward compatibility with a really unfortunate misunderstanding of RFC
282 2045 by the early implementors of this module, C<discrete> and C<composite> are
283 also present in the returned hashref, with the values of C<type> and C<subtype>
286 =func parse_content_disposition
288 This routine is exported by default.
290 This routine parses email Content-Disposition headers according to RFC 2183 and
291 RFC 2231. It returns a hash as above, with entries for the C<type>, and a hash