lib/PublicInbox/EmlContentFoo.pm

   1 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
   2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
   3 # This library is free software; you can redistribute it and/or modify
   4 # it under the same terms as Perl itself.
   5 #
   6 # License: GPL-1.0+ or Artistic-1.0-Perl
   7 #  <https://www.gnu.org/licenses/gpl-1.0.txt>
   8 #  <https://dev.perl.org/licenses/artistic.html>
   9 #
  10 # This license differs from the rest of public-inbox
  11 #
  12 # This is a fork of the Email::MIME::ContentType 1.022 with
  13 # minor improvements and incompatibilities; namely changes to
  14 # quiet warnings with legacy data.
  15 package PublicInbox::EmlContentFoo;
  16 use strict;
  17 use parent qw(Exporter);
  18 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
  19
  20 use Encode 2.87 qw(find_mime_encoding);
  21 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
  22
  23 our $STRICT_PARAMS = 1;
  24
  25 my $ct_default = 'text/plain; charset=us-ascii';
  26
  27 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
  28         qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
  29
  30 my $re_token_non_strict = # allow CTLs and above ASCII
  31         qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
  32
  33 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
  34         qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
  35 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
  36 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
  37
  38 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
  39 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
  40 my $re_quoted_string_non_strict =
  41 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
  42
  43 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
  44 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
  45 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
  46
  47 sub parse_content_type {
  48         my ($ct) = @_;
  49
  50         # If the header isn't there or is empty, give default answer.
  51         $ct = $ct_default unless defined($ct) && length($ct);
  52
  53         _unfold_lines($ct);
  54         _clean_comments($ct);
  55
  56         # It is also recommend (sic.) that this default be assumed when a
  57         # syntactically invalid Content-Type header field is encountered.
  58         unless ($ct =~ s/^($re_token)\/($re_token)//) {
  59                 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
  60                                                 ($re_token_non_strict)//x) {
  61                         #carp "Invalid Content-Type '$ct'";
  62                         return parse_content_type($ct_default);
  63                 }
  64         }
  65
  66         my ($type, $subtype) = (lc $1, lc $2);
  67
  68         _clean_comments($ct);
  69         $ct =~ s/\s+$//;
  70
  71         my $attributes = {};
  72         if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
  73                 # carp "Missing ';' before first Content-Type parameter '$ct'";
  74         } else {
  75                 $attributes = _process_rfc2231(_parse_attributes($ct));
  76         }
  77
  78         {
  79                 type       => $type,
  80                 subtype => $subtype,
  81                 attributes => $attributes,
  82
  83                 # This is dumb.  Really really dumb.  For backcompat. -- rjbs,
  84                 # 2013-08-10
  85                 discrete   => $type,
  86                 composite  => $subtype,
  87         };
  88 }
  89
  90 my $cd_default = 'attachment';
  91
  92 sub parse_content_disposition {
  93         my ($cd) = @_;
  94
  95         $cd = $cd_default unless defined($cd) && length($cd);
  96
  97         _unfold_lines($cd);
  98         _clean_comments($cd);
  99
 100         unless ($cd =~ s/^($re_token)//) {
 101                 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
 102                         #carp "Invalid Content-Disposition '$cd'";
 103                         return parse_content_disposition($cd_default);
 104                 }
 105         }
 106
 107         my $type = lc $1;
 108
 109         _clean_comments($cd);
 110         $cd =~ s/\s+$//;
 111
 112         my $attributes = {};
 113         if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
 114 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
 115         } else {
 116                 $attributes = _process_rfc2231(_parse_attributes($cd));
 117         }
 118
 119         {
 120                 type       => $type,
 121                 attributes => $attributes,
 122         };
 123 }
 124
 125 sub _unfold_lines {
 126         $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
 127 }
 128
 129 sub _clean_comments {
 130         my $ret = ($_[0] =~ s/^\s+//);
 131         while (length $_[0]) {
 132                 last unless $_[0] =~ s/^\(//;
 133                 my $level = 1;
 134                 while (length $_[0]) {
 135                         my $ch = substr $_[0], 0, 1, '';
 136                         if ($ch eq '(') {
 137                                 $level++;
 138                         } elsif ($ch eq ')') {
 139                                 $level--;
 140                                 last if $level == 0;
 141                         } elsif ($ch eq '\\') {
 142                                 substr $_[0], 0, 1, '';
 143                         }
 144                 }
 145                 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
 146                 $ret |= ($_[0] =~ s/^\s+//);
 147         }
 148         $ret;
 149 }
 150
 151 sub _process_rfc2231 {
 152         my ($attribs) = @_;
 153         my %cont;
 154         my %encoded;
 155         foreach (keys %{$attribs}) {
 156                 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
 157                 my ($attr, $sec) = ($1, $2);
 158                 $cont{$attr}->[$sec] = $attribs->{$_};
 159                 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
 160                 delete $attribs->{$_};
 161         }
 162         foreach (keys %cont) {
 163                 my $key = $_;
 164                 $key .= '*' if $encoded{$_};
 165                 $attribs->{$key} = join '', @{$cont{$_}};
 166         }
 167         foreach (keys %{$attribs}) {
 168                 next unless $_ =~ m/^(.*)\*$/;
 169                 my $key = $1;
 170                 next unless $attribs->{$_} =~ m/^$re_exvalue$/;
 171                 my ($charset, $value) = ($1, $2);
 172                 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
 173                 if (length $charset) {
 174                         my $enc = find_mime_encoding($charset);
 175                         if (defined $enc) {
 176                                 $value = $enc->decode($value);
 177                         # } else {
 178                                 #carp "Unknown charset '$charset' in
 179                                 #attribute '$key' value";
 180                         }
 181                 }
 182                 $attribs->{$key} = $value;
 183                 delete $attribs->{$_};
 184         }
 185         $attribs;
 186 }
 187
 188 sub _parse_attributes {
 189         local $_ = shift;
 190         substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
 191         my $attribs = {};
 192         while (length $_) {
 193                 s/^;// or $STRICT_PARAMS and do {
 194                         #carp "Missing semicolon before parameter '$_'";
 195                         return $attribs;
 196                 };
 197                 _clean_comments($_);
 198                 unless (length $_) {
 199                         # Some mail software generates a Content-Type like this:
 200                         # "Content-Type: text/plain;"
 201                         # RFC 1521 section 3 says a parameter must exist if
 202                         # there is a semicolon.
 203                         #carp "Extra semicolon after last parameter" if
 204                         #$STRICT_PARAMS;
 205                         return $attribs;
 206                 }
 207                 my $attribute;
 208                 if (s/^($re_token)=//) {
 209                         $attribute = lc $1;
 210                 } else {
 211                         if ($STRICT_PARAMS) {
 212                                 # carp "Illegal parameter '$_'";
 213                                 return $attribs;
 214                         }
 215                         if (s/^($re_token_non_strict)=//) {
 216                                 $attribute = lc $1;
 217                         } else {
 218                                 unless (s/^([^;=\s]+)\s*=//) {
 219                                         #carp "Cannot parse parameter '$_'";
 220                                         return $attribs;
 221                                 }
 222                                 $attribute = lc $1;
 223                         }
 224                 }
 225                 _clean_comments($_);
 226                 my $value = _extract_attribute_value();
 227                 $attribs->{$attribute} = $value;
 228                 _clean_comments($_);
 229         }
 230         $attribs;
 231 }
 232
 233 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
 234         my $value;
 235         while (length $_) {
 236                 if (s/^($re_token)//) {
 237                         $value .= $1;
 238                 } elsif (s/^$re_quoted_string//) {
 239                         my $sub = $1;
 240                         $sub =~ s/\\(.)/$1/g;
 241                         $value .= $sub;
 242                 } elsif ($STRICT_PARAMS) {
 243                         #my $char = substr $_, 0, 1;
 244                         #carp "Unquoted '$char' not allowed";
 245                         return;
 246                 } elsif (s/^($re_token_non_strict)//) {
 247                         $value .= $1;
 248                 } elsif (s/^$re_quoted_string_non_strict//) {
 249                         my $sub = $1;
 250                         $sub =~ s/\\(.)/$1/g;
 251                         $value .= $sub;
 252                 }
 253                 my $erased = _clean_comments($_);
 254                 last if !length $_ or /^;/;
 255                 if ($STRICT_PARAMS) {
 256                         #my $char = substr $_, 0, 1;
 257                         #carp "Extra '$char' found after parameter";
 258                         return;
 259                 }
 260                 if ($erased) {
 261                         # Sometimes semicolon is missing, so check for = char
 262                         last if m/^$re_token_non_strict=/;
 263                         $value .= ' ';
 264                 }
 265                 $value .= substr $_, 0, 1, '';
 266         }
 267         $value;
 268 }
 269
 270 1;
 271 __END__
 272 =func parse_content_type
 273
 274 This routine is exported by default.
 275
 276 This routine parses email content type headers according to section 5.1 of RFC
 277 2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
 278 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
 279 C<attributes>.
 280
 281 For backward compatibility with a really unfortunate misunderstanding of RFC
 282 2045 by the early implementors of this module, C<discrete> and C<composite> are
 283 also present in the returned hashref, with the values of C<type> and C<subtype>
 284 respectively.
 285
 286 =func parse_content_disposition
 287
 288 This routine is exported by default.
 289
 290 This routine parses email Content-Disposition headers according to RFC 2183 and
 291 RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
 292 of C<attributes>.
 293
 294 =cut