lib/PublicInbox/EmlContentFoo.pm

   1 # Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
   2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
   3 # This library is free software; you can redistribute it and/or modify
   4 # it under the same terms as Perl itself.
   5 #
   6 # License: GPL-1.0+ or Artistic-1.0-Perl
   7 #  <https://www.gnu.org/licenses/gpl-1.0.txt>
   8 #  <https://dev.perl.org/licenses/artistic.html>
   9 #
  10 # This license differs from the rest of public-inbox
  11 #
  12 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
  13 #
  14 # This is a fork of the Email::MIME::ContentType 1.022 with
  15 # minor improvements and incompatibilities; namely changes to
  16 # quiet warnings with legacy data.
  17 package PublicInbox::EmlContentFoo;
  18 use strict;
  19 use parent qw(Exporter);
  20 use v5.10.1;
  21
  22 # find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
  23 # while we support 2.35 shipped with Perl 5.10.1
  24 use Encode 2.35 qw(find_encoding);
  25 my %mime_name_map; # $enc->mime_name => $enc object
  26 BEGIN {
  27         eval { Encode->import('find_mime_encoding') };
  28         if ($@) {
  29                 *find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
  30                 %mime_name_map = map {;
  31                         my $enc = find_encoding($_);
  32                         my $m = lc($enc->mime_name // '');
  33                         $m => $enc;
  34                 } Encode->encodings(':all');
  35
  36                 # delete fallback for encodings w/o ->mime_name:
  37                 delete $mime_name_map{''};
  38
  39                 # an extra alias see Encode::MIME::NAME
  40                 $mime_name_map{'utf8'} = find_encoding('UTF-8');
  41         }
  42 }
  43
  44 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
  45
  46 our $STRICT_PARAMS = 1;
  47
  48 my $ct_default = 'text/plain; charset=us-ascii';
  49
  50 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
  51         qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
  52
  53 my $re_token_non_strict = # allow CTLs and above ASCII
  54         qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
  55
  56 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
  57         qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
  58 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
  59 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
  60
  61 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
  62 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
  63 my $re_quoted_string_non_strict =
  64 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
  65
  66 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
  67 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
  68 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
  69
  70 sub parse_content_type {
  71         my ($ct) = @_;
  72
  73         # If the header isn't there or is empty, give default answer.
  74         $ct = $ct_default unless defined($ct) && length($ct);
  75
  76         _unfold_lines($ct);
  77         _clean_comments($ct);
  78
  79         # It is also recommend (sic.) that this default be assumed when a
  80         # syntactically invalid Content-Type header field is encountered.
  81         unless ($ct =~ s/^($re_token)\/($re_token)//) {
  82                 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
  83                                                 ($re_token_non_strict)//x) {
  84                         #carp "Invalid Content-Type '$ct'";
  85                         return parse_content_type($ct_default);
  86                 }
  87         }
  88
  89         my ($type, $subtype) = (lc $1, lc $2);
  90
  91         _clean_comments($ct);
  92         $ct =~ s/\s+$//;
  93
  94         my $attributes = {};
  95         if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
  96                 # carp "Missing ';' before first Content-Type parameter '$ct'";
  97         } else {
  98                 $attributes = _process_rfc2231(_parse_attributes($ct));
  99         }
 100
 101         {
 102                 type       => $type,
 103                 subtype => $subtype,
 104                 attributes => $attributes,
 105         };
 106 }
 107
 108 my $cd_default = 'attachment';
 109
 110 sub parse_content_disposition {
 111         my ($cd) = @_;
 112
 113         $cd = $cd_default unless defined($cd) && length($cd);
 114
 115         _unfold_lines($cd);
 116         _clean_comments($cd);
 117
 118         unless ($cd =~ s/^($re_token)//) {
 119                 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
 120                         #carp "Invalid Content-Disposition '$cd'";
 121                         return parse_content_disposition($cd_default);
 122                 }
 123         }
 124
 125         my $type = lc $1;
 126
 127         _clean_comments($cd);
 128         $cd =~ s/\s+$//;
 129
 130         my $attributes = {};
 131         if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
 132 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
 133         } else {
 134                 $attributes = _process_rfc2231(_parse_attributes($cd));
 135         }
 136
 137         {
 138                 type       => $type,
 139                 attributes => $attributes,
 140         };
 141 }
 142
 143 sub _unfold_lines {
 144         $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
 145 }
 146
 147 sub _clean_comments {
 148         my $ret = ($_[0] =~ s/^\s+//);
 149         while (length $_[0]) {
 150                 last unless $_[0] =~ s/^\(//;
 151                 my $level = 1;
 152                 while (length $_[0]) {
 153                         my $ch = substr $_[0], 0, 1, '';
 154                         if ($ch eq '(') {
 155                                 $level++;
 156                         } elsif ($ch eq ')') {
 157                                 $level--;
 158                                 last if $level == 0;
 159                         } elsif ($ch eq '\\') {
 160                                 substr $_[0], 0, 1, '';
 161                         }
 162                 }
 163                 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
 164                 $ret |= ($_[0] =~ s/^\s+//);
 165         }
 166         $ret;
 167 }
 168
 169 sub _process_rfc2231 {
 170         my ($attribs) = @_;
 171         my %cont;
 172         my %encoded;
 173         foreach (keys %{$attribs}) {
 174                 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
 175                 my ($attr, $sec) = ($1, $2);
 176                 $cont{$attr}->[$sec] = $attribs->{$_};
 177                 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
 178                 delete $attribs->{$_};
 179         }
 180         foreach (keys %cont) {
 181                 my $key = $_;
 182                 $key .= '*' if $encoded{$_};
 183                 $attribs->{$key} = join '', @{$cont{$_}};
 184         }
 185         foreach (keys %{$attribs}) {
 186                 next unless $_ =~ m/^(.*)\*$/;
 187                 my $key = $1;
 188                 next unless ($attribs->{$_} // '') =~ m/^$re_exvalue$/;
 189                 my ($charset, $value) = ($1, $2);
 190                 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
 191                 if (length $charset) {
 192                         my $enc = find_mime_encoding($charset);
 193                         if (defined $enc) {
 194                                 $value = $enc->decode($value);
 195                         # } else {
 196                                 #carp "Unknown charset '$charset' in
 197                                 #attribute '$key' value";
 198                         }
 199                 }
 200                 $attribs->{$key} = $value;
 201                 delete $attribs->{$_};
 202         }
 203         $attribs;
 204 }
 205
 206 sub _parse_attributes {
 207         local $_ = shift;
 208         substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
 209         my $attribs = {};
 210         while (length $_) {
 211                 s/^;// or $STRICT_PARAMS and do {
 212                         #carp "Missing semicolon before parameter '$_'";
 213                         return $attribs;
 214                 };
 215                 _clean_comments($_);
 216                 unless (length $_) {
 217                         # Some mail software generates a Content-Type like this:
 218                         # "Content-Type: text/plain;"
 219                         # RFC 1521 section 3 says a parameter must exist if
 220                         # there is a semicolon.
 221                         #carp "Extra semicolon after last parameter" if
 222                         #$STRICT_PARAMS;
 223                         return $attribs;
 224                 }
 225                 my $attribute;
 226                 if (s/^($re_token)=//) {
 227                         $attribute = lc $1;
 228                 } else {
 229                         if ($STRICT_PARAMS) {
 230                                 # carp "Illegal parameter '$_'";
 231                                 return $attribs;
 232                         }
 233                         if (s/^($re_token_non_strict)=//) {
 234                                 $attribute = lc $1;
 235                         } else {
 236                                 unless (s/^([^;=\s]+)\s*=//) {
 237                                         #carp "Cannot parse parameter '$_'";
 238                                         return $attribs;
 239                                 }
 240                                 $attribute = lc $1;
 241                         }
 242                 }
 243                 _clean_comments($_);
 244                 my $value = _extract_attribute_value();
 245                 $attribs->{$attribute} = $value;
 246                 _clean_comments($_);
 247         }
 248         $attribs;
 249 }
 250
 251 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
 252         my $value;
 253         while (length $_) {
 254                 if (s/^($re_token)//) {
 255                         $value .= $1;
 256                 } elsif (s/^$re_quoted_string//) {
 257                         my $sub = $1;
 258                         $sub =~ s/\\(.)/$1/g;
 259                         $value .= $sub;
 260                 } elsif ($STRICT_PARAMS) {
 261                         #my $char = substr $_, 0, 1;
 262                         #carp "Unquoted '$char' not allowed";
 263                         return;
 264                 } elsif (s/^($re_token_non_strict)//) {
 265                         $value .= $1;
 266                 } elsif (s/^$re_quoted_string_non_strict//) {
 267                         my $sub = $1;
 268                         $sub =~ s/\\(.)/$1/g;
 269                         $value .= $sub;
 270                 }
 271                 my $erased = _clean_comments($_);
 272                 last if !length $_ or /^;/;
 273                 if ($STRICT_PARAMS) {
 274                         #my $char = substr $_, 0, 1;
 275                         #carp "Extra '$char' found after parameter";
 276                         return;
 277                 }
 278                 if ($erased) {
 279                         # Sometimes semicolon is missing, so check for = char
 280                         last if m/^$re_token_non_strict=/;
 281                         $value .= ' ';
 282                 }
 283                 $value .= substr $_, 0, 1, '';
 284         }
 285         $value;
 286 }
 287
 288 1;
 289 __END__
 290 =func parse_content_type
 291
 292 This routine is exported by default.
 293
 294 This routine parses email content type headers according to section 5.1 of RFC
 295 2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
 296 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
 297 C<attributes>.
 298
 299 =func parse_content_disposition
 300
 301 This routine is exported by default.
 302
 303 This routine parses email Content-Disposition headers according to RFC 2183 and
 304 RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
 305 of C<attributes>.
 306
 307 =cut