lib/PublicInbox/EmlContentFoo.pm

   1 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
   2 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
   3 # This library is free software; you can redistribute it and/or modify
   4 # it under the same terms as Perl itself.
   5 #
   6 # License: GPL-1.0+ or Artistic-1.0-Perl
   7 #  <https://www.gnu.org/licenses/gpl-1.0.txt>
   8 #  <https://dev.perl.org/licenses/artistic.html>
   9 #
  10 # This license differs from the rest of public-inbox
  11 #
  12 # ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
  13 #
  14 # This is a fork of the Email::MIME::ContentType 1.022 with
  15 # minor improvements and incompatibilities; namely changes to
  16 # quiet warnings with legacy data.
  17 package PublicInbox::EmlContentFoo;
  18 use strict;
  19 use parent qw(Exporter);
  20 use v5.10.1;
  21
  22 # find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
  23 # while we support 2.35 shipped with Perl 5.10.1
  24 use Encode 2.35 qw(find_encoding);
  25 my %mime_name_map; # $enc->mime_name => $enc object
  26 BEGIN {
  27         eval { Encode->import('find_mime_encoding') };
  28         if ($@) {
  29                 *find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
  30                 %mime_name_map = map {;
  31                         my $enc = find_encoding($_);
  32                         my $m = lc($enc->mime_name // '');
  33                         $m => $enc;
  34                 } Encode->encodings(':all');
  35
  36                 # delete fallback for encodings w/o ->mime_name:
  37                 delete $mime_name_map{''};
  38
  39                 # an extra alias see Encode::MIME::NAME
  40                 $mime_name_map{'utf8'} = find_encoding('UTF-8');
  41         }
  42 }
  43
  44 our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
  45
  46 our $STRICT_PARAMS = 1;
  47
  48 my $ct_default = 'text/plain; charset=us-ascii';
  49
  50 my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
  51         qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
  52
  53 my $re_token_non_strict = # allow CTLs and above ASCII
  54         qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
  55
  56 my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
  57         qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
  58 my $re_quoted_pair = qr/\\[\x00-\x7F]/;
  59 my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
  60
  61 my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
  62 my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
  63 my $re_quoted_string_non_strict =
  64 qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
  65
  66 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
  67 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
  68 my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
  69
  70 sub parse_content_type {
  71         my ($ct) = @_;
  72
  73         # If the header isn't there or is empty, give default answer.
  74         $ct = $ct_default unless defined($ct) && length($ct);
  75
  76         _unfold_lines($ct);
  77         _clean_comments($ct);
  78
  79         # It is also recommend (sic.) that this default be assumed when a
  80         # syntactically invalid Content-Type header field is encountered.
  81         unless ($ct =~ s/^($re_token)\/($re_token)//) {
  82                 unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
  83                                                 ($re_token_non_strict)//x) {
  84                         #carp "Invalid Content-Type '$ct'";
  85                         return parse_content_type($ct_default);
  86                 }
  87         }
  88
  89         my ($type, $subtype) = (lc $1, lc $2);
  90
  91         _clean_comments($ct);
  92         $ct =~ s/\s+$//;
  93
  94         my $attributes = {};
  95         if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
  96                 # carp "Missing ';' before first Content-Type parameter '$ct'";
  97         } else {
  98                 $attributes = _process_rfc2231(_parse_attributes($ct));
  99         }
 100
 101         {
 102                 type       => $type,
 103                 subtype => $subtype,
 104                 attributes => $attributes,
 105
 106                 # This is dumb.  Really really dumb.  For backcompat. -- rjbs,
 107                 # 2013-08-10
 108                 discrete   => $type,
 109                 composite  => $subtype,
 110         };
 111 }
 112
 113 my $cd_default = 'attachment';
 114
 115 sub parse_content_disposition {
 116         my ($cd) = @_;
 117
 118         $cd = $cd_default unless defined($cd) && length($cd);
 119
 120         _unfold_lines($cd);
 121         _clean_comments($cd);
 122
 123         unless ($cd =~ s/^($re_token)//) {
 124                 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
 125                         #carp "Invalid Content-Disposition '$cd'";
 126                         return parse_content_disposition($cd_default);
 127                 }
 128         }
 129
 130         my $type = lc $1;
 131
 132         _clean_comments($cd);
 133         $cd =~ s/\s+$//;
 134
 135         my $attributes = {};
 136         if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
 137 # carp "Missing ';' before first Content-Disposition parameter '$cd'";
 138         } else {
 139                 $attributes = _process_rfc2231(_parse_attributes($cd));
 140         }
 141
 142         {
 143                 type       => $type,
 144                 attributes => $attributes,
 145         };
 146 }
 147
 148 sub _unfold_lines {
 149         $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
 150 }
 151
 152 sub _clean_comments {
 153         my $ret = ($_[0] =~ s/^\s+//);
 154         while (length $_[0]) {
 155                 last unless $_[0] =~ s/^\(//;
 156                 my $level = 1;
 157                 while (length $_[0]) {
 158                         my $ch = substr $_[0], 0, 1, '';
 159                         if ($ch eq '(') {
 160                                 $level++;
 161                         } elsif ($ch eq ')') {
 162                                 $level--;
 163                                 last if $level == 0;
 164                         } elsif ($ch eq '\\') {
 165                                 substr $_[0], 0, 1, '';
 166                         }
 167                 }
 168                 # carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
 169                 $ret |= ($_[0] =~ s/^\s+//);
 170         }
 171         $ret;
 172 }
 173
 174 sub _process_rfc2231 {
 175         my ($attribs) = @_;
 176         my %cont;
 177         my %encoded;
 178         foreach (keys %{$attribs}) {
 179                 next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
 180                 my ($attr, $sec) = ($1, $2);
 181                 $cont{$attr}->[$sec] = $attribs->{$_};
 182                 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
 183                 delete $attribs->{$_};
 184         }
 185         foreach (keys %cont) {
 186                 my $key = $_;
 187                 $key .= '*' if $encoded{$_};
 188                 $attribs->{$key} = join '', @{$cont{$_}};
 189         }
 190         foreach (keys %{$attribs}) {
 191                 next unless $_ =~ m/^(.*)\*$/;
 192                 my $key = $1;
 193                 next unless $attribs->{$_} =~ m/^$re_exvalue$/;
 194                 my ($charset, $value) = ($1, $2);
 195                 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
 196                 if (length $charset) {
 197                         my $enc = find_mime_encoding($charset);
 198                         if (defined $enc) {
 199                                 $value = $enc->decode($value);
 200                         # } else {
 201                                 #carp "Unknown charset '$charset' in
 202                                 #attribute '$key' value";
 203                         }
 204                 }
 205                 $attribs->{$key} = $value;
 206                 delete $attribs->{$_};
 207         }
 208         $attribs;
 209 }
 210
 211 sub _parse_attributes {
 212         local $_ = shift;
 213         substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
 214         my $attribs = {};
 215         while (length $_) {
 216                 s/^;// or $STRICT_PARAMS and do {
 217                         #carp "Missing semicolon before parameter '$_'";
 218                         return $attribs;
 219                 };
 220                 _clean_comments($_);
 221                 unless (length $_) {
 222                         # Some mail software generates a Content-Type like this:
 223                         # "Content-Type: text/plain;"
 224                         # RFC 1521 section 3 says a parameter must exist if
 225                         # there is a semicolon.
 226                         #carp "Extra semicolon after last parameter" if
 227                         #$STRICT_PARAMS;
 228                         return $attribs;
 229                 }
 230                 my $attribute;
 231                 if (s/^($re_token)=//) {
 232                         $attribute = lc $1;
 233                 } else {
 234                         if ($STRICT_PARAMS) {
 235                                 # carp "Illegal parameter '$_'";
 236                                 return $attribs;
 237                         }
 238                         if (s/^($re_token_non_strict)=//) {
 239                                 $attribute = lc $1;
 240                         } else {
 241                                 unless (s/^([^;=\s]+)\s*=//) {
 242                                         #carp "Cannot parse parameter '$_'";
 243                                         return $attribs;
 244                                 }
 245                                 $attribute = lc $1;
 246                         }
 247                 }
 248                 _clean_comments($_);
 249                 my $value = _extract_attribute_value();
 250                 $attribs->{$attribute} = $value;
 251                 _clean_comments($_);
 252         }
 253         $attribs;
 254 }
 255
 256 sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
 257         my $value;
 258         while (length $_) {
 259                 if (s/^($re_token)//) {
 260                         $value .= $1;
 261                 } elsif (s/^$re_quoted_string//) {
 262                         my $sub = $1;
 263                         $sub =~ s/\\(.)/$1/g;
 264                         $value .= $sub;
 265                 } elsif ($STRICT_PARAMS) {
 266                         #my $char = substr $_, 0, 1;
 267                         #carp "Unquoted '$char' not allowed";
 268                         return;
 269                 } elsif (s/^($re_token_non_strict)//) {
 270                         $value .= $1;
 271                 } elsif (s/^$re_quoted_string_non_strict//) {
 272                         my $sub = $1;
 273                         $sub =~ s/\\(.)/$1/g;
 274                         $value .= $sub;
 275                 }
 276                 my $erased = _clean_comments($_);
 277                 last if !length $_ or /^;/;
 278                 if ($STRICT_PARAMS) {
 279                         #my $char = substr $_, 0, 1;
 280                         #carp "Extra '$char' found after parameter";
 281                         return;
 282                 }
 283                 if ($erased) {
 284                         # Sometimes semicolon is missing, so check for = char
 285                         last if m/^$re_token_non_strict=/;
 286                         $value .= ' ';
 287                 }
 288                 $value .= substr $_, 0, 1, '';
 289         }
 290         $value;
 291 }
 292
 293 1;
 294 __END__
 295 =func parse_content_type
 296
 297 This routine is exported by default.
 298
 299 This routine parses email content type headers according to section 5.1 of RFC
 300 2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
 301 a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
 302 C<attributes>.
 303
 304 For backward compatibility with a really unfortunate misunderstanding of RFC
 305 2045 by the early implementors of this module, C<discrete> and C<composite> are
 306 also present in the returned hashref, with the values of C<type> and C<subtype>
 307 respectively.
 308
 309 =func parse_content_disposition
 310
 311 This routine is exported by default.
 312
 313 This routine parses email Content-Disposition headers according to RFC 2183 and
 314 RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
 315 of C<attributes>.
 316
 317 =cut