HTML: various encoding fixups

author Eric Wong <e@80x24.org>

Thu, 17 Apr 2014 20:10:38 +0000 (20:10 +0000)

committer Eric Wong <e@80x24.org>

Thu, 17 Apr 2014 21:02:03 +0000 (21:02 +0000)
author Eric Wong <e@80x24.org>
Thu, 17 Apr 2014 20:10:38 +0000 (20:10 +0000)
committer Eric Wong <e@80x24.org>
Thu, 17 Apr 2014 21:02:03 +0000 (21:02 +0000)
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm

index 0a652f6e7636319978efac8a5c3087debc2f3d64..33406522b59bf53fe4823c5420d18dbae7e83e95 100644 (file)
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -7,7 +7,7 @@ use XML::Atom::SimpleFeed;
  use Email::MIME;
  use Email::Address;
  use URI::Escape qw/uri_escape/;
-use Encode qw/encode decode/;
+use Encode qw/find_encoding/;
  use Encode::MIME::Header;
  use CGI qw(escapeHTML);
  use POSIX qw(strftime);
@@ -15,6 +15,9 @@ use Date::Parse qw(strptime);
  use constant DATEFMT => '%Y-%m-%dT%H:%M:%SZ';
  use PublicInbox::View;
  use Mail::Thread;
+my $enc_utf8 = find_encoding('utf8');
+my $enc_ascii = find_encoding('us-ascii');
+my $enc_mime = find_encoding('MIME-Header');
  
  # FIXME: workaround https://rt.cpan.org/Public/Bug/Display.html?id=22817
  
@@ -52,7 +55,7 @@ sub generate_html_index {
         my $top = $args->{top}; # bool
         local $ENV{GIT_DIR} = $args->{git_dir};
         my $feed_opts = get_feedopts($args);
-       my $title = escapeHTML($feed_opts->{description} || "");
+       my $title = xs_html($feed_opts->{description} || "");
         my @messages;
         each_recent_blob($max, sub {
                 my $str = `git cat-file blob $_[0]`;
@@ -146,8 +149,9 @@ sub utf8_header {
         my ($simple, $name) = @_;
         my $val = $simple->header($name);
         return "" unless defined $val;
-       $val =~ tr/\t\r\n / /s;
-       encode('utf8', decode('MIME-Header', $val));
+       $val =~ tr/\t\n / /s;
+       $val =~ tr/\r//d;
+       $enc_utf8->encode($enc_mime->decode($val));
  }
  
  sub feed_date {
@@ -220,9 +224,9 @@ sub dump_html_line {
                 my @from = Email::Address->parse($from);
                 $from = $from[0]->name;
                 (defined($from) && length($from)) or $from = $from[0]->address;
-               $from = escapeHTML($from);
-               $subj = escapeHTML($subj);
-               $args->[0] .= "<a href=\"$url.html\">`-&gt; $subj</a> $from\n";
+               $from = xs_html($from);
+               $subj = xs_html($subj);
+               $args->[0] .= "<a href=\"$url.html\">$subj</a> $from\n";
         } else {
                 $args->[0] .= "[ Message not available ]\n";
         }
@@ -230,4 +234,9 @@ sub dump_html_line {
         dump_html_line($self->next, $level, $args) if $self->next;
  }
  
+sub xs_html {
+       $enc_ascii->encode(escapeHTML($enc_utf8->decode($_[0])),
+                       Encode::HTMLCREF);
+}
+
  1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm

index 2efbf1b54d771ad7ad48e0c0d32a75db99bc3816..84c7393b197131ee01c4ba94935578c0ef51bf29 100644 (file)
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -5,8 +5,13 @@ use strict;
  use warnings;
  use URI::Escape qw/uri_escape/;
  use CGI qw/escapeHTML/;
-use Encode qw/decode encode/;
+use Encode qw/find_encoding/;
  use Encode::MIME::Header;
+use Email::MIME::ContentType qw/parse_content_type/;
+
+my $enc_utf8 = find_encoding('utf8');
+my $enc_ascii = find_encoding('us-ascii');
+my $enc_mime = find_encoding('MIME-Header');
  
  # public functions:
  sub as_html {
@@ -26,28 +31,42 @@ sub as_feed_entry {
  
  # only private functions below.
  
+sub enc_for {
+       my ($ct) = @_;
+       defined $ct or return $enc_utf8;
+       my $ct_parsed = parse_content_type($ct);
+       if ($ct_parsed) {
+               if (my $charset = $ct_parsed->{attributes}->{charset}) {
+                       my $enc = find_encoding($charset);
+                       return $enc if $enc;
+               }
+       }
+       $enc_utf8;
+}
+
  sub multipart_text_as_html {
         my ($mime, $full_pfx) = @_;
         my $rv = "";
         my $part_nr = 0;
+       my $enc_msg = enc_for($mime->header("Content-Type"));
  
         # scan through all parts, looking for displayable text
         $mime->walk_parts(sub {
                 my ($part) = @_;
                 return if $part->subparts; # walk_parts already recurses
-
-               my $fn = $part->filename;
+               my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8;
  
                 if ($part_nr > 0) {
+                       my $fn = $part->filename;
                         defined($fn) or $fn = "part #" . ($part_nr + 1);
-                       $rv .= add_filename_line($fn);
+                       $rv .= add_filename_line($enc->decode($fn));
                 }
  
                 if (defined $full_pfx) {
-                       $rv .= add_text_body_short($part, $part_nr,
+                       $rv .= add_text_body_short($enc, $part, $part_nr,
                                                 $full_pfx);
                 } else {
-                       $rv .= add_text_body_full($part, $part_nr);
+                       $rv .= add_text_body_full($enc, $part, $part_nr);
                 }
                 $rv .= "\n" unless $rv =~ /\n\z/s;
                 ++$part_nr;
@@ -62,13 +81,13 @@ sub add_filename_line {
  
         $len -= length($fn);
         $pad x= ($len/2) if ($len > 0);
-       "$pad " . escapeHTML($fn) . " $pad\n";
+       "$pad " . ascii_html($fn) . " $pad\n";
  }
  
  sub add_text_body_short {
-       my ($part, $part_nr, $full_pfx) = @_;
+       my ($enc, $part, $part_nr, $full_pfx) = @_;
         my $n = 0;
-       my $s = escapeHTML($part->body);
+       my $s = ascii_html($enc->decode($part->body));
         $s =~ s!^((?:(?:&gt;[^\n]+)\n)+)!
                 my $cur = $1;
                 my @lines = split(/\n/, $cur);
@@ -93,9 +112,9 @@ sub add_text_body_short {
  }
  
  sub add_text_body_full {
-       my ($part, $part_nr) = @_;
+       my ($enc, $part, $part_nr) = @_;
         my $n = 0;
-       my $s = escapeHTML($part->body);
+       my $s = ascii_html($enc->decode($part->body));
         $s =~ s!^((?:(?:&gt;[^\n]+)\n)+)!
                 my $cur = $1;
                 my @lines = split(/\n/, $cur);
@@ -110,14 +129,19 @@ sub add_text_body_full {
  
  sub trim_message_id {
         my ($mid) = @_;
-       $mid =~ s/\A<//;
-       $mid =~ s/>\z//;
-       my $html = escapeHTML($mid);
-       my $href = escapeHTML(uri_escape($mid));
+       $mid = $enc_mime->decode($mid);
+       $mid =~ s/\A\s*<//;
+       $mid =~ s/>\s*\z//;
+       my $html = ascii_html($mid);
+       my $href = ascii_html(uri_escape($mid));
  
         ($html, $href);
  }
  
+sub ascii_html {
+       $enc_ascii->encode(escapeHTML($_[0]), Encode::HTMLCREF);
+}
+
  sub headers_to_html_header {
         my ($simple) = @_;
  
@@ -126,10 +150,9 @@ sub headers_to_html_header {
         foreach my $h (qw(From To Cc Subject Date)) {
                 my $v = $simple->header($h);
                 defined $v or next;
-               $v = decode("MIME-Header", $v);
-               $v = encode("utf8", $v);
-               $v = escapeHTML($v);
-               $v =~ tr/\n/ /;
+               $v =~ tr/\n/ /s;
+               $v =~ tr/\r//d;
+               $v = ascii_html($enc_mime->decode($v));
                 $rv .= "$h: $v\n";
  
                 if ($h eq "From" || $h eq "Subject") {
author	Eric Wong <e@80x24.org>
	Thu, 17 Apr 2014 20:10:38 +0000 (20:10 +0000)
committer	Eric Wong <e@80x24.org>
	Thu, 17 Apr 2014 21:02:03 +0000 (21:02 +0000)
lib/PublicInbox/Feed.pm		patch \| blob \| history
lib/PublicInbox/View.pm		patch \| blob \| history