From 5198c976ce8b1954f0f76a0da152cc434411f147 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Nov 2022 21:31:55 +0000 Subject: [PATCH] eml: header_raw converts octets to Perl UTF-8 This fixes the display of raw (non-RFC 2047) names and subjects in HTML message views. SMTPUTF8 (RFC 6531) allows raw UTF-8 in headers without RFC 2047 encoding, so let Perl handle it as a character sequence for the rest of our consumers. Thus, the old special case in PublicInbox::Smsg->populate is no longer necessary and gone. The one regression notice so far (and fixed here) is compressed IMAP envelope responses still needs raw bytes since the zlib wrapper is designed for octets, not Perl UTF-8 chars. Thus we reverse utf8::decode with utf8::encode in PublicInbox::IMAP::_esc. ->header_set also forces encoding to bytes, since all existing callers would either be dealing with ->header_raw results or be RFC-2047-encoded anyways. Reindexing is not necessary with this change due to the prior PublicInbox::Smsg->populate special case. Reported-by: Konstantin Ryabitsev Link: https://public-inbox.org/meta/20221124153715.3nenjpjzj43vqxr2@meerkat.local/ --- lib/PublicInbox/Eml.pm | 8 +++++--- lib/PublicInbox/IMAP.pm | 2 ++ lib/PublicInbox/Smsg.pm | 3 --- t/imapd.t | 28 ++++++++++++++++++++++++++++ t/psgi_search.t | 7 ++++++- 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 485f637a..8b999e1a 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # # Lazy MIME parser, it still slurps the full message but keeps short @@ -144,6 +144,7 @@ sub header_raw { my $re = re_memo($_[1]); my @v = (${ $_[0]->{hdr} } =~ /$re/g); for (@v) { + utf8::decode($_); # SMTPUTF8 # for compatibility w/ Email::Simple::Header, s/\s+\z//s; s/\A\s+//s; @@ -359,14 +360,15 @@ sub header_set { $pfx .= ': '; my $len = 78 - length($pfx); @vals = map {; + utf8::encode(my $v = $_); # to bytes, support SMTPUTF8 # folding differs from Email::Simple::Header, # we favor tabs for visibility (and space savings :P) if (length($_) >= $len && (/\n[^ \t]/s || !/\n/s)) { local $Text::Wrap::columns = $len; local $Text::Wrap::huge = 'overflow'; - $pfx . wrap('', "\t", $_) . $self->{crlf}; + $pfx . wrap('', "\t", $v) . $self->{crlf}; } else { - $pfx . $_ . $self->{crlf}; + $pfx . $v . $self->{crlf}; } } @vals; $$hdr =~ s!$re!shift(@vals) // ''!ge; # replace current headers, first diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 1f65aa65..37317948 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -426,8 +426,10 @@ sub _esc ($) { if (!defined($v)) { 'NIL'; } elsif ($v =~ /[{"\r\n%*\\\[]/) { # literal string + utf8::encode($v); '{' . length($v) . "}\r\n" . $v; } else { # quoted string + utf8::encode($v); qq{"$v"} } } diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 2026c7d9..b132381b 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -99,9 +99,6 @@ sub populate { # to protect git and NNTP clients $val =~ tr/\0\t\n/ /; - # rare: in case headers have wide chars (not RFC2047-encoded) - utf8::decode($val); - # lower-case fields for read-only stuff $self->{lc($f)} = $val; diff --git a/t/imapd.t b/t/imapd.t index 3c74aefd..cbd6c1b9 100644 --- a/t/imapd.t +++ b/t/imapd.t @@ -534,6 +534,34 @@ SKIP: { } } +{ + ok(my $ic = $imap_client->new(%mic_opt), 'logged in'); + my $mb = "$ibx[0]->{newsgroup}.$first_range"; + ok($ic->examine($mb), "EXAMINE $mb"); + my $uidnext = $ic->uidnext($mb); # we'll fetch BODYSTRUCTURE on this + my $im = $ibx[0]->importer(0); + $im->add(PublicInbox::Eml->new(< +From: Ævar Arnfjörð Bjarmason +To: git\@vger.kernel.org + +EOF + $im->done; + my $envl = $ic->get_envelope($uidnext); + is($envl->{subject}, 'test Ævar', 'UTF-8 subject'); + is($envl->{sender}->[0]->{personalname}, 'Ævar Arnfjörð Bjarmason', + 'UTF-8 sender[0].personalname'); + SKIP: { + skip 'need compress for comparisons', 1 if !$can_compress; + ok($ic = $imap_client->new(%mic_opt), 'uncompressed logged in'); + ok($ic && $ic->compress, 'compress enabled'); + ok($ic->examine($mb), "EXAMINE $mb"); + my $raw = $ic->get_envelope($uidnext); + is_deeply($envl, $raw, 'raw and compressed match'); + } +} + $td->kill; $td->join; is($?, 0, 'no error in exited process') if !$ENV{TEST_KILL_IMAPD}; diff --git a/t/psgi_search.t b/t/psgi_search.t index 3da93eda..8868f67e 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -1,5 +1,5 @@ #!perl -w -# Copyright (C) 2017-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ use strict; use v5.10.1; @@ -103,6 +103,11 @@ test_psgi(sub { $www->call(@_) }, sub { like($res->content, $mid_re, 'found mid in response'); chop($digits); } + $res = $cb->(GET("/test/$mid/")); + $html = $res->content; + like($html, qr/\bFrom: Ævar /, + "displayed Ævar's name properly in permalink From:"); + unlike($html, qr/Ã/, 'no raw octets in permalink HTML'); $res = $cb->(GET('/test/')); $html = $res->content; -- 2.44.0