From 86c28d2432292c6bee149f59175486e5610e4462 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 19 Aug 2020 08:02:33 +0000 Subject: [PATCH] smsg: handle wide characters in raw mail headers There may be messages in the wild with wide characters in headers which aren't non-RFC2047 encoded. Assume UTF-8 so those fields can round trip through over.sqlite3. This doesn't affect docdata.glass in Xapian, but it does affect how over.sqlite3 stores the same deflated info. --- lib/PublicInbox/Smsg.pm | 3 +++ t/psgi_search.t | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index aaf88f35..62cb951e 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -105,6 +105,9 @@ sub populate { # to protect git and NNTP clients $val =~ tr/\0\t\n/ /; + # rare: in case headers have wide chars (not RFC2047-encoded) + utf8::decode($val); + # lower-case fields for read-only stuff $self->{lc($f)} = $val; diff --git a/t/psgi_search.t b/t/psgi_search.t index 2d12ba6a..5d537363 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -28,8 +28,10 @@ my $im = $ibx->importer(0); my $digits = '10010260936330'; my $ua = 'Pine.LNX.4.10'; my $mid = "$ua.$digits.2460-100000\@penguin.transmeta.com"; + +# n.b. these headers are not properly RFC2047-encoded my $mime = PublicInbox::Eml->new(< From: Ævar Arnfjörð Bjarmason To: git\@vger.kernel.org @@ -102,6 +104,8 @@ test_psgi(sub { $www->call(@_) }, sub { 'subject-less message linked from "/$INBOX/"'); like($html, qr/\bhref="blank-subject[^>]+>\(no subject\)(GET('/test/?q=tc:git')); like($html, qr/\bhref="no-subject-at-all[^>]+>\(no subject\)