From: Eric Wong Date: Thu, 31 Oct 2019 03:12:19 +0000 (+0000) Subject: msgiter: attempt to decode all text/* bodies X-Git-Tag: v1.2.0~10 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=e5cc97f6a2ff53f072a5d692e56d0918b33c5081;hp=130af18f06ae9b91e07985ff56b4dd90cedbd744 msgiter: attempt to decode all text/* bodies We want to index text/x-patch and text/x-diff, at least, since "git format-patch" can generate a patch series as attachments using --attach. --- diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index f11ba223..d9df32ab 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -45,12 +45,18 @@ sub msg_part_text ($$) { # times when it should not have been: # <87llgalspt.fsf@free.fr> # <200308111450.h7BEoOu20077@mail.osdl.org> - if ($err && ($ct =~ m!\btext/plain\b!i || + if ($err && ($ct =~ m!\btext/\b!i || $ct =~ m!\bmultipart/mixed\b!i)) { - # Try to assume UTF-8 because Alpine seems to - # do wacky things and set charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; + my $cte = $part->header_raw('Content-Transfer-Encoding'); + if (defined($cte) && $cte =~ /\b7bit\b/i) { + $s = $part->body; + $err = undef if $s =~ /\A[[:ascii:]]+\z/s; + } else { + # Try to assume UTF-8 because Alpine seems to + # do wacky things and set charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + } # If forcing charset=UTF-8 failed, # caller will warn further down...