From a79671f4c2f50ecba1271b85051f732b4ee04a46 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 18 Aug 2015 01:11:05 +0000 Subject: [PATCH] search: common Subject: normalization for Re: prefixes Drop German ("Aw:") support since it's non-standard and is not supported by Mail::Thread and non-English prefixes are more likely to conflict with prefixes used in Free Software development where ("subsection:") prefixes are common and English is the common language. Anyways we don't filter "Vs: " (Finnish) or "Sv: " (Norwegian, Swedish, Danish, Icelandic), either. ref: https://en.wikipedia.org/wiki/RE_(e-mail)#Abbreviations_in_other_languages --- lib/PublicInbox/Search.pm | 6 +++++- lib/PublicInbox/View.pm | 19 +++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index db86301d..6a05ce7a 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -10,6 +10,10 @@ require PublicInbox::View; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compressed/; +# This is English-only, everything else is non-standard and may be confused as +# a prefix common in patch emails +our $REPLY_RE = qr/^re:\s+/i; + use constant { TS => 0, # SCHEMA_VERSION history @@ -490,7 +494,7 @@ sub subject_path { $subj =~ s/\A\s+//; $subj =~ s/\s+\z//; - $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) + $subj =~ s/$REPLY_RE//igo; # remove reply prefix $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; lc($subj); } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index b0b8e140..7122a38d 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -457,6 +457,7 @@ sub html_footer { if (my $c = $res->{count}) { $c = $c == 1 ? '1 followup' : "$c followups"; $idx .= "\n$c:\n"; + $res->{srch} = $srch; thread_followups(\$idx, $mime, $res); } else { $idx .= "\n(no followups, yet)\n"; @@ -493,13 +494,14 @@ sub anchor_for { sub simple_dump { my ($dst, $root, $node, $level) = @_; + # $root = [ Root Message-ID, \%seen, $srch ]; my $pfx = ' ' x $level; $$dst .= $pfx; if (my $x = $node->message) { my $mid = $x->header('Message-ID'); if ($root->[0] ne $mid) { my $s = $x->header('Subject'); - my $h = hash_subj($s); + my $h = $root->[2]->subject_path($s); if ($root->[1]->{$h}) { $s = ''; } else { @@ -525,15 +527,6 @@ sub simple_dump { simple_dump($dst, $root, $node->next, $level) if $node->next; } -sub hash_subj { - my ($subj) = @_; - $subj =~ s/\A\s+//; - $subj =~ s/\s+\z//; - $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) - $subj =~ s/\s+/ /; - Digest::SHA::sha1($subj); -} - sub thread_followups { my ($dst, $root, $res) = @_; my @msgs = map { $_->mini_mime } @{$res->{msgs}}; @@ -542,8 +535,10 @@ sub thread_followups { my $th = PublicInbox::Thread->new($root, @msgs); $th->thread; $th->order(*PublicInbox::Thread::sort_ts); - $root = [ $root->header('Message-ID'), - { hash_subj($root->header('Subject')) => 1 } ]; + my $srch = $res->{srch}; + my $subj = $srch->subject_path($root->header('Subject')); + my %seen = ($subj => 1); + $root = [ $root->header('Message-ID'), \%seen, $srch ]; simple_dump($dst, $root, $_, 0) for $th->rootset; } -- 2.44.0