From a9c903a57ff9a18c56a53bcba4316eade423fef6 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 9 Sep 2016 00:01:24 +0000 Subject: [PATCH] search: more granular message body searching "bs:" and "b:" are adapted from mairix(1) We will also support searching explicitly for quoted vs non-quoted text via "q:" and "nq:" prefixes since sometimes readers will not care for quoted text. In the future, we will support parsing diffs (perhaps when repobrowse integration is complete). Note: this roughly doubles the size of the Xapian database due to the additional information; so this change may not be worth it. --- lib/PublicInbox/Search.pm | 18 ++++++++++++------ lib/PublicInbox/SearchIdx.pm | 17 ++++++++++++++--- t/search.t | 25 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 3b25b662..f74129d5 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -58,16 +58,22 @@ my %bool_pfx_external = ( ); my %prob_prefix = ( - s => 'S', # for mairix compatibility + # for mairix compatibility + s => 'S', m => 'Q', # 'mid' is exact, 'm' can do partial - f => 'A', # for mairix compatibility - t => 'XTO', # for mairix compatibility - tc => 'XTC', # for mairix compatibility - c => 'XCC', # for mairix compatibility - tcf => 'XTCF', # for mairix compatibility + f => 'A', + t => 'XTO', + tc => 'XTC', + c => 'XCC', + tcf => 'XTCF', + b => 'XBODY', + bs => 'XBS', + # n.b.: leaving out "a:" alias for "tcf:" even though # mairix supports it. It is only mentioned in passing in mairix(1) # and the extra two letters are not significantly longer. + q => 'XQUOT', + nq => 'XNQ', ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 37fefbea..cd27a294 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -173,7 +173,10 @@ sub add_message { my $tg = $self->term_generator; $tg->set_document($doc); - $tg->index_text($subj, 1, 'S') if $subj; + if ($subj) { + $tg->index_text($subj, 1, 'S'); + $tg->index_text($subj, 1, 'XBS'); + } $tg->increase_termpos; $tg->index_text($subj) if $subj; $tg->increase_termpos; @@ -199,13 +202,21 @@ sub add_message { } } if (@quot) { - $tg->index_text(join("\n", @quot), 0); + my $s = join("\n", @quot); @quot = (); + $tg->index_text($s, 1, 'XQUOT'); + $tg->index_text($s, 0, 'XBS'); + $tg->index_text($s, 0, 'XBODY'); + $tg->index_text($s, 0); $tg->increase_termpos; } if (@orig) { - $tg->index_text(join("\n", @orig)); + my $s = join("\n", @orig); @orig = (); + $tg->index_text($s, 1, 'XNQ'); + $tg->index_text($s, 1, 'XBS'); + $tg->index_text($s, 1, 'XBODY'); + $tg->index_text($s); $tg->increase_termpos; } }); diff --git a/t/search.t b/t/search.t index 7abaf832..bddb545a 100644 --- a/t/search.t +++ b/t/search.t @@ -361,6 +361,31 @@ sub filter_mids { } } +{ + $rw_commit->(); + $ro->reopen; + my $res = $ro->query('b:hello'); + is(scalar @{$res->{msgs}}, 0, 'no match on body search only'); + $res = $ro->query('bs:smith'); + is(scalar @{$res->{msgs}}, 0, + 'no match on body+subject search for From'); + + $res = $ro->query('q:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one quoted body'); + like($res->{msgs}->[0]->from, qr/\AQuoter/, 'got quoted body'); + + $res = $ro->query('nq:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one non-quoted body'); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body'); + + foreach my $pfx (qw(b: bs:)) { + $res = $ro->query($pfx . 'theatre'); + is(scalar @{$res->{msgs}}, 2, "searched both bodies for $pfx"); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, + "non-quoter first for $pfx"); + } +} + done_testing(); 1; -- 2.44.0