From b714ab45d30d6f0298d73ef4281c1d0263a02493 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 7 May 2020 03:00:09 +0000 Subject: [PATCH] search: support searching on List-Id We'll support both probabilistic matches via `l:' and boolean matches via `lid:' for exact matches, similar to how both `m:' and `mid:' are supported. Only text inside angle braces (`<' and `>') are supported, since I'm not sure if there's value in searching on the optional phrases (which would require decoding with ->header_str instead of ->header_raw). --- lib/PublicInbox/Search.pm | 9 +++++++++ lib/PublicInbox/SearchIdx.pm | 6 ++++++ t/search.t | 31 +++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 86a6ad67..b7db2b9f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -77,11 +77,17 @@ use constant { # 15 - see public-inbox-v2-format(5) # further bumps likely unnecessary, we'll suggest in-place # "--reindex" use for further fixes and tweaks + # + # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15): + # * "lid:" and "l:" for List-Id searches SCHEMA_VERSION => 15, }; +# note: the non-X term prefix allocations are shared with +# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue + lid => 'G', # newsGroup (or similar entity), just inside <> dfpre => 'XDFPRE', dfpost => 'XDFPOST', dfblob => 'XDFPRE XDFPOST', @@ -92,6 +98,7 @@ my %prob_prefix = ( # for mairix compatibility s => 'S', m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', t => 'XTO', tc => 'XTO XCC', @@ -134,6 +141,8 @@ EOF 'f:' => 'match within the From header', 'a:' => 'match within the To, Cc, and From headers', 'tc:' => 'match within the To and Cc headers', + 'lid:' => 'exact contents of the List-Id', + 'l:' => 'partial match contents of the List-Id header', 'bs:' => 'match within the Subject and body', 'dfn:' => 'match filename from diff', 'dfa:' => 'match diff removed (-) lines', diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 25118f43..998341a7 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -352,6 +352,12 @@ sub add_xapian ($$$$) { } } $doc->add_boolean_term('Q' . $_) foreach @$mids; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } $self->{xdb}->replace_document($smsg->{num}, $doc); } diff --git a/t/search.t b/t/search.t index 83986837..92f3305d 100644 --- a/t/search.t +++ b/t/search.t @@ -66,6 +66,7 @@ Subject: Hello world Message-ID: From: John Smith To: list@example.com +List-Id: I'm not mad \m/ EOF @@ -77,6 +78,7 @@ Message-ID: From: John Smith To: list@example.com Cc: foo@example.com +List-Id: there's nothing goodbye forever :< EOF @@ -448,6 +450,35 @@ EOF is($ro->query("m:Pine m:LNX m:10010260936330", {mset=>1})->size, 1); }); +{ # List-Id searching + my $found = $ro->query('lid:i.m.just.bored'); + is_deeply([ filter_mids($found) ], [ 'root@s' ], + 'got expected mid on exact lid: search'); + + $found = $ro->query('lid:just.bored'); + is_deeply($found, [], 'got nothing on lid: search'); + + $found = $ro->query('lid:*.just.bored'); + is_deeply($found, [], 'got nothing on lid: search'); + + $found = $ro->query('l:i.m.just.bored'); + is_deeply([ filter_mids($found) ], [ 'root@s' ], + 'probabilistic search works on full List-Id contents'); + + $found = $ro->query('l:just.bored'); + is_deeply([ filter_mids($found) ], [ 'root@s' ], + 'probabilistic search works on partial List-Id contents'); + + $found = $ro->query('lid:mad'); + is_deeply($found, [], 'no match on phrase with lid:'); + + $found = $ro->query('lid:bored'); + is_deeply($found, [], 'no match on partial List-Id with lid:'); + + $found = $ro->query('l:nothing'); + is_deeply($found, [], 'matched on phrase with l:'); +} + done_testing(); 1; -- 2.44.0