X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearch.pm;h=9ab5afe6d619860a1277314b5680350a109897b8;hb=3d41aa23f35501ca92aab8aa42980fa73f7fa74f;hp=c8e297f42634a842c3b9b3ba033d21cd71353d9e;hpb=21ab8f3cc530d9483091f32c0865ba1ce867cef8;p=public-inbox.git
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index c8e297f4..9ab5afe6 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -1,5 +1,5 @@
-# Copyright (C) 2015 all contributors
-# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+# Copyright (C) 2015-2018 all contributors
+# License: AGPL-3.0+
# based on notmuch, but with no concept of folders, files or flags
#
# Read-only search interface for use by the web and NNTP interfaces
@@ -16,7 +16,7 @@ use constant YYYYMMDD => 4; # for searching in the WWW UI
use Search::Xapian qw/:standard/;
use PublicInbox::SearchMsg;
-use Email::MIME;
+use PublicInbox::MIME;
use PublicInbox::MID qw/mid_clean id_compress/;
# This is English-only, everything else is non-standard and may be confused as
@@ -38,7 +38,11 @@ use constant {
# 9 - disable Message-ID compression (SHA-1)
# 10 - optimize doc for NNTP overviews
# 11 - merge threads when vivifying ghosts
- SCHEMA_VERSION => 11,
+ # 12 - change YYYYMMDD value column to numeric
+ # 13 - fix threading for empty References/In-Reply-To
+ # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
+ # 14 - fix ghost root vivification
+ SCHEMA_VERSION => 14,
# n.b. FLAG_PURE_NOT is expensive not suitable for a public website
# as it could become a denial-of-service vector
@@ -52,8 +56,6 @@ my %bool_pfx_internal = (
);
my %bool_pfx_external = (
- # do we still need these? probably not..
- path => 'XPATH',
mid => 'Q', # uniQue id (Message-ID)
);
@@ -66,53 +68,56 @@ my %prob_prefix = (
tc => 'XTO XCC',
c => 'XCC',
tcf => 'XTO XCC A',
+ a => 'XTO XCC A',
b => 'XNQ XQUOT',
bs => 'XNQ XQUOT S',
+ n => 'XFN',
- # n.b.: leaving out "a:" alias for "tcf:" even though
- # mairix supports it. It is only mentioned in passing in mairix(1)
- # and the extra two letters are not significantly longer.
q => 'XQUOT',
nq => 'XNQ',
+ dfn => 'XDFN',
+ dfa => 'XDFA',
+ dfb => 'XDFB',
+ dfhh => 'XDFHH',
+ dfctx => 'XDFCTX',
+ dfpre => 'XDFPRE',
+ dfpost => 'XDFPOST',
+ dfblob => 'XDFPRE XDFPOST',
# default:
- '' => 'XMID S A XNQ XQUOT',
+ '' => 'XMID S A XNQ XQUOT XFN',
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
our @HELP = (
- 's:' => < 'match within Subject e.g. s:"a quick brown fox"',
'd:' => < 'match within message body, including text attachments',
+ 'nq:' => 'match non-quoted text within message body',
+ 'q:' => 'match quoted text within message body',
+ 'n:' => 'match filename of attachment(s)',
+ 't:' => 'match within the To header',
+ 'c:' => 'match within the Cc header',
+ 'f:' => 'match within the From header',
+ 'a:' => 'match within the To, Cc, and From headers',
+ 'tc:' => 'match within the To and Cc headers',
+ 'bs:' => 'match within the Subject and body',
+ 'dfn:' => 'match filename from diff',
+ 'dfa:' => 'match diff removed (-) lines',
+ 'dfb:' => 'match diff added (+) lines',
+ 'dfhh:' => 'match diff hunk header context (usually a function name)',
+ 'dfctx:' => 'match diff context lines',
+ 'dfpre:' => 'match pre-image git blob ID',
+ 'dfpost:' => 'match post-image git blob ID',
+ 'dfblob:' => 'match either pre or post-image git blob ID',
);
-# TODO: (from mairix, some of these are maybe)
-# b (body), f (From:), c (Cc:), n (attachment), t (To:)
-# tc (To:+Cc:), bs (body + Subject), tcf (To: +Cc: +From:)
-#
-# Non-mairix:
-# df (filenames from diff)
-# nq (non-quoted body)
-# da (diff a/ removed lines)
-# db (diff b/ added lines)
+chomp @HELP;
-my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
-
-sub xpfx { $all_pfx{$_[0]} }
-
-our %PFX2TERM_RMAP;
-my %meta_pfx = (mid => 1, thread => 1, path => 1);
-while (my ($k, $v) = each %all_pfx) {
- $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k};
-}
-
-my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
+my $mail_query = Search::Xapian::Query->new('T' . 'mail');
sub xdir {
my (undef, $git_dir) = @_;
@@ -147,34 +152,44 @@ sub get_thread {
my $smsg = eval { $self->lookup_message($mid) };
return { total => 0, msgs => [] } unless $smsg;
- my $qtid = Search::Xapian::Query->new(xpfx('thread').$smsg->thread_id);
+ my $qtid = Search::Xapian::Query->new('G' . $smsg->thread_id);
my $path = $smsg->path;
if (defined $path && $path ne '') {
my $path = id_compress($smsg->path);
- my $qsub = Search::Xapian::Query->new(xpfx('path').$path);
+ my $qsub = Search::Xapian::Query->new('XPATH' . $path);
$qtid = Search::Xapian::Query->new(OP_OR, $qtid, $qsub);
}
$opts ||= {};
$opts->{limit} ||= 1000;
+
+ # always sort threads by timestamp, this makes life easier
+ # for the threading algorithm (in SearchThread.pm)
+ $opts->{asc} = 1;
+
_do_enquire($self, $qtid, $opts);
}
-sub _do_enquire {
- my ($self, $query, $opts) = @_;
+sub retry_reopen {
+ my ($self, $cb) = @_;
my $ret;
for (1..10) {
- eval { $ret = _enquire_once($self, $query, $opts) };
+ eval { $ret = $cb->() };
return $ret unless $@;
# Exception: The revision being read has been discarded -
# you should call Xapian::Database::reopen()
- if (index($@, 'Xapian::Database::reopen') >= 0) {
+ if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') {
reopen($self);
} else {
- die $@;
+ die;
}
}
}
+sub _do_enquire {
+ my ($self, $query, $opts) = @_;
+ retry_reopen($self, sub { _enquire_once($self, $query, $opts) });
+}
+
sub _enquire_once {
my ($self, $query, $opts) = @_;
my $enquire = $self->enquire;
@@ -221,7 +236,7 @@ sub qp {
$qp->set_stemmer($self->stemmer);
$qp->set_stemming_strategy(STEM_SOME);
$qp->add_valuerangeprocessor(
- Search::Xapian::StringValueRangeProcessor->new(YYYYMMDD, 'd:'));
+ Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:'));
while (my ($name, $prefix) = each %bool_pfx_external) {
$qp->add_boolean_prefix($name, $prefix);
@@ -236,11 +251,12 @@ sub qp {
/\Aserial:(\w+):/ or next;
my $pfx = $1;
push @$user_pfx, "$pfx:", < XGMANE
$qp->add_boolean_prefix($pfx, 'X'.uc($pfx));
}
+ chomp @$user_pfx;
}
while (my ($name, $prefix) = each %prob_prefix) {
@@ -269,27 +285,29 @@ sub lookup_message {
my ($self, $mid) = @_;
$mid = mid_clean($mid);
- my $doc_id = $self->find_unique_doc_id('mid', $mid);
+ my $doc_id = $self->find_unique_doc_id('Q' . $mid);
my $smsg;
if (defined $doc_id) {
# raises on error:
my $doc = $self->{xdb}->get_document($doc_id);
$smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
- $smsg->doc_id($doc_id);
+ $smsg->{doc_id} = $doc_id;
}
$smsg;
}
sub lookup_mail { # no ghosts!
my ($self, $mid) = @_;
- my $smsg = lookup_message($self, $mid) or return;
- PublicInbox::SearchMsg->load_doc($smsg->{doc});
+ retry_reopen($self, sub {
+ my $smsg = lookup_message($self, $mid) or return;
+ $smsg->load_expand;
+ });
}
sub find_unique_doc_id {
- my ($self, $term, $value) = @_;
+ my ($self, $termval) = @_;
- my ($begin, $end) = $self->find_doc_ids($term, $value);
+ my ($begin, $end) = $self->find_doc_ids($termval);
return undef if $begin->equal($end); # not found
@@ -297,26 +315,20 @@ sub find_unique_doc_id {
# sanity check
$begin->inc;
- $begin->equal($end) or die "Term '$term:$value' is not unique\n";
+ $begin->equal($end) or die "Term '$termval' is not unique\n";
$rv;
}
# returns begin and end PostingIterator
sub find_doc_ids {
- my ($self, $term, $value) = @_;
-
- $self->find_doc_ids_for_term(xpfx($term) . $value);
-}
-
-# returns begin and end PostingIterator
-sub find_doc_ids_for_term {
- my ($self, $term) = @_;
+ my ($self, $termval) = @_;
my $db = $self->{xdb};
- ($db->postlist_begin($term), $db->postlist_end($term));
+ ($db->postlist_begin($termval), $db->postlist_end($termval));
}
# normalize subjects so they are suitable as pathnames for URLs
+# XXX: consider for removal
sub subject_path {
my $subj = pop;
$subj = subject_normalized($subj);
@@ -334,32 +346,6 @@ sub subject_normalized {
$subj;
}
-# for doc data
-sub subject_summary {
- my $subj = pop;
- my $max = 68;
- if (length($subj) > $max) {
- my @subj = split(/\s+/, $subj);
- $subj = '';
- my $l;
-
- while ($l = shift @subj) {
- my $new = $subj . $l . ' ';
- last if length($new) >= $max;
- $subj = $new;
- }
- if ($subj ne '') {
- my $r = scalar @subj ? ' ...' : '';
- $subj =~ s/ \z/$r/s;
- } else {
- # subject has one REALLY long word, and NOT spam? wtf
- @subj = ($l =~ /\A(.{1,72})/);
- $subj = $subj[0] . ' ...';
- }
- }
- $subj;
-}
-
sub enquire {
my ($self) = @_;
$self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb});