# Read-only search interface for use by the web and NNTP interfaces
package PublicInbox::Search;
use strict;
-use warnings;
+use parent qw(Exporter);
+our @EXPORT_OK = qw(mdocid);
-# values for searching
-use constant TS => 0; # Received: header in Unix time
-use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI
-use constant DT => 2; # Date: YYYYMMDDHHMMSS
+# values for searching, changing the numeric value breaks
+# compatibility with old indices (so don't change them it)
+use constant {
+ TS => 0, # Received: header in Unix time (IMAP INTERNALDATE)
+ YYYYMMDD => 1, # Date: header for searching in the WWW UI
+ DT => 2, # Date: YYYYMMDDHHMMSS
+
+ # added for public-inbox 1.6.0+
+ BYTES => 3, # IMAP RFC822.SIZE
+ UID => 4, # IMAP UID == NNTP article number == Xapian docid
+ THREADID => 5, # RFC 8474, RFC 8621
+
+ # TODO
+ # REPLYCNT => ?, # IMAP ANSWERED
+
+ # SCHEMA_VERSION history
+ # 0 - initial
+ # 1 - subject_path is lower-cased
+ # 2 - subject_path is id_compress in the index, only
+ # 3 - message-ID is compressed if it includes '%' (hack!)
+ # 4 - change "Re: " normalization, avoid circular Reference ghosts
+ # 5 - subject_path drops trailing '.'
+ # 6 - preserve References: order in document data
+ # 7 - remove references and inreplyto terms
+ # 8 - remove redundant/unneeded document data
+ # 9 - disable Message-ID compression (SHA-1)
+ # 10 - optimize doc for NNTP overviews
+ # 11 - merge threads when vivifying ghosts
+ # 12 - change YYYYMMDD value column to numeric
+ # 13 - fix threading for empty References/In-Reply-To
+ # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
+ # 14 - fix ghost root vivification
+ # 15 - see public-inbox-v2-format(5)
+ # further bumps likely unnecessary, we'll suggest in-place
+ # "--reindex" use for further fixes and tweaks:
+ #
+ # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
+ # * "lid:" and "l:" for List-Id searches
+ #
+ # v1.6.0 adds BYTES, UID and THREADID values
+ SCHEMA_VERSION => 15,
+};
use PublicInbox::Smsg;
use PublicInbox::Over;
my $QP_FLAGS;
-our %X = map { $_ => 0 } qw(BoolWeight Database Enquire
- NumberValueRangeProcessor QueryParser Stem);
+our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem);
our $Xap; # 'Search::Xapian' or 'Xapian'
+my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
my $ENQ_ASCENDING;
sub load_xapian () {
return 1 if defined $Xap;
- for my $x (qw(Search::Xapian Xapian)) {
+ # n.b. PI_XAPIAN is intended for development use only. We still
+ # favor Search::Xapian since that's what's available in current
+ # Debian stable (10.x) and derived distros.
+ for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') {
eval "require $x";
next if $@;
$x->import(qw(:standard));
$Xap = $x;
+
+ # `version_string' was added in Xapian 1.1
+ my $xver = eval('v'.eval($x.'::version_string()')) //
+ eval('v'.eval($x.'::xapian_version_string()'));
+
+ # NumberRangeProcessor was added in Xapian 1.3.6,
+ # NumberValueRangeProcessor was removed for 1.5.0+,
+ # favor the older /Value/ variant since that's what our
+ # (currently) preferred Search::Xapian supports
+ $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ?
+ 'NumberRangeProcessor' : 'NumberValueRangeProcessor');
$X{$_} = $Xap.'::'.$_ for (keys %X);
# ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm,
# a prefix common in patch emails
our $LANG = 'english';
-use constant {
- # SCHEMA_VERSION history
- # 0 - initial
- # 1 - subject_path is lower-cased
- # 2 - subject_path is id_compress in the index, only
- # 3 - message-ID is compressed if it includes '%' (hack!)
- # 4 - change "Re: " normalization, avoid circular Reference ghosts
- # 5 - subject_path drops trailing '.'
- # 6 - preserve References: order in document data
- # 7 - remove references and inreplyto terms
- # 8 - remove redundant/unneeded document data
- # 9 - disable Message-ID compression (SHA-1)
- # 10 - optimize doc for NNTP overviews
- # 11 - merge threads when vivifying ghosts
- # 12 - change YYYYMMDD value column to numeric
- # 13 - fix threading for empty References/In-Reply-To
- # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
- # 14 - fix ghost root vivification
- # 15 - see public-inbox-v2-format(5)
- # further bumps likely unnecessary, we'll suggest in-place
- # "--reindex" use for further fixes and tweaks
- SCHEMA_VERSION => 15,
-};
-
+# note: the non-X term prefix allocations are shared with
+# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
my %bool_pfx_external = (
mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
+ lid => 'G', # newsGroup (or similar entity), just inside <>
dfpre => 'XDFPRE',
dfpost => 'XDFPOST',
dfblob => 'XDFPRE XDFPOST',
# for mairix compatibility
s => 'S',
m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+ l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
f => 'A',
t => 'XTO',
tc => 'XTO XCC',
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
+# not documenting lid: for now, either, it is probably redundant with l:,
+# especially since we don't offer boolean searches for To/Cc/From
+# headers, either
our @HELP = (
's:' => 'match within Subject e.g. s:"a quick brown fox"',
'd:' => <<EOF,
'f:' => 'match within the From header',
'a:' => 'match within the To, Cc, and From headers',
'tc:' => 'match within the To and Cc headers',
+ 'l:' => 'match contents of the List-Id header',
'bs:' => 'match within the Subject and body',
'dfn:' => 'match filename from diff',
'dfa:' => 'match diff removed (-) lines',
sub xdir ($;$) {
my ($self, $rdonly) = @_;
- if ($self->{ibx_ver} == 1) {
- "$self->{inboxdir}/public-inbox/xapian" . SCHEMA_VERSION;
- } else {
- my $dir = "$self->{inboxdir}/xap" . SCHEMA_VERSION;
- return $dir if $rdonly;
-
- my $shard = $self->{shard};
- defined $shard or die "shard not given";
- $dir .= "/$shard";
+ if ($rdonly || !defined($self->{shard})) {
+ $self->{xpfx};
+ } else { # v2 only:
+ "$self->{xpfx}/$self->{shard}";
}
}
my ($xdb, $slow_phrase);
my $qpf = \($self->{qp_flags} ||= $QP_FLAGS);
if ($self->{ibx_ver} >= 2) {
- foreach my $shard (<$dir/*>) {
- -d $shard && $shard =~ m!/[0-9]+\z! or next;
- my $sub = $X{Database}->new($shard);
- if ($xdb) {
- $xdb->add_database($sub);
- } else {
- $xdb = $sub;
+ my @xdb;
+ opendir(my $dh, $dir) or return; # not initialized yet
+
+ # We need numeric sorting so shard[0] is first for reading
+ # Xapian metadata, if needed
+ for (sort { $a <=> $b } grep(/\A[0-9]+\z/, readdir($dh))) {
+ my $shard_dir = "$dir/$_";
+ if (-d $shard_dir && -r _) {
+ push @xdb, $X{Database}->new($shard_dir);
+ $slow_phrase ||= -f "$shard_dir/iamchert";
+ } else { # gaps from missing epochs throw off mdocid()
+ warn "E: $shard_dir missing or unreadable\n";
+ return;
}
- $slow_phrase ||= -f "$shard/iamchert";
}
+ $self->{nshard} = scalar(@xdb);
+ $xdb = shift @xdb;
+ $xdb->add_database($_) for @xdb;
} else {
$slow_phrase = -f "$dir/iamchert";
$xdb = $X{Database}->new($dir);
$xdb;
}
+# v2 Xapian docids don't conflict, so they're identical to
+# NNTP article numbers and IMAP UIDs.
+# https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
+sub mdocid {
+ my ($nshard, $mitem) = @_;
+ my $docid = $mitem->get_docid;
+ int(($docid - 1) / $nshard) + 1;
+}
+
+sub mset_to_artnums {
+ my ($self, $mset) = @_;
+ my $nshard = $self->{nshard} // 1;
+ [ map { mdocid($nshard, $_) } $mset->items ];
+}
+
sub xdb ($) {
my ($self) = @_;
$self->{xdb} ||= do {
};
}
+sub xpfx_init ($) {
+ my ($self) = @_;
+ if ($self->{ibx_ver} == 1) {
+ $self->{xpfx} .= '/public-inbox/xapian' . SCHEMA_VERSION;
+ } else {
+ $self->{xpfx} .= '/xap'.SCHEMA_VERSION;
+ }
+}
+
sub new {
my ($class, $ibx) = @_;
ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
my $self = bless {
- inboxdir => $ibx->{inboxdir},
+ xpfx => $ibx->{inboxdir}, # for xpfx_init
altid => $ibx->{altid},
ibx_ver => $ibx->version,
}, $class;
+ xpfx_init($self);
my $dir = xdir($self, 1);
$self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3");
$self;
if ($query_string eq '' && !$opts->{mset}) {
$self->{over_ro}->recent($opts);
} else {
- my $qp = qp($self);
+ my $qp = $self->{qp} //= qparse_new($self);
my $qp_flags = $self->{qp_flags};
my $query = $qp->parse_query($query_string, $qp_flags);
$opts->{relevance} = 1 unless exists $opts->{relevance};
retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]);
}
+# returns true if all docs have the THREADID value
+sub has_threadid ($) {
+ my ($self) = @_;
+ (xdb($self)->get_metadata('has_threadid') // '') eq '1';
+}
+
sub _enquire_once { # retry_reopen callback
my ($self, $query, $opts) = @{$_[0]};
my $xdb = xdb($self);
$enquire->set_query($query);
$opts ||= {};
my $desc = !$opts->{asc};
- if (($opts->{mset} || 0) == 2) {
+ if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID
$enquire->set_docid_order($ENQ_ASCENDING);
$enquire->set_weighting_scheme($X{BoolWeight}->new);
} elsif ($opts->{relevance}) {
} else {
$enquire->set_sort_by_value_then_relevance(TS, $desc);
}
+
+ # `mairix -t / --threads' or JMAP collapseThreads
+ if ($opts->{thread} && has_threadid($self)) {
+ $enquire->set_collapse_key(THREADID);
+ }
+
my $offset = $opts->{offset} || 0;
my $limit = $opts->{limit} || 50;
my $mset = $enquire->get_mset($offset, $limit);
return $mset if $opts->{mset};
- my @msgs = map { PublicInbox::Smsg::from_mitem($_) } $mset->items;
- return \@msgs unless wantarray;
-
- ($mset->get_matches_estimated, \@msgs)
+ my $nshard = $self->{nshard} // 1;
+ my $i = 0;
+ my %order = map { mdocid($nshard, $_) => ++$i } $mset->items;
+ my @msgs = sort {
+ $order{$a->{num}} <=> $order{$b->{num}}
+ } @{$self->{over_ro}->get_all(keys %order)};
+ wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs;
}
# read-write
sub stemmer { $X{Stem}->new($LANG) }
# read-only
-sub qp {
+sub qparse_new ($) {
my ($self) = @_;
- my $qp = $self->{query_parser};
- return $qp if $qp;
my $xdb = xdb($self);
- # new parser
- $qp = $X{QueryParser}->new;
+ my $qp = $X{QueryParser}->new;
$qp->set_default_op(OP_AND());
$qp->set_database($xdb);
- $qp->set_stemmer($self->stemmer);
+ $qp->set_stemmer(stemmer($self));
$qp->set_stemming_strategy(STEM_SOME());
- $qp->set_max_wildcard_expansion(100);
- my $nvrp = $X{NumberValueRangeProcessor};
- $qp->add_valuerangeprocessor($nvrp->new(YYYYMMDD, 'd:'));
- $qp->add_valuerangeprocessor($nvrp->new(DT, 'dt:'));
+ my $cb = $qp->can('set_max_wildcard_expansion') //
+ $qp->can('set_max_expansion'); # Xapian 1.5.0+
+ $cb->($qp, 100);
+ $cb = $qp->can('add_valuerangeprocessor') //
+ $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+ $cb->($qp, $NVRP->new(YYYYMMDD, 'd:'));
+ $cb->($qp, $NVRP->new(DT, 'dt:'));
+
+ # for IMAP, undocumented for WWW and may be split off go away
+ $cb->($qp, $NVRP->new(BYTES, 'bytes:'));
+ $cb->($qp, $NVRP->new(TS, 'ts:'));
+ $cb->($qp, $NVRP->new(UID, 'uid:'));
while (my ($name, $prefix) = each %bool_pfx_external) {
$qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
# we do not actually create AltId objects,
# just parse the spec to avoid the extra DB handles for now.
if (my $altid = $self->{altid}) {
- my $user_pfx = $self->{-user_pfx} ||= [];
+ my $user_pfx = $self->{-user_pfx} = [];
for (@$altid) {
# $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3'
+ # note: Xapian supports multibyte UTF-8, /^[0-9]+$/,
+ # and '_' with prefixes matching \w+
/\Aserial:(\w+):/ or next;
my $pfx = $1;
push @$user_pfx, "$pfx:", <<EOF;
while (my ($name, $prefix) = each %prob_prefix) {
$qp->add_prefix($name, $_) foreach split(/ /, $prefix);
}
-
- $self->{query_parser} = $qp;
+ $qp;
}
sub help {
my ($self) = @_;
- $self->qp; # parse altids
+ $self->{qp} //= qparse_new($self); # parse altids
my @ret = @HELP;
if (my $user_pfx = $self->{-user_pfx}) {
push @ret, @$user_pfx;