# Read-only search interface for use by the web and NNTP interfaces
package PublicInbox::Search;
use strict;
+use parent qw(Exporter);
+our @EXPORT_OK = qw(mdocid);
-# values for searching
+# values for searching, changing the numeric value breaks
+# compatibility with old indices (so don't change them it)
use constant {
TS => 0, # Received: header in Unix time (IMAP INTERNALDATE)
YYYYMMDD => 1, # Date: header for searching in the WWW UI
DT => 2, # Date: YYYYMMDDHHMMSS
+
+ # added for public-inbox 1.6.0+
BYTES => 3, # IMAP RFC822.SIZE
UID => 4, # IMAP UID == NNTP article number == Xapian docid
+
# TODO
- # REPLYCNT => 4, # IMAP ANSWERED
+ # THREADID => ?
+ # REPLYCNT => ?, # IMAP ANSWERED
+
+ # SCHEMA_VERSION history
+ # 0 - initial
+ # 1 - subject_path is lower-cased
+ # 2 - subject_path is id_compress in the index, only
+ # 3 - message-ID is compressed if it includes '%' (hack!)
+ # 4 - change "Re: " normalization, avoid circular Reference ghosts
+ # 5 - subject_path drops trailing '.'
+ # 6 - preserve References: order in document data
+ # 7 - remove references and inreplyto terms
+ # 8 - remove redundant/unneeded document data
+ # 9 - disable Message-ID compression (SHA-1)
+ # 10 - optimize doc for NNTP overviews
+ # 11 - merge threads when vivifying ghosts
+ # 12 - change YYYYMMDD value column to numeric
+ # 13 - fix threading for empty References/In-Reply-To
+ # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
+ # 14 - fix ghost root vivification
+ # 15 - see public-inbox-v2-format(5)
+ # further bumps likely unnecessary, we'll suggest in-place
+ # "--reindex" use for further fixes and tweaks:
+ #
+ # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
+ # * "lid:" and "l:" for List-Id searches
+ #
+ # v1.6.0 adds BYTES and UID values
+ SCHEMA_VERSION => 15,
};
use PublicInbox::Smsg;
# a prefix common in patch emails
our $LANG = 'english';
-use constant {
- # SCHEMA_VERSION history
- # 0 - initial
- # 1 - subject_path is lower-cased
- # 2 - subject_path is id_compress in the index, only
- # 3 - message-ID is compressed if it includes '%' (hack!)
- # 4 - change "Re: " normalization, avoid circular Reference ghosts
- # 5 - subject_path drops trailing '.'
- # 6 - preserve References: order in document data
- # 7 - remove references and inreplyto terms
- # 8 - remove redundant/unneeded document data
- # 9 - disable Message-ID compression (SHA-1)
- # 10 - optimize doc for NNTP overviews
- # 11 - merge threads when vivifying ghosts
- # 12 - change YYYYMMDD value column to numeric
- # 13 - fix threading for empty References/In-Reply-To
- # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
- # 14 - fix ghost root vivification
- # 15 - see public-inbox-v2-format(5)
- # further bumps likely unnecessary, we'll suggest in-place
- # "--reindex" use for further fixes and tweaks
- #
- # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
- # * "lid:" and "l:" for List-Id searches
- SCHEMA_VERSION => 15,
-};
-
# note: the non-X term prefix allocations are shared with
# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
my %bool_pfx_external = (
sub xdir ($;$) {
my ($self, $rdonly) = @_;
- if ($self->{ibx_ver} == 1) {
- "$self->{inboxdir}/public-inbox/xapian" . SCHEMA_VERSION;
- } else {
- my $dir = "$self->{inboxdir}/xap" . SCHEMA_VERSION;
- return $dir if $rdonly;
-
- my $shard = $self->{shard};
- defined $shard or die "shard not given";
- $dir .= "/$shard";
+ if ($rdonly || !defined($self->{shard})) {
+ $self->{xpfx};
+ } else { # v2 only:
+ "$self->{xpfx}/$self->{shard}";
}
}
my ($xdb, $slow_phrase);
my $qpf = \($self->{qp_flags} ||= $QP_FLAGS);
if ($self->{ibx_ver} >= 2) {
- my $n = 0;
- foreach my $shard (<$dir/*>) {
- -d $shard && $shard =~ m!/[0-9]+\z! or next;
- my $sub = $X{Database}->new($shard);
- if ($xdb) {
- $xdb->add_database($sub);
- } else {
- $xdb = $sub;
+ my @xdb;
+ opendir(my $dh, $dir) or return; # not initialized yet
+
+ # We need numeric sorting so shard[0] is first for reading
+ # Xapian metadata, if needed
+ for (sort { $a <=> $b } grep(/\A[0-9]+\z/, readdir($dh))) {
+ my $shard_dir = "$dir/$_";
+ if (-d $shard_dir && -r _) {
+ push @xdb, $X{Database}->new($shard_dir);
+ $slow_phrase ||= -f "$shard_dir/iamchert";
+ } else { # gaps from missing epochs throw off mdocid()
+ warn "E: $shard_dir missing or unreadable\n";
+ return;
}
- $slow_phrase ||= -f "$shard/iamchert";
- ++$n;
}
- $self->{nshard} = $n;
+ $self->{nshard} = scalar(@xdb);
+ $xdb = shift @xdb;
+ $xdb->add_database($_) for @xdb;
} else {
$slow_phrase = -f "$dir/iamchert";
$xdb = $X{Database}->new($dir);
};
}
+sub xpfx_init ($) {
+ my ($self) = @_;
+ if ($self->{ibx_ver} == 1) {
+ $self->{xpfx} .= '/public-inbox/xapian' . SCHEMA_VERSION;
+ } else {
+ $self->{xpfx} .= '/xap'.SCHEMA_VERSION;
+ }
+}
+
sub new {
my ($class, $ibx) = @_;
ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
my $self = bless {
- inboxdir => $ibx->{inboxdir},
+ xpfx => $ibx->{inboxdir}, # for xpfx_init
altid => $ibx->{altid},
ibx_ver => $ibx->version,
}, $class;
+ xpfx_init($self);
my $dir = xdir($self, 1);
$self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3");
$self;