# Copyright (C) 2020-2021 all contributors
# License: AGPL-3.0+
# like PublicInbox::SearchIdx, but for searching for non-mail messages.
# Things indexed include:
# * inboxes themselves
# * epoch information
# * (maybe) git code repository information
# Expect ~100K-1M documents with no parallelism opportunities,
# so no sharding, here.
#
# See MiscSearch for read-only counterpart
package PublicInbox::MiscIdx;
use strict;
use v5.10.1;
use PublicInbox::InboxWritable;
use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat
use PublicInbox::SearchIdx qw(index_text term_generator add_val);
use PublicInbox::Spawn qw(nodatacow_dir);
use Carp qw(croak);
use File::Path ();
use PublicInbox::MiscSearch;
use PublicInbox::Config;
my $json;
sub new {
my ($class, $eidx) = @_;
PublicInbox::SearchIdx::load_xapian_writable();
my $mi_dir = "$eidx->{xpfx}/misc";
File::Path::mkpath($mi_dir);
nodatacow_dir($mi_dir);
my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN;
$flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync};
$json //= PublicInbox::Config::json();
bless {
mi_dir => $mi_dir,
flags => $flags,
indexlevel => 'full', # small DB, no point in medium?
}, $class;
}
sub _begin_txn ($) {
my ($self) = @_;
my $wdb = $PublicInbox::Search::X{WritableDatabase};
my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) };
croak "Failed opening $self->{mi_dir}: $@" if $@;
$xdb->begin_transaction;
$xdb;
}
sub commit_txn {
my ($self) = @_;
my $xdb = delete $self->{xdb} or return;
$xdb->commit_transaction;
}
sub create_xdb {
my ($self) = @_;
$self->{xdb} //= _begin_txn($self);
commit_txn($self);
}
sub remove_eidx_key {
my ($self, $eidx_key) = @_;
my $xdb = $self->{xdb} //= _begin_txn($self);
my $head = $xdb->postlist_begin('Q'.$eidx_key);
my $tail = $xdb->postlist_end('Q'.$eidx_key);
my @docids; # only one, unless we had bugs
for (; $head != $tail; $head++) {
push @docids, $head->get_docid;
}
for my $docid (@docids) {
$xdb->delete_document($docid);
warn "I: remove inbox docid #$docid ($eidx_key)\n";
}
}
# adds or updates according to $eidx_key
sub index_ibx {
my ($self, $ibx) = @_;
my $eidx_key = $ibx->eidx_key;
my $xdb = $self->{xdb} //= _begin_txn($self);
# Q = uniQue in Xapian terminology
my $head = $xdb->postlist_begin('Q'.$eidx_key);
my $tail = $xdb->postlist_end('Q'.$eidx_key);
my ($docid, @drop);
for (; $head != $tail; $head++) {
if (defined $docid) {
my $i = $head->get_docid;
push @drop, $i;
warn <get_docid;
}
}
$xdb->delete_document($_) for @drop; # just in case
my $doc = $PublicInbox::Search::X{Document}->new;
term_generator($self)->set_document($doc);
# allow sorting by modified and uidvalidity (created at)
add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified);
add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity);
$doc->add_boolean_term('Q'.$eidx_key); # uniQue id
$doc->add_boolean_term('T'.'inbox'); # Type
if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) {
$doc->add_boolean_term('T'.'newsgroup'); # additional Type
}
# force reread from disk, {description} could be loaded from {misc}
delete $ibx->{description};
my $desc = $ibx->description;
# description = S/Subject (or title)
# address = A/Author
index_text($self, $desc, 1, 'S');
index_text($self, $ibx->{name}, 1, 'XNAME');
my %map = (
address => 'A',
listid => 'XLISTID',
infourl => 'XINFOURL',
url => 'XURL'
);
while (my ($f, $pfx) = each %map) {
for my $v (@{$ibx->{$f} // []}) {
index_text($self, $v, 1, $pfx);
}
}
my $data = {};
if (defined(my $max = $ibx->max_git_epoch)) { # v2
my $pfx = "/$ibx->{name}/git/";
for my $epoch (0..$max) {
my $git = $ibx->git_epoch($epoch) or return;
if (my $ent = $git->manifest_entry($epoch, $desc)) {
$data->{"$pfx$epoch.git"} = $ent;
$ent->{git_dir} = $git->{git_dir};
}
$git->cleanup; # ->modified starts cat-file --batch
}
} elsif (my $ent = $ibx->git->manifest_entry) { # v1
$ent->{git_dir} = $ibx->{inboxdir};
$data->{"/$ibx->{name}"} = $ent;
}
$doc->set_data($json->encode($data));
if (defined $docid) {
$xdb->replace_document($docid, $doc);
} else {
$xdb->add_document($doc);
}
}
1;