-# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org>
+# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# based on notmuch, but with no concept of folders, files or flags
+# based on notmuch, but with no concept of folders, files
#
# Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use
# with the web and NNTP interfaces. This index maintains thread
use PublicInbox::Spawn qw(spawn nodatacow_dir);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
+our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
index_text term_generator add_val is_bad_blob);
my $X = \%PublicInbox::Search::X;
our ($DB_CREATE_OR_OPEN, $DB_OPEN);
}
}
$ibx = PublicInbox::InboxWritable->new($ibx);
- my $self = bless {
- ibx => $ibx,
- xpfx => $inboxdir, # for xpfx_init
- -altid => $altid,
- ibx_ver => $version,
- indexlevel => $indexlevel,
- }, $class;
- $self->xpfx_init;
+ my $self = PublicInbox::Search->new($ibx);
+ bless $self, $class;
+ $self->{ibx} = $ibx;
+ $self->{-altid} = $altid;
+ $self->{indexlevel} = $indexlevel;
$self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium';
if ($ibx->{-skip_docdata}) {
$self->{-set_skip_docdata_once} = 1;
$self->{-skip_docdata} = 1;
}
- $ibx->umask_prepare;
if ($version == 1) {
$self->{lock_path} = "$inboxdir/ssoma.lock";
my $dir = $self->xdir;
$DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()');
$DB_OPEN = eval($xap.'::DB_OPEN()');
my $ver = (eval($xap.'::major_version()') << 16) |
- (eval($xap.'::minor_version()') << 8);
+ (eval($xap.'::minor_version()') << 8) |
+ eval($xap.'::revision()');
$DB_NO_SYNC = 0x4 if $ver >= 0x10400;
+ # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks
+ $X->{CLOEXEC_UNSET} = 1 if $ver >= 0x010215 && $ver <= 0x010218;
1;
}
index_headers($self, $smsg);
if (defined(my $eidx_key = $smsg->{eidx_key})) {
- $doc->add_boolean_term('O'.$eidx_key);
+ $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
}
msg_iter($eml, \&index_xapian, [ $self, $doc ]);
index_ids($self, $doc, $eml, $mids);
if (!$self->{-skip_docdata}) {
# WWW doesn't need {to} or {cc}, only NNTP
$smsg->{to} = $smsg->{cc} = '';
- PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
+ $smsg->parse_references($eml, $mids);
my $data = $smsg->to_doc_data;
$doc->set_data($data);
}
sub add_xapian ($$$$) {
my ($self, $eml, $smsg, $mids) = @_;
+ begin_txn_lazy($self);
my $doc = eml2doc($self, $eml, $smsg, $mids);
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
sub _msgmap_init ($) {
my ($self) = @_;
- die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1;
+ die "BUG: _msgmap_init is only for v1\n" if $self->{ibx}->version != 1;
$self->{mm} //= eval {
require PublicInbox::Msgmap;
my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1;
sub add_message {
# mime = PublicInbox::Eml or Email::MIME object
my ($self, $mime, $smsg, $sync) = @_;
+ begin_txn_lazy($self);
my $mids = mids_for_index($mime);
$smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat
$smsg->{mid} //= $mids->[0]; # v1 compatibility
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
term_generator($self)->set_document($doc);
- $doc->add_boolean_term('O'.$eidx_key);
+ $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
index_list_id($self, $doc, $eml);
$self->{xdb}->replace_document($docid, $doc);
}
$self->{xdb}->replace_document($docid, $doc);
}
+sub set_keywords {
+ my ($self, $docid, @kw) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ my %keep = map { $_ => 1 } @kw;
+ my %add = %keep;
+ my @rm;
+ my $end = $doc->termlist_end;
+ for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
+ $cur->skip_to('K');
+ last if $cur == $end;
+ my $kw = $cur->get_termname;
+ $kw =~ s/\AK//s or next;
+ $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw);
+ }
+ return unless (scalar(@rm) + scalar(keys %add));
+ $doc->remove_term('K'.$_) for @rm;
+ $doc->add_boolean_term('K'.$_) for (keys %add);
+ $self->{xdb}->replace_document($docid, $doc);
+}
+
+sub add_keywords {
+ my ($self, $docid, @kw) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ $doc->add_boolean_term('K'.$_) for @kw;
+ $self->{xdb}->replace_document($docid, $doc);
+}
+
+sub remove_keywords {
+ my ($self, $docid, @kw) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ my $replace;
+ eval {
+ $doc->remove_term('K'.$_);
+ $replace = 1
+ } for @kw;
+ $self->{xdb}->replace_document($docid, $doc) if $replace;
+}
+
sub smsg_from_doc ($) {
my ($doc) = @_;
my $data = $doc->get_data or return;
sub xdb_remove {
my ($self, @docids) = @_;
+ $self->begin_txn_lazy;
my $xdb = $self->{xdb} or return;
for my $docid (@docids) {
eval { $xdb->delete_document($docid) };
}
}
-sub remove_by_docid {
- my ($self, $num) = @_;
- die "BUG: remove_by_docid is v2-only\n" if $self->{oidx};
- $self->begin_txn_lazy;
- xdb_remove($self, $num) if need_xapian($self);
-}
-
sub index_git_blob_id {
my ($doc, $pfx, $objid) = @_;
}
}
-# returns the number of bytes to add if given a non-CRLF arg
-sub crlf_adjust ($) {
- if (index($_[0], "\r\n") < 0) {
- # common case is LF-only, every \n needs an \r;
- # so favor a cheap tr// over an expensive m//g
- $_[0] =~ tr/\n/\n/;
- } else { # count number of '\n' w/o '\r', expensive:
- scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
- }
-}
-
sub is_bad_blob ($$$$) {
my ($oid, $type, $size, $expect_oid) = @_;
if ($type ne 'blob') {
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
- $size += crlf_adjust($$bref);
- my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
+ my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
+ $smsg->set_bytes($$bref, $size);
my $self = $sync->{sidx};
local $self->{current_info} = "$self->{current_info}: $oid";
my $eml = PublicInbox::Eml->new($bref);
$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
die "E: could not generate NNTP article number for $oid";
add_message($self, $eml, $smsg, $sync);
+ ++$self->{nidx};
my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing';
${$sync->{latest_cmt}} = $cur_cmt;
}
if (defined(my $cur_cmt = $sync->{cur_cmt})) {
${$sync->{latest_cmt}} = $cur_cmt;
}
+ ++$self->{nidx};
}
sub with_umask {
$self->with_umask(\&_commit_txn, $self);
}
-sub worker_done {
- my ($self) = @_;
- if (need_xapian($self)) {
- die "$$ $0 xdb not released\n" if $self->{xdb};
- }
- die "$$ $0 still in transaction\n" if $self->{txn};
-}
-
sub eidx_shard_new {
my ($class, $eidx, $shard) = @_;
my $self = bless {