X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=88349faa38b7a9325a11900155d4715c9faa90de;hb=af0b0fb7a454470a32c452119d0392e0dedb3fe1;hp=ade5575669c1b601162280369eb795aa3b0c40e1;hpb=a367ec1b15a2458e532245f5308565dd84f8ca63;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ade55756..88349faa 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -1,6 +1,6 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ -# based on notmuch, but with no concept of folders, files or flags +# based on notmuch, but with no concept of folders, files # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread @@ -15,15 +15,17 @@ use PublicInbox::InboxWritable; use PublicInbox::MID qw(mids_for_index mids); use PublicInbox::MsgIter; use PublicInbox::IdxStack; -use Carp qw(croak); +use Carp qw(croak carp); use POSIX qw(strftime); +use Time::Local qw(timegm); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -31,11 +33,11 @@ use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; +our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; sub new { my ($class, $ibx, $creat, $shard) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $levels = qr/\A(?:full|medium|basic)\z/; my $inboxdir = $ibx->{inboxdir}; my $version = $ibx->version; my $indexlevel = 'full'; @@ -45,32 +47,28 @@ sub new { $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; } if ($ibx->{indexlevel}) { - if ($ibx->{indexlevel} =~ $levels) { + if ($ibx->{indexlevel} =~ $INDEXLEVELS) { $indexlevel = $ibx->{indexlevel}; } else { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } $ibx = PublicInbox::InboxWritable->new($ibx); - my $self = bless { - ibx => $ibx, - xpfx => $inboxdir, # for xpfx_init - -altid => $altid, - ibx_ver => $version, - indexlevel => $indexlevel, - }, $class; - $self->xpfx_init; + my $self = PublicInbox::Search->new($ibx); + bless $self, $class; + $self->{ibx} = $ibx; + $self->{-altid} = $altid; + $self->{indexlevel} = $indexlevel; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; if ($ibx->{-skip_docdata}) { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; - $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); - $self->{over}->{-no_fsync} = 1 if $ibx->{-no_fsync}; + $self->{oidx} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); + $self->{oidx}->{-no_fsync} = 1 if $ibx->{-no_fsync}; } elsif ($version == 2) { defined $shard or die "shard is required for v2\n"; # shard is a number @@ -135,11 +133,9 @@ sub idx_acquire { } } return unless defined $flag; - $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync}; + $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; - if ($@) { - die "Failed opening $dir: ", $@; - } + croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; } @@ -154,7 +150,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } @@ -325,6 +321,16 @@ sub index_xapian { # msg_iter callback } } +sub index_list_id ($$$) { + my ($self, $doc, $hdr) = @_; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } +} + sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; for my $mid (@$mids) { @@ -338,16 +344,12 @@ sub index_ids ($$$$) { } } $doc->add_boolean_term('Q' . $_) for @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } + index_list_id($self, $doc, $hdr); } -sub add_xapian ($$$$) { +sub eml2doc ($$$;$) { my ($self, $eml, $smsg, $mids) = @_; + $mids //= mids_for_index($eml); my $doc = $X->{Document}->new; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); @@ -363,6 +365,9 @@ sub add_xapian ($$$$) { $tg->set_document($doc); index_headers($self, $smsg); + if (defined(my $eidx_key = $smsg->{eidx_key})) { + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + } msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); @@ -387,12 +392,18 @@ sub add_xapian ($$$$) { } } } + $doc; +} + +sub add_xapian ($$$$) { + my ($self, $eml, $smsg, $mids) = @_; + my $doc = eml2doc($self, $eml, $smsg, $mids); $self->{xdb}->replace_document($smsg->{num}, $doc); } sub _msgmap_init ($) { my ($self) = @_; - die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1; + die "BUG: _msgmap_init is only for v1\n" if $self->{ibx}->version != 1; $self->{mm} //= eval { require PublicInbox::Msgmap; my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1; @@ -421,8 +432,8 @@ sub add_message { # of the fields which exist in over.sqlite3. We may stop # storing doc_data in Xapian sometime after we get multi-inbox # search working. - if (my $over = $self->{over}) { # v1 only - $over->add_overview($mime, $smsg); + if (my $oidx = $self->{oidx}) { # v1 only + $oidx->add_overview($mime, $smsg); } if (need_xapian($self)) { add_xapian($self, $mime, $smsg, $mids); @@ -436,32 +447,122 @@ sub add_message { $smsg->{num}; } +sub _get_doc ($$) { + my ($self, $docid) = @_; + my $doc = eval { $self->{xdb}->get_document($docid) }; + $doc // do { + warn "E: $@\n" if $@; + warn "E: #$docid missing in Xapian\n"; + undef; + } +} + +sub add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + term_generator($self)->set_document($doc); + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + index_list_id($self, $doc, $eml); + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + eval { $doc->remove_term('O'.$eidx_key) }; + warn "W: ->remove_term O$eidx_key: $@\n" if $@; + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + eval { $doc->remove_term('G' . $lid) }; + warn "W: ->remove_term G$lid: $@\n" if $@; + + # nb: we don't remove the XL probabilistic terms + # since terms may overlap if cross-posted. + # + # IOW, a message which has both + # and would have overlapping + # "XLexample" and "XLcom" as terms and which we + # wouldn't know if they're safe to remove if we just + # unindex while preserving + # . + # + # In any case, this entire sub is will likely never + # be needed and users using the "l:" prefix are probably + # rarer. + } + $self->{xdb}->replace_document($docid, $doc); +} + +sub set_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my %keep = map { $_ => 1 } @kw; + my %add = %keep; + my @rm; + my $end = $doc->termlist_end; + for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { + $cur->skip_to('K'); + last if $cur == $end; + my $kw = $cur->get_termname; + $kw =~ s/\AK//s or next; + $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw); + } + return unless (scalar(@rm) + scalar(keys %add)); + $doc->remove_term('K'.$_) for @rm; + $doc->add_boolean_term('K'.$_) for (keys %add); + $self->{xdb}->replace_document($docid, $doc); +} + +sub add_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + $doc->add_boolean_term('K'.$_) for @kw; + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my $replace; + eval { + $doc->remove_term('K'.$_); + $replace = 1 + } for @kw; + $self->{xdb}->replace_document($docid, $doc) if $replace; +} + +sub smsg_from_doc ($) { + my ($doc) = @_; + my $data = $doc->get_data or return; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); + $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); + $smsg->load_from_data($data); + $smsg; +} + sub xdb_remove { - my ($self, $oid, @removed) = @_; + my ($self, @docids) = @_; my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = eval { $xdb->get_document($num) }; - unless ($doc) { - warn "E: $@\n" if $@; - warn "E: #$num $oid missing in Xapian\n"; - next; - } - my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->load_expand($doc); - my $blob = $smsg->{blob} // '(unset)'; - if ($blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; - } + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in in Xapian? $@\n" if $@; } } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{over}; +sub remove_by_docid { + my ($self, $num) = @_; + die "BUG: remove_by_docid is v2-only\n" if $self->{oidx}; $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); + xdb_remove($self, $num) if need_xapian($self); } sub index_git_blob_id { @@ -481,17 +582,13 @@ sub unindex_eml { my $nr = 0; my %tmp; for my $mid (@$mids) { - my @removed = eval { $self->{over}->remove_oid($oid, $mid) }; - if ($@) { - warn "E: failed to remove <$mid> from overview: $@\n"; - } else { - $nr += scalar @removed; - $tmp{$_}++ for @removed; - } + my @removed = $self->{oidx}->remove_oid($oid, $mid); + $nr += scalar @removed; + $tmp{$_}++ for @removed; } if (!$nr) { - $mids = join('> <', @$mids); - warn "W: <$mids> missing for removal from overview\n"; + my $m = join('> <', @$mids); + warn "W: <$m> missing for removal from overview\n"; } while (my ($num, $nr) = each %tmp) { warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; @@ -501,7 +598,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -509,9 +606,9 @@ sub index_mm { my $mids = mids($mime); my $mm = $self->{mm}; if ($sync->{reindex}) { - my $over = $self->{over}; + my $oidx = $self->{oidx}; for my $mid (@$mids) { - my ($num, undef) = $over->num_mid0_for_oid($oid, $mid); + my ($num, undef) = $oidx->num_mid0_for_oid($oid, $mid); return $num if defined $num; } $mm->num_for($mids->[0]) // $mm->mid_insert($mids->[0]); @@ -532,34 +629,63 @@ sub crlf_adjust ($) { } } +sub is_bad_blob ($$$$) { + my ($oid, $type, $size, $expect_oid) = @_; + if ($type ne 'blob') { + carp "W: $expect_oid is not a blob (type=$type)"; + return 1; + } + croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid; + $size == 0 ? 1 : 0; # size == 0 means purged +} + sub index_both { # git->cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; $size += crlf_adjust($$bref); my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; + my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + ${$sync->{latest_cmt}} = $cur_cmt; } sub unindex_both { # git->cat_async callback - my ($bref, $oid, $type, $size, $self) = @_; + my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); + my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); + # may be undef if leftover + if (defined(my $cur_cmt = $sync->{cur_cmt})) { + ${$sync->{latest_cmt}} = $cur_cmt; + } + ++$self->{nidx}; +} + +sub with_umask { + my $self = shift; + ($self->{ibx} // $self->{eidx})->with_umask(@_); } # called by public-inbox-index sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; - $self->{ibx}->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + $self->with_umask(\&_index_sync, $self, $opt); + if ($opt->{reindex} && !$opt->{quit}) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } @@ -575,46 +701,44 @@ sub check_size { # check_async cb for -index --max-size=... sub v1_checkpoint ($$;$) { my ($self, $sync, $stk) = @_; - $self->{ibx}->git->check_async_wait; - $self->{ibx}->git->cat_async_wait; + $self->{ibx}->git->async_wait_all; - # latest_cmt may be undef - my $newest = $stk ? $stk->{latest_cmt} : undef; - if ($newest) { + # $newest may be undef + my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + if (defined($newest)) { my $cur = $self->{mm}->last_commit || ''; if (need_update($self, $cur, $newest)) { $self->{mm}->last_commit($newest); } - } else { - ${$sync->{max}} = $self->{batch_bytes}; } + ${$sync->{max}} = $self->{batch_bytes}; $self->{mm}->{dbh}->commit; - if ($newest && need_xapian($self)) { - my $xdb = $self->{xdb}; + my $xdb = need_xapian($self) ? $self->{xdb} : undef; + if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } - + } + if ($stk) { # all done if $stk is passed # let SearchView know a full --reindex was done so it can # generate ->has_threadid-dependent links - if ($sync->{reindex} && !ref($sync->{reindex})) { + if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) { my $n = $xdb->get_metadata('has_threadid'); $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } + $self->{oidx}->rethread_done($sync->{-opt}); # all done } - - $self->{over}->rethread_done($sync->{-opt}) if $newest; # all done commit_txn_lazy($self); - $self->{ibx}->git->cleanup; + $sync->{ibx}->git->cleanup; my $nr = ${$sync->{nr}}; idx_release($self, $nr); # let another process do some work... if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -623,27 +747,32 @@ sub v1_checkpoint ($$;$) { # only for v1 sub process_stack { my ($self, $sync, $stk) = @_; - my $git = $self->{ibx}->git; + my $git = $sync->{ibx}->git; my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; $sync->{sidx} = $self; + $sync->{latest_cmt} = \(my $latest_cmt); $self->{mm}->{dbh}->begin_work; if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $sync); } } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { + my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid }; + last if $sync->{quit}; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; + $arg->{autime} = $at; + $arg->{cotime} = $ct; if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { @@ -651,17 +780,17 @@ sub process_stack { } v1_checkpoint($self, $sync) if $max <= 0; } elsif ($f eq 'd') { - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } -sub log2stack ($$$$) { - my ($sync, $git, $range, $ibx) = @_; +sub log2stack ($$$) { + my ($sync, $git, $range) = @_; my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise) my ($add, $del); - if ($ibx->version == 1) { + if ($sync->{ibx}->version == 1) { my $path = $hex.'{2}/'.$hex.'{38}'; $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!; $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!; @@ -675,17 +804,18 @@ sub log2stack ($$$$) { my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H --no-notes --no-color --no-renames --no-abbrev), $range); - my ($at, $ct, $stk); + my ($at, $ct, $stk, $cmt); while (<$fh>) { + return if $sync->{quit}; if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); + ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3); + $stk //= PublicInbox::IdxStack->new($cmt); } elsif (/$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); + $stk->push_rec('d', $at, $ct, $oid, $cmt); } } elsif (/$add/) { my $oid = $1; @@ -693,12 +823,10 @@ sub log2stack ($$$$) { my $oid_bin = pack('H*', $oid); my $nr = --$D->{$oid_bin}; delete($D->{$oid_bin}) if $nr <= 0; - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); + next if $nr >= 0; } + $stk->push_rec('m', $at, $ct, $oid, $cmt); } } close $fh or die "git log failed: \$?=$?"; @@ -706,9 +834,9 @@ sub log2stack ($$$$) { $stk->read_prepare; } -sub prepare_stack ($$$) { - my ($self, $sync, $range) = @_; - my $git = $self->{ibx}->git; +sub prepare_stack ($$) { + my ($sync, $range) = @_; + my $git = $sync->{ibx}->git; if (index($range, '..') < 0) { # don't show annoying git errors to users who run -index @@ -717,7 +845,7 @@ sub prepare_stack ($$$) { return PublicInbox::IdxStack->new->read_prepare if $?; } $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR - log2stack($sync, $git, $range, $self->{ibx}); + log2stack($sync, $git, $range); } # --is-ancestor requires git 1.8.0+ @@ -765,17 +893,32 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; my $tip = $opt->{ref} || 'HEAD'; - my $git = $self->{ibx}->git; + my $ibx = $self->{ibx}; + local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $git->batch_prepare; + $ibx->git->batch_prepare; my $pr = $opt->{-progress}; - my $sync = { reindex => $opt->{reindex}, -opt => $opt }; + my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; - $self->{over}->rethread_prepare($opt); + $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); if ($sync->{reindex}) { my $last = $mm->last_commit; @@ -791,10 +934,10 @@ sub _index_sync { my $lx = reindex_from($sync->{reindex}, $last_commit); my $range = $lx eq '' ? $tip : "$lx..$tip"; $pr->("counting changes\n\t$range ... ") if $pr; - my $stk = prepare_stack($self, $sync, $range); + my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -806,7 +949,7 @@ sub DESTROY { sub _begin_txn { my ($self) = @_; my $xdb = $self->{xdb} || idx_acquire($self); - $self->{over}->begin_lazy if $self->{over}; + $self->{oidx}->begin_lazy if $self->{oidx}; $xdb->begin_transaction if $xdb; $self->{txn} = 1; $xdb; @@ -814,7 +957,7 @@ sub _begin_txn { sub begin_txn_lazy { my ($self) = @_; - $self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn}; + $self->with_umask(\&_begin_txn, $self) if !$self->{txn}; } # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) @@ -842,17 +985,21 @@ sub set_metadata_once { sub _commit_txn { my ($self) = @_; + if (my $eidx = $self->{eidx}) { + $eidx->git->async_wait_all; + $eidx->{transact_bytes} = 0; + } if (my $xdb = $self->{xdb}) { set_metadata_once($self); $xdb->commit_transaction; } - $self->{over}->commit_lazy if $self->{over}; + $self->{oidx}->commit_lazy if $self->{oidx}; } sub commit_txn_lazy { my ($self) = @_; delete($self->{txn}) and - $self->{ibx}->with_umask(\&_commit_txn, $self); + $self->with_umask(\&_commit_txn, $self); } sub worker_done { @@ -863,4 +1010,39 @@ sub worker_done { die "$$ $0 still in transaction\n" if $self->{txn}; } +sub eidx_shard_new { + my ($class, $eidx, $shard) = @_; + my $self = bless { + eidx => $eidx, + xpfx => $eidx->{xpfx}, + indexlevel => $eidx->{indexlevel}, + -skip_docdata => 1, + shard => $shard, + creat => 1, + }, $class; + $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium'; + $self; +} + +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); + } +} + 1;