X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=0124dd11b254f7df5abaaa490cad929e345ada6b;hb=0d38f65c490466837ae091afa7a7b6f59d04ce7c;hp=569efbb0debcdf1b7c9939b6838eda1d8819e017;hpb=4c315ed49fe8a6224264d74c490e0ee552365b2f;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 569efbb0..0124dd11 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,9 +22,10 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -154,7 +155,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } @@ -444,20 +445,20 @@ sub add_message { $smsg->{num}; } -sub _get_doc ($$$) { - my ($self, $docid, $oid) = @_; +sub _get_doc ($$) { + my ($self, $docid) = @_; my $doc = eval { $self->{xdb}->get_document($docid) }; $doc // do { warn "E: $@\n" if $@; - warn "E: #$docid $oid missing in Xapian\n"; + warn "E: #$docid missing in Xapian\n"; undef; } } sub add_eidx_info { - my ($self, $docid, $oid, $eidx_key, $eml) = @_; + my ($self, $docid, $eidx_key, $eml) = @_; begin_txn_lazy($self); - my $doc = _get_doc($self, $docid, $oid) or return; + my $doc = _get_doc($self, $docid) or return; term_generator($self)->set_document($doc); $doc->add_boolean_term('O'.$eidx_key); index_list_id($self, $doc, $eml); @@ -465,14 +466,16 @@ sub add_eidx_info { } sub remove_eidx_info { - my ($self, $docid, $oid, $eidx_key, $eml) = @_; + my ($self, $docid, $eidx_key, $eml) = @_; begin_txn_lazy($self); - my $doc = _get_doc($self, $docid, $oid) or return; - $doc->remove_term('O'.$eidx_key); - for my $l ($eml->header_raw('List-Id')) { + my $doc = _get_doc($self, $docid) or return; + eval { $doc->remove_term('O'.$eidx_key) }; + warn "W: ->remove_term O$eidx_key: $@\n" if $@; + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; - $doc->remove_term('G' . $lid); + eval { $doc->remove_term('G' . $lid) }; + warn "W: ->remove_term G$lid: $@\n" if $@; # nb: we don't remove the XL probabilistic terms # since terms may overlap if cross-posted. @@ -509,25 +512,19 @@ sub smsg_from_doc ($) { } sub xdb_remove { - my ($self, $oid, @removed) = @_; + my ($self, @docids) = @_; my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = _get_doc($self, $num, $oid) or next; - my $smsg = smsg_from_doc($doc); - my $blob = $smsg->{blob}; # may be undef if --skip-docdata - if (!defined($blob) || $blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; - } + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in in Xapian? $@\n" if $@; } } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{oidx}; +sub remove_by_docid { + my ($self, $num) = @_; + die "BUG: remove_by_docid is v2-only\n" if $self->{oidx}; $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); + xdb_remove($self, $num) if need_xapian($self); } sub index_git_blob_id { @@ -552,8 +549,8 @@ sub unindex_eml { $tmp{$_}++ for @removed; } if (!$nr) { - $mids = join('> <', @$mids); - warn "W: <$mids> missing for removal from overview\n"; + my $m = join('> <', @$mids); + warn "W: <$m> missing for removal from overview\n"; } while (my ($num, $nr) = each %tmp) { warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; @@ -563,7 +560,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -606,11 +603,17 @@ sub index_both { # git->cat_async callback $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + ${$sync->{latest_cmt}} = $cur_cmt; } sub unindex_both { # git->cat_async callback - my ($bref, $oid, $type, $size, $self) = @_; - unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); + my ($bref, $oid, $type, $size, $sync) = @_; + unindex_eml($sync->{sidx}, $oid, PublicInbox::Eml->new($bref)); + # may be undef if leftover + if (defined(my $cur_cmt = $sync->{cur_cmt})) { + ${$sync->{latest_cmt}} = $cur_cmt; + } } sub with_umask { @@ -623,10 +626,11 @@ sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; $self->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + if ($opt->{reindex} && !$opt->{quit}) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } @@ -644,34 +648,33 @@ sub v1_checkpoint ($$;$) { my ($self, $sync, $stk) = @_; $self->{ibx}->git->async_wait_all; - # latest_cmt may be undef - my $newest = $stk ? $stk->{latest_cmt} : undef; - if ($newest) { + # $newest may be undef + my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + if (defined($newest)) { my $cur = $self->{mm}->last_commit || ''; if (need_update($self, $cur, $newest)) { $self->{mm}->last_commit($newest); } - } else { - ${$sync->{max}} = $self->{batch_bytes}; } + ${$sync->{max}} = $self->{batch_bytes}; $self->{mm}->{dbh}->commit; - if ($newest && need_xapian($self)) { - my $xdb = $self->{xdb}; + my $xdb = need_xapian($self) ? $self->{xdb} : undef; + if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } - + } + if ($stk) { # all done if $stk is passed # let SearchView know a full --reindex was done so it can # generate ->has_threadid-dependent links - if ($sync->{reindex} && !ref($sync->{reindex})) { + if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) { my $n = $xdb->get_metadata('has_threadid'); $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } + $self->{oidx}->rethread_done($sync->{-opt}); # all done } - - $self->{oidx}->rethread_done($sync->{-opt}) if $newest; # all done commit_txn_lazy($self); $sync->{ibx}->git->cleanup; my $nr = ${$sync->{nr}}; @@ -680,7 +683,7 @@ sub v1_checkpoint ($$;$) { if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -695,21 +698,26 @@ sub process_stack { $sync->{nr} = \$nr; $sync->{max} = \$max; $sync->{sidx} = $self; + $sync->{latest_cmt} = \(my $latest_cmt); $self->{mm}->{dbh}->begin_work; if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $sync); } } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { + my $arg = { %$sync, cur_cmt => $cur_cmt }; + last if $sync->{quit}; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; + $arg->{autime} = $at; + $arg->{cotime} = $ct; if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { @@ -717,10 +725,10 @@ sub process_stack { } v1_checkpoint($self, $sync) if $max <= 0; } elsif ($f eq 'd') { - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } sub log2stack ($$$) { @@ -741,17 +749,18 @@ sub log2stack ($$$) { my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H --no-notes --no-color --no-renames --no-abbrev), $range); - my ($at, $ct, $stk); + my ($at, $ct, $stk, $cmt); while (<$fh>) { + return if $sync->{quit}; if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); + ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3); + $stk //= PublicInbox::IdxStack->new($cmt); } elsif (/$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); + $stk->push_rec('d', $at, $ct, $oid, $cmt); } } elsif (/$add/) { my $oid = $1; @@ -759,12 +768,10 @@ sub log2stack ($$$) { my $oid_bin = pack('H*', $oid); my $nr = --$D->{$oid_bin}; delete($D->{$oid_bin}) if $nr <= 0; - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); + next if $nr >= 0; } + $stk->push_rec('m', $at, $ct, $oid, $cmt); } } close $fh or die "git log failed: \$?=$?"; @@ -831,6 +838,16 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; @@ -840,6 +857,10 @@ sub _index_sync { $ibx->git->batch_prepare; my $pr = $opt->{-progress}; my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); @@ -860,7 +881,7 @@ sub _index_sync { my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -943,4 +964,25 @@ sub eidx_shard_new { $self; } +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); + } +} + 1;