X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=0124dd11b254f7df5abaaa490cad929e345ada6b;hb=0d38f65c490466837ae091afa7a7b6f59d04ce7c;hp=6ff2cf947189617729c18194a21d1e23a6ed64c5;hpb=355c345f8eb8ac3edd5545a2548c90bd8b32d66e;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 6ff2cf94..0124dd11 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,9 +22,10 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -154,7 +155,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } @@ -444,20 +445,20 @@ sub add_message { $smsg->{num}; } -sub _get_doc ($$$) { - my ($self, $docid, $oid) = @_; +sub _get_doc ($$) { + my ($self, $docid) = @_; my $doc = eval { $self->{xdb}->get_document($docid) }; $doc // do { warn "E: $@\n" if $@; - warn "E: #$docid $oid missing in Xapian\n"; + warn "E: #$docid missing in Xapian\n"; undef; } } sub add_eidx_info { - my ($self, $docid, $oid, $eidx_key, $eml) = @_; + my ($self, $docid, $eidx_key, $eml) = @_; begin_txn_lazy($self); - my $doc = _get_doc($self, $docid, $oid) or return; + my $doc = _get_doc($self, $docid) or return; term_generator($self)->set_document($doc); $doc->add_boolean_term('O'.$eidx_key); index_list_id($self, $doc, $eml); @@ -465,12 +466,12 @@ sub add_eidx_info { } sub remove_eidx_info { - my ($self, $docid, $oid, $eidx_key, $eml) = @_; + my ($self, $docid, $eidx_key, $eml) = @_; begin_txn_lazy($self); - my $doc = _get_doc($self, $docid, $oid) or return; + my $doc = _get_doc($self, $docid) or return; eval { $doc->remove_term('O'.$eidx_key) }; warn "W: ->remove_term O$eidx_key: $@\n" if $@; - for my $l ($eml->header_raw('List-Id')) { + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; eval { $doc->remove_term('G' . $lid) }; @@ -511,25 +512,19 @@ sub smsg_from_doc ($) { } sub xdb_remove { - my ($self, $oid, @removed) = @_; + my ($self, @docids) = @_; my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = _get_doc($self, $num, $oid) or next; - my $smsg = smsg_from_doc($doc); - my $blob = $smsg->{blob}; # may be undef if --skip-docdata - if (!defined($blob) || $blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; - } + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in in Xapian? $@\n" if $@; } } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{oidx}; +sub remove_by_docid { + my ($self, $num) = @_; + die "BUG: remove_by_docid is v2-only\n" if $self->{oidx}; $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); + xdb_remove($self, $num) if need_xapian($self); } sub index_git_blob_id { @@ -565,7 +560,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -631,10 +626,11 @@ sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; $self->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + if ($opt->{reindex} && !$opt->{quit}) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } @@ -687,7 +683,7 @@ sub v1_checkpoint ($$;$) { if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -708,6 +704,7 @@ sub process_stack { if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); $git->cat_async($oid, \&unindex_both, $sync); } @@ -717,6 +714,7 @@ sub process_stack { } while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { my $arg = { %$sync, cur_cmt => $cur_cmt }; + last if $sync->{quit}; if ($f eq 'm') { $arg->{autime} = $at; $arg->{cotime} = $ct; @@ -730,7 +728,7 @@ sub process_stack { $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } sub log2stack ($$$) { @@ -840,6 +838,16 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; @@ -849,6 +857,10 @@ sub _index_sync { $ibx->git->batch_prepare; my $pr = $opt->{-progress}; my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); @@ -869,7 +881,7 @@ sub _index_sync { my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -952,4 +964,25 @@ sub eidx_shard_new { $self; } +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); + } +} + 1;