From 017fed7bc4d33ac474a19356994be5bd0bfe68ba Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Wed, 18 Apr 2018 09:13:13 +0000 Subject: [PATCH] searchidx: regenerate and avoid article number gaps on full index Some messages to git@vger went missing from Msgmap from old bugs and became inaccessible via NNTP. Forcing NNTP article numbers when the overview DB came about made the problem more visible when reindexing old (v1) repositories as all removed spam messages took up AUTOINCREMENT numbers again before they were removed. Having large gaps in NNTP article numbers is not good since it throws off NNTP clients. This does NOT prevent NNTP clients from seeing some messages twice, but is better than having them miss several messages entirely. We also avoid depending on --reverse in git-log, as git requires storing an entire commit list in memory for --reverse, so it's cheaper to store only deleted blobs in the %D hash since they do not live long. --- MANIFEST | 1 + lib/PublicInbox/Msgmap.pm | 8 +- lib/PublicInbox/SearchIdx.pm | 186 +++++++++++++++++++++------------- lib/PublicInbox/V2Writable.pm | 15 +-- t/v1-add-remove-add.t | 45 ++++++++ 5 files changed, 171 insertions(+), 84 deletions(-) create mode 100644 t/v1-add-remove-add.t diff --git a/MANIFEST b/MANIFEST index 00a0970b..8038ad48 100644 --- a/MANIFEST +++ b/MANIFEST @@ -192,6 +192,7 @@ t/spawn.t t/thread-cycle.t t/time.t t/utf8.mbox +t/v1-add-remove-add.t t/v2-add-remove-add.t t/v2mda.t t/v2mirror.t diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index feef8ba7..3237a5ed 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -108,10 +108,10 @@ sub created_at { sub mid_insert { my ($self, $mid) = @_; my $dbh = $self->{dbh}; - my $sql = 'INSERT OR IGNORE INTO msgmap (mid) VALUES (?)'; - my $sth = $self->{mid_insert} ||= $dbh->prepare($sql); - $sth->bind_param(1, $mid); - return if $sth->execute == 0; + my $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO msgmap (mid) VALUES (?) + + return if $sth->execute($mid) == 0; $dbh->last_insert_id(undef, undef, 'msgmap', 'num'); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 0b1dc219..2239c902 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -17,6 +17,7 @@ use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); use PublicInbox::OverIdx; +use PublicInbox::Spawn qw(spawn); require PublicInbox::Git; use Compress::Zlib qw(compress); @@ -266,8 +267,8 @@ sub add_message { my $mids = mids($mime->header_obj); $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility unless (defined $num) { # v1 - my $mm = $self->_msgmap_init; - $num = $mm->mid_insert($mid0) || $mm->num_for($mid0); + $self->_msgmap_init; + $num = index_mm($self, $mime); } eval { my $smsg = PublicInbox::SearchMsg->new($mime); @@ -464,8 +465,28 @@ sub index_mm { my ($self, $mime) = @_; my $mid = mid_clean(mid_mime($mime)); my $mm = $self->{mm}; - my $num = $mm->mid_insert($mid); - return $num if defined $num; + my $num; + + if (defined $self->{regen_down}) { + $num = $mm->num_for($mid) and return $num; + + while (($num = $self->{regen_down}--) > 0) { + if ($mm->mid_set($num, $mid) != 0) { + return $num; + } + } + } elsif (defined $self->{regen_up}) { + $num = $mm->num_for($mid) and return $num; + + # this is to fixup old bugs due to add-remove-add + while (($num = ++$self->{regen_up})) { + if ($mm->mid_set($num, $mid) != 0) { + return $num; + } + } + } + + $num = $mm->mid_insert($mid) and return $num; # fallback to num_for since filters like RubyLang set the number $mm->num_for($mid); @@ -476,18 +497,6 @@ sub unindex_mm { $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); } -sub index_mm2 { - my ($self, $mime, $bytes, $blob) = @_; - my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - add_message($self, $mime, $bytes, $num, $blob); -} - -sub unindex_mm2 { - my ($self, $mime) = @_; - $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); - unindex_blob($self, $mime); -} - sub index_both { my ($self, $mime, $bytes, $blob) = @_; my $num = index_mm($self, $mime); @@ -521,12 +530,12 @@ sub batch_adjust ($$$$) { $$max -= $bytes; if ($$max <= 0) { $$max = BATCH_BYTES; - $batch_cb->($latest, 1); + $batch_cb->($latest); } } # only for v1 -sub rlog { +sub read_log { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; @@ -537,23 +546,39 @@ sub rlog { my $bytes; my $max = BATCH_BYTES; local $/ = "\n"; + my %D; my $line; + my $newest; + my $mid = '20170114215743.5igbjup6qpsh3jfg@genre.crustytoothpaste.net'; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; + delete $D{$blob} and next; my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mids = mids($mime->header_obj); + foreach (@$mids) { + warn "ADD $mid\n" if ($_ eq $mid); + } batch_adjust(\$max, $bytes, $batch_cb, $latest); $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - my $mime = do_cat_mail($git, $blob, \$bytes) or next; - batch_adjust(\$max, $bytes, $batch_cb, $latest); - $del_cb->($self, $mime); + $D{$blob} = 1; } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; + $newest ||= $latest; + } + } + # get the leftovers + foreach my $blob (keys %D) { + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mids = mids($mime->header_obj); + foreach (@$mids) { + warn "DEL $mid\n" if ($_ eq $mid); } + $del_cb->($self, $mime); } - $batch_cb->($latest, 0); + $batch_cb->($latest, $newest); } sub _msgmap_init { @@ -567,18 +592,62 @@ sub _msgmap_init { sub _git_log { my ($self, $range) = @_; - $self->{git}->popen(qw/log --reverse --no-notes --no-color + my $git = $self->{git}; + + if (index($range, '..') < 0) { + my $regen_max = 0; + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $range); + ++$regen_max while <$fh>; + my (undef, $max) = $self->{mm}->minmax; + + if ($max && $max == $regen_max) { + # fix up old bugs in full indexes which caused messages to + # not appear in Msgmap + $self->{regen_up} = $max; + } else { + # normal regen is for for fresh data + $self->{regen_down} = $regen_max; + } + } + + $git->popen(qw/log --no-notes --no-color --no-renames --raw -r --no-abbrev/, $range); } +sub is_ancestor ($$$) { + my ($git, $cur, $tip) = @_; + return 0 unless $git->check($cur); + my $cmd = [ 'git', "--git-dir=$git->{git_dir}", + qw(merge-base --is-ancestor), $cur, $tip ]; + my $pid = spawn($cmd); + defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; + waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; + $? == 0; +} + +sub need_update ($$$) { + my ($self, $cur, $new) = @_; + my $git = $self->{git}; + return 1 if $cur && !is_ancestor($git, $cur, $new); + my $range = $cur eq '' ? $new : "$cur..$new"; + chomp(my $n = $git->qx(qw(rev-list --count), $range)); + ($n eq '' || $n > 0); +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; my $reindex = $opts->{reindex}; my ($mkey, $last_commit, $lx, $xlog); - $self->{git}->batch_prepare; + my $git = $self->{git}; + $git->batch_prepare; + my $xdb = $self->begin_txn_lazy; + my $mm = _msgmap_init($self); do { $xlog = undef; $mkey = 'last_commit'; @@ -588,73 +657,54 @@ sub _index_sync { $lx = ''; $mkey = undef if $last_commit ne ''; } + + # use last_commit from msgmap if it is older or unset + my $lm = $mm->last_commit || ''; + if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) { + $lx = $lm; + } + $self->{over}->rollback_lazy; $self->{over}->disconnect; delete $self->{txn}; $xdb->cancel_transaction; $xdb = _xdb_release($self); - # ensure we leak no FDs to "git log" + # ensure we leak no FDs to "git log" with Xapian <= 1.2 my $range = $lx eq '' ? $tip : "$lx..$tip"; $xlog = _git_log($self, $range); $xdb = $self->begin_txn_lazy; } while ($xdb->get_metadata('last_commit') ne $last_commit); - my $mm = _msgmap_init($self); my $dbh = $mm->{dbh} if $mm; - my $mm_only; my $cb = sub { - my ($commit, $more) = @_; + my ($commit, $newest) = @_; if ($dbh) { - $mm->last_commit($commit) if $commit; + if ($newest) { + my $cur = $mm->last_commit || ''; + if (need_update($self, $cur, $newest)) { + $mm->last_commit($newest); + } + } $dbh->commit; } - if (!$mm_only) { - $xdb->set_metadata($mkey, $commit) if $mkey && $commit; - $self->commit_txn_lazy; + if ($mkey && $newest) { + my $cur = $xdb->get_metadata($mkey); + if (need_update($self, $cur, $newest)) { + $xdb->set_metadata($mkey, $newest); + } } + $self->commit_txn_lazy; # let another process do some work... < - if ($more) { - if (!$mm_only) { - $xdb = $self->begin_txn_lazy; - } + if (!$newest) { + $xdb = $self->begin_txn_lazy; $dbh->begin_work if $dbh; } }; - if ($mm) { - $dbh->begin_work; - my $lm = $mm->last_commit || ''; - if ($lm eq $lx) { - # Common case is the indexes are synced, - # we only need to run git-log once: - rlog($self, $xlog, *index_both, *unindex_both, $cb); - } else { - # Uncommon case, msgmap and xapian are out-of-sync - # do not care for performance (but git is fast :>) - # This happens if we have to reindex Xapian since - # msgmap is a frozen format and our Xapian format - # is evolving. - my $r = $lm eq '' ? $tip : "$lm..$tip"; - - # first, ensure msgmap is up-to-date: - my $mkey_prev = $mkey; - $mkey = undef; # ignore xapian, for now - my $mlog = _git_log($self, $r); - $mm_only = 1; - rlog($self, $mlog, *index_mm, *unindex_mm, $cb); - $mm_only = $mlog = undef; - - # now deal with Xapian - $mkey = $mkey_prev; - $dbh = undef; - rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); - } - } else { - # user didn't install DBD::SQLite and DBI - rlog($self, $xlog, *add_message, *unindex_blob, $cb); - } + $dbh->begin_work; + read_log($self, $xlog, *index_both, *unindex_both, $cb); } sub DESTROY { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index e9fd502e..2cc87305 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -15,7 +15,8 @@ use PublicInbox::ContentId qw(content_id content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; use PublicInbox::Msgmap; -use PublicInbox::Spawn; +use PublicInbox::Spawn qw(spawn); +use PublicInbox::SearchIdx; use IO::Handle; # an estimate of the post-packed size to the raw uncompressed size @@ -561,7 +562,6 @@ sub import_init { sub diff ($$$) { my ($mid, $cur, $new) = @_; use File::Temp qw(tempfile); - use PublicInbox::Spawn qw(spawn); my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1); print $ah $cur->as_string or die "print: $!"; @@ -738,16 +738,7 @@ sub last_commits { $heads; } -sub is_ancestor ($$$) { - my ($git, $cur, $tip) = @_; - return 0 unless $git->check($cur); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(merge-base --is-ancestor), $cur, $tip ]; - my $pid = spawn($cmd); - defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0; -} +*is_ancestor = *PublicInbox::SearchIdx::is_ancestor; sub index_prepare { my ($self, $opts, $epoch_max, $ranges) = @_; diff --git a/t/v1-add-remove-add.t b/t/v1-add-remove-add.t new file mode 100644 index 00000000..cd6e2811 --- /dev/null +++ b/t/v1-add-remove-add.t @@ -0,0 +1,45 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::Import; +use PublicInbox::SearchIdx; +use File::Temp qw/tempdir/; + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v1-add-remove-add.t" if $@; +} +my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1); +is(system(qw(git init --bare), $mainrepo), 0); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-add-remove-add', + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + 'Message-ID' => '', + ], + body => "hello world\n", +); +my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx); +ok($im->add($mime), 'message added'); +ok($im->remove($mime), 'message added'); +ok($im->add($mime), 'message added again'); +$im->done; +my $rw = PublicInbox::SearchIdx->new($ibx, 1); +$rw->index_sync; +my $msgs = $ibx->recent({limit => 10}); +is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @$msgs, 1, 'only one message in history'); +is($ibx->mm->num_for('a-mid@b'), 2, 'exists with second article number'); + +done_testing(); -- 2.44.0