From 4c5fa84eec85cf406cbcd900f92dc478ed14ea30 Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Thu, 22 Mar 2018 03:39:30 +0000 Subject: [PATCH] v2writable: add NNTP article number regeneration support Allow best-effort regeneration of NNTP article numbers from cloned git repositories in addition to indexing Xapian Article numbers will not remain consistent when we add purge support, though. --- MANIFEST | 1 + lib/PublicInbox/V2Writable.pm | 61 +++++++++++++++++++--- script/public-inbox-index | 35 +++++++++++-- t/v2reindex.t | 98 +++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 10 deletions(-) create mode 100644 t/v2reindex.t diff --git a/MANIFEST b/MANIFEST index 567148a4..0f889959 100644 --- a/MANIFEST +++ b/MANIFEST @@ -180,6 +180,7 @@ t/spawn.t t/thread-all.t t/thread-cycle.t t/utf8.mbox +t/v2reindex.t t/v2writable.t t/view.t t/watch_maildir.t diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 550a74d4..605f6887 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -494,7 +494,7 @@ sub mark_deleted { } sub reindex_oid { - my ($self, $mm_tmp, $D, $git, $oid) = @_; + my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_; my $len; my $msgref = $git->cat_file($oid, \$len); my $mime = PublicInbox::MIME->new($$msgref); @@ -514,8 +514,27 @@ sub reindex_oid { $num = $n; } } + if (!defined($mid0) && $regen && !$del) { + $num = $$regen--; + die "BUG: ran out of article numbers\n" if $num <= 0; + my $mm = $self->{skel}->{mm}; + foreach my $mid (@$mids) { + if ($mm->mid_set($num, $mid) == 1) { + $mid0 = $mid; + last; + } + } + if (!defined($mid0)) { + my $id = '<' . join('> <', @$mids) . '>'; + warn "Message-Id $id unusable for $num\n"; + } + } + if (!defined($mid0) || $del) { - return if (!defined($mid0) && $del); # expected for deletes + if (!defined($mid0) && $del) { # expected for deletes + $$regen--; + return + } my $id = '<' . join('> <', @$mids) . '>'; defined($mid0) or @@ -546,19 +565,45 @@ sub reindex_oid { } sub reindex { - my ($self) = @_; + my ($self, $regen) = @_; my $ibx = $self->{-inbox}; my $pfx = "$ibx->{mainrepo}/git"; my $max_git; my $latest = git_dir_latest($self, \$max_git); return unless defined $latest; - my @cmd = qw(log --raw -r --pretty=tformat:%h - --no-notes --no-color --no-abbrev); my $head = $ibx->{ref_head} || 'refs/heads/master'; $self->idx_init; # acquire lock my $x40 = qr/[a-f0-9]{40}/; my $mm_tmp = $self->{skel}->{mm}->tmp_clone; + if (!$regen) { + my (undef, $max) = $mm_tmp->minmax; + unless (defined $max) { + $regen = 1; + warn +"empty msgmap.sqlite3, regenerating article numbers\n"; + } + } + my $tip; # latest commit out of all git repos + if ($regen) { + my $regen_max = 0; + for (my $cur = $max_git; $cur >= 0; $cur--) { + die "already reindexing!\n" if $self->{reindex_pipe}; + my $git = PublicInbox::Git->new("$pfx/$cur.git"); + chomp($tip = $git->qx('rev-parse', $head)) unless $tip; + my $h = $cur == $max_git ? $tip : $head; + my @count = ('rev-list', '--count', $h, '--', 'm'); + $regen_max += $git->qx(@count); + } + die "No messages found in $pfx/*.git, bug?\n" unless $regen_max; + $regen = \$regen_max; + } my $D = {}; + my @cmd = qw(log --raw -r --pretty=tformat:%h + --no-notes --no-color --no-abbrev); + + # if we are regenerating, we must not use a newer tip commit than what + # the regeneration counter used: + $tip ||= $head; # work backwards through history for (my $cur = $max_git; $cur >= 0; $cur--) { @@ -566,12 +611,14 @@ sub reindex { my $cmt; my $git_dir = "$pfx/$cur.git"; my $git = PublicInbox::Git->new($git_dir); - my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $head); + my $h = $cur == $max_git ? $tip : $head; + my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $h); while (<$fh>) { if (/\A$x40$/o) { chomp($cmt = $_); } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { - $self->reindex_oid($mm_tmp, $D, $git, $1); + $self->reindex_oid($mm_tmp, $D, $git, $1, + $regen); } elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) { $self->mark_deleted($D, $git, $1); } diff --git a/script/public-inbox-index b/script/public-inbox-index index cea35738..52d6ba70 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -23,8 +23,15 @@ if ($@) { } my $reindex; -my %opts = ( '--reindex' => \$reindex ); +my $regen; +my $jobs = undef; +my %opts = ( + '--reindex' => \$reindex, + '--regenerate' => \$regen, + '--jobs|j=i' => \$jobs, +); GetOptions(%opts) or die "bad command-line args\n$usage"; +die "--jobs must be positive\n" if defined $jobs && $jobs <= 0; my @dirs; @@ -93,8 +100,30 @@ sub index_dir { if (ref($repo) && ($repo->{version} || 1) == 2) { eval { require PublicInbox::V2Writable }; die "v2 requirements not met: $@\n" if $@; - my $v2w = PublicInbox::V2Writable->new($repo); - $v2w->reindex; + my $v2w = eval { + local $ENV{NPROC} = $jobs; + PublicInbox::V2Writable->new($repo); + }; + if (defined $jobs) { + if ($jobs == 1) { + $v2w->{parallel} = 0; + } else { + my $n = $v2w->{partitions}; + if ($jobs != $n) { + warn +"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n"; + } + } + } + my $mm = $repo->mm; + my (undef, $max) = $mm->minmax if $mm; + if (defined($max) && !$reindex && !$regen) { + die +"v2 inboxes may only use --reindex and/or --regenerate once\n". +"msgmap.sqlite3 is initialized\n"; + } + + $v2w->reindex($regen); $v2w->done; } else { my $s = PublicInbox::SearchIdx->new($repo, 1); diff --git a/t/v2reindex.t b/t/v2reindex.t new file mode 100644 index 00000000..b9540e4a --- /dev/null +++ b/t/v2reindex.t @@ -0,0 +1,98 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::ContentId qw(content_digest); +use File::Temp qw/tempdir/; +use File::Path qw(remove_tree); + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2reindex.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-v2reindex-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); + +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; +foreach my $i (1..10) { + $mime->header_set('Message-Id', "<$i\@example.com>"); + ok($im->add($mime), "message $i added"); + if ($i == 4) { + $im->remove($mime); + } +} + +if ('test remove later') { + $mime->header_set('Message-Id', "<5\@example.com>"); + $im->remove($mime); +} + +$im->done; +my $minmax = [ $ibx->mm->minmax ]; +ok(defined $minmax->[0] && defined $minmax->[1], 'minmax defined'); + +eval { $im->reindex }; +is($@, '', 'no error from reindexing'); +$im->done; + +my $xap = "$mainrepo/xap".PublicInbox::Search::SCHEMA_VERSION(); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed'); +eval { $im->reindex }; +is($@, '', 'no error from reindexing'); +$im->done; +ok(-d $xap, 'Xapian directories recreated'); + +delete $ibx->{mm}; +is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); + +ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap'); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed again'); +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + eval { $im->reindex }; + is($@, '', 'no error from reindexing without msgmap'); + like(join(' ', @warn), qr/regenerat/, 'warned about regenerating'); + $im->done; + ok(-d $xap, 'Xapian directories recreated'); + delete $ibx->{mm}; + is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); +} + +ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap'); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed again'); +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + eval { $im->reindex(my $regen = 1) }; + is($@, '', 'no error from reindexing without msgmap'); + is_deeply(\@warn, [], 'no warnings'); + $im->done; + ok(-d $xap, 'Xapian directories recreated'); + delete $ibx->{mm}; + is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); +} + +done_testing(); -- 2.44.0