+sub cpdb_retryable ($$) {
+ my ($src, $pfx) = @_;
+ if (ref($@) =~ /\bDatabaseModifiedError\b/) {
+ warn "$pfx Xapian DB modified, reopening and retrying\n";
+ $src->reopen;
+ return 1;
+ }
+ if ($@) {
+ warn "$pfx E: ", ref($@), "\n";
+ die;
+ }
+ 0;
+}
+
+sub progress_pfx ($) {
+ my ($wip) = @_; # tempdir v2: ([0-9])+-XXXXXXXX
+ my @p = split('/', $wip);
+
+ # return "xap15/0" for v2, or "xapian15" for v1:
+ ($p[-1] =~ /\A([0-9]+)/) ? "$p[-2]/$1" : $p[-1];
+}
+
+# xapian-compact wrapper
+sub compact ($$) {
+ my ($args, $opt) = @_;
+ my ($src, $newdir) = @$args;
+ my $dst = ref($newdir) ? $newdir->dirname : $newdir;
+ my ($r, $w);
+ my $pfx = $opt->{-progress_pfx} ||= progress_pfx($src);
+ my $pr = $opt->{-progress};
+ my $rdr = {};
+
+ foreach my $fd (0..2) {
+ defined(my $dfd = $opt->{$fd}) or next;
+ $rdr->{$fd} = $dfd;
+ }
+ $rdr->{1} = $w if $pr && pipe($r, $w);
+
+ # we rely on --no-renumber to keep docids synched to NNTP
+ my $cmd = [ $XAPIAN_COMPACT, '--no-renumber' ];
+ for my $sw (qw(no-full fuller)) {
+ push @$cmd, "--$sw" if $opt->{$sw};
+ }
+ for my $sw (qw(blocksize)) {
+ defined(my $v = $opt->{$sw}) or next;
+ push @$cmd, "--$sw", $v;
+ }
+ $pr->("$pfx `".join(' ', @$cmd)."'\n") if $pr;
+ push @$cmd, $src, $dst;
+ my $pid = spawn($cmd, undef, $rdr);
+ if ($pr) {
+ close $w or die "close: \$w: $!";
+ foreach (<$r>) {
+ s/\r/\r$pfx /g;
+ $pr->("$pfx $_");
+ }
+ }
+ my $rp = waitpid($pid, 0);
+ if ($? || $rp != $pid) {
+ die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$rp)\n";
+ }
+}
+
+sub cpdb_loop ($$$;$$) {
+ my ($src, $dst, $pr_data, $cur_shard, $reshard) = @_;
+ my ($pr, $fmt, $nr, $pfx);
+ if ($pr_data) {
+ $pr = $pr_data->{pr};
+ $fmt = $pr_data->{fmt};
+ $nr = \($pr_data->{nr});
+ $pfx = $pr_data->{pfx};
+ }
+
+ my ($it, $end);
+ do {
+ eval {
+ $it = $src->postlist_begin('');
+ $end = $src->postlist_end('');
+ };
+ } while (cpdb_retryable($src, $pfx));
+
+ do {
+ eval {
+ for (; $it != $end; $it++) {
+ my $docid = $it->get_docid;
+ if (defined $reshard) {
+ my $dst_shard = $docid % $reshard;
+ next if $dst_shard != $cur_shard;
+ }
+ my $doc = $src->get_document($docid);
+ $dst->replace_document($docid, $doc);
+ if ($pr_data && !(++$$nr & 1023)) {
+ $pr->(sprintf($fmt, $$nr));
+ }
+ }
+
+ # unlike copydatabase(1), we don't copy spelling
+ # and synonym data (or other user metadata) since
+ # the Perl APIs don't expose iterators for them
+ # (and public-inbox does not use those features)
+ };
+ } while (cpdb_retryable($src, $pfx));
+}
+
+# Like copydatabase(1), this is horribly slow; and it doesn't seem due
+# to the overhead of Perl.
+sub cpdb ($$) {
+ my ($args, $opt) = @_;
+ my ($old, $newdir) = @$args;
+ my $new = $newdir->dirname;
+ my ($src, $cur_shard);
+ my $reshard;
+ PublicInbox::SearchIdx::load_xapian_writable() or die;
+ my $XapianDatabase = $PublicInbox::Search::X{Database};
+ if (ref($old) eq 'ARRAY') {
+ ($cur_shard) = ($new =~ m!xap[0-9]+/([0-9]+)\b!);
+ defined $cur_shard or
+ die "BUG: could not extract shard # from $new";
+ $reshard = $opt->{reshard};
+ defined $reshard or die 'BUG: got array src w/o --reshard';
+
+ # resharding, M:N copy means have full read access
+ foreach (@$old) {
+ if ($src) {
+ my $sub = $XapianDatabase->new($_);
+ $src->add_database($sub);
+ } else {
+ $src = $XapianDatabase->new($_);
+ }
+ }
+ } else {
+ $src = $XapianDatabase->new($old);
+ }
+
+ my ($tmp, $ft);
+ local %SIG = %SIG;
+ if ($opt->{compact}) {
+ my $dir = dirname($new);
+ same_fs_or_die($dir, $new);
+ $ft = File::Temp->newdir("$new.compact-XXXXXX", DIR => $dir);
+ setup_signals();
+ $tmp = $ft->dirname;
+ } else {
+ $tmp = $new;
+ }
+
+ # like copydatabase(1), be sure we don't overwrite anything in case
+ # of other bugs:
+ my $creat = eval($PublicInbox::Search::Xap.'::DB_CREATE()');
+ die if $@;
+ my $XapianWritableDatabase = $PublicInbox::Search::X{WritableDatabase};
+ my $dst = $XapianWritableDatabase->new($tmp, $creat);
+ my $pr = $opt->{-progress};
+ my $pfx = $opt->{-progress_pfx} = progress_pfx($new);
+ my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr;
+
+ do {
+ eval {
+ # update the only metadata key for v1:
+ my $lc = $src->get_metadata('last_commit');
+ $dst->set_metadata('last_commit', $lc) if $lc;
+
+ # only the first xapian shard (0) gets 'indexlevel'
+ if ($new =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\b!) {
+ my $l = $src->get_metadata('indexlevel');
+ if ($l eq 'medium') {
+ $dst->set_metadata('indexlevel', $l);
+ }
+ }
+ if ($pr_data) {
+ my $tot = $src->get_doccount;
+
+ # we can only estimate when resharding,
+ # because removed spam causes slight imbalance
+ my $est = '';
+ if (defined $cur_shard && $reshard > 1) {
+ $tot = int($tot/$reshard);
+ $est = 'around ';
+ }
+ my $fmt = "$pfx % ".length($tot)."u/$tot\n";
+ $pr->("$pfx copying $est$tot documents\n");
+ $pr_data->{fmt} = $fmt;
+ $pr_data->{total} = $tot;
+ }
+ };
+ } while (cpdb_retryable($src, $pfx));
+
+ if (defined $reshard) {
+ # we rely on document IDs matching NNTP article number,
+ # so we can't have the Xapian sharding DB support rewriting
+ # document IDs. Thus we iterate through each shard
+ # individually.
+ $src = undef;
+ foreach (@$old) {
+ my $old = $XapianDatabase->new($_);
+ cpdb_loop($old, $dst, $pr_data, $cur_shard, $reshard);
+ }
+ } else {
+ cpdb_loop($src, $dst, $pr_data);
+ }
+
+ $pr->(sprintf($pr_data->{fmt}, $pr_data->{nr})) if $pr;
+ return unless $opt->{compact};
+
+ $src = $dst = undef; # flushes and closes
+
+ # this is probably the best place to do xapian-compact
+ # since $dst isn't readable by HTTP or NNTP clients, yet:
+ compact([ $tmp, $new ], $opt);
+ remove_tree($tmp) or die "failed to remove $tmp: $!\n";
+}
+