From: Eric Wong (Contractor, The Linux Foundation) <e@80x24.org>
Date: Thu, 29 Mar 2018 20:17:19 +0000 (+0000)
Subject: public-inbox-compact: new tool for driving xapian-compact
X-Git-Tag: v1.1.0-pre1~98
X-Git-Url: http://www.git.stargrave.org/?a=commitdiff_plain;h=e5c2e2588d7ad2243afeabad67b3c951c5b66643;p=public-inbox.git

public-inbox-compact: new tool for driving xapian-compact

Having multiple Xapian partitions is mostly pointless after
the initial import.  We can compact all the partitions into
one while keeping the skeleton separate.
---

diff --git a/Documentation/public-inbox-compact.pod b/Documentation/public-inbox-compact.pod
new file mode 100644
index 00000000..4a519ce9
--- /dev/null
+++ b/Documentation/public-inbox-compact.pod
@@ -0,0 +1,50 @@
+=head1 NAME
+
+public-inbox-compact - compact Xapian DBs
+
+=head1 SYNOPSIS
+
+	public-inbox-compact INBOX_DIR
+
+=head1 DESCRIPTION
+
+public-inbox-compact is a wrapper for L<xapian-compact(1)>
+designed for "v2" inboxes.  It combines multiple Xapian
+partitions into one to reduce space overhead after an initial
+mass import (using multiple partitions) is done.
+
+It locks the inbox and prevents other processes such as
+L<public-inbox-watch(1)> from writing while it operates.
+
+It also supports "v1" (ssoma) inboxes with limited
+usefulness over L<xapian-compact(1)>
+
+=head1 ENVIRONMENT
+
+=over 8
+
+=item PI_CONFIG
+
+The default config file, normally "~/.public-inbox/config".
+See L<public-inbox-config(5)>
+
+=back
+
+=head1 UPGRADING
+
+=head1 CONTACT
+
+Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org>
+
+The mail archives are hosted at L<https://public-inbox.org/meta/>
+and L<http://hjrcffqmbrq6wope.onion/meta/>
+
+=head1 COPYRIGHT
+
+Copyright 2018 all contributors L<mailto:meta@public-inbox.org>
+
+License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
+
+=head1 SEE ALSO
+
+L<xapian-compact(1)>, L<public-inbox-index(1)>
diff --git a/MANIFEST b/MANIFEST
index 1e48d3a9..ce6cd116 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -7,6 +7,7 @@ Documentation/design_notes.txt
 Documentation/design_www.txt
 Documentation/hosted.txt
 Documentation/include.mk
+Documentation/public-inbox-compact.pod
 Documentation/public-inbox-config.pod
 Documentation/public-inbox-convert.pod
 Documentation/public-inbox-daemon.pod
@@ -110,6 +111,7 @@ sa_config/Makefile
 sa_config/README
 sa_config/root/etc/spamassassin/public-inbox.pre
 sa_config/user/.spamassassin/user_prefs
+script/public-inbox-compact
 script/public-inbox-convert
 script/public-inbox-httpd
 script/public-inbox-index
@@ -137,6 +139,7 @@ t/common.perl
 t/config.t
 t/config_limiter.t
 t/content_id.t
+t/convert-compact.t
 t/emergency.t
 t/fail-bin/spamc
 t/feed.t
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
new file mode 100755
index 00000000..016873d3
--- /dev/null
+++ b/script/public-inbox-compact
@@ -0,0 +1,94 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+use PublicInbox::V2Writable;
+use PublicInbox::Search;
+use PublicInbox::Config;
+use Cwd 'abs_path';
+use File::Temp qw(tempdir);
+use File::Path qw(remove_tree);
+use PublicInbox::Spawn qw(spawn);
+my $usage = "Usage: public-inbox-compact REPO_DIR\n";
+my $dir = shift or die $usage;
+my $config = PublicInbox::Config->new;
+my $ibx;
+$config->each_inbox(sub {
+	$ibx = $_[0] if abs_path($_[0]->{mainrepo}) eq $dir
+});
+unless ($ibx) {
+	warn "W: $dir not configured in ".
+		PublicInbox::Config::default_file() . "\n";
+	$ibx = {
+		mainrepo => $dir,
+		name => 'ignored',
+		address => [ 'old@example.com' ],
+	};
+	$ibx = PublicInbox::Inbox->new($ibx);
+}
+my $v = ($ibx->{version} || 1);
+if ($v == 2) {
+	require PublicInbox::V2Writable;
+	my $v2w = PublicInbox::V2Writable->new($ibx);
+	my $xap_v = 'xap'.PublicInbox::Search::SCHEMA_VERSION;
+	my $xroot = "$ibx->{mainrepo}/$xap_v";
+	opendir my $dh, $xroot or die "Failed to opendir $xroot: $!\n";
+	$v2w->lock_acquire;
+	my $new = tempdir(CLEANUP => 1, DIR => $ibx->{mainrepo});
+	my @parts;
+	my $skel;
+	while (defined(my $dn = readdir($dh))) {
+		if ($dn =~ /\A\d+\z/) {
+			push @parts, "$xroot/$dn";
+		} elsif ($dn eq 'skel') {
+			$skel = "$xroot/$dn";
+		} elsif ($dn eq '.' || $dn eq '..') {
+		} else {
+			warn "W: skipping unknown Xapian DB: $xroot/$dn\n";
+		}
+	}
+	close $dh;
+	my %pids;
+	if (@parts) {
+		my $pid = spawn([ qw(xapian-compact), @parts, "$new/0" ]);
+		defined $pid or die "compact failed: $?\n";
+		$pids{$pid} = 'xapian-compact (parts)';
+	} else {
+		warn "No parts found in $xroot\n";
+	}
+	if (defined $skel) {
+		my $pid = spawn([ qw(xapian-compact), $skel, "$new/skel" ]);
+		defined $pid or die "compact failed: $?\n";
+		$pids{$pid} = 'xapian-compact (skel)';
+	} else {
+		warn "$xroot/skel missing\n";
+	}
+	die "No xapian-compact processes running\n" unless scalar keys %pids;
+	while (scalar keys %pids) {
+		my $pid = waitpid(-1, 0);
+		my $desc = delete $pids{$pid};
+		die "$desc failed: $?\n" if $?;
+	}
+	rename($xroot, "$new/old") or die "rename $xroot => $new/old: $!\n";
+	rename($new, $xroot) or die "rename $new => $xroot: $!\n";
+	$v2w->lock_release;
+	remove_tree("$xroot/old") or die "failed to remove $xroot/old: $!\n";
+} elsif ($v == 1) {
+	require PublicInbox::Import;
+	my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
+	my $xap_v = 'xapian'.PublicInbox::Search::SCHEMA_VERSION;
+	my $v1_root = "$ibx->{mainrepo}/public-inbox";
+	my $old = "$v1_root/$xap_v";
+	-d $old or die "$old does not exist\n";
+	my $new = tempdir(CLEANUP => 1, DIR => $v1_root);
+	$im->lock_acquire;
+	PublicInbox::Import::run_die([ qw(xapian-compact), $old, $new ]);
+	rename($old, "$new/old") or die "rename $old => $new: $!\n";
+	rename($new, $old) or die "rename $new => $old: $!\n";
+	$im->lock_release;
+	remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
+} else {
+	die "Unsupported inbox version: $v\n";
+}
diff --git a/t/convert-compact.t b/t/convert-compact.t
new file mode 100644
index 00000000..922ec9c2
--- /dev/null
+++ b/t/convert-compact.t
@@ -0,0 +1,57 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use File::Temp qw/tempdir/;
+use PublicInbox::MIME;
+my @mods = qw(DBD::SQLite Search::Xapian);
+foreach my $mod (@mods) {
+	eval "require $mod";
+	plan skip_all => "$mod missing for convert-compact.t" if $@;
+}
+use PublicInbox::V2Writable;
+use PublicInbox::Import;
+my $tmpdir = tempdir('convert-compact-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $ibx = {
+	mainrepo => "$tmpdir/v1",
+	name => 'test-v1',
+	-primary_address => 'test@example.com',
+};
+
+ok(PublicInbox::Import::run_die([qw(git init --bare -q), $ibx->{mainrepo}]),
+	'initialized v1 repo');
+$ibx = PublicInbox::Inbox->new($ibx);
+my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
+my $mime = PublicInbox::MIME->create(
+	header => [
+		From => 'a@example.com',
+		To => 'test@example.com',
+		Subject => 'this is a subject',
+		'Message-ID' => '<a-mid@b>',
+		Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+	],
+	body => "hello world\n",
+);
+ok($im->add($mime), 'added one message');
+$im->done;
+PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
+local $ENV{PATH} = "blib/script:$ENV{PATH}";
+open my $err, '>>', "$tmpdir/err.log" or die "open: err.log $!\n";
+open my $out, '>>', "$tmpdir/out.log" or die "open: out.log $!\n";
+my $rdr = { 1 => fileno($out), 2 => fileno($err) };
+
+my $cmd = [ 'public-inbox-compact', $ibx->{mainrepo} ];
+ok(PublicInbox::Import::run_die($cmd, undef, $rdr), 'v1 compact works');
+
+$cmd = [ 'public-inbox-convert', $ibx->{mainrepo}, "$tmpdir/v2" ];
+ok(PublicInbox::Import::run_die($cmd, undef, $rdr), 'convert works');
+
+$cmd = [ 'public-inbox-compact', "$tmpdir/v2" ];
+my $env = { NPROC => 2 };
+ok(PublicInbox::Import::run_die($cmd, $env, $rdr), 'v2 compact works');
+$ibx->{mainrepo} = "$tmpdir/v2";
+my $v2w = PublicInbox::V2Writable->new($ibx);
+is($v2w->{partitions}, 1, "only one partition in compacted repo");
+
+done_testing();