Default: C<full>
+=item publicinbox.<name>.indexSequentialShard
+
+See L<public-inbox-index(1)/publicInbox.indexSequentialShard>
+
=item publicinbox.<name>.httpbackendmax
If a digit, the maximum number of parallel
See L<public-inbox-edit(1)>
=item publicinbox.indexMaxSize
+=item publicinbox.indexBatchSize
+=item publicinbox.indexSequentialShard
See L<public-inbox-index(1)>
=item --jobs=JOBS, -j
-Control the number of Xapian indexing jobs in a
+Influences the number of Xapian indexing shards in a
(L<public-inbox-v2-format(5)>) inbox.
C<--jobs=0> is accepted as of public-inbox 1.6.0 (PENDING)
to disable parallel indexing.
+If the inbox has not been indexed, C<JOBS - 1> shards
+will be created (one job is always needed for indexing
+the overview and article number mapping).
+
Default: the number of existing Xapian shards
=item --compact / -c
Available in public-inbox 1.6.0 (PENDING).
+=item --sequential-shard
+
+Sets or overrides L</publicinbox.indexSequentialShard> on a
+per-invocation basis. See L</publicinbox.indexSequentialShard>
+below.
+
+Available in public-inbox 1.6.0 (PENDING).
+
=back
=head1 FILES
Default: 1m (one megabyte)
+=item publicinbox.indexBatchSize
+
+Flushes changes to the filesystem and releases locks after
+indexing the given number of bytes. The default value of C<1m>
+(one megabyte) is low to minimize memory use and reduce
+contention with parallel invocations of L<public-inbox-mda(1)>,
+L<public-inbox-learn(1)>, and L<public-inbox-watch(1)>.
+
+Increase this value on powerful systems to improve throughput at
+the expense of memory use. The reduction of lock granularity
+may not be noticeable on fast systems.
+
+This option is available in public-inbox 1.6 or later.
+public-inbox 1.5 and earlier used the current default, C<1m>.
+
+For L<public-inbox-v2-format(5)> inboxes, this value is
+multiplied by the number of Xapian shards. Thus a typical v2
+inbox with 3 shards will flush every 3 megabytes by default.
+
+Default: 1m (one megabyte)
+
+=item publicinbox.indexSequentialShard
+=item publicinbox.<inbox_name>.indexSequentialShard
+
+For L<public-inbox-v2-format(5)> inboxes, setting this to C<true>
+allows indexing Xapian shards in multiple passes. This speeds up
+indexing on rotational storage with high seek latency by allowing
+individual shards to fit into the kernel page cache.
+
+Using a higher-than-normal number of C<--jobs> with
+L<public-inbox-init(1)> may be required to ensure individual
+shards are small enough to fit into cache.
+
+Available in public-inbox 1.6.0 (PENDING).
+
+This is ignored on L<public-inbox-v1-format(5)> inboxes.
+
+Default: false, shards are indexed in parallel
+
=back
=head1 ENVIRONMENT
Issuing TRIM commands with L<fstrim(8)> was necessary to maintain
consistent performance while developing this feature.
-Rotational storage devices are NOT recommended for indexing of
-large mail archives; but are fine for backup and usable for
-small instances.
+Rotational storage devices perform significantly worse than
+solid state storage for indexing of large mail archives; but are
+fine for backup and usable for small instances.
+
+As of public-inbox 1.6.0, the C<--sequential-shard> option of
+L<public-inbox-index(1)> may be used with a high shard count
+to ensure individual shards fit into page cache when the entire
+Xapian DB cannot.
Our use of the L</OVERVIEW DB> requires Xapian document IDs to
remain stable. Using L<public-inbox-compact(1)> and
$git;
}
-sub _git_config_bool ($) {
- my ($val) = @_;
+sub git_bool {
+ my ($val) = $_[-1]; # $_[0] may be $self, or $val
if ($val =~ /\A(?:false|no|off|[\-\+]?(?:0x)?0+)\z/i) {
0;
} elsif ($val =~ /\A(?:true|yes|on|[\-\+]?(?:0x)?[0-9]+)\z/i) {
foreach my $k (qw(inboxdir filter newsgroup
watch httpbackendmax
- replyto feedmax nntpserver indexlevel)) {
+ replyto feedmax nntpserver
+ indexlevel indexsequentialshard)) {
my $v = $self->{"$pfx.$k"};
$ibx->{$k} = $v if defined $v;
}
foreach my $k (qw(obfuscate)) {
my $v = $self->{"$pfx.$k"};
defined $v or next;
- if (defined(my $bval = _git_config_bool($v))) {
+ if (defined(my $bval = git_bool($v))) {
$ibx->{$k} = $bval;
} else {
warn "Ignoring $pfx.$k=$v in config, not boolean\n";
$self->{ibx}->git->cleanup; # *async_wait
${$sync->{need_checkpoint}} = 0;
- $sync->{mm_tmp}->atfork_prepare;
+ my $mm_tmp = $sync->{mm_tmp};
+ $mm_tmp->atfork_prepare if $mm_tmp;
$self->done; # release lock
if (my $pr = $sync->{-opt}->{-progress}) {
# allow -watch or -mda to write...
$self->idx_init; # reacquire lock
- $sync->{mm_tmp}->atfork_parent;
+ $mm_tmp->atfork_parent if $mm_tmp;
}
sub index_oid { # cat_async callback
}
$all->cat_async_wait;
}
- return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
+ if (!$regen_max && !keys(%{$self->{unindex_range}})) {
+ $sync->{-regen_fmt} = "%u/?\n";
+ return 0;
+ }
# reindex should NOT see new commits anymore, if we do,
# it's a problem and we need to notice it via die()
$ranges;
}
+sub index_xap_only { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $smsg) = @_;
+ my $self = $smsg->{v2w};
+ my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
+ $idx->begin_txn_lazy;
+ $idx->add_message(PublicInbox::Eml->new($bref), $smsg);
+ $self->{transact_bytes} += $size;
+}
+
+sub index_seq_shard ($$$) {
+ my ($self, $sync, $off) = @_;
+ my $ibx = $self->{ibx};
+ my $max = $ibx->mm->max or return;
+ my $all = $ibx->git;
+ my $over = $ibx->over;
+ my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES;
+ if (my $pr = $sync->{-opt}->{-progress}) {
+ $pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n");
+ }
+ for (my $num = $off; $num <= $max; $num += $self->{shards}) {
+ my $smsg = $over->get_art($num) or next;
+ $smsg->{v2w} = $self;
+ $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
+ if ($self->{transact_bytes} >= $batch_bytes) {
+ ${$sync->{nr}} = $num;
+ reindex_checkpoint($self, $sync);
+ }
+ }
+}
+
sub index_epoch ($$$) {
my ($self, $sync, $i) = @_;
my $epoch_max;
my $latest = git_dir_latest($self, \$epoch_max);
return unless defined $latest;
+
+ my $seq = $opt->{sequentialshard};
+ my $idxlevel = $self->{ibx}->{indexlevel};
+ local $self->{ibx}->{indexlevel} = 'basic' if $seq;
+
$self->idx_init($opt); # acquire lock
fill_alternates($self, $epoch_max);
$self->{over}->rethread_prepare($opt);
$pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr;
}
+ if ($seq) { # deal with Xapian shards sequentially
+ my $end = $self->{shards} - 1;
+ $self->{ibx}->{indexlevel} = $idxlevel;
+ delete $sync->{mm_tmp};
+ $self->idx_init($opt); # re-acquire lock
+ index_seq_shard($self, $sync, $_) for (0..$end);
+ $self->{ibx}->git->cat_async_wait;
+ $self->done;
+ }
+
# reindex does not pick up new changes, so we rerun w/o it:
if ($opt->{reindex}) {
my %again = %$opt;
sub cfg_bool ($$$) {
my ($cfg, $key, $url) = @_;
my $orig = $cfg->urlmatch($key, $url) // return;
- my $bool = PublicInbox::Config::_git_config_bool($orig);
+ my $bool = $cfg->git_bool($orig);
warn "W: $key=$orig for $url is not boolean\n" unless defined($bool);
$bool;
}
my $compact_opt;
my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
- indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s))
+ indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
+ sequentialshard|seq-shard|sequential-shard))
or die "bad command-line args\n$usage";
die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
$PublicInbox::SearchIdx::BATCH_BYTES = $bs;
}
+my $s = $opt->{sequentialshard} //
+ $cfg->{lc('publicInbox.indexSequentialShard')};
+if (defined $s) {
+ my $v = $cfg->git_bool($s);
+ defined($v) or
+ die "`publicInbox.indexSequentialShard=$s' not boolean\n";
+ $opt->{sequentialshard} = $v;
+}
+
my $mods = {};
foreach my $ibx (@ibxs) {
# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
}
$ibx->{-no_sync} = 1 if !$opt->{sync};
- PublicInbox::Admin::index_inbox($ibx, undef, $opt);
+
+ my $ibx_opt = $opt;
+ if (defined(my $s = $ibx->{indexsequentialshard})) {
+ defined(my $v = $cfg->git_bool($s)) or die <<EOL;
+publicInbox.$ibx->{name}.indexSequentialShard not boolean
+EOL
+ $ibx_opt = { %$opt, sequentialshard => $v };
+ }
+ PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt;
}
{
for my $t (qw(TRUE true yes on 1 +1 -1 13 0x1 0x12 0X5)) {
- is(PublicInbox::Config::_git_config_bool($t), 1, "$t is true");
+ is(PublicInbox::Config::git_bool($t), 1, "$t is true");
is(xqx([qw(git -c), "test.val=$t",
qw(config --bool test.val)]),
"true\n", "$t matches git-config behavior");
}
for my $f (qw(FALSE false no off 0 +0 +000 00 0x00 0X0)) {
- is(PublicInbox::Config::_git_config_bool($f), 0, "$f is false");
+ is(PublicInbox::Config::git_bool($f), 0, "$f is false");
is(xqx([qw(git -c), "test.val=$f",
qw(config --bool test.val)]),
"false\n", "$f matches git-config behavior");
}
- is(PublicInbox::Config::_git_config_bool('bogus'), undef,
+ is(PublicInbox::Config::git_bool('bogus'), undef,
'bogus is undef');
}
use warnings;
use Test::More;
use PublicInbox::TestCommon;
+use File::Path qw(remove_tree);
use Cwd qw(abs_path);
require_git(2.6);
local $ENV{HOME} = abs_path('t');
is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
}
+if ('sequential-shard') {
+ $mset = $mibx->search->query('m:15@example.com', {mset => 1});
+ is(scalar($mset->items), 1, 'large message not indexed');
+ remove_tree(glob("$tmpdir/m/xap*"), glob("$tmpdir/m/msgmap.*"));
+ my $cmd = [ qw(-index -j9 --sequential-shard), "$tmpdir/m" ];
+ ok(run_script($cmd), '--sequential-shard works');
+ my @shards = glob("$tmpdir/m/xap*/?");
+ is(scalar(@shards), 8, 'got expected shard count');
+ PublicInbox::InboxWritable::cleanup($mibx);
+ $mset = $mibx->search->query('m:15@example.com', {mset => 1});
+ is(scalar($mset->items), 1, 'search works after --sequential-shard');
+}
+
if ('max size') {
$mime->header_set('Message-ID', '<2big@a>');
my $max = '2k';