supported, thus the value of C<1m> to prevents indexing of
messages larger than one megabyte.
-This is useful for avoiding memory exhaustion in mirrors.
+This is useful for avoiding memory exhaustion in mirrors
+via git. It does not prevent L<public-inbox-mda(1)> or
+L<public-inbox-watch(1)> from importing (and indexing)
+a message.
+
This option is only available in public-inbox 1.5 or later.
Default: none
Default: C<1>
-=item --indexlevel <basic|medium|full>
+=item -L, --indexlevel <basic|medium|full>
Controls the indexing level for L<public-inbox-index(1)>
my $dir = $self->xdir;
$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
$self->{over}->{-no_fsync} = 1 if $ibx->{-no_fsync};
- $self->{index_max_size} = $ibx->{index_max_size};
} elsif ($version == 2) {
defined $shard or die "shard is required for v2\n";
# shard is a number
sub check_size { # check_async cb for -index --max-size=...
my ($oid, $type, $size, $arg, $git) = @_;
(($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}";
- if ($size <= $arg->{index_max_size}) {
+ if ($size <= $arg->{max_size}) {
$git->cat_async($oid, $arg->{index_oid}, $arg);
} else {
- warn "W: skipping $oid ($size > $arg->{index_max_size})\n";
+ warn "W: skipping $oid ($size > $arg->{max_size})\n";
}
}
$self->{mm}->last_commit($newest);
}
} else {
- ${$sync->{max}} = $BATCH_BYTES;
+ ${$sync->{max}} = $self->{batch_bytes};
}
$self->{mm}->{dbh}->commit;
sub process_stack {
my ($self, $sync, $stk) = @_;
my $git = $self->{ibx}->git;
- my $max = $BATCH_BYTES;
+ my $max = $self->{batch_bytes};
my $nr = 0;
$sync->{nr} = \$nr;
$sync->{max} = \$max;
$git->cat_async($oid, \&unindex_both, $self);
}
}
- if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
+ if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
$sync->{index_oid} = \&index_both;
}
while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
if ($f eq 'm') {
my $arg = { %$sync, autime => $at, cotime => $ct };
- if ($sync->{index_max_size}) {
+ if ($sync->{max_size}) {
$git->check_async($oid, \&check_size, $arg);
} else {
$git->cat_async($oid, \&index_both, $arg);
my ($self, $opts) = @_;
my $tip = $opts->{ref} || 'HEAD';
my $git = $self->{ibx}->git;
+ $self->{batch_bytes} = $opts->{batch_size} // $BATCH_BYTES;
$git->batch_prepare;
my $pr = $opts->{-progress};
my $sync = { reindex => $opts->{reindex}, -opt => $opts };
$self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb);
}
-sub batch_bytes ($) {
- my ($self) = @_;
- ($self->{parallel} ? $self->{shards} : 1) *
- $PublicInbox::SearchIdx::BATCH_BYTES;
-}
-
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
- $n >= batch_bytes($self);
+ $n >= $self->{batch_bytes};
}
sub _add {
# xcpdb can change shard count while -watch is idle
my $nshards = count_shards($self);
$self->{shards} = $nshards if $nshards && $nshards != $self->{shards};
+ $self->{batch_bytes} = $opt->{batch_size} //
+ $PublicInbox::SearchIdx::BATCH_BYTES;
+ $self->{batch_bytes} *= $self->{shards} if $self->{parallel};
# need to create all shards before initializing msgmap FD
# idx_shards must be visible to all forked processes
}
# allow -watch or -mda to write...
- $self->idx_init; # reacquire lock
+ $self->idx_init($sync->{-opt}); # reacquire lock
$mm_tmp->atfork_parent if $mm_tmp;
}
$pr->("Xapian indexlevel=$ibx->{indexlevel} ".
"$beg..$end (% $step)\n");
}
- my $batch_bytes = batch_bytes($self);
for (my $num = $beg; $num <= $end; $num += $step) {
my $smsg = $ibx->over->get_art($num) or next;
$smsg->{v2w} = $self;
$ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
- if ($self->{transact_bytes} >= $batch_bytes) {
+ if ($self->{transact_bytes} >= $self->{batch_bytes}) {
${$sync->{nr}} = $num;
reindex_checkpoint($self, $sync);
}
$self->{current_info} = "$i.git $oid";
if ($f eq 'm') {
my $arg = { %$sync, autime => $at, cotime => $ct };
- if ($sync->{index_max_size}) {
+ if ($sync->{max_size}) {
$all->check_async($oid, \&check_size, $arg);
} else {
$all->cat_async($oid, \&index_oid, $arg);
sub xapian_only {
my ($self, $opt, $sync, $art_beg) = @_;
- my $seq = $opt->{sequentialshard};
+ my $seq = $opt->{sequential_shard};
$art_beg //= 0;
local $self->{parallel} = 0 if $seq;
$self->idx_init($opt); # acquire lock
sub index_sync {
my ($self, $opt) = @_;
$opt //= $_[1] //= {};
- goto \&xapian_only if $opt->{xapianonly};
+ goto \&xapian_only if $opt->{xapian_only};
my $pr = $opt->{-progress};
my $epoch_max;
my $latest = git_dir_latest($self, \$epoch_max);
return unless defined $latest;
- my $seq = $opt->{sequentialshard};
+ my $seq = $opt->{sequential_shard};
my $art_beg; # the NNTP article number we start xapian_only at
my $idxlevel = $self->{ibx}->{indexlevel};
local $self->{ibx}->{indexlevel} = 'basic' if $seq;
$art_beg++ if defined($art_beg);
}
}
- if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
+ if ($sync->{max_size} = $opt->{max_size}) {
$sync->{index_oid} = \&index_oid;
}
# work forwards through history
options:
--no-fsync speed up indexing, risk corruption on power outage
- --indexlevel=LEVEL `basic', 'medium', or `full' (default: full)
+ -L LEVEL `basic', `medium', or `full' (default: full)
--compact | -c run public-inbox-compact(1) after indexing
--sequential-shard index Xapian shards sequentially for slow storage
--jobs=NUM set or disable parallelization (NUM=0)
See public-inbox-index(1) man page for full documentation.
EOF
my $compact_opt;
-my $opt = { quiet => -1, compact => 0, maxsize => undef, fsync => 1 };
+my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 };
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
- fsync|sync! xapianonly|xapian-only
- indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
- sequentialshard|seq-shard|sequential-shard
+ fsync|sync! xapian_only|xapian-only
+ indexlevel|index-level|L=s max_size|max-size=s
+ batch_size|batch-size=s
+ sequential_shard|seq-shard|sequential-shard
help|?))
or die "bad command-line args\n$usage";
if ($opt->{help}) { print $help; exit 0 };
die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
-if ($opt->{xapianonly} && !$opt->{reindex}) {
+if ($opt->{xapian_only} && !$opt->{reindex}) {
die "--xapian-only requires --reindex\n";
}
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
-my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+my $max_size = $opt->{max_size} // $cfg->{lc('publicInbox.indexMaxSize')};
if (defined $max_size) {
PublicInbox::Admin::parse_unsigned(\$max_size) or
die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+ $opt->{max_size} = $max_size;
}
-my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')};
+my $bs = $opt->{batch_size} // $cfg->{lc('publicInbox.indexBatchSize')};
if (defined $bs) {
PublicInbox::Admin::parse_unsigned(\$bs) or
die "`publicInbox.indexBatchSize=$bs' not parsed\n";
+ $opt->{batch_size} = $bs;
}
-no warnings 'once';
-local $PublicInbox::SearchIdx::BATCH_BYTES = $bs if defined($bs);
-use warnings 'once';
# out-of-the-box builds of Xapian 1.4.x are still limited to 32-bit
# https://getting-started-with-xapian.readthedocs.io/en/latest/concepts/indexing/limitations.html
local $ENV{XAPIAN_FLUSH_THRESHOLD} ||= '4294967295' if defined($bs);
-my $s = $opt->{sequentialshard} //
+my $s = $opt->{sequential_shard} //
$cfg->{lc('publicInbox.indexSequentialShard')};
if (defined $s) {
my $v = $cfg->git_bool($s);
defined($v) or
die "`publicInbox.indexSequentialShard=$s' not boolean\n";
- $opt->{sequentialshard} = $v;
+ $opt->{sequential_shard} = $v;
}
my $mods = {};
foreach my $ibx (@ibxs) {
# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
- $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ?
+ $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
'full' : PublicInbox::Admin::detect_indexlevel($ibx));
- $ibx->{index_max_size} = $max_size;
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
$ibx->{-no_fsync} = 1 if !$opt->{fsync};
my $ibx_opt = $opt;
- if (defined(my $s = $ibx->{indexsequentialshard})) {
+ if (defined(my $s = $ibx->{lc('indexSequentialShard')})) {
defined(my $v = $cfg->git_bool($s)) or die <<EOL;
publicInbox.$ibx->{name}.indexSequentialShard not boolean
EOL
- $ibx_opt = { %$opt, sequentialshard => $v };
+ $ibx_opt = { %$opt, sequential_shard => $v };
}
PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
if ($compact_opt) {
- local $compact_opt->{jobs} = 0 if $ibx_opt->{sequentialshard};
+ local $compact_opt->{jobs} = 0 if $ibx_opt->{sequential_shard};
PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
}
}
my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs);
my %opts = (
'V|version=i' => \$version,
- 'L|indexlevel=s' => \$indexlevel,
+ 'L|index-level|indexlevel=s' => \$indexlevel,
'S|skip|skip-epoch=i' => \$skip_epoch,
'N|skip-artnum=i' => \$skip_artnum,
'j|jobs=i' => \$jobs,
exit(1) if $conflict;
my $ibx = $cfg->lookup_name($name);
- if ($ibx) {
- if (!defined($indexlevel) && $ibx->{indexlevel}) {
- $indexlevel = $ibx->{indexlevel};
- }
- }
+ $indexlevel //= $ibx->{indexlevel} if $ibx;
}
my $pi_config_tmp = $fh->filename;
close($fh) or die "failed to close $pi_config_tmp: $!\n";