See L<public-inbox-edit(1)>
+=item publicinbox.indexMaxSize
+
+See L<public-inbox-index(1)>
+
=item publicinbox.wwwlisting
Enable a HTML listing style when the root path of the URL '/' is accessed.
L<public-inbox-edit(1)> or L<public-inbox-purge(1)> to ensure data
is expunged from mirrors.
+=item --max-size SIZE
+
+Sets or overrides L</publicinbox.indexMaxSize> on a
+per-invocation basis. See L</publicinbox.indexMaxSize>
+below.
+
=back
=head1 FILES
v2 inboxes are described in L<public-inbox-v2-format>.
+=head1 CONFIGURATION
+
+=over 8
+
+=item publicinbox.indexMaxSize
+
+Prevents indexing of messages larger than the specified size
+value. A single suffix modifier of C<k>, C<m> or C<g> is
+supported, thus the value of C<1m> to prevents indexing of
+messages larger than one megabyte.
+
+This is useful for avoiding memory exhaustion in mirrors.
+
+Default: none
+
+=back
+
=head1 ENVIRONMENT
=over 8
}
}
+# same unit factors as git:
+sub parse_unsigned ($) {
+ my ($max_size) = @_;
+
+ $$max_size =~ /\A([0-9]+)([kmg])?\z/i or return;
+ my ($n, $unit_factor) = ($1, $2 // '');
+ my %u = ( k => 1024, m => 1024**2, g => 1024**3 );
+ $$max_size = $n * ($u{lc($unit_factor)} // 1);
+ 1;
+}
+
1;
$self->{lock_path} = "$inboxdir/ssoma.lock";
my $dir = $self->xdir;
$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
+ $self->{index_max_size} = $ibx->{index_max_size};
} elsif ($version == 2) {
defined $shard or die "shard is required for v2\n";
# shard is a number
}
}
+sub too_big ($$$) {
+ my ($self, $git, $oid) = @_;
+ my $max_size = $self->{index_max_size} or return;
+ my (undef, undef, $size) = $git->check($oid);
+ die "E: bad $oid in $git->{git_dir}\n" if !defined($size);
+ return if $size <= $max_size;
+ warn "W: skipping $oid ($size > $max_size)\n";
+ 1;
+}
+
# only for v1
sub read_log {
my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
}
next;
}
+ next if too_big($self, $git, $blob);
my $mime = do_cat_mail($git, $blob, \$bytes);
my $smsg = bless {}, 'PublicInbox::Smsg';
batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
$add_cb->($self, $mime, $smsg);
} elsif ($line =~ /$delmsg/o) {
my $blob = $1;
- $D{$blob} = 1;
+ $D{$blob} = 1 unless too_big($self, $git, $blob);
} elsif ($line =~ /^commit ($h40)/o) {
$latest = $1;
$newest ||= $latest;
last_commit => [], # git repo -> commit
};
$self->{shards} = count_shards($self) || nproc_shards($creat);
+ $self->{index_max_size} = $v2ibx->{index_max_size};
bless $self, $class;
}
sub mark_deleted ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my $msgref = $git->cat_file($oid);
my $mime = PublicInbox::MIME->new($$msgref);
my $mids = mids($mime->header_obj);
sub reindex_oid ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my ($num, $mid0, $len);
my $msgref = $git->cat_file($oid, \$len);
return if $len == 0; # purged
use PublicInbox::Xapcmd;
my $compact_opt;
-my $opt = { quiet => -1, compact => 0 };
-GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s))
+my $opt = { quiet => -1, compact => 0, maxsize => undef };
+GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune
+ indexlevel|L=s maxsize|max-size=s))
or die "bad command-line args\n$usage";
die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
$compact_opt = { -coarse_lock => 1, compact => 1 };
}
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV);
+my $cfg = PublicInbox::Config->new;
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg);
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
my $mods = {};
+my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+if (defined $max_size) {
+ PublicInbox::Admin::parse_unsigned(\$max_size) or
+ die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+}
+
foreach my $ibx (@ibxs) {
# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
$ibx->{indexlevel} //= $opt->{indexlevel} //
PublicInbox::Admin::detect_indexlevel($ibx);
+ $ibx->{index_max_size} = $max_size;
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
}
chdir '/';
+
+my @pairs = (
+ '1g' => 1024 ** 3,
+ 666 => 666,
+ '1500K' => 1500 * 1024,
+ '15m' => 15 * (1024 ** 2),
+);
+
+while (@pairs) {
+ my ($in, $out) = splice(@pairs, 0, 2);
+ my $orig = $in;
+ ok(PublicInbox::Admin::parse_unsigned(\$in), "parse_unsigned $orig");
+ is($in, $out, "got $orig => ($in == $out)");
+}
+
+for my $v ('', 'bogus', '1p', '1gig') {
+ ok(!PublicInbox::Admin::parse_unsigned(\$v),
+ "parse_unsigned rejects $v");
+}
+
done_testing();
zzzzzz
EOF
- $im->add($mime);
+ ok($im->add($mime), 'added initial message');
+
+ $mime->header_set('Message-ID', '<toobig@example.com>');
+ $mime->body_str_set("z\n" x 1024);
+ ok($im->add($mime), 'added big message');
# deliver a reply, too
- my $reply = Email::MIME->new(<<EOF);
+ $mime = Email::MIME->new(<<EOF);
From: You <you\@example.com>
To: Me <me\@example.com>
Cc: $addr
what?
EOF
- $im->add($reply);
+ ok($im->add($mime), 'added reply');
my $slashy_mid = 'slashy/asdf@example.com';
my $slashy = Email::MIME->new(<<EOF);
slashy
EOF
- $im->add($slashy);
+ ok($im->add($slashy), 'added slash');
$im->done;
my $res = cgi_run("/test/slashy/asdf\@example.com/raw");
my $path = "/test/blahblah\@example.com/t.mbox.gz";
my $res = cgi_run($path);
like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
- my $indexed;
- eval {
- require DBD::SQLite;
- require PublicInbox::SearchIdx;
- my $s = PublicInbox::SearchIdx->new($ibx, 1);
- $s->index_sync;
- $indexed = 1;
- };
+ my $cmd = ['-index', $ibx->{inboxdir}, '--max-size=2k'];
+ my $opt = { 2 => \(my $err) };
+ my $indexed = run_script($cmd, undef, $opt);
if ($indexed) {
$res = cgi_run($path);
like($res->{head}, qr/^Status: 200 /, "search returned mbox");
IO::Uncompress::Gunzip::gunzip(\$in => \$out);
like($out, qr/^From /m, "From lines in mbox");
};
+ $res = cgi_run('/test/toobig@example.com/');
+ like($res->{head}, qr/^Status: 300 /,
+ 'did not index or return >max-size message');
+ like($err, qr/skipping [a-f0-9]{40,}/,
+ 'warned about skipping large OID');
} else {
like($res->{head}, qr/^Status: 501 /, "search not available");
- SKIP: { skip 'DBD::SQLite not available', 2 };
+ SKIP: { skip 'DBD::SQLite not available', 4 };
}
my $have_xml_treepp = eval { require XML::TreePP; 1 } if $indexed;
is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
}
+if ('max size') {
+ $mime->header_set('Message-ID', '<2big@a>');
+ my $max = '2k';
+ $mime->body_str_set("z\n" x 1024);
+ ok($v2w->add($mime), "add big message");
+ $v2w->done;
+ $ibx->cleanup;
+ $fetch_each_epoch->();
+ PublicInbox::InboxWritable::cleanup($mibx);
+ my $cmd = ['-index', "$tmpdir/m", "--max-size=$max" ];
+ my $opt = { 2 => \(my $err) };
+ ok(run_script($cmd, undef, $opt), 'indexed with --max-size');
+ like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+ $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+ is(scalar($mset->items), 0, 'large message not indexed');
+
+ {
+ open my $fh, '>>', $pi_config or die;
+ print $fh <<EOF or die;
+[publicinbox]
+ indexMaxSize = 2k
+EOF
+ close $fh or die;
+ }
+ $cmd = ['-index', "$tmpdir/m", "--reindex" ];
+ ok(run_script($cmd, undef, $opt), 'reindexed w/ indexMaxSize in file');
+ like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+ $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+ is(scalar($mset->items), 0, 'large message not re-indexed');
+}
+
ok($td->kill, 'killed httpd');
$td->join;