Documentation/public-inbox-index.pod | 8 ++++++++ Documentation/public-inbox-init.pod | 10 ++++++++++ lib/PublicInbox/Admin.pm | 12 +++++++----- lib/PublicInbox/SearchIdx.pm | 35 ++++++++++++++++++++++++++--------- lib/PublicInbox/SearchIdxShard.pm | 2 +- script/public-inbox-convert | 3 ++- script/public-inbox-index | 7 +++++-- script/public-inbox-init | 8 ++++++++ t/inbox_idle.t | 2 +- t/index-git-times.t | 11 ++++++++++- t/init.t | 13 +++++++++++++ diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 1ed9f5e7943dd832c656c83df024da592b861db6..46a538254c6d00204c04d143114743cac9b90f6e 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -145,6 +145,14 @@ below. Available in public-inbox 1.6.0 (PENDING). +=item --skip-docdata + +Stop storing document data in Xapian on an existing inbox. + +See L for description and caveats. + +Available in public-inbox 1.6.0 (PENDING). + =back =head1 FILES diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod index 31c5f743edb818dc8edf1d0abdf9558eef0260d7..b25dd1e4d02e373b83e64ac5edd29cacf81969be 100644 --- a/Documentation/public-inbox-init.pod +++ b/Documentation/public-inbox-init.pod @@ -95,6 +95,16 @@ default due to contention in the top-level producer process. Default: the number of online CPUs, up to 4 +=item --skip-docdata + +Do not store document data in Xapian, reducing Xapian storage +overhead by around 1.5%. + +Warning: this option prevents rollbacks to public-inbox 1.5.0 +and earlier. + +Available since public-inbox 1.6.0 (PENDING). + =back =head1 ENVIRONMENT diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index f5427af77376a6e1da40f5fe9aa187c19a53f36e..b8ead6f703984153712640cd1af6928a5c0d060c 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -48,13 +48,14 @@ # for unconfigured inboxes sub detect_indexlevel ($) { my ($ibx) = @_; + my $over = $ibx->over; + my $srch = $ibx->search; + delete @$ibx{qw(over search)}; # don't leave open FDs lying around + # brand new or never before indexed inboxes default to full - return 'full' unless $ibx->over; - delete $ibx->{over}; # don't leave open FD lying around - + return 'full' unless $over; my $l = 'basic'; - my $srch = $ibx->search or return $l; - delete $ibx->{search}; # don't leave open FD lying around + return $l unless $srch; if (my $xdb = $srch->xdb) { $l = 'full'; my $m = $xdb->get_metadata('indexlevel'); @@ -65,6 +66,7 @@ warn <<""; $ibx->{inboxdir} has unexpected indexlevel in Xapian: $m } + $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); } $l; } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5c39f3d620e3e774faab522c9de3113f995733d5..be46b2b972acd8954094930ba5ac16defd37f980 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -61,6 +61,10 @@ indexlevel => $indexlevel, }, $class; $self->xpfx_init; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; + if ($ibx->{-skip_docdata}) { + $self->{-set_skip_docdata_once} = 1; + $self->{-skip_docdata} = 1; + } $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; @@ -359,10 +363,18 @@ index_headers($self, $smsg); msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); - $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); - my $data = $smsg->to_doc_data; - $doc->set_data($data); + + # by default, we maintain compatibility with v1.5.0 and earlier + # by writing to docdata.glass, users who never exect to downgrade can + # use --skip-docdata + if (!$self->{-skip_docdata}) { + # WWW doesn't need {to} or {cc}, only NNTP + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + my $data = $smsg->to_doc_data; + $doc->set_data($data); + } + if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { my $pfx = $alt->{xprefix}; @@ -831,23 +843,28 @@ } # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) # This metadata is read by Admin::detect_indexlevel: -sub set_indexlevel { +sub set_metadata_once { my ($self) = @_; - if (!$self->{shard} && # undef or 0, not >0 - delete($self->{-set_indexlevel_once})) { - my $xdb = $self->{xdb}; + return if $self->{shard}; # only continue if undef or 0, not >0 + my $xdb = $self->{xdb}; + + if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { $xdb->set_metadata('indexlevel', 'medium'); } } + if (delete($self->{-set_skip_docdata_once})) { + $xdb->get_metadata('skip_docdata') or + $xdb->set_metadata('skip_docdata', '1'); + } } sub _commit_txn { my ($self) = @_; if (my $xdb = $self->{xdb}) { - set_indexlevel($self); + set_metadata_once($self); $xdb->commit_transaction; } $self->{over}->commit_lazy if $self->{over}; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 59b360872b3bb4db111b5354b7a5b4b1aecbd41d..20077e08fa2edc13ef17926d7bc55789559f628d 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -16,7 +16,7 @@ my $ibx = $v2w->{ibx}; my $self = $class->SUPER::new($ibx, 1, $shard); # create the DB before forking: $self->idx_acquire; - $self->set_indexlevel; + $self->set_metadata_once; $self->idx_release; $self->spawn_worker($v2w, $shard) if $v2w->{parallel}; $self; diff --git a/script/public-inbox-convert b/script/public-inbox-convert index d655dcc6e0504f1bed64795fb092c75aa3c43864..4ff198d1947e3ceeb3063b32ef0fa6aab9606ee6 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -77,7 +77,8 @@ } die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2; require PublicInbox::Admin; -$old->{indexlevel} //= PublicInbox::Admin::detect_indexlevel($old); +my $detected = PublicInbox::Admin::detect_indexlevel($old); +$old->{indexlevel} //= $detected; my $env; if ($opt->{'index'}) { my $mods = {}; diff --git a/script/public-inbox-index b/script/public-inbox-index index 30d248382a83cf0cedf252d614b577f4150cf77c..9855c67d31012ed734f3ff2c499bc14597a574a8 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -39,7 +39,7 @@ fsync|sync! xapian_only|xapian-only indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s sequential_shard|seq-shard|sequential-shard - all help|?)) + skip-docdata all help|?)) or die "bad command-line args\n$usage"; if ($opt->{help}) { print $help; exit 0 }; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; @@ -58,9 +58,11 @@ unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } my $mods = {}; foreach my $ibx (@ibxs) { + # detect_indexlevel may also set $ibx->{-skip_docdata} + my $detected = PublicInbox::Admin::detect_indexlevel($ibx); # XXX: users can shoot themselves in the foot, with opt->{indexlevel} $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? - 'full' : PublicInbox::Admin::detect_indexlevel($ibx)); + 'full' : $detected); PublicInbox::Admin::scan_ibx_modules($mods, $ibx); } @@ -75,6 +77,7 @@ if ($opt->{compact} >= 2) { PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt}); } $ibx->{-no_fsync} = 1 if !$opt->{fsync}; + $ibx->{-skip_docdata} //= $opt->{'skip-docdata'}; my $ibx_opt = $opt; if (defined(my $s = $ibx->{lc('indexSequentialShard')})) { diff --git a/script/public-inbox-init b/script/public-inbox-init index b19c2321ae22a72202f3db2d6c986b24d3b91cdc..037e8e56409bfd229aad25371f0c3461708b0e54 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -34,6 +34,7 @@ require PublicInbox::Admin; PublicInbox::Admin::require_or_die('-base'); my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs, $show_help); +my $skip_docdata; my $ng = ''; my %opts = ( 'V|version=i' => \$version, @@ -42,6 +43,7 @@ 'S|skip|skip-epoch=i' => \$skip_epoch, 'skip-artnum=i' => \$skip_artnum, 'j|jobs=i' => \$jobs, 'ng|newsgroup=s' => \$ng, + 'skip-docdata' => \$skip_docdata, 'help|?' => \$show_help, ); my $usage_cb = sub { @@ -177,6 +179,12 @@ } require PublicInbox::InboxWritable; $ibx = PublicInbox::InboxWritable->new($ibx, $creat_opt); +if ($skip_docdata) { + $ibx->{indexlevel} //= 'full'; # ensure init_inbox writes xdb + $ibx->{indexlevel} eq 'basic' and + die "--skip-docdata ignored with --indexlevel=basic\n"; + $ibx->{-skip_docdata} = $skip_docdata; +} $ibx->init_inbox(0, $skip_epoch, $skip_artnum); # needed for git prior to v2.1.0 diff --git a/t/inbox_idle.t b/t/inbox_idle.t index 61287200d9bbe65f493c1b66ad50befa23c76355..e16ee11b18970deaad33015173f84af101d0ab6d 100644 --- a/t/inbox_idle.t +++ b/t/inbox_idle.t @@ -29,7 +29,7 @@ my $im = $ibx->importer(0); if ($V == 1) { my $sidx = PublicInbox::SearchIdx->new($ibx, 1); $sidx->idx_acquire; - $sidx->set_indexlevel; + $sidx->set_metadata_once; $sidx->idx_release; # allow watching on lockfile } my $pi_config = PublicInbox::Config->new(\< $r }); } -run_script(['-index', $v1dir]) or die 'v1 index failed'; +run_script(['-index', '--skip-docdata', $v1dir]) or die 'v1 index failed'; + my $smsg; { my $cfg = PublicInbox::Config->new; my $ibx = $cfg->lookup($addr); + my $lvl = PublicInbox::Admin::detect_indexlevel($ibx); + is($lvl, 'medium', 'indexlevel detected'); + is($ibx->{-skip_docdata}, 1, '--skip-docdata flag set on -index'); $smsg = $ibx->over->get_art(1); is($smsg->{ds}, 749520000, 'datestamp from git author time'); is($smsg->{ts}, 1285977600, 'timestamp from git committer time'); @@ -70,6 +75,10 @@ my $check_v2 = sub { my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir, address => $addr}); + my $lvl = PublicInbox::Admin::detect_indexlevel($ibx); + is($lvl, 'medium', 'indexlevel detected after convert'); + is($ibx->{-skip_docdata}, 1, + '--skip-docdata preserved after convert'); my $v2smsg = $ibx->over->get_art(1); is($v2smsg->{ds}, $smsg->{ds}, 'v2 datestamp from git author time'); diff --git a/t/init.t b/t/init.t index 4d2c50496fc92b15bdf7b95521c77d0c58d10905..dad094358381a5bb4773aab3f0410962f45969f3 100644 --- a/t/init.t +++ b/t/init.t @@ -95,6 +95,19 @@ is(read_indexlevel("v2$lvl"), $lvl, "indexlevel set to '$lvl'"); my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir }); is(PublicInbox::Admin::detect_indexlevel($ibx), $lvl, 'detected expected level w/o config'); + ok(!$ibx->{-skip_docdata}, 'docdata written by default'); + } + for my $v (1, 2) { + my $name = "v$v-skip-docdata"; + my $dir = "$tmpdir/$name"; + $cmd = [ '-init', $name, "-V$v", '--skip-docdata', + $dir, "http://example.com/$name", + "$name\@example.com" ]; + ok(run_script($cmd), "-init -V$v --skip-docdata"); + my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir }); + is(PublicInbox::Admin::detect_indexlevel($ibx), 'full', + "detected default indexlevel -V$v"); + ok($ibx->{-skip_docdata}, "docdata skip set -V$v"); } # loop for idempotency