From: Eric Wong Date: Fri, 1 Jan 2021 04:51:46 +0000 (+0000) Subject: Merge tag 'v1.6.1' into eidx X-Git-Tag: v1.7.0~1472 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=8862c33ae93eea1af6246cd3c7a81e0a122186bf;hp=3bacac503f6ff0bcf19aa581151c9c89fa35fe55 Merge tag 'v1.6.1' into eidx public-inbox 1.6.1 - minor bugfix release * tag 'v1.6.1': (31 commits) public-inbox 1.6.1 - minor bugfix release import: drop X-Status in addition to Status eml: fix undefined vars on -To: meta@public-inbox.org -Subject: [ANNOUNCE] public-inbox 1.6.1 -Date: Thu, 31 Dec 2020 23:45:56 +0000 -Message-ID: <20201231234556.public-inbox-1.6.1-rele@sed> -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf-8 -Content-Disposition: inline - -A small, bugfix release on top of 1.6.0 from September 2020. - -Bug fixes: - -* MIME header decoding no longer warns on undefined variables, - with Perl <5.28. Thanks to a bug report by Ali Alnubani. - https://public-inbox.org/meta/DM6PR12MB49106F8E3BD697B63B943A22DADB0@DM6PR12MB4910.namprd12.prod.outlook.com/ - -* Fixed a message threading bug thanks to a report from Kyle Meyer. - "public-inbox-index --rethread --reindex" will be necessary - in case of certain messages arrive out-of-order. - Link: https://public-inbox.org/meta/87360nlc44.fsf@kyleam.com/ - -* WWW: per-inbox grokmirror manifests no longer return info - for all inboxes, only the root /manifest.js.gz includes all - inboxes. This regression appeared in 1.6. - -* public-inbox-mda matches List-Id headers insensitively, - matching public-inbox-watch behavior. Similarly, List-Id - is always indexed lower-cased for boolean matches to avoid - matching an incorrect term. - -* Newsgroup and Path NNTP headers are now emitted in conformance - with RFC 5536 3.1.[45]. Thanks to Andrey Melnikov for the report: - https://public-inbox.org/meta/CA+PODjpUN5Q4gBFQhAzUNuMasVEdmp9f=8Uo0Ej0mFumdSwi4w@mail.gmail.com/ - -* Inotify fixes for public-inbox-imapd users relying on SIGHUP - reloads and thousands of watches. - -* Read-only daemon fixes around TLS and Linux <4.5 systems - -Bugfixes with minor behavior changes: - -* The X-Status mbox header is now excluded from imports, - just like the Status: header has been for many years. - They have no place in public archives and can be privacy - concern for people sharing archives. - -* WWW prevents deep-linking to attachments to limit abuse - vectors. Noticed by Leah Neukirchen: - https://public-inbox.org/meta/87imagyap9.fsf@vuxu.org/ - -There are also several ocumentation fixes from Uwe Kleine-König -and Kyle Meyer. - -Please report bugs via plain-text mail to: meta@public-inbox.org - -See archives at https://public-inbox.org/meta/ for all history. diff --git a/Documentation/mknews.perl b/Documentation/mknews.perl index 510a4e18..a11dd5f0 100755 --- a/Documentation/mknews.perl +++ b/Documentation/mknews.perl @@ -43,7 +43,7 @@ if ($dst eq 'NEWS') { ); $ibx->{-primary_address} = $addr; my $ctx = { - -inbox => $ibx, + ibx => $ibx, -upfx => "$base_url/", -hr => 1, }; @@ -119,10 +119,10 @@ sub html_start { } sub html_end { - print $out < -EOF + for (@$PublicInbox::WwwStream::CODE_URL) { + print $out " git clone $_\n" or die; + } + print $out "\n" or die; } sub atom_start { @@ -131,7 +131,7 @@ sub atom_start { # WwwAtomStream stats this dir for mtime my $astream = PublicInbox::WwwAtomStream->new($ctx); delete $astream->{emit_header}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $title = PublicInbox::WwwAtomStream::title_tag($ibx->description); my $updated = PublicInbox::WwwAtomStream::feed_updated($mtime); print $out < +and L. It exists to facilitate +searches across multiple inboxes as well as to reduce index +space when messages are cross-posted to several existing +inboxes. + +It transparently indexes messages across any combination of v1 and v2 +inboxes and data about inboxes themselves. + +=head1 DIRECTORY LAYOUT + +While inspired by v2, there is no git blob storage nor +C DB. + +Instead, there is an C (all caps) git repo which treats +every indexed v1 inbox or v2 epoch as a git alternate. + +As with v2 inboxes, it uses C and Xapian "shards" +for WWW and IMAP use. Several exclusive new tables are added +to deal with L and metadata. + +Unlike v1 and v2 inboxes, it is NOT designed to map to a NNTP +newsgroup. Thus it lacks C to enforce the +unique Message-ID requirement of NNTP. + +=head2 INDEX OVERVIEW AND DEFINITIONS + + $SCHEMA_VERSION - DB schema version (for Xapian) + $SHARD - Integer starting with 0 based on parallelism + + foo/ # "foo" is the name of the index + - ei.lock # lock file to protect global state + - ALL.git # empty, alternates for inboxes + - ei$SCHEMA_VERSION/$SHARD # per-shard Xapian DB + - ei$SCHEMA_VERSION/over.sqlite3 # overview DB for WWW, IMAP + - ei$SCHEMA_VERSION/misc # misc Xapian DB + +File and directory names are intentionally different from +analogous v2 names to ensure extindex and v2 inboxes can +easily be distinguished from each other. + +=head2 XREF3 DEDUPLICATION + +Due to cross-posted messages being the norm in the large Linux kernel +development community and Xapian indices being the primary consumer of +storage, it makes sense to deduplicate indexing as much as possible. + +The internal storage format is based on the NNTP "Xref" tuple, +but with the addition of a third element: the git blob OID. +Thus the triple is expressed in string form as: + + $NEWSGROUP_NAME:$ARTICLE_NUM:$OID + +If no C is configured for an inbox, the C +of the inbox is used. + +This data is stored in the C table of over.sqlite3. + +=head2 misc XAPIAN DB + +In addition to the numeric Xapian shards for indexing messages, +there is a new, in-development Xapian index for storing data +about inboxes themselves and other non-message data. This +index allows us to speed up operations involving hundreds or +thousands of inboxes. + +=head1 BENEFITS + +In addition to providing cross-inbox search capabilities, it can +also replace per-inbox Xapian shards (but not per-inbox +over.sqlite3). This allows reduction in disk space, open file +handles, and associated memory use. + +=head1 CAVEATS + +Relocating v1 and v2 inboxes on the filesystem will require +extindex to be garbage-collected and/or reindexed. + +Configuring and maintaining stable C names before any +messages are indexed from every inbox can avoid expensive +reindexing and rely exclusively on GC. + +=head1 LOCKING + +L locking exclusively locks the empty ei.lock file +for all non-atomic operations. + +=head1 THANKS + +Thanks to the Linux Foundation for sponsoring the development +and testing. + +=head1 COPYRIGHT + +Copyright 2020 all contributors L + +License: AGPL-3.0+ L + +=head1 SEE ALSO + +L diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 0848e860..2d5df930 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -162,6 +162,23 @@ See L for description and caveats. Available in public-inbox 1.6.0+. +=item --update-extindex=EXTINDEX, -E + +Update the given external index (L. +Either the configured section name (e.g. C) or a directory name +may be specified. + +Defaults to C if C<[extindex "all"]> is configured, +otherwise no external indices are updated. + +May be specified multiple times in rare cases where multiple +external indices are configured. + +=item --no-update-extindex + +Do not update the C external index by default. This negates +all uses of C<-E> / C<--update-extindex=> on the command-line. + =back =head1 FILES @@ -297,4 +314,4 @@ License: AGPL-3.0+ L =head1 SEE ALSO -L, L +L, L, L diff --git a/Documentation/standards.perl b/Documentation/standards.perl index 1c56830e..3ae64ddf 100755 --- a/Documentation/standards.perl +++ b/Documentation/standards.perl @@ -28,6 +28,9 @@ my $rfcs = [ 1036 => 'Standard for Interchange of USENET Messages', 5536 => 'Netnews Article Format', 5537 => 'Netnews Architecture and Protocols', + 1738 => 'Uniform resource locators', + 5092 => 'IMAP URL scheme', + 5538 => 'NNTP URI schemes', 6048 => 'NNTP additions to LIST command (TODO)', 8054 => 'NNTP compression', 4642 => 'NNTP TLS', diff --git a/MANIFEST b/MANIFEST index 8a47ccbf..a4cdedff 100644 --- a/MANIFEST +++ b/MANIFEST @@ -10,7 +10,6 @@ Documentation/RelNotes/v1.3.0.eml Documentation/RelNotes/v1.4.0.eml Documentation/RelNotes/v1.5.0.eml Documentation/RelNotes/v1.6.0.eml -Documentation/RelNotes/v1.6.1.eml Documentation/RelNotes/v1.7.0.wip Documentation/clients.txt Documentation/dc-dlvr-spam-flow.txt @@ -28,6 +27,7 @@ Documentation/public-inbox-config.pod Documentation/public-inbox-convert.pod Documentation/public-inbox-daemon.pod Documentation/public-inbox-edit.pod +Documentation/public-inbox-extindex-format.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-imapd.pod Documentation/public-inbox-index.pod @@ -122,6 +122,8 @@ lib/PublicInbox/Emergency.pm lib/PublicInbox/Eml.pm lib/PublicInbox/EmlContentFoo.pm lib/PublicInbox/ExtMsg.pm +lib/PublicInbox/ExtSearch.pm +lib/PublicInbox/ExtSearchIdx.pm lib/PublicInbox/FakeInotify.pm lib/PublicInbox/Feed.pm lib/PublicInbox/Filter/Base.pm @@ -130,6 +132,8 @@ lib/PublicInbox/Filter/Mirror.pm lib/PublicInbox/Filter/RubyLang.pm lib/PublicInbox/Filter/SubjectTag.pm lib/PublicInbox/Filter/Vger.pm +lib/PublicInbox/Gcf2.pm +lib/PublicInbox/Gcf2Client.pm lib/PublicInbox/GetlineBody.pm lib/PublicInbox/Git.pm lib/PublicInbox/GitAsyncCat.pm @@ -153,6 +157,7 @@ lib/PublicInbox/In2Tie.pm lib/PublicInbox/Inbox.pm lib/PublicInbox/InboxIdle.pm lib/PublicInbox/InboxWritable.pm +lib/PublicInbox/Isearch.pm lib/PublicInbox/KQNotify.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm @@ -163,6 +168,8 @@ lib/PublicInbox/MIME.pm lib/PublicInbox/ManifestJsGz.pm lib/PublicInbox/Mbox.pm lib/PublicInbox/MboxGz.pm +lib/PublicInbox/MiscIdx.pm +lib/PublicInbox/MiscSearch.pm lib/PublicInbox/MsgIter.pm lib/PublicInbox/MsgTime.pm lib/PublicInbox/Msgmap.pm @@ -214,6 +221,7 @@ lib/PublicInbox/WwwStatic.pm lib/PublicInbox/WwwStream.pm lib/PublicInbox/WwwText.pm lib/PublicInbox/Xapcmd.pm +lib/PublicInbox/gcf2_libgit2.h sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre @@ -221,6 +229,7 @@ sa_config/user/.spamassassin/user_prefs script/public-inbox-compact script/public-inbox-convert script/public-inbox-edit +script/public-inbox-extindex script/public-inbox-httpd script/public-inbox-imapd script/public-inbox-index @@ -267,6 +276,7 @@ t/eml.t t/eml_content_disposition.t t/eml_content_type.t t/epoll.t +t/extsearch.t t/fail-bin/spamc t/fake_inotify.t t/feed.t @@ -277,6 +287,8 @@ t/filter_mirror.t t/filter_rubylang.t t/filter_subjecttag.t t/filter_vger.t +t/gcf2.t +t/gcf2_client.t t/git-http-backend.psgi t/git.fast-import-data t/git.t @@ -311,6 +323,7 @@ t/mda.t t/mda_filter_rubylang.t t/mid.t t/mime.t +t/miscsearch.t t/msg_iter-nested.eml t/msg_iter-order.eml t/msg_iter.t @@ -381,6 +394,7 @@ t/x-unknown-alpine.eml t/xcpdb-reshard.t xt/cmp-msgstr.t xt/cmp-msgview.t +xt/create-many-inboxes.t xt/eml_check_limits.t xt/git-http-backend.t xt/git_async_cmp.t diff --git a/Makefile.PL b/Makefile.PL index be3471a6..57592378 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -10,7 +10,7 @@ my $v = {}; my $t = {}; # do not sort -my @RELEASES = qw(v1.6.1 v1.6.0 v1.5.0 v1.4.0 v1.3.0 v1.2.0 v1.1.0-pre1 v1.0.0); +my @RELEASES = qw(v1.6.0 v1.5.0 v1.4.0 v1.3.0 v1.2.0 v1.1.0-pre1 v1.0.0); $v->{news_deps} = [ map { "Documentation/RelNotes/$_.eml" } @RELEASES ]; $v->{txt} = [ qw(INSTALL README COPYING TODO HACKING) ]; @@ -31,9 +31,20 @@ my @syn = (@EXE_FILES, grep(m!^lib/.*\.pm$!, @manifest), @scripts); @syn = grep(!/SaPlugin/, @syn) if !eval { require Mail::SpamAssasin }; $v->{syn_files} = \@syn; $v->{my_syntax} = [map { "$_.syntax" } @syn]; -$v->{-m1} = [ map { (split('/'))[-1] } @EXE_FILES ]; +my @no_pod; +$v->{-m1} = [ map { + my $x = (split('/'))[-1]; + my $pod = "Documentation/$x.pod"; + if (-f $pod) { + $x; + } else { + warn "W: $pod missing\n"; + push @no_pod, $x; + (); + } + } @EXE_FILES ]; $v->{-m5} = [ qw(public-inbox-config public-inbox-v1-format - public-inbox-v2-format) ]; + public-inbox-v2-format public-inbox-extindex-format) ]; $v->{-m7} = [ qw(public-inbox-overview public-inbox-tuning) ]; $v->{-m8} = [ qw(public-inbox-daemon) ]; my @sections = (1, 5, 7, 8); @@ -109,12 +120,13 @@ my %man3 = map {; # semi-colon tells Perl this is a BLOCK (and not EXPR) $mod =~ s/\.\w+\z//; "lib/PublicInbox/$_" => "blib/man3/PublicInbox::$mod.\$(MAN3EXT)" } qw(Git.pm Import.pm WWW.pod SaPlugin/ListMirror.pod); +my $warn_no_pod = @no_pod ? "\n\t\@echo W: missing .pod: @no_pod\n" : ''; WriteMakefile( NAME => 'PublicInbox', # n.b. camel-case is not our choice # XXX drop "PENDING" in .pod before updating this! - VERSION => '1.6.1', + VERSION => '1.6.0', AUTHOR => 'Eric Wong ', ABSTRACT => 'public-inbox server infrastructure', @@ -172,6 +184,8 @@ $VARS -include Documentation/include.mk $TGTS +check-man ::$warn_no_pod + # syntax checks are currently GNU make only: %.syntax :: % @\$(PERL) -w -I lib -c \$< diff --git a/README b/README index ae428bcf..6396373f 100644 --- a/README +++ b/README @@ -94,6 +94,7 @@ AGPL source code is available via git: git clone https://public-inbox.org/public-inbox.git git clone https://repo.or.cz/public-inbox.git + torsocks git clone http://ou63pmih66umazou.onion/public-inbox.git torsocks git clone http://hjrcffqmbrq6wope.onion/public-inbox See below for contact info. diff --git a/examples/cgit.psgi b/examples/cgit.psgi index 7ad38e28..48e3798b 100644 --- a/examples/cgit.psgi +++ b/examples/cgit.psgi @@ -14,8 +14,8 @@ use warnings; use Plack::Builder; use PublicInbox::Cgit; use PublicInbox::Config; -my $pi_config = PublicInbox::Config->new; -my $cgit = PublicInbox::Cgit->new($pi_config); +my $pi_cfg = PublicInbox::Config->new; +my $cgit = PublicInbox::Cgit->new($pi_cfg); builder { eval { enable 'ReverseProxy' }; diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index fb88e621..d414e4e2 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -6,15 +6,15 @@ package PublicInbox::Admin; use strict; use parent qw(Exporter); -use Cwd qw(abs_path); -use POSIX (); -our @EXPORT_OK = qw(resolve_repo_dir setup_signals); +our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); +*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { my ($cb, $arg) = @_; # optional + require POSIX; # we call exit() here instead of _exit() so DESTROY methods # get called (e.g. File::Temp::Dir and PublicInbox::Msgmap) @@ -27,21 +27,34 @@ sub setup_signals { }; } -sub resolve_repo_dir { +sub resolve_inboxdir { my ($cd, $ver) = @_; - my $prefix = defined $cd ? $cd : './'; - if (-d $prefix && -f "$prefix/inbox.lock") { # v2 - $$ver = 2 if $ver; - return abs_path($prefix); + my $try = $cd // '.'; + my $root_dev_ino; + while (1) { # favor v2, first + if (-f "$try/inbox.lock") { + $$ver = 2 if $ver; + return rel2abs_collapsed($try); + } elsif (-d $try) { + my @try = stat _; + $root_dev_ino //= do { + my @root = stat('/') or die "stat /: $!\n"; + "$root[0]\0$root[1]"; + }; + last if "$try[0]\0$try[1]" eq $root_dev_ino; + $try .= '/..'; # continue, cd up + } else { + die "`$try' is not a directory\n"; + } } + # try v1 bare git dirs my $cmd = [ qw(git rev-parse --git-dir) ]; my $fh = popen_rd($cmd, undef, {-C => $cd}); my $dir = do { local $/; <$fh> }; - close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n"; + close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n"; chomp $dir; $$ver = 1 if $ver; - return abs_path($cd) if ($dir eq '.' && defined $cd); - abs_path($dir); + rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir); } # for unconfigured inboxes @@ -78,8 +91,8 @@ sub unconfigured_ibx ($$) { name => $name, address => [ "$name\@example.com" ], inboxdir => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, + # consumers (-convert) warn on this: + -unconfigured => 1, }); } @@ -95,40 +108,53 @@ sub resolve_inboxes ($;$$) { } my $min_ver = $opt->{-min_inbox_version} || 0; + # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames, + # pathnames are not unique due to symlinks and bind mounts my (@old, @ibxs); - my %dir2ibx; - if ($cfg) { + if ($opt->{all}) { $cfg->each_inbox(sub { my ($ibx) = @_; - my $path = abs_path($ibx->{inboxdir}); - if (defined($path)) { - $dir2ibx{$path} = $ibx; + if (-e $ibx->{inboxdir}) { + push(@ibxs, $ibx) if $ibx->version >= $min_ver; } else { - warn <{name} $ibx->{inboxdir}: $! -EOF + warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); - } - if ($opt->{all}) { - my @all = values %dir2ibx; - @all = grep { $_->version >= $min_ver } @all; - push @ibxs, @all; } else { # directories specified on the command-line - my $i = 0; my @dirs = @$argv; - push @dirs, '.' unless @dirs; - foreach (@dirs) { - my $v; - my $dir = resolve_repo_dir($_, \$v); - if ($v < $min_ver) { + push @dirs, '.' if !@dirs && $opt->{-use_cwd}; + my %s2i; # "st_dev\0st_ino" => array index + for (my $i = 0; $i <= $#dirs; $i++) { + my $dir = $dirs[$i]; + my @st = stat($dir) or die "stat($dir): $!\n"; + $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver)); + if ($ver >= $min_ver) { + $s2i{"$st[0]\0$st[1]"} //= $i; + } else { push @old, $dir; - next; } - my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); - $i++; - push @ibxs, $ibx; } + my $done = \'done'; + eval { + $cfg->each_inbox(sub { + my ($ibx) = @_; + return if $ibx->version < $min_ver; + my $dir = $ibx->{inboxdir}; + if (my @s = stat $dir) { + my $i = delete($s2i{"$s[0]\0$s[1]"}) + // return; + $ibxs[$i] = $ibx; + die $done if !keys(%s2i); + } else { + warn "W: $ibx->{name} $dir: $!\n"; + } + }); + }; + die $@ if $@ && $@ ne $done; + for my $i (sort { $a <=> $b } values %s2i) { + $ibxs[$i] = unconfigured_ibx($dirs[$i], $i); + } + @ibxs = grep { defined } @ibxs; # duplicates are undef } if (@old) { die "-V$min_ver inboxes not supported by $0\n\t", @@ -208,12 +234,20 @@ sub index_terminate { sub index_inbox { my ($ibx, $im, $opt) = @_; + require PublicInbox::InboxWritable; my $jobs = delete $opt->{jobs} if $opt; if (my $pr = $opt->{-progress}) { $pr->("indexing $ibx->{inboxdir} ...\n"); } local %SIG = %SIG; setup_signals(\&index_terminate, $ibx); + my $warn_cb = $SIG{__WARN__} // \&CORE::warn; + my $idx = { current_info => $ibx->{inboxdir} }; + my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore'); + local $SIG{__WARN__} = sub { + return if $warn_ignore->(@_); + $warn_cb->($idx->{current_info}, ': ', @_); + }; if (ref($ibx) && $ibx->version == 2) { eval { require PublicInbox::V2Writable }; die "v2 requirements not met: $@\n" if $@; @@ -225,21 +259,19 @@ sub index_inbox { } else { my $n = $v2w->{shards}; if ($jobs < ($n + 1) && !$opt->{reshard}) { - warn -"Unable to respect --jobs=$jobs on index, inbox was created with $n shards\n"; + warn <($v2w->{current_info}, ': ', @_); - }; - $v2w->index_sync($opt); + $idx = $v2w; } else { require PublicInbox::SearchIdx; - my $s = PublicInbox::SearchIdx->new($ibx, 1); - $s->index_sync($opt); + $idx = PublicInbox::SearchIdx->new($ibx, 1); } + $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } sub progress_prepare ($) { diff --git a/lib/PublicInbox/Cgit.pm b/lib/PublicInbox/Cgit.pm index fb0d0e60..472509a8 100644 --- a/lib/PublicInbox/Cgit.pm +++ b/lib/PublicInbox/Cgit.pm @@ -16,9 +16,9 @@ use PublicInbox::Qspawn; use PublicInbox::WwwStatic qw(r); sub locate_cgit ($) { - my ($pi_config) = @_; - my $cgit_bin = $pi_config->{'publicinbox.cgitbin'}; - my $cgit_data = $pi_config->{'publicinbox.cgitdata'}; + my ($pi_cfg) = @_; + my $cgit_bin = $pi_cfg->{'publicinbox.cgitbin'}; + my $cgit_data = $pi_cfg->{'publicinbox.cgitdata'}; # /var/www/htdocs/cgit is the default install path from cgit.git # /usr/{lib,share}/cgit is where Debian puts cgit @@ -51,28 +51,28 @@ sub locate_cgit ($) { } sub new { - my ($class, $pi_config) = @_; - my ($cgit_bin, $cgit_data) = locate_cgit($pi_config); + my ($class, $pi_cfg) = @_; + my ($cgit_bin, $cgit_data) = locate_cgit($pi_cfg); my $self = bless { cmd => [ $cgit_bin ], cgit_data => $cgit_data, - pi_config => $pi_config, + pi_cfg => $pi_cfg, }, $class; - $pi_config->fill_all; # fill in -code_repos mapped to inboxes + $pi_cfg->fill_all; # fill in -code_repos mapped to inboxes # some cgit repos may not be mapped to inboxes, so ensure those exist: - my $code_repos = $pi_config->{-code_repos}; - foreach my $k (keys %$pi_config) { + my $code_repos = $pi_cfg->{-code_repos}; + foreach my $k (keys %$pi_cfg) { $k =~ /\Acoderepo\.(.+)\.dir\z/ or next; - my $dir = $pi_config->{$k}; + my $dir = $pi_cfg->{$k}; $code_repos->{$1} ||= PublicInbox::Git->new($dir); } while (my ($nick, $repo) = each %$code_repos) { $self->{"\0$nick"} = $repo; } - my $cgit_static = $pi_config->{-cgit_static}; + my $cgit_static = $pi_cfg->{-cgit_static}; my $static = join('|', map { quotemeta $_ } keys %$cgit_static); $self->{static} = qr/\A($static)\z/; $self; @@ -120,7 +120,7 @@ sub call { my $rdr = input_prepare($env) or return r(500); my $qsp = PublicInbox::Qspawn->new($self->{cmd}, $cgi_env, $rdr); - my $limiter = $self->{pi_config}->limiter('-cgit'); + my $limiter = $self->{pi_cfg}->limiter('-cgit'); $qsp->psgi_return($env, $limiter, $parse_cgi_headers); } diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index d57c361a..21f2161a 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -33,6 +33,7 @@ sub new { $self->{-by_list_id} = {}; $self->{-by_name} = {}; $self->{-by_newsgroup} = {}; + $self->{-by_eidx_key} = {}; $self->{-no_obfuscate} = {}; $self->{-limiters} = {}; $self->{-code_repos} = {}; # nick => PublicInbox::Git object @@ -89,6 +90,14 @@ sub lookup_name ($$) { $self->{-by_name}->{$name} // _fill($self, "publicinbox.$name"); } +sub lookup_ei { + my ($self, $name) = @_; + $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name"); +} + +# special case for [extindex "all"] +sub ALL { lookup_ei($_[0], 'all') } + sub each_inbox { my ($self, $cb, @arg) = @_; # may auto-vivify if config file is non-existent: @@ -123,20 +132,16 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my %rv; - my (%section_seen, @section_order); + my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i); local $/ = $rs; - while (defined(my $line = <$fh>)) { - chomp $line; - my ($k, $v) = split($fs, $line, 2); - my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); - unless (defined $section_seen{$section}) { - $section_seen{$section} = 1; - push @section_order, $section; - } - - my $cur = $rv{$k}; - if (defined $cur) { + while (defined($line = <$fh>)) { # perf critical with giant configs + $i = index($line, $fs); + $k = substr($line, 0, $i); + $v = substr($line, $i + 1, -1); # chop off $fs + $section = substr($k, 0, rindex($k, '.')); + $seen{$section} //= push(@section_order, $section); + + if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { push @$cur, $v; } else { @@ -154,11 +159,10 @@ sub config_fh_parse ($$$) { sub git_config_dump { my ($file) = @_; return {} unless -e $file; - my @cmd = (qw/git config -z -l --includes/, "--file=$file"); - my $cmd = join(' ', @cmd); - my $fh = popen_rd(\@cmd); + my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; + my $fh = popen_rd($cmd); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "failed to close ($cmd) pipe: $?"; + close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } @@ -360,6 +364,16 @@ sub git_bool { } } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed { + require File::Spec; + my $p = File::Spec->rel2abs($_[-1]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; + require Cwd; + Cwd::abs_path($p); +} + sub _fill { my ($self, $pfx) = @_; my $ibx = {}; @@ -382,10 +396,10 @@ EOF } } - # backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"}; - if (($ibx->{inboxdir} // '') =~ /\n/s) { - warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; + # "mainrepo" is backwards compatibility: + my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if (index($dir, "\n") >= 0) { + warn "E: `$dir' must not contain `\\n'\n"; return; } foreach my $k (qw(obfuscate)) { @@ -406,17 +420,14 @@ EOF } } - return unless defined($ibx->{inboxdir}); - my $name = $pfx; - $name =~ s/\Apublicinbox\.//; - + my $name = substr($pfx, length('publicinbox.')); if (!valid_inbox_name($name)) { warn "invalid inbox name: '$name'\n"; return; } $ibx->{name} = $name; - $ibx->{-pi_config} = $self; + $ibx->{-pi_cfg} = $self; $ibx = PublicInbox::Inbox->new($ibx); foreach (@{$ibx->{address}}) { my $lc_addr = lc($_); @@ -429,8 +440,31 @@ EOF $self->{-by_list_id}->{lc($list_id)} = $ibx; } } - if (my $ng = $ibx->{newsgroup}) { - $self->{-by_newsgroup}->{$ng} = $ibx; + if (defined(my $ngname = $ibx->{newsgroup})) { + if (ref($ngname)) { + delete $ibx->{newsgroup}; + warn 'multiple newsgroups not supported: '. + join(', ', @$ngname). "\n"; + # Newsgroup name needs to be compatible with RFC 3977 + # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. + # Leave out a few chars likely to cause problems or conflicts: + # '|', '<', '>', ';', '#', '$', '&', + } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! || + $ngname eq '') { + delete $ibx->{newsgroup}; + warn "newsgroup name invalid: `$ngname'\n"; + } else { + # PublicInbox::NNTPD does stricter ->nntp_usable + # checks, keep this lean for startup speed + $self->{-by_newsgroup}->{$ngname} = $ibx; + } + } + unless (defined $ibx->{newsgroup}) { # for ->eidx_key + my $abs = rel2abs_collapsed($dir); + if ($abs ne $dir) { + warn "W: `$dir' canonicalized to `$abs'\n"; + $ibx->{inboxdir} = $abs; + } } $self->{-by_name}->{$name} = $ibx; if ($ibx->{obfuscate}) { @@ -453,8 +487,18 @@ EOF push @$repo_objs, $repo if $repo; } } + if (my $es = ALL($self)) { + require PublicInbox::Isearch; + $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es); + } + $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx; +} - $ibx +sub _fill_ei ($$) { + my ($self, $pfx) = @_; + require PublicInbox::ExtSearch; + my $d = $self->{"$pfx.topdir"}; + defined($d) && -d $d ? PublicInbox::ExtSearch->new($d) : undef; } sub urlmatch { @@ -476,4 +520,16 @@ sub urlmatch { } } +sub json { + state $json; + $json //= do { + for my $mod (qw(Cpanel::JSON::XS JSON::MaybeXS JSON JSON::PP)) { + eval "require $mod" or next; + # ->ascii encodes non-ASCII to "\uXXXX" + $json = $mod->new->ascii(1) and last; + } + $json; + }; +} + 1; diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index a02b3bb7..97a6f6ef 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -50,7 +50,6 @@ our ( $PostLoopCallback, # subref to call at the end of each loop, if defined (global) $LoopTimeout, # timeout of event loop in milliseconds - $DoneInit, # if we've done the one-time module init yet @Timers, # timers $in_loop, ); @@ -75,12 +74,9 @@ sub Reset { @Timers = (); $PostLoopCallback = undef; - $DoneInit = 0; $_io = undef; # closes real $Epoll FD $Epoll = undef; # may call DSKQXS::DESTROY - - *EventLoop = *FirstTimeEventLoop; } =head2 C<< CLASS->SetLoopTimeout( $timeout ) >> @@ -91,9 +87,7 @@ A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return immediately. =cut -sub SetLoopTimeout { - return $LoopTimeout = $_[1] + 0; -} +sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 } =head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >> @@ -137,14 +131,13 @@ sub set_cloexec ($) { fcntl($_io, F_SETFD, $fl | FD_CLOEXEC); } +# caller sets return value to $Epoll sub _InitPoller { - return if $DoneInit; - $DoneInit = 1; - if (PublicInbox::Syscall::epoll_defined()) { - $Epoll = epoll_create(); - set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0); + my $fd = epoll_create(); + set_cloexec($fd) if (defined($fd) && $fd >= 0); + $fd; } else { my $cls; for (qw(DSKQXS DSPoll)) { @@ -152,9 +145,8 @@ sub _InitPoller last if eval "require $cls"; } $cls->import(qw(epoll_ctl epoll_wait)); - $Epoll = $cls->new; + $cls->new; } - *EventLoop = *EpollEventLoop; } =head2 C<< CLASS->EventLoop() >> @@ -163,13 +155,6 @@ Start processing IO events. In most daemon programs this never exits. See C below for how to exit the loop. =cut -sub FirstTimeEventLoop { - my $class = shift; - - _InitPoller(); - - EventLoop($class); -} sub now () { clock_gettime(CLOCK_MONOTONIC) } @@ -213,12 +198,7 @@ sub RunTimers { my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; # -1 is an infinite timeout, so prefer a real timeout - return $timeout if $LoopTimeout == -1; - - # otherwise pick the lower of our regular timeout and time until - # the next timer - return $LoopTimeout if $LoopTimeout < $timeout; - return $timeout; + ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout; } # We can't use waitpid(-1) safely here since it can hit ``, system(), @@ -271,21 +251,21 @@ sub PostEventLoop () { $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; } -sub EpollEventLoop { +sub EventLoop { + $Epoll //= _InitPoller(); local $in_loop = 1; + my @events; do { - my @events; - my $i; my $timeout = RunTimers(); # get up to 1000 events - my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events); - for ($i=0; $i<$evcount; $i++) { + epoll_wait($Epoll, 1000, $timeout, \@events); + for my $fd (@events) { # it's possible epoll_wait returned many events, including some at the end # that ones in the front triggered unregister-interest actions. if we # can't find the %sock entry, it's because we're no longer interested # in that event. - $DescriptorMap{$events[$i]->[0]}->event_step; + $DescriptorMap{$fd}->event_step; } } while (PostEventLoop()); _run_later(); @@ -330,8 +310,7 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - _InitPoller(); - + $Epoll //= _InitPoller(); retry: if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index d1d3fe60..aa2c9168 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -134,7 +134,7 @@ sub epoll_wait { } } # caller only cares for $events[$i]->[0] - scalar(@$events); + $_ = $_->[0] for @$events; } # kqueue is close-on-fork (not exec), so we must not close it diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 1d9b51d9..a218f695 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -45,14 +45,13 @@ sub epoll_wait { my $fd = $pset[$i++]; my $revents = $pset[$i++] or next; delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; - push @$events, [ $fd ]; + push @$events, $fd; } my $nevents = scalar @$events; if ($n != $nevents) { warn "BUG? poll() returned $n, but got $nevents"; } } - $n; } 1; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index 5fdcba14..1762be0b 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -11,7 +11,6 @@ use IO::Socket; use POSIX qw(WNOHANG :signal_h); use Socket qw(IPPROTO_TCP SOL_SOCKET); sub SO_ACCEPTFILTER () { 0x1000 } -use Cwd qw/abs_path/; STDOUT->autoflush(1); STDERR->autoflush(1); use PublicInbox::DS qw(now); @@ -19,6 +18,7 @@ use PublicInbox::Syscall qw($SFD_NONBLOCK); require PublicInbox::Listener; use PublicInbox::EOFpipe; use PublicInbox::Sigfd; +use PublicInbox::GitAsyncCat; my @CMD; my ($set_user, $oldset); my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); @@ -201,10 +201,11 @@ sub check_absolute ($$) { sub daemonize () { if ($daemonize) { + require Cwd; foreach my $i (0..$#ARGV) { my $arg = $ARGV[$i]; next unless -e $arg; - $ARGV[$i] = abs_path($arg); + $ARGV[$i] = Cwd::abs_path($arg); } check_absolute('stdout', $stdout); check_absolute('stderr', $stderr); @@ -236,8 +237,7 @@ EOF }; if ($daemonize) { - my $pid = fork; - die "could not fork: $!\n" unless defined $pid; + my $pid = fork // die "fork: $!"; exit if $pid; open(STDIN, '+<', '/dev/null') or @@ -245,8 +245,7 @@ EOF open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - $pid = fork; - die "could not fork: $!\n" unless defined $pid; + $pid = fork // die "fork: $!"; exit if $pid; } return unless defined $pid_file; @@ -368,14 +367,12 @@ sub inherit ($) { foreach my $fd (3..$end) { my $s = IO::Handle->new_from_fd($fd, 'r'); if (my $k = sockname($s)) { - if ($s->blocking) { - $s->blocking(0); - warn <<""; + my $prev_was_blocking = $s->blocking(0); + warn <<"" if $prev_was_blocking; Inherited socket (fd=$fd) is blocking, making it non-blocking. Set 'NonBlocking = true' in the systemd.service unit to avoid stalled processes when multiple service instances start. - } $listener_names->{$k} = $s; push @rv, $s; } else { @@ -422,11 +419,8 @@ sub upgrade { # $_[0] = signal name or number (unused) } sub kill_workers ($) { - my ($s) = @_; - - while (my ($pid, $id) = each %pids) { - kill $s, $pid; - } + my ($sig) = @_; + kill $sig, keys(%pids); } sub upgrade_aborted ($) { @@ -648,6 +642,10 @@ sub run ($$$;$) { daemon_prepare($default); my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef; my $for_destroy = daemonize(); + + # localize GCF2C for tests: + local $PublicInbox::GitAsyncCat::GCF2C; + daemon_loop($refresh, $post_accept, $tlsd, $af_default); PublicInbox::DS->Reset; # ->DESTROY runs when $for_destroy goes out-of-scope diff --git a/lib/PublicInbox/DummyInbox.pm b/lib/PublicInbox/DummyInbox.pm index 69b0b683..981043ce 100644 --- a/lib/PublicInbox/DummyInbox.pm +++ b/lib/PublicInbox/DummyInbox.pm @@ -7,16 +7,16 @@ package PublicInbox::DummyInbox; use strict; -sub created_at { 0 } # Msgmap::created_at +sub uidvalidity { 0 } # Msgmap::created_at sub mm { shift } sub uid_range { [] } # Over::uid_range sub subscribe_unlock { undef }; no warnings 'once'; -*max = \&created_at; +*max = \&uidvalidity; *query_xover = \&uid_range; *over = \&mm; -*search = *unsubscribe_unlock = +*isrch = *search = *unsubscribe_unlock = *get_art = *description = *base_url = \&subscribe_unlock; 1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 03faf3a1..4df885ab 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -32,8 +32,8 @@ sub PARTIAL_MAX () { 100 } sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; - my $srch = $ibx->search or return; - my $opt = { limit => PARTIAL_MAX, mset => 2 }; + my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL + my $opt = { limit => PARTIAL_MAX, relevance => -1 }; my @try = ("m:$mid*"); my $chop = $mid; if ($chop =~ s/(\W+)(\w*)\z//) { @@ -76,7 +76,7 @@ sub search_partial ($$) { sub ext_msg_i { my ($other, $ctx) = @_; - return if $other->{name} eq $ctx->{-inbox}->{name} || !$other->base_url; + return if $other->{name} eq $ctx->{ibx}->{name} || !$other->base_url; my $mm = $other->mm or return; @@ -103,19 +103,48 @@ sub ext_msg_step { } } +sub ext_msg_ALL ($) { + my ($ctx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL or return; + my $by_eidx_key = $ctx->{www}->{pi_cfg}->{-by_eidx_key}; + my $cur_key = eval { $ctx->{ibx}->eidx_key } // + return partial_response($ctx); # $cur->{ibx} == $ALL + my %seen = ($cur_key => 1); + my ($id, $prev); + while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($x->{num}); + for my $k (@$xr3) { + $k =~ s/:[0-9]+:$x->{blob}\z// or next; + next if $k eq $cur_key; + my $ibx = $by_eidx_key->{$k} // next; + my $url = $ibx->base_url or next; + push(@{$ctx->{found}}, $ibx) unless $seen{$k}++; + } + } + return exact($ctx) if $ctx->{found}; + + # fall back to partial MID matching + for my $ibxish ($ctx->{ibx}, $ALL) { + my $mids = search_partial($ibxish, $ctx->{mid}) or next; + push @{$ctx->{partial}}, [ $ibxish, $mids ]; + last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX; + } + partial_response($ctx); +} + sub ext_msg { my ($ctx) = @_; - sub { + ext_msg_ALL($ctx) // sub { $ctx->{-wcb} = $_[0]; # HTTP server write callback if ($ctx->{env}->{'pi-httpd.async'}) { require PublicInbox::ConfigIter; my $iter = PublicInbox::ConfigIter->new( - $ctx->{www}->{pi_config}, + $ctx->{www}->{pi_cfg}, \&ext_msg_step, $ctx); $iter->event_step; } else { - $ctx->{www}->{pi_config}->each_inbox(\&ext_msg_i, $ctx); + $ctx->{www}->{pi_cfg}->each_inbox(\&ext_msg_i, $ctx); finalize_exact($ctx); } }; @@ -141,7 +170,7 @@ sub finalize_exact { # fall back to partial MID matching my $mid = $ctx->{mid}; - my $cur = $ctx->{-inbox}; + my $cur = $ctx->{ibx}; my $mids = search_partial($cur, $mid); if ($mids) { $ctx->{n_partial} = scalar(@$mids); @@ -159,7 +188,7 @@ sub finalize_exact { finalize_partial($ctx); } -sub finalize_partial { +sub partial_response ($) { my ($ctx) = @_; my $mid = $ctx->{mid}; my $code = 404; @@ -172,7 +201,7 @@ sub finalize_partial { my $es = $n_partial == 1 ? '' : 'es'; $n_partial .= '+' if ($n_partial == PARTIAL_MAX); $s .= "\n$n_partial partial match$es found:\n\n"; - my $cur_name = $ctx->{-inbox}->{name}; + my $cur_name = $ctx->{ibx}->{name}; foreach my $pair (@{$ctx->{partial}}) { my ($ibx, $res) = @$pair; my $env = $ctx->{env} if $ibx->{name} eq $cur_name; @@ -192,9 +221,11 @@ sub finalize_partial { $ctx->{-html_tip} = $s .= ''; $ctx->{-title_html} = $title; $ctx->{-upfx} = '../'; - $ctx->{-wcb}->(html_oneshot($ctx, $code)); + html_oneshot($ctx, $code); } +sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) } + sub ext_urls { my ($ctx, $mid, $href, $html) = @_; diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm new file mode 100644 index 00000000..7c9586a6 --- /dev/null +++ b/lib/PublicInbox/ExtSearch.pm @@ -0,0 +1,129 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# Read-only external (detached) index for cross inbox search. +# This is a read-only counterpart to PublicInbox::ExtSearchIdx +# and behaves like PublicInbox::Inbox AND PublicInbox::Search +package PublicInbox::ExtSearch; +use strict; +use v5.10.1; +use PublicInbox::Over; +use PublicInbox::Inbox; +use PublicInbox::MiscSearch; +use DBI qw(:sql_types); # SQL_BLOB + +# for ->reopen, ->mset, ->mset_to_artnums +use parent qw(PublicInbox::Search); + +sub new { + my (undef, $topdir) = @_; + bless { + topdir => $topdir, + # xpfx => 'ei15' + xpfx => "$topdir/ei".PublicInbox::Search::SCHEMA_VERSION + }, __PACKAGE__; +} + +sub misc { + my ($self) = @_; + $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); +} + +# overrides PublicInbox::Search::_xdb +sub _xdb { + my ($self) = @_; + $self->xdb_sharded; +} + +# same as per-inbox ->over, for now... +sub over { + my ($self) = @_; + $self->{over} //= PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); +} + +sub git { + my ($self) = @_; + $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); +} + +# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table +sub nntp_xref_for { # NNTP only + my ($self, $xibx, $xsmsg) = @_; + my $dbh = over($self)->dbh; + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($xibx->{newsgroup}); + my $xibx_id = $sth->fetchrow_array // do { + warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n"; + return; + }; + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1 + + $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB); + + # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2 + $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num}); + $sth->bind_param(3, $xibx_id); + $sth->execute; + my $docid = $sth->fetchrow_array // do { + warn <{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}" +EOF + return; + }; + + # LIMIT is number of newsgroups on server: + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ? + + $sth->execute($docid, $xibx_id); + my $rows = $sth->fetchall_arrayref; + + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 + + my %xref = map { + my ($ibx_id, $xnum) = @$_; + + $eidx_key_sth->execute($ibx_id); + my $eidx_key = $eidx_key_sth->fetchrow_array; + + # only include if there's a newsgroup name + $eidx_key && index($eidx_key, '/') >= 0 ? + () : ($eidx_key => $xnum) + } @$rows; + $xref{$xibx->{newsgroup}} = $xsmsg->{num}; + \%xref; +} + +sub mm { undef } + +sub altid_map { {} } + +sub description { + my ($self) = @_; + ($self->{description} //= + PublicInbox::Inbox::cat_desc("$self->{topdir}/description")) // + '$EXTINDEX_DIR/description missing'; +} + +sub cloneurl { [] } # TODO + +sub base_url { 'https://example.com/TODO/' } +sub nntp_url { [] } + +no warnings 'once'; +*smsg_eml = \&PublicInbox::Inbox::smsg_eml; +*smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; +*msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; +*modified = \&PublicInbox::Inbox::modified; +*recent = \&PublicInbox::Inbox::recent; + +*max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef +*isrch = *search = \&PublicInbox::Search::reopen; + +1; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm new file mode 100644 index 00000000..07e64698 --- /dev/null +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -0,0 +1,1105 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# Detached/external index cross inbox search indexing support +# read-write counterpart to PublicInbox::ExtSearch +# +# It's based on the same ideas as public-inbox-v2-format(5) using +# over.sqlite3 for dedupe and sharded Xapian. msgmap.sqlite3 is +# missing, so there is no Message-ID conflict resolution, meaning +# no NNTP support for now. +# +# v2 has a 1:1 mapping of index:inbox or msgmap for NNTP support. +# This is intended to be an M:N index:inbox mapping, but it'll likely +# be 1:N in common practice (M==1) + +package PublicInbox::ExtSearchIdx; +use strict; +use v5.10.1; +use parent qw(PublicInbox::ExtSearch PublicInbox::Lock); +use Carp qw(croak carp); +use Sys::Hostname qw(hostname); +use POSIX qw(strftime); +use PublicInbox::Search; +use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor + is_bad_blob); +use PublicInbox::OverIdx; +use PublicInbox::MiscIdx; +use PublicInbox::MID qw(mids); +use PublicInbox::V2Writable; +use PublicInbox::InboxWritable; +use PublicInbox::ContentHash qw(content_hash); +use PublicInbox::Eml; +use PublicInbox::DS qw(now); +use DBI qw(:sql_types); # SQL_BLOB + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not yet supported\n"; + my $self = bless { + xpfx => "$dir/ei".PublicInbox::Search::SCHEMA_VERSION, + topdir => $dir, + creat => $opt->{creat}, + ibx_map => {}, # (newsgroup//inboxdir) => $ibx + ibx_list => [], + indexlevel => $l, + transact_bytes => 0, + total_bytes => 0, + current_info => '', + parallel => 1, + lock_path => "$dir/ei.lock", + }, __PACKAGE__; + $self->{shards} = $self->count_shards || nproc_shards($opt->{creat}); + my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); + $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{oidx} = $oidx; + $self +} + +sub attach_inbox { + my ($self, $ibx) = @_; + $self->{ibx_map}->{$ibx->eidx_key} //= do { + push @{$self->{ibx_list}}, $ibx; + $ibx; + } +} + +sub _ibx_attach { # each_inbox callback + my ($ibx, $self) = @_; + attach_inbox($self, $ibx); +} + +sub attach_config { + my ($self, $cfg) = @_; + $self->{cfg} = $cfg; + $cfg->each_inbox(\&_ibx_attach, $self); +} + +sub check_batch_limit ($) { + my ($req) = @_; + my $self = $req->{self}; + my $new_smsg = $req->{new_smsg}; + + # {raw_bytes} may be unset, so just use {bytes} + my $n = $self->{transact_bytes} += $new_smsg->{bytes}; + + # set flag for PublicInbox::V2Writable::index_todo: + ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; +} + +sub do_xpost ($$) { + my ($req, $smsg) = @_; + my $self = $req->{self}; + my $docid = $smsg->{num}; + my $idx = $self->idx_shard($docid); + my $oid = $req->{oid}; + my $xibx = $req->{ibx}; + my $eml = $req->{eml}; + my $eidx_key = $xibx->eidx_key; + if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message + my $xnum = $req->{xnum}; + $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); + $idx->shard_add_eidx_info($docid, $eidx_key, $eml); + check_batch_limit($req); + } else { # 'd' + my $rm_eidx_info; + my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key, + \$rm_eidx_info); + if ($nr == 0) { + $self->{oidx}->eidxq_del($docid); + $idx->shard_remove($docid); + } elsif ($rm_eidx_info) { + $idx->shard_remove_eidx_info($docid, $eidx_key, $eml); + $self->{oidx}->eidxq_add($docid); # yes, add + } + } +} + +# called by V2Writable::sync_prepare +sub artnum_max { $_[0]->{oidx}->eidx_max } + +sub index_unseen ($) { + my ($req) = @_; + my $new_smsg = $req->{new_smsg} or die 'BUG: {new_smsg} unset'; + my $eml = delete $req->{eml}; + $new_smsg->populate($eml, $req); + my $self = $req->{self}; + my $docid = $self->{oidx}->adj_counter('eidx_docid', '+'); + $new_smsg->{num} = $docid; + my $idx = $self->idx_shard($docid); + $self->{oidx}->add_overview($eml, $new_smsg); + my $oid = $new_smsg->{blob}; + my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset'; + $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key); + $idx->index_raw(undef, $eml, $new_smsg, $ibx->eidx_key); + check_batch_limit($req); +} + +sub do_finalize ($) { + my ($req) = @_; + if (my $indexed = $req->{indexed}) { + do_xpost($req, $_) for @$indexed; + } elsif (exists $req->{new_smsg}) { # totally unseen messsage + index_unseen($req); + } else { + # `d' message was already unindexed in the v1/v2 inboxes, + # so it's too noisy to warn, here. + } + # cur_cmt may be undef for unindex_oid, set by V2Writable::index_todo + if (defined(my $cur_cmt = $req->{cur_cmt})) { + ${$req->{latest_cmt}} = $cur_cmt; + } +} + +sub do_step ($) { # main iterator for adding messages to the index + my ($req) = @_; + my $self = $req->{self} // die 'BUG: {self} missing'; + while (1) { + if (my $next_arg = $req->{next_arg}) { + if (my $smsg = $self->{oidx}->next_by_mid(@$next_arg)) { + $req->{cur_smsg} = $smsg; + $self->git->cat_async($smsg->{blob}, + \&ck_existing, $req); + return; # ck_existing calls do_step + } + delete $req->{cur_smsg}; + delete $req->{next_arg}; + } + my $mid = shift(@{$req->{mids}}); + last unless defined $mid; + my ($id, $prev); + $req->{next_arg} = [ $mid, \$id, \$prev ]; + # loop again + } + do_finalize($req); +} + +sub _blob_missing ($) { # called when req->{cur_smsg}->{blob} is bad + my ($req) = @_; + my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing'; + my $self = $req->{self}; + my $xref3 = $self->{oidx}->get_xref3($smsg->{num}); + my @keep = grep(!/:$smsg->{blob}\z/, @$xref3); + if (@keep) { + $keep[0] =~ /:([a-f0-9]{40,}+)\z/ or + die "BUG: xref $keep[0] has no OID"; + my $oidhex = $1; + $self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob}); + my $upd = $self->{oidx}->update_blob($smsg, $oidhex); + my $saved = $self->{oidx}->get_art($smsg->{num}); + } else { + $self->{oidx}->delete_by_num($smsg->{num}); + } +} + +sub ck_existing { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing'; + if ($type eq 'missing') { + _blob_missing($req); + } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) { + my $self = $req->{self} // die 'BUG: {self} missing'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $cur = PublicInbox::Eml->new($bref); + if (content_hash($cur) eq $req->{chash}) { + push @{$req->{indexed}}, $smsg; # for do_xpost + } # else { index_unseen later } + } + do_step($req); +} + +# is the messages visible in the inbox currently being indexed? +# return the number if so +sub cur_ibx_xnum ($$) { + my ($req, $bref) = @_; + my $ibx = $req->{ibx} or die 'BUG: current {ibx} missing'; + + $req->{eml} = PublicInbox::Eml->new($bref); + $req->{chash} = content_hash($req->{eml}); + $req->{mids} = mids($req->{eml}); + my @q = @{$req->{mids}}; # copy + while (defined(my $mid = shift @q)) { + my ($id, $prev); + while (my $x = $ibx->over->next_by_mid($mid, \$id, \$prev)) { + return $x->{num} if $x->{blob} eq $req->{oid}; + } + } + undef; +} + +sub index_oid { # git->cat_async callback for 'm' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $new_smsg = $req->{new_smsg} = bless { + blob => $oid, + }, 'PublicInbox::Smsg'; + $new_smsg->{bytes} = $size + crlf_adjust($$bref); + defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return; + ++${$req->{nr}}; + do_step($req); +} + +sub unindex_oid { # git->cat_async callback for 'd' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + return if defined(cur_ibx_xnum($req, $bref)); # was re-added + do_step($req); +} + +# overrides V2Writable::last_commits, called by sync_ranges via sync_prepare +sub last_commits { + my ($self, $sync) = @_; + my $heads = []; + my $ekey = $sync->{ibx}->eidx_key; + my $uv = $sync->{ibx}->uidvalidity; + for my $i (0..$sync->{epoch_max}) { + $heads->[$i] = $self->{oidx}->eidx_meta("lc-v2:$ekey//$uv;$i"); + } + $heads; +} + +sub _ibx_index_reject ($) { + my ($ibx) = @_; + $ibx->mm // return 'unindexed, no msgmap.sqlite3'; + $ibx->uidvalidity // return 'no UIDVALIDITY'; + $ibx->over // return 'unindexed, no over.sqlite3'; + undef; +} + +sub _sync_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + if (defined(my $err = _ibx_index_reject($ibx))) { + return "W: skipping $ekey ($err)"; + } + $sync->{ibx} = $ibx; + $sync->{nr} = \(my $nr = 0); + my $v = $ibx->version; + if ($v == 2) { + $sync->{epoch_max} = $ibx->max_git_epoch // return; + sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable + } elsif ($v == 1) { + my $uv = $ibx->uidvalidity; + my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); + my $head = $ibx->mm->last_commit // + return "E: $ibx->{inboxdir} is not indexed"; + my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); + my $unit = { stack => $stk, git => $ibx->git }; + push @{$sync->{todo}}, $unit; + } else { + return "E: $ekey unsupported inbox version (v$v)"; + } + for my $unit (@{delete($sync->{todo}) // []}) { + last if $sync->{quit}; + index_todo($self, $sync, $unit); + } + $self->{midx}->index_ibx($ibx) unless $sync->{quit}; + $ibx->git->cleanup; # done with this inbox, now + undef; +} + +sub gc_unref_doc ($$$$) { + my ($self, $ibx_id, $eidx_key, $docid) = @_; + my $dbh = $self->{oidx}->dbh; + + # for debug/info purposes, oids may no longer be accessible + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref}; + + $dbh->prepare_cached(<<'')->execute($docid, $ibx_id); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? + + my $remain = $self->{oidx}->get_xref3($docid); + if (scalar(@$remain)) { + $self->{oidx}->eidxq_add($docid); # enqueue for reindex + for my $oid (@oid) { + warn "I: unref #$docid $eidx_key $oid\n"; + } + } else { + warn "I: remove #$docid $eidx_key @oid\n"; + $self->idx_shard($docid)->shard_remove($docid); + } +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + + my $dbh = $self->{oidx}->dbh; + my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?'); + my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes'); + my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?'); + + $ibx_ck->execute; + while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { + next if $self->{ibx_map}->{$eidx_key}; + $self->{midx}->remove_eidx_key($eidx_key); + warn "I: deleting messages for $eidx_key...\n"; + $x3_doc->execute($ibx_id); + while (defined(my $docid = $x3_doc->fetchrow_array)) { + gc_unref_doc($self, $ibx_id, $eidx_key, $docid); + } + $dbh->prepare_cached(<<'')->execute($ibx_id); +DELETE FROM inboxes WHERE ibx_id = ? + + # drop last_commit info + my $pat = $eidx_key; + $pat =~ s/([_%])/\\$1/g; + $lc_i->execute("lc-%:$pat//%"); + while (my ($key) = $lc_i->fetchrow_array) { + next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; + warn "I: removing $key\n"; + $dbh->prepare_cached(<<'')->execute($key); +DELETE FROM eidx_meta WHERE key = ? + + } + + warn "I: $eidx_key removed\n"; + } + + # it's not real unless it's in `over', we use parallelism here, + # shards will be reading directly from over, so commit + $self->{oidx}->commit_lazy; + $self->{oidx}->begin_lazy; + + for my $idx (@{$self->{idx_shards}}) { + warn "I: cleaning up shard #$idx->{shard}\n"; + $idx->shard_over_check($self->{oidx}); + } + my $nr = $dbh->do(<<''); +DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) + + warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + + done($self); +} + +sub _ibx_for ($$$) { + my ($self, $sync, $smsg) = @_; + my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset'; + my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos"; + $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" +} + +sub _fd_constrained ($) { + my ($self) = @_; + $self->{-fd_constrained} //= do { + my $soft; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($soft, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + chomp($soft = `sh -c 'ulimit -n'`); + } + if (defined($soft)) { + my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate + my $ret = $want > $soft; + if ($ret) { + warn <{sync}; + my $self = $sync->{self}; + my $by_chash = delete $req->{by_chash} or die 'BUG: no {by_chash}'; + my $nr = scalar(keys(%$by_chash)) or die 'BUG: no content hashes'; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $docid = $smsg->{num} = $orig_smsg->{num}; + $self->{oidx}->add_overview($eml, $smsg); # may rethread + check_batch_limit({ %$sync, new_smsg => $smsg }); + my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}"; + my $stable = delete($by_chash->{$chash0}) // + die "BUG: $smsg->{blob} chash missing"; + my $idx = $self->idx_shard($docid); + my $top_smsg = pop @$stable; + $top_smsg == $smsg or die 'BUG: top_smsg != smsg'; + my $ibx = _ibx_for($self, $sync, $smsg); + $idx->index_raw(undef, $eml, $smsg, $ibx->eidx_key); + for my $x (reverse @$stable) { + $ibx = _ibx_for($self, $sync, $x); + my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}'; + $idx->shard_add_eidx_info($docid, $ibx->eidx_key, $hdr); + } + return if $nr == 1; # likely, all good + + warn "W: #$docid split into $nr due to deduplication change\n"; + my @todo; + for my $ary (values %$by_chash) { + for my $x (reverse @$ary) { + warn "removing #$docid xref3 $x->{blob}\n"; + my $n = $self->{oidx}->remove_xref3($docid, $x->{blob}); + die "BUG: $x->{blob} invalidated #$docid" if $n == 0; + } + my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; + $x->{num} = delete($x->{xnum}) // die '{xnum} unset'; + $ibx = _ibx_for($self, $sync, $x); + if (my $over = $ibx->over) { + my $e = $over->get_art($x->{num}); + $e->{blob} eq $x->{blob} or die <{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); +EOF + push @todo, $ibx, $e; + $over->dbh_close if _fd_constrained($self); + } else { + die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n"; + } + } + undef $by_chash; + while (my ($ibx, $e) = splice(@todo, 0, 2)) { + reindex_unseen($self, $sync, $ibx, $e); + } +} + +sub _reindex_oid { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $sync = $req->{sync}; + my $self = $sync->{self}; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2]; + my $docid = $orig_smsg->{num}; + if (is_bad_blob($oid, $type, $size, $expect_oid)) { + my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid); + if ($remain == 0) { + warn "W: #$docid gone or corrupted\n"; + $self->idx_shard($docid)->shard_remove($docid); + } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { + warn "BUG: #$docid gone (UNEXPECTED)\n"; + $self->idx_shard($docid)->shard_remove($docid); + } + return; + } + my $ci = $self->{current_info}; + local $self->{current_info} = "$ci #$docid $oid"; + my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $re_smsg->{bytes} = $size + crlf_adjust($$bref); + my $eml = PublicInbox::Eml->new($bref); + $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, + cotime => $orig_smsg->{ts} }); + my $chash = content_hash($eml); + $re_smsg->{chash} = $chash; + $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1]; + $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0]; + $re_smsg->{hdr} = $eml->header_obj; + push @{$req->{by_chash}->{$chash}}, $re_smsg; + if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { # last $re_smsg is the highest priority xref3 + local $self->{current_info} = "$ci #$docid"; + _reindex_finalize($req, $re_smsg, $eml); + } +} + +sub _reindex_smsg ($$$) { + my ($self, $sync, $smsg) = @_; + my $docid = $smsg->{num}; + my $xr3 = $self->{oidx}->get_xref3($docid, 1); + if (scalar(@$xr3) == 0) { # _reindex_check_stale should've covered this + warn <<""; +BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex + + $self->{oidx}->delete_by_num($docid); + $self->idx_shard($docid)->shard_remove($docid); + return; + } + + # we sort {xr3r} in the reverse order of {ibx_list} so we can + # hit the common case in _reindex_finalize without rereading + # from git (or holding multiple messages in memory). + my $id2pos = $sync->{id2pos}; # index in {ibx_list} + @$xr3 = sort { + $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} + || + $b->[1] <=> $a->[1] # break ties with {xnum} + } @$xr3; + @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3; + my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 }; + $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req); +} + +sub checkpoint_due ($) { + my ($sync) = @_; + ${$sync->{need_checkpoint}} || (now() > $sync->{next_check}); +} + +sub host_ident () { + # I've copied FS images and only changed the hostname before, + # so prepend hostname. Use `state' since these a BOFH can change + # these while this process is running and we always want to be + # able to release locks taken by this process. + state $retval = hostname . '-' . do { + my $m; # machine-id(5) is systemd + if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> } + # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs + chomp($m ||= `{ sysctl -n kern.hostid || + hostid || ghostid; } 2>/dev/null` + || "no-machine-id-or-hostid-on-$^O"); + $m; + }; +} + +sub eidxq_release { + my ($self) = @_; + my $expect = delete($self->{-eidxq_locked}) or return; + my ($owner_pid, undef) = split(/-/, $expect); + return if $owner_pid != $$; # shards may fork + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') // ''; + if ($cur eq $expect) { + $oidx->eidx_meta('eidxq_lock', ''); + return 1; + } elsif ($cur ne '') { + warn "E: eidxq_lock($expect) stolen by $cur\n"; + } else { + warn "E: eidxq_lock($expect) released by another process\n"; + } + undef; +} + +sub DESTROY { + my ($self) = @_; + eidxq_release($self) and $self->{oidx}->commit_lazy; +} + +sub _eidxq_take ($) { + my ($self) = @_; + my $val = "$$-${\time}-$>-".host_ident; + $self->{oidx}->eidx_meta('eidxq_lock', $val); + $self->{-eidxq_locked} = $val; +} + +sub eidxq_lock_acquire ($) { + my ($self) = @_; + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self); + if (my $locked = $self->{-eidxq_locked}) { # be lazy + return $locked if $locked eq $cur; + } + my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4); + my $t = strftime('%Y-%m-%d %k:%M:%S', gmtime($time)); + if ($euid == $> && $ident eq host_ident) { + if (kill(0, $pid)) { + warn <dbh->sqlite_db_filename; + warn <{oidx}->dbh; + my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; + ${$sync->{nr}} = 0; + local $sync->{-regen_fmt} = "%u/$tot\n"; + my $pr = $sync->{-opt}->{-progress}; + if ($pr) { + my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq'); + my $max = $dbh->selectrow_array('SELECT MAX(docid) FROM eidxq'); + $pr->("Xapian indexing $min..$max (total=$tot)\n"); + } + $sync->{id2pos} //= do { + my %id2pos; + my $pos = 0; + $id2pos{$_->{-ibx_id}} = $pos++ for @{$self->{ibx_list}}; + \%id2pos; + }; + my ($del, $iter); +restart: + $del = $dbh->prepare('DELETE FROM eidxq WHERE docid = ?'); + $iter = $dbh->prepare('SELECT docid FROM eidxq ORDER BY docid ASC'); + $iter->execute; + while (defined(my $docid = $iter->fetchrow_array)) { + last if $sync->{quit}; + if (my $smsg = $self->{oidx}->get_art($docid)) { + _reindex_smsg($self, $sync, $smsg); + } else { + warn "E: #$docid does not exist in over\n"; + } + $del->execute($docid); + ++${$sync->{nr}}; + + if (checkpoint_due($sync)) { + $dbh = $del = $iter = undef; + reindex_checkpoint($self, $sync); # release lock + $dbh = $self->{oidx}->dbh; + goto restart; + } + } + $self->git->async_wait_all; + $pr->("reindexed ${$sync->{nr}}/$tot\n") if $pr; +} + +sub _reindex_unseen { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $self = $req->{self} // die 'BUG: {self} unset'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; + $new_smsg->{bytes} = $size + crlf_adjust($$bref); + my $eml = $req->{eml} = PublicInbox::Eml->new($bref); + $req->{new_smsg} = $new_smsg; + $req->{chash} = content_hash($eml); + $req->{mids} = mids($eml); # do_step iterates through this + do_step($req); # enter the normal indexing flow +} + +# --reindex may catch totally unseen messages, this handles them +sub reindex_unseen ($$$$) { + my ($self, $sync, $ibx, $xsmsg) = @_; + my $req = { + %$sync, # has {self} + autime => $xsmsg->{ds}, + cotime => $xsmsg->{ts}, + oid => $xsmsg->{blob}, + ibx => $ibx, + xnum => $xsmsg->{num}, + # {mids} and {chash} will be filled in at _reindex_unseen + }; + warn "I: reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n"; + $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req); +} + +sub _reindex_check_unseen ($$$) { + my ($self, $sync, $ibx) = @_; + my $ibx_id = $ibx->{-ibx_id}; + my $slice = 1000; + my ($beg, $end) = (1, $slice); + + # first, check if we missed any messages in target $ibx + my $msgs; + my $pr = $sync->{-opt}->{-progress}; + my $ekey = $ibx->eidx_key; + local $sync->{-regen_fmt} = + "$ekey checking unseen %u/".$ibx->over->max."\n"; + ${$sync->{nr}} = 0; + + while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) { + ${$sync->{nr}} = $beg; + $beg = $msgs->[-1]->{num} + 1; + $end = $beg + $slice; + if (checkpoint_due($sync)) { + reindex_checkpoint($self, $sync); # release lock + } + + my $inx3 = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1); +SELECT DISTINCT(docid) FROM xref3 WHERE +ibx_id = ? AND xnum = ? AND oidbin = ? + + for my $xsmsg (@$msgs) { + my $oidbin = pack('H*', $xsmsg->{blob}); + $inx3->bind_param(1, $ibx_id); + $inx3->bind_param(2, $xsmsg->{num}); + $inx3->bind_param(3, $oidbin, SQL_BLOB); + $inx3->execute; + my $docids = $inx3->fetchall_arrayref; + # index messages which were totally missed + # the first time around ASAP: + if (scalar(@$docids) == 0) { + reindex_unseen($self, $sync, $ibx, $xsmsg); + } else { # already seen, reindex later + for my $r (@$docids) { + $self->{oidx}->eidxq_add($r->[0]); + } + } + last if $sync->{quit}; + } + last if $sync->{quit}; + } +} + +sub _reindex_check_stale ($$$) { + my ($self, $sync, $ibx) = @_; + my $min = 0; + my $pr = $sync->{-opt}->{-progress}; + my $fetching; + my $ekey = $ibx->eidx_key; + local $sync->{-regen_fmt} = + "$ekey check stale/missing %u/".$ibx->over->max."\n"; + ${$sync->{nr}} = 0; + do { + if (checkpoint_due($sync)) { + reindex_checkpoint($self, $sync); # release lock + } + # now, check if there's stale xrefs + my $iter = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1); +SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? AND docid > ? +ORDER BY docid,xnum ASC LIMIT 10000 + + $iter->execute($ibx->{-ibx_id}, $min); + $fetching = undef; + + while (my ($docid, $xnum, $oidbin) = $iter->fetchrow_array) { + return if $sync->{quit}; + ${$sync->{nr}} = $xnum; + + $fetching = $min = $docid; + my $smsg = $ibx->over->get_art($xnum); + my $oidhex = unpack('H*', $oidbin); + my $err; + if (!$smsg) { + $err = 'stale'; + } elsif ($smsg->{blob} ne $oidhex) { + $err = "mismatch (!= $smsg->{blob})"; + } else { + next; # likely, all good + } + # current_info already has eidx_key + warn "$xnum:$oidhex (#$docid): $err\n"; + my $del = $self->{oidx}->dbh->prepare_cached(<<''); +DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ? + + $del->bind_param(1, $ibx->{-ibx_id}); + $del->bind_param(2, $xnum); + $del->bind_param(3, $oidbin, SQL_BLOB); + $del->execute; + + # get_xref3 over-fetches, but this is a rare path: + my $xr3 = $self->{oidx}->get_xref3($docid); + my $idx = $self->idx_shard($docid); + if (scalar(@$xr3) == 0) { # all gone + $self->{oidx}->delete_by_num($docid); + $self->{oidx}->eidxq_del($docid); + $idx->shard_remove($docid); + } else { # enqueue for reindex of remaining messages + $idx->shard_remove_eidx_info($docid, + $ibx->eidx_key); + $self->{oidx}->eidxq_add($docid); # yes, add + } + } + } while (defined $fetching); +} + +sub _reindex_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + local $self->{current_info} = $ekey; + if (defined(my $err = _ibx_index_reject($ibx))) { + warn "W: cannot reindex $ekey ($err)\n"; + } else { + _reindex_check_unseen($self, $sync, $ibx); + _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + } + delete @$ibx{qw(over mm search git)}; # won't need these for a bit +} + +sub eidx_reindex { + my ($self, $sync) = @_; + + # acquire eidxq_lock early because full reindex takes forever + # and incremental -extindex processes can run during our checkpoints + if (!eidxq_lock_acquire($self)) { + warn "E: aborting --reindex\n"; + return; + } + for my $ibx (@{$self->{ibx_list}}) { + _reindex_inbox($self, $sync, $ibx); + last if $sync->{quit}; + } + $self->git->async_wait_all; # ensure eidxq gets filled completely + eidxq_process($self, $sync) unless $sync->{quit}; +} + +sub sync_inbox { + my ($self, $sync, $ibx) = @_; + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); +} + +sub eidx_sync { # main entry point + my ($self, $opt) = @_; + + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + local $self->{current_info} = ''; + local $SIG{__WARN__} = sub { + $warn_cb->($self->{current_info}, ': ', @_); + }; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + $self->{oidx}->rethread_prepare($opt); + my $sync = { + need_checkpoint => \(my $need_checkpoint = 0), + check_intvl => 10, + next_check => now() + 10, + -opt => $opt, + # DO NOT SET {reindex} here, it's incompatible with reused + # V2Writable code, reindex is totally different here + # compared to v1/v2 inboxes because we have multiple histories + self => $self, + -regen_fmt => "%u/?\n", + }; + local $SIG{USR1} = sub { $need_checkpoint = 1 }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + for my $ibx (@{$self->{ibx_list}}) { + $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); + } + if (delete($opt->{reindex})) { + local $sync->{checkpoint_unlocks} = 1; + eidx_reindex($self, $sync); + } + + # don't use $_ here, it'll get clobbered by reindex_checkpoint + if ($opt->{scan} // 1) { + for my $ibx (@{$self->{ibx_list}}) { + last if $sync->{quit}; + sync_inbox($self, $sync, $ibx); + } + } + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; + eidxq_process($self, $sync) unless $sync->{quit}; + + eidxq_release($self); + done($self); + $sync; # for eidx_watch +} + +sub update_last_commit { # overrides V2Writable + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $ibx = $sync->{ibx} or die 'BUG: {ibx} missing'; + my $ekey = $ibx->eidx_key; + my $uv = $ibx->uidvalidity; + my $epoch = $unit->{epoch}; + my $meta_key; + my $v = $ibx->version; + if ($v == 2) { + die 'No {epoch} for v2 unit' unless defined $epoch; + $meta_key = "lc-v2:$ekey//$uv;$epoch"; + } elsif ($v == 1) { + die 'Unexpected {epoch} for v1 unit' if defined $epoch; + $meta_key = "lc-v1:$ekey//$uv"; + } else { + die "Unsupported inbox version: $v"; + } + my $last = $self->{oidx}->eidx_meta($meta_key); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); + return if $n ne '' && $n == 0; + } + $self->{oidx}->eidx_meta($meta_key, $latest_cmt); +} + +sub _idx_init { # with_umask callback + my ($self, $opt) = @_; + PublicInbox::V2Writable::_idx_init($self, $opt); + $self->{midx} = PublicInbox::MiscIdx->new($self); +} + +sub idx_init { # similar to V2Writable + my ($self, $opt) = @_; + return if $self->{idx_shards}; + + $self->git->cleanup; + + my $ALL = $self->git->{git_dir}; # ALL.git + PublicInbox::Import::init_bare($ALL) unless -d $ALL; + my $info_dir = "$ALL/objects/info"; + my $alt = "$info_dir/alternates"; + my $mode = 0644; + my (@old, @new, %seen); # seen: st_dev + st_ino + if (-e $alt) { + open(my $fh, '<', $alt) or die "open $alt: $!"; + $mode = (stat($fh))[2] & 07777; + while (my $line = <$fh>) { + chomp(my $d = $line); + if (my @st = stat($d)) { + next if $seen{"$st[0]\0$st[1]"}++; + } else { + warn "W: stat($d) failed (from $alt): $!\n"; + next if $opt->{-idx_gc}; + } + push @old, $line; + } + } + for my $ibx (@{$self->{ibx_list}}) { + my $line = $ibx->git->{git_dir} . "/objects\n"; + chomp(my $d = $line); + if (my @st = stat($d)) { + next if $seen{"$st[0]\0$st[1]"}++; + } else { + warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n"; + next if $opt->{-idx_gc}; + } + push @new, $line; + } + if (scalar @new) { + push @old, @new; + my $o = \@old; + PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o); + } + $self->parallel_init($self->{indexlevel}); + $self->with_umask(\&_idx_init, $self, $opt); + $self->{oidx}->begin_lazy; + $self->{oidx}->eidx_prep; + $self->{midx}->begin_txn; +} + +sub _watch_commit { # PublicInbox::DS::add_timer callback + my ($self) = @_; + delete $self->{-commit_timer}; + eidxq_process($self, $self->{-watch_sync}); + eidxq_release($self); + delete local $self->{-watch_sync}->{-regen_fmt}; + reindex_checkpoint($self, $self->{-watch_sync}); + + # call event_step => done unless commit_timer is armed + PublicInbox::DS::requeue($self); +} + +sub on_inbox_unlock { # called by PublicInbox::InboxIdle + my ($self, $ibx) = @_; + my $opt = $self->{-watch_sync}->{-opt}; + my $pr = $opt->{-progress}; + my $ekey = $ibx->eidx_key; + local $0 = "sync $ekey"; + $pr->("indexing $ekey\n") if $pr; + $self->idx_init($opt); + sync_inbox($self, $self->{-watch_sync}, $ibx); + $self->{-commit_timer} //= PublicInbox::DS::add_timer( + $opt->{'commit-interval'} // 10, + \&_watch_commit, $self); +} + +sub eidx_reload { # -extindex --watch SIGHUP handler + my ($self, $idler) = @_; + if ($self->{cfg}) { + my $pr = $self->{-watch_sync}->{-opt}->{-progress}; + $pr->('reloading ...') if $pr; + delete $self->{-resync_queue}; + @{$self->{ibx_list}} = (); + %{$self->{ibx_map}} = (); + delete $self->{-watch_sync}->{id2pos}; + my $cfg = PublicInbox::Config->new; + attach_config($self, $cfg); + $idler->refresh($cfg); + $pr->(" done\n") if $pr; + } else { + warn "reload not supported without --all\n"; + } +} + +sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler + my ($self) = @_; + $self->{-resync_queue} //= [ @{$self->{ibx_list}} ]; + PublicInbox::DS::requeue($self); # trigger our ->event_step +} + +sub event_step { # PublicInbox::DS::requeue callback + my ($self) = @_; + if (my $resync_queue = $self->{-resync_queue}) { + if (my $ibx = shift(@$resync_queue)) { + on_inbox_unlock($self, $ibx); + PublicInbox::DS::requeue($self); + } else { + delete $self->{-resync_queue}; + _watch_commit($self); + } + } else { + done($self) unless $self->{-commit_timer}; + } +} + +sub eidx_watch { # public-inbox-extindex --watch main loop + my ($self, $opt) = @_; + local %SIG = %SIG; + for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) { + $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" }; + } + require PublicInbox::InboxIdle; + require PublicInbox::DS; + require PublicInbox::Syscall; + require PublicInbox::Sigfd; + my $idler = PublicInbox::InboxIdle->new($self->{cfg}); + if (!$self->{cfg}) { + $idler->watch_inbox($_) for @{$self->{ibx_list}}; + } + $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $pr = $opt->{-progress}; + $pr->("performing initial scan ...\n") if $pr; + my $sync = eidx_sync($self, $opt); # initial sync + return if $sync->{quit}; + my $oldset = PublicInbox::Sigfd::block_signals(); + local $self->{current_info} = ''; + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { $cb->($self->{current_info}, ': ', @_) }; + my $sig = { + HUP => sub { eidx_reload($self, $idler) }, + USR1 => sub { eidx_resync_start($self) }, + TSTP => sub { kill('STOP', $$) }, + }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; + my $sigfd = PublicInbox::Sigfd->new($sig, + $PublicInbox::Syscall::SFD_NONBLOCK); + %SIG = (%SIG, %$sig) if !$sigfd; + local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock + if (!$sigfd) { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + PublicInbox::Sigfd::sig_setmask($oldset); + PublicInbox::DS->SetLoopTimeout(1000); + } + PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + $pr->("initial scan complete, entering event loop\n") if $pr; + PublicInbox::DS->EventLoop; # calls InboxIdle->event_step + done($self); +} + +no warnings 'once'; +*done = \&PublicInbox::V2Writable::done; +*with_umask = \&PublicInbox::InboxWritable::with_umask; +*parallel_init = \&PublicInbox::V2Writable::parallel_init; +*nproc_shards = \&PublicInbox::V2Writable::nproc_shards; +*sync_prepare = \&PublicInbox::V2Writable::sync_prepare; +*index_todo = \&PublicInbox::V2Writable::index_todo; +*count_shards = \&PublicInbox::V2Writable::count_shards; +*atfork_child = \&PublicInbox::V2Writable::atfork_child; +*idx_shard = \&PublicInbox::V2Writable::idx_shard; +*reindex_checkpoint = \&PublicInbox::V2Writable::reindex_checkpoint; + +1; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 805076f0..f570a25d 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -24,7 +24,7 @@ sub generate { sub generate_thread_atom { my ($ctx) = @_; - my $msgs = $ctx->{msgs} = $ctx->{-inbox}->over->get_thread($ctx->{mid}); + my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid}); return _no_thread() unless @$msgs; PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); } @@ -34,7 +34,7 @@ sub generate_html_index { # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: my $qp = $ctx->{qp}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if ($qp && !$qp->{r} && $ibx->over) { return PublicInbox::View::index_topics($ctx); } @@ -79,8 +79,8 @@ sub _no_thread () { sub recent_msgs { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; - my $max = $ibx->{feedmax}; + my $ibx = $ctx->{ibx}; + my $max = $ibx->{feedmax} // 25; return PublicInbox::View::paginate_recent($ctx, $max) if $ibx->over; # only for rare v1 inboxes which aren't indexed at all diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index 06e4ea75..62cf5d20 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -16,7 +16,7 @@ sub new { my ($class, %opts) = @_; my $altid = delete $opts{-altid}; my $self = $class->SUPER::new(%opts); - my $ibx = $self->{-inbox}; + my $ibx = $self->{ibx}; # altid = serial:ruby-core:file=msgmap.sqlite3 if (!$altid && $ibx && $ibx->{altid}) { $altid ||= $ibx->{altid}->[0]; diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm new file mode 100644 index 00000000..fe6afef2 --- /dev/null +++ b/lib/PublicInbox/Gcf2.pm @@ -0,0 +1,110 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# backend for a git-cat-file-workalike based on libgit2, +# other libgit2 stuff may go here, too. +package PublicInbox::Gcf2; +use strict; +use PublicInbox::Spawn qw(which popen_rd); +use Fcntl qw(LOCK_EX); +use IO::Handle; # autoflush +my (%CFG, $c_src, $lockfh); +BEGIN { + # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY + # to ~/.cache/public-inbox/inline-c if it exists + my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // + die 'PERL_INLINE_DIRECTORY not defined'; + my $f = "$inline_dir/.public-inbox.lock"; + open $lockfh, '>', $f or die "failed to open $f: $!\n"; + my $pc = which($ENV{PKG_CONFIG} // 'pkg-config'); + my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!); + my $rdr = {}; + open $rdr->{2}, '>', '/dev/null' or die "open /dev/null: $!"; + for my $x (qw(libgit2)) { + my $l = popen_rd([$pc, '--libs', $x], undef, $rdr); + $l = do { local $/; <$l> }; + next if $?; + my $c = popen_rd([$pc, '--cflags', $x], undef, $rdr); + $c = do { local $/; <$c> }; + next if $?; + + # note: we name C source files .h to prevent + # ExtUtils::MakeMaker from automatically trying to + # build them. + my $f = "$dir/gcf2_$x.h"; + if (open(my $fh, '<', $f)) { + chomp($l, $c); + local $/; + defined($c_src = <$fh>) or die "read $f: $!\n"; + $CFG{LIBS} = $l; + $CFG{CCFLAGSEX} = $c; + last; + } else { + die "E: $f: $!\n"; + } + } + die "E: libgit2 not installed\n" unless $c_src; + + # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking + flock($lockfh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; +} + +# we use Capitalized and ALLCAPS for compatibility with old Inline::C +use Inline C => Config => %CFG, BOOT => 'git_libgit2_init();'; +use Inline C => $c_src; +undef $c_src; +undef %CFG; +undef $lockfh; + +sub add_alt ($$) { + my ($gcf2, $objdir) = @_; + + # libgit2 (tested 0.27.7+dfsg.1-0.2 and 0.28.3+dfsg.1-1~bpo10+1 + # in Debian) doesn't handle relative epochs properly when nested + # multiple levels. Add all the absolute paths to workaround it, + # since $EXTINDEX_DIR/ALL.git/objects/info/alternates uses absolute + # paths to reference $V2INBOX_DIR/all.git/objects and + # $V2INBOX_DIR/all.git/objects/info/alternates uses relative paths + # to refer to $V2INBOX_DIR/git/$EPOCH.git/objects + # + # See https://bugs.debian.org/975607 + if (open(my $fh, '<', "$objdir/info/alternates")) { + chomp(my @abs_alt = grep(m!^/!, <$fh>)); + $gcf2->add_alternate($_) for @abs_alt; + } + $gcf2->add_alternate($objdir); +} + +# Usage: $^X -MPublicInbox::Gcf2 -e 'PublicInbox::Gcf2::loop()' +# (see lib/PublicInbox/Gcf2Client.pm) +sub loop { + my $gcf2 = new(); + my %seen; + STDERR->autoflush(1); + STDOUT->autoflush(1); + + while () { + chomp; + my ($oid, $git_dir) = split(/ /, $_, 2); + $seen{$git_dir}++ or add_alt($gcf2, "$git_dir/objects"); + if (!$gcf2->cat_oid(1, $oid)) { + # retry once if missing. We only get unabbreviated OIDs + # from SQLite or Xapian DBs, here, so malicious clients + # can't trigger excessive retries: + warn "I: $$ $oid missing, retrying in $git_dir\n"; + + $gcf2 = new(); + %seen = ($git_dir => 1); + add_alt($gcf2, "$git_dir/objects"); + + if ($gcf2->cat_oid(1, $oid)) { + warn "I: $$ $oid found after retry\n"; + } else { + warn "W: $$ $oid missing after retry\n"; + print "$oid missing\n"; # mimic git-cat-file + } + } + } +} + +1; diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm new file mode 100644 index 00000000..ab486de5 --- /dev/null +++ b/lib/PublicInbox/Gcf2Client.pm @@ -0,0 +1,69 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# connects public-inbox processes to PublicInbox::Gcf2::loop() +package PublicInbox::Gcf2Client; +use strict; +use parent qw(PublicInbox::DS); +use PublicInbox::Git; +use PublicInbox::Spawn qw(popen_rd); +use IO::Handle (); +use PublicInbox::Syscall qw(EPOLLONESHOT); +# fields: +# async_cat => GitAsyncCat ref (read-only pipe) +# sock => writable pipe to Gcf2::loop + + +sub new { + my ($rdr) = @_; + my $self = bless {}, __PACKAGE__; + # ensure the child process has the same @INC we do: + my $env = { PERL5LIB => join(':', @INC) }; + my ($out_r, $out_w); + pipe($out_r, $out_w) or die "pipe failed: $!"; + $rdr //= {}; + $rdr->{0} = $out_r; + my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop()]]; + @$self{qw(in pid)} = popen_rd($cmd, $env, $rdr); + fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ + $out_w->autoflush(1); + $out_w->blocking(0); + $self->{inflight} = []; + $self->SUPER::new($out_w, EPOLLONESHOT); # detect errors once +} + +sub fail { + my $self = shift; + $self->close; # PublicInbox::DS::close + PublicInbox::Git::fail($self, @_); +} + +sub cat_async ($$$;$) { + my ($self, $req, $cb, $arg) = @_; + my $inflight = $self->{inflight}; + + # {wbuf} is rare, I hope: + cat_async_step($self, $inflight) if $self->{wbuf}; + + if (!$self->write(\"$req\n")) { + $self->fail("gcf2c write: $!") if !$self->{sock}; + } + push @$inflight, $req, $cb, $arg; +} + +# ensure PublicInbox::Git::cat_async_step never calls cat_async_retry +sub alternates_changed {} + +# this is the write-only end of a pipe, DS->EventLoop will call this +sub event_step { + my ($self) = @_; + $self->flush_write; + $self->close if !$self->{in}; # process died +} + +no warnings 'once'; + +# used by GitAsyncCat +*cat_async_step = \&PublicInbox::Git::cat_async_step; + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index a7ba57f9..73dc7d3e 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -12,15 +12,19 @@ use v5.10.1; use parent qw(Exporter); use POSIX (); use IO::Handle; # ->autoflush -use Errno qw(EINTR); +use Errno qw(EINTR EAGAIN); use File::Glob qw(bsd_glob GLOB_NOSORT); +use File::Spec (); use Time::HiRes qw(stat); use PublicInbox::Spawn qw(popen_rd); use PublicInbox::Tmpfile; +use IO::Poll qw(POLLIN); use Carp qw(croak); +use Digest::SHA (); our @EXPORT_OK = qw(git_unquote git_quote); our $PIPE_BUFSIZ = 65536; # Linux default our $in_cleanup; +our $RDTIMEO = 60_000; # milliseconds use constant MAX_INFLIGHT => (($^O eq 'linux' ? 4096 : POSIX::_POSIX_PIPE_BUF()) * 3) @@ -92,9 +96,9 @@ sub alternates_changed { sub last_check_err { my ($self) = @_; my $fh = $self->{err_c} or return; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); + sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); defined(sysread($fh, my $buf, -s $fh)) or - fail($self, "sysread failed: $!"); + $self->fail("sysread failed: $!"); $buf; } @@ -103,19 +107,19 @@ sub _bidi_pipe { if ($self->{$pid}) { if (defined $err) { # "err_c" my $fh = $self->{$err}; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); - truncate($fh, 0) or fail($self, "truncate failed: $!"); + sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); + truncate($fh, 0) or $self->fail("truncate failed: $!"); } return; } my ($out_r, $out_w); - pipe($out_r, $out_w) or fail($self, "pipe failed: $!"); + pipe($out_r, $out_w) or $self->fail("pipe failed: $!"); my @cmd = (qw(git), "--git-dir=$self->{git_dir}", qw(-c core.abbrev=40 cat-file), $batch); my $redir = { 0 => $out_r }; if ($err) { my $id = "git.$self->{git_dir}$batch.err"; - my $fh = tmpfile($id) or fail($self, "tmpfile($id): $!"); + my $fh = tmpfile($id) or $self->fail("tmpfile($id): $!"); $self->{$err} = $fh; $redir->{2} = $fh; } @@ -130,6 +134,8 @@ sub _bidi_pipe { $self->{$in} = $in_r; } +sub poll_in ($) { IO::Poll::_poll($RDTIMEO, fileno($_[0]), my $ev = POLLIN) } + sub my_read ($$$) { my ($fh, $rbuf, $len) = @_; my $left = $len - length($$rbuf); @@ -138,9 +144,12 @@ sub my_read ($$$) { $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); if ($r) { $left -= $r; + } elsif (defined($r)) { # EOF + return 0; } else { - next if (!defined($r) && $! == EINTR); - return $r; + next if ($! == EAGAIN and poll_in($fh)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error } } \substr($$rbuf, 0, $len, ''); @@ -152,9 +161,15 @@ sub my_readline ($$) { if ((my $n = index($$rbuf, "\n")) >= 0) { return substr($$rbuf, 0, $n + 1, ''); } - my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); - next if $r || (!defined($r) && $! == EINTR); - return defined($r) ? '' : undef; # EOF or error + my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)) + and next; + + # return whatever's left on EOF + return substr($$rbuf, 0, length($$rbuf)+1, '') if defined($r); + + next if ($! == EAGAIN and poll_in($fh)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error } } @@ -172,7 +187,7 @@ sub cat_async_retry ($$$$$) { for (my $i = 0; $i < @$inflight; $i += 3) { $buf .= "$inflight->[$i]\n"; } - print { $self->{out} } $buf or fail($self, "write error: $!"); + print { $self->{out} } $buf or $self->fail("write error: $!"); unshift(@$inflight, \$req, $cb, $arg); # \$ref to indicate retried cat_async_step($self, $inflight); # take one step @@ -185,30 +200,34 @@ sub cat_async_step ($$) { my $rbuf = delete($self->{cat_rbuf}) // \(my $new = ''); my ($bref, $oid, $type, $size); my $head = my_readline($self->{in}, $rbuf); + # ->fail may be called via Gcf2Client.pm if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) { ($oid, $type, $size) = ($1, $2, $3 + 0); $bref = my_read($self->{in}, $rbuf, $size + 1) or - fail($self, defined($bref) ? 'read EOF' : "read: $!"); - chop($$bref) eq "\n" or fail($self, 'LF missing after blob'); - } elsif ($head =~ / missing$/) { + $self->fail(defined($bref) ? 'read EOF' : "read: $!"); + chop($$bref) eq "\n" or $self->fail('LF missing after blob'); + } elsif ($head =~ s/ missing\n//s) { + $oid = $head; # ref($req) indicates it's already been retried - if (!ref($req) && !$in_cleanup && alternates_changed($self)) { + # -gcf2 retries internally, so it never hits this path: + if (!ref($req) && !$in_cleanup && $self->alternates_changed) { return cat_async_retry($self, $inflight, $req, $cb, $arg); } $type = 'missing'; - $oid = ref($req) ? $$req : $req; + $oid = ref($req) ? $$req : $req if $oid eq ''; } else { - fail($self, "Unexpected result from async git cat-file: $head"); + my $err = $! ? " ($!)" : ''; + $self->fail("bad result from async cat-file: $head$err"); } - eval { $cb->($bref, $oid, $type, $size, $arg) }; $self->{cat_rbuf} = $rbuf if $$rbuf ne ''; + eval { $cb->($bref, $oid, $type, $size, $arg) }; warn "E: $oid: $@\n" if $@; } sub cat_async_wait ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or return; + my $inflight = $self->{inflight} or return; while (scalar(@$inflight)) { cat_async_step($self, $inflight); } @@ -236,7 +255,7 @@ sub check_async_step ($$) { my ($self, $inflight_c) = @_; die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3; my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3); - my $rbuf = delete($self->{rbuf_c}) // \(my $new = ''); + my $rbuf = delete($self->{chk_rbuf}) // \(my $new = ''); chomp(my $line = my_readline($self->{in_c}, $rbuf)); my ($hex, $type, $size) = split(/ /, $line); @@ -246,16 +265,16 @@ sub check_async_step ($$) { # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { my $ret = my_read($self->{in_c}, $rbuf, $type + 1); - fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret; + $self->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret; } + $self->{chk_rbuf} = $rbuf if $$rbuf ne ''; eval { $cb->($hex, $type, $size, $arg, $self) }; warn "E: check($req) $@\n" if $@; - $self->{rbuf_c} = $rbuf if $$rbuf ne ''; } sub check_async_wait ($) { my ($self) = @_; - my $inflight_c = delete $self->{inflight_c} or return; + my $inflight_c = $self->{inflight_c} or return; while (scalar(@$inflight_c)) { check_async_step($self, $inflight_c); } @@ -272,10 +291,10 @@ sub check_async_begin ($) { sub check_async ($$$$) { my ($self, $oid, $cb, $arg) = @_; my $inflight_c = $self->{inflight_c} // check_async_begin($self); - if (scalar(@$inflight_c) >= MAX_INFLIGHT) { + while (scalar(@$inflight_c) >= MAX_INFLIGHT) { check_async_step($self, $inflight_c); } - print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!"); + print { $self->{out_c} } $oid, "\n" or $self->fail("write error: $!"); push(@$inflight_c, $oid, $cb, $arg); } @@ -302,10 +321,12 @@ sub check { sub _destroy { my ($self, $rbuf, $in, $out, $pid, $err) = @_; - my $p = delete $self->{$pid} or return; delete @$self{($rbuf, $in, $out)}; delete $self->{$err} if $err; # `err_c' + # GitAsyncCat::event_step may delete {pid} + my $p = delete $self->{$pid} or return; + # PublicInbox::DS may not be loaded eval { PublicInbox::DS::dwaitpid($p, undef, undef) }; waitpid($p, 0) if $@; # wait synchronously if not in event loop @@ -313,14 +334,23 @@ sub _destroy { sub cat_async_abort ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or die 'BUG: not in async'; + if (my $inflight = $self->{inflight}) { + while (@$inflight) { + my ($req, $cb, $arg) = splice(@$inflight, 0, 3); + $req =~ s/ .*//; # drop git_dir for Gcf2Client + eval { $cb->(undef, $req, undef, undef, $arg) }; + warn "E: $req: $@ (in abort)\n" if $@; + } + delete $self->{cat_rbuf}; + delete $self->{inflight}; + } cleanup($self); } -sub fail { +sub fail { # may be augmented in subclasses my ($self, $msg) = @_; - $self->{inflight} ? cat_async_abort($self) : cleanup($self); - croak("git $self->{git_dir}: $msg"); + cat_async_abort($self); + croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); } sub popen { @@ -332,10 +362,19 @@ sub popen { sub qx { my ($self, @cmd) = @_; my $fh = $self->popen(@cmd); - local $/ = "\n"; - return <$fh> if wantarray; - local $/; - <$fh> + local $/ = wantarray ? "\n" : undef; + <$fh>; +} + +# check_async and cat_async may trigger the other, so ensure they're +# both completely done by using this: +sub async_wait_all ($) { + my ($self) = @_; + while (scalar(@{$self->{inflight_c} // []}) || + scalar(@{$self->{inflight} // []})) { + $self->check_async_wait; + $self->cat_async_wait; + } } # returns true if there are pending "git cat-file" processes @@ -343,13 +382,15 @@ sub cleanup { my ($self) = @_; local $in_cleanup = 1; delete $self->{async_cat}; - check_async_wait($self); - cat_async_wait($self); + async_wait_all($self); + delete $self->{inflight}; + delete $self->{inflight_c}; _destroy($self, qw(cat_rbuf in out pid)); _destroy($self, qw(chk_rbuf in_c out_c pid_c err_c)); !!($self->{pid} || $self->{pid_c}); } + # assuming a well-maintained repo, this should be a somewhat # accurate estimation of its size # TODO: show this in the WWW UI as a hint to potential cloners @@ -394,8 +435,8 @@ sub pub_urls { sub cat_async_begin { my ($self) = @_; - cleanup($self) if alternates_changed($self); - batch_prepare($self); + cleanup($self) if $self->alternates_changed; + $self->batch_prepare; die 'BUG: already in async' if $self->{inflight}; $self->{inflight} = []; } @@ -403,24 +444,21 @@ sub cat_async_begin { sub cat_async ($$$;$) { my ($self, $oid, $cb, $arg) = @_; my $inflight = $self->{inflight} // cat_async_begin($self); - if (scalar(@$inflight) >= MAX_INFLIGHT) { + while (scalar(@$inflight) >= MAX_INFLIGHT) { cat_async_step($self, $inflight); } - - print { $self->{out} } $oid, "\n" or fail($self, "write error: $!"); + print { $self->{out} } $oid, "\n" or $self->fail("write error: $!"); push(@$inflight, $oid, $cb, $arg); } -# this is safe to call inside $cb, but not guaranteed to enqueue -# returns true if successful, undef if not. sub async_prefetch { my ($self, $oid, $cb, $arg) = @_; - if (defined($self->{async_cat}) && (my $inflight = $self->{inflight})) { + if (my $inflight = $self->{inflight}) { # we could use MAX_INFLIGHT here w/o the halving, # but lets not allow one client to monopolize a git process if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) { print { $self->{out} } $oid, "\n" or - fail($self, "write error: $!"); + $self->fail("write error: $!"); return push(@$inflight, $oid, $cb, $arg); } } @@ -451,6 +489,57 @@ sub modified ($) { $modified || time; } +# for grokmirror, which doesn't read gitweb.description +# templates/hooks--update.sample and git-multimail in git.git +# only match "Unnamed repository", not the full contents of +# templates/this--description in git.git +sub manifest_entry { + my ($self, $epoch, $default_desc) = @_; + my ($fh, $pid) = $self->popen('show-ref'); + my $dig = Digest::SHA->new(1); + while (read($fh, my $buf, 65536)) { + $dig->add($buf); + } + close $fh; + waitpid($pid, 0); + return if $?; # empty, uninitialized git repo + my $git_dir = $self->{git_dir}; + my $ent = { + fingerprint => $dig->hexdigest, + reference => undef, + modified => modified($self), + }; + chomp(my $owner = $self->qx('config', 'gitweb.owner')); + utf8::decode($owner); + $ent->{owner} = $owner eq '' ? undef : $owner; + my $desc = ''; + if (open($fh, '<', "$git_dir/description")) { + local $/ = "\n"; + chomp($desc = <$fh>); + utf8::decode($desc); + } + $desc = 'Unnamed repository' if $desc eq ''; + if (defined $epoch && $desc =~ /\AUnnamed repository/) { + $desc = "$default_desc [epoch $epoch]"; + } + $ent->{description} = $desc; + if (open($fh, '<', "$git_dir/objects/info/alternates")) { + # n.b.: GitPython doesn't seem to handle comments or C-quoted + # strings like native git does; and we don't for now, either. + local $/ = "\n"; + chomp(my @alt = <$fh>); + + # grokmirror only supports 1 alternate for "reference", + if (scalar(@alt) == 1) { + my $objdir = "$git_dir/objects"; + my $ref = File::Spec->rel2abs($alt[0], $objdir); + $ref =~ s!/[^/]+/?\z!!; # basename + $ent->{reference} = $ref; + } + } + $ent; +} + 1; __END__ =pod diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index 5f785df7..dc97af16 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -3,40 +3,92 @@ # # internal class used by PublicInbox::Git + PublicInbox::DS # This parses the output pipe of "git cat-file --batch" -# -# Note: this does NOT set the non-blocking flag, we expect `git cat-file' -# to be a local process, and git won't start writing a blob until it's -# fully read. So minimize context switching and read as much as possible -# and avoid holding a buffer in our heap any longer than it has to live. package PublicInbox::GitAsyncCat; use strict; use parent qw(PublicInbox::DS Exporter); +use POSIX qw(WNOHANG); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(git_async_cat); - -sub _add { - my ($class, $git) = @_; - $git->batch_prepare; - my $self = bless { git => $git }, $class; - $self->SUPER::new($git->{in}, EPOLLIN|EPOLLET); - \undef; # this is a true ref() +our @EXPORT = qw(git_async_cat git_async_prefetch); +use PublicInbox::Git (); + +our $GCF2C; # singleton PublicInbox::Gcf2Client + +sub close { + my ($self) = @_; + + if (my $gitish = delete $self->{gitish}) { + PublicInbox::Git::cat_async_abort($gitish); + } + $self->SUPER::close; # PublicInbox::DS::close } sub event_step { my ($self) = @_; - my $git = $self->{git}; - return $self->close if ($git->{in} // 0) != ($self->{sock} // 1); - my $inflight = $git->{inflight}; + my $gitish = $self->{gitish} or return; + return $self->close if ($gitish->{in} // 0) != ($self->{sock} // 1); + my $inflight = $gitish->{inflight}; if ($inflight && @$inflight) { - $git->cat_async_step($inflight); - $self->requeue if @$inflight || exists $git->{cat_rbuf}; + $gitish->cat_async_step($inflight); + + # child death? + if (($gitish->{in} // 0) != ($self->{sock} // 1)) { + $self->close; + } elsif (@$inflight || exists $gitish->{cat_rbuf}) { + # ok, more to do, requeue for fairness + $self->requeue; + } + } elsif ((my $pid = waitpid($gitish->{pid}, WNOHANG)) > 0) { + # May happen if the child process is killed by a BOFH + # (or segfaults) + delete $gitish->{pid}; + warn "E: gitish $pid exited with \$?=$?\n"; + $self->close; } } sub git_async_cat ($$$$) { my ($git, $oid, $cb, $arg) = @_; - $git->cat_async($oid, $cb, $arg); - $git->{async_cat} //= _add(__PACKAGE__, $git); + my $gitish = $GCF2C //= eval { + require PublicInbox::Gcf2; + require PublicInbox::Gcf2Client; + PublicInbox::Gcf2Client::new(); + } // 0; # 0: do not retry if libgit2 or Inline::C are missing + if ($gitish) { # Gcf2 active, {inflight} may be unset due to errors + $GCF2C->{inflight} or + $gitish = $GCF2C = PublicInbox::Gcf2Client::new(); + $oid .= " $git->{git_dir}"; + } else { + $gitish = $git; + } + $gitish->cat_async($oid, $cb, $arg); + $gitish->{async_cat} //= do { + # read-only end of pipe (Gcf2Client is write-only end) + my $self = bless { gitish => $gitish }, __PACKAGE__; + $gitish->{in}->blocking(0); + $self->SUPER::new($gitish->{in}, EPOLLIN|EPOLLET); + \undef; # this is a true ref() + }; +} + +# this is safe to call inside $cb, but not guaranteed to enqueue +# returns true if successful, undef if not. +sub git_async_prefetch { + my ($git, $oid, $cb, $arg) = @_; + if ($GCF2C) { + if ($GCF2C->{async_cat} && !$GCF2C->{wbuf}) { + $oid .= " $git->{git_dir}"; + return $GCF2C->cat_async($oid, $cb, $arg); + } + } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) { + # we could use MAX_INFLIGHT here w/o the halving, + # but lets not allow one client to monopolize a git process + if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) { + print { $git->{out} } $oid, "\n" or + $git->fail("write error: $!"); + return push(@$inflight, $oid, $cb, $arg); + } + } + undef; } 1; diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index 20030433..5f701673 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -169,7 +169,7 @@ sub async_blob_cb { # git->cat_async callback if (!defined($oid)) { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $self->{-inbox}->{inboxdir}\n"; + warn "E: $smsg->{blob} missing in $self->{ibx}->{inboxdir}\n"; return $http->next_step($self->can('async_next')); } $smsg->{blob} eq $oid or bail($self, "BUG: $smsg->{blob} != $oid"); @@ -180,7 +180,7 @@ sub async_blob_cb { # git->cat_async callback sub smsg_blob { my ($self, $smsg) = @_; - git_async_cat($self->{-inbox}->git, $smsg->{blob}, + git_async_cat($self->{ibx}->git, $smsg->{blob}, \&async_blob_cb, $self); } diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index c9a024d6..2af5ab0c 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -627,7 +627,7 @@ sub fetch_blob_cb { # called by git->cat_async via git_async_cat } my $pre; if (!$self->{wbuf} && (my $nxt = $msgs->[0])) { - $pre = $ibx->git->async_prefetch($nxt->{blob}, + $pre = git_async_prefetch($ibx->git, $nxt->{blob}, \&fetch_blob_cb, $fetch_arg); } fetch_run_ops($self, $smsg, $bref, $ops, $partial); @@ -1110,7 +1110,7 @@ sub search_uid_range { # long_response 1; # more } -sub parse_query ($$) { +sub parse_imap_query ($$) { my ($self, $query) = @_; my $q = PublicInbox::IMAPsearchqp::parse($self, $query); if (ref($q)) { @@ -1122,37 +1122,10 @@ sub parse_query ($$) { $q; } -sub refill_xap ($$$$) { - my ($self, $uids, $range_info, $q) = @_; - my ($beg, $end) = @$range_info; - my $srch = $self->{ibx}->search; - my $opt = { mset => 2, limit => 1000 }; - my $mset = $srch->mset("$q uid:$beg..$end", $opt); - @$uids = @{$srch->mset_to_artnums($mset)}; - if (@$uids) { - $range_info->[0] = $uids->[-1] + 1; # update $beg - return; # possibly more - } - 0; # all done -} - -sub search_xap_range { # long_response - my ($self, $tag, $q, $range_info, $want_msn) = @_; - my $uids = []; - if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) { - $err ||= 'OK Search done'; - $self->write("\r\n$tag $err\r\n"); - return; - } - msn_convert($self, $uids) if $want_msn; - $self->msg_more(join(' ', '', @$uids)); - 1; # more -} - sub search_common { my ($self, $tag, $query, $want_msn) = @_; my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n"; - my $q = parse_query($self, $query); + my $q = parse_imap_query($self, $query); return "$tag $q\r\n" if !ref($q); my ($sql, $range_info) = delete @$q{qw(sql range_info)}; if (!scalar(keys %$q)) { # overview.sqlite3 @@ -1160,11 +1133,17 @@ sub search_common { long_response($self, \&search_uid_range, $tag, $sql, $range_info, $want_msn); } elsif ($q = $q->{xap}) { - $self->{ibx}->search or + my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; - $self->msg_more('* SEARCH'); - long_response($self, \&search_xap_range, - $tag, $q, $range_info, $want_msn); + my $opt = { + relevance => -1, + limit => UID_SLICE, + uid_range => $range_info + }; + my $mset = $srch->mset($q, $opt); + my $uids = $srch->mset_to_artnums($mset, $opt); + msn_convert($self, $uids) if $want_msn; + "* SEARCH @$uids\r\n$tag OK Search done\r\n"; } else { "$tag BAD Error\r\n"; } diff --git a/lib/PublicInbox/IMAPD.pm b/lib/PublicInbox/IMAPD.pm index 3c211ee1..fb945847 100644 --- a/lib/PublicInbox/IMAPD.pm +++ b/lib/PublicInbox/IMAPD.pm @@ -19,33 +19,34 @@ sub new { err => \*STDERR, out => \*STDOUT, # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } - # pi_config => PublicInbox::Config + # pi_cfg => PublicInbox::Config # idler => PublicInbox::InboxIdle }, $class; } -sub imapd_refresh_ibx { # pi_config->each_inbox cb +sub imapd_refresh_ibx { # pi_cfg->each_inbox cb my ($ibx, $imapd) = @_; my $ngname = $ibx->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - return; - } elsif ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || - $ngname =~ /\.[0-9]+\z/) { + + # We require lower-case since IMAP mailbox names are + # case-insensitive (but -nntpd matches INN in being + # case-sensitive + if ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || + # don't confuse with 50K slices + $ngname =~ /\.[0-9]+\z/) { warn "mailbox name invalid: newsgroup=`$ngname'\n"; return; } $ibx->over or return; $ibx->{over} = undef; - my $mm = $ibx->mm or return; - $ibx->{mm} = undef; # RFC 3501 2.3.1.1 - "A good UIDVALIDITY value to use in # this case is a 32-bit representation of the creation # date/time of the mailbox" - defined($ibx->{uidvalidity} = $mm->created_at) or return; - PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max // 0); + eval { $ibx->uidvalidity }; + my $mm = delete($ibx->{mm}) or return; + defined($ibx->{uidvalidity}) or return; + PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max); # preload to avoid fragmentation: $ibx->description; @@ -59,7 +60,7 @@ sub imapd_refresh_ibx { # pi_config->each_inbox cb } sub imapd_refresh_finalize { - my ($imapd, $pi_config) = @_; + my ($imapd, $pi_cfg) = @_; my $mailboxes; if (my $next = delete $imapd->{imapd_next}) { $imapd->{mailboxes} = delete $next->{mailboxes}; @@ -77,40 +78,40 @@ sub imapd_refresh_finalize { qq[* LIST (\\Has${no}Children) "." $u\r\n] } keys %$mailboxes ]; - $imapd->{pi_config} = $pi_config; + $imapd->{pi_cfg} = $pi_cfg; if (my $idler = $imapd->{idler}) { - $idler->refresh($pi_config); + $idler->refresh($pi_cfg); } } -sub imapd_refresh_step { # pi_config->iterate_start cb - my ($pi_config, $section, $imapd) = @_; +sub imapd_refresh_step { # pi_cfg->iterate_start cb + my ($pi_cfg, $section, $imapd) = @_; if (defined($section)) { return if $section !~ m!\Apublicinbox\.([^/]+)\z!; - my $ibx = $pi_config->lookup_name($1) or return; + my $ibx = $pi_cfg->lookup_name($1) or return; imapd_refresh_ibx($ibx, $imapd->{imapd_next}); } else { # undef == "EOF" - imapd_refresh_finalize($imapd, $pi_config); + imapd_refresh_finalize($imapd, $pi_cfg); } } sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = PublicInbox::Config->new; + my $pi_cfg = PublicInbox::Config->new; if ($sig) { # SIGHUP is handled through the event loop $self->{imapd_next} = { dummies => {}, mailboxes => {} }; - my $iter = PublicInbox::ConfigIter->new($pi_config, + my $iter = PublicInbox::ConfigIter->new($pi_cfg, \&imapd_refresh_step, $self); $iter->event_step; } else { # initial start is synchronous $self->{dummies} = {}; - $pi_config->each_inbox(\&imapd_refresh_ibx, $self); - imapd_refresh_finalize($self, $pi_config); + $pi_cfg->each_inbox(\&imapd_refresh_ibx, $self); + imapd_refresh_finalize($self, $pi_cfg); } } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); } 1; diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm index ce75b46a..c55c5c36 100644 --- a/lib/PublicInbox/IdxStack.pm +++ b/lib/PublicInbox/IdxStack.pm @@ -6,19 +6,27 @@ package PublicInbox::IdxStack; use v5.10.1; use strict; use Fcntl qw(:seek); -use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*'; +use constant PACK_FMT => eval { pack('Q', 1) } ? 'A1QQH*H*' : 'A1IIH*H*'; # start off in write-only mode sub new { open(my $io, '+>', undef) or die "open: $!"; + # latest_cmt is still useful when the newest revision is a `d'(elete), + # otherwise we favor $sync->{latest_cmt} for checkpoints and {quit} bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__ } # file_char = [d|m] sub push_rec { - my ($self, $file_char, $at, $ct, $blob_oid) = @_; - my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid); - $self->{rec_size} //= length($rec); + my ($self, $file_char, $at, $ct, $blob_oid, $cmt_oid) = @_; + my $rec = pack(PACK_FMT, $file_char, $at, $ct, $blob_oid, $cmt_oid); + $self->{unpack_fmt} //= do { + my $len = length($cmt_oid); + my $fmt = PACK_FMT; + $fmt =~ s/H\*/H$len/g; + $self->{rec_size} = length($rec); + $fmt; + }; print { $self->{wr} } $rec or die "print: $!"; $self->{tot_size} += length($rec); } @@ -46,7 +54,7 @@ sub pop_rec { my $r = read($io, my $buf, $sz); defined($r) or die "read: $!"; $r == $sz or die "read($r != $sz)"; - unpack(FMT, $buf); + unpack($self->{unpack_fmt}, $buf); } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2cb4896a..e0a84bfd 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -48,7 +48,7 @@ sub gfi_start { return ($self->{in}, $self->{out}) if $self->{pid}; - my (@ret, $out_r, $out_w); + my ($in_r, $pid, $out_r, $out_w); pipe($out_r, $out_w) or die "pipe failed: $!"; $self->lock_acquire; @@ -56,27 +56,28 @@ sub gfi_start { my ($git, $ref) = @$self{qw(git ref)}; local $/ = "\n"; chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?; if ($self->{path_type} ne '2/38' && $self->{tip}) { local $/ = "\0"; my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref); + die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; chomp @t; $self->{-tree} = { map { $_ => 1 } @t }; } my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(fast-import --quiet --done --date-format=raw)); - my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); + ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); $out_w->autoflush(1); $self->{in} = $in_r; $self->{out} = $out_w; $self->{pid} = $pid; $self->{nchg} = 0; - @ret = ($in_r, $out_w); }; if ($@) { $self->lock_release; die $@; } - @ret; + ($in_r, $out_w); } sub wfail () { die "write to fast-import failed: $!" } diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index e9efd29d..af6380a7 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -4,10 +4,10 @@ # Represents a public-inbox (which may have multiple mailing addresses) package PublicInbox::Inbox; use strict; -use warnings; use PublicInbox::Git; use PublicInbox::MID qw(mid2path); use PublicInbox::Eml; +use List::Util qw(max); # Long-running "git-cat-file --batch" processes won't notice # unlinked packs, so we need to restart those processes occasionally. @@ -74,18 +74,8 @@ sub _cleanup_later ($) { $CLEANUP->{"$self"} = $self; } -sub _set_uint ($$$) { - my ($opts, $field, $default) = @_; - my $val = $opts->{$field}; - if (defined $val) { - $val = $val->[-1] if ref($val) eq 'ARRAY'; - $val = undef if $val !~ /\A[0-9]+\z/; - } - $opts->{$field} = $val || $default; -} - sub _set_limiter ($$$) { - my ($self, $pi_config, $pfx) = @_; + my ($self, $pi_cfg, $pfx) = @_; my $lkey = "-${pfx}_limiter"; $self->{$lkey} ||= do { # full key is: publicinbox.$NAME.httpbackendmax @@ -96,7 +86,7 @@ sub _set_limiter ($$$) { require PublicInbox::Qspawn; $lim = PublicInbox::Qspawn::Limiter->new($val); } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { - $lim = $pi_config->limiter($val); + $lim = $pi_cfg->limiter($val); warn "$mkey limiter=$val not found\n" if !$lim; } else { warn "$mkey limiter=$val not understood\n"; @@ -110,14 +100,15 @@ sub new { my $v = $opts->{address} ||= [ 'public-inbox@example.com' ]; my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; - my $pi_config = delete $opts->{-pi_config}; - _set_limiter($opts, $pi_config, 'httpbackend'); - _set_uint($opts, 'feedmax', 25); - $opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'}; - my $dir = $opts->{inboxdir}; - if (defined $dir && -f "$dir/inbox.lock") { - $opts->{version} = 2; + my $pi_cfg = delete $opts->{-pi_cfg}; + _set_limiter($opts, $pi_cfg, 'httpbackend'); + my $fmax = $opts->{feedmax}; + if (defined($fmax) && $fmax =~ /\A[0-9]+\z/) { + $opts->{feedmax} += 0; + } else { + delete $opts->{feedmax}; } + $opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'}; # allow any combination of multi-line or comma-delimited hide entries my $hide = {}; @@ -130,16 +121,18 @@ sub new { bless $opts, $class; } -sub version { $_[0]->{version} // 1 } +sub version { + $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1 +} sub git_epoch { - my ($self, $epoch) = @_; - $self->version == 2 or return; + my ($self, $epoch) = @_; # v2-only, callers always supply $epoch $self->{"$epoch.git"} ||= do { my $git_dir = "$self->{inboxdir}/git/$epoch.git"; + return unless -d $git_dir; my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - # no cleanup needed, we never cat-file off this, only clone + # caller must manually cleanup when done $g; }; } @@ -160,19 +153,15 @@ sub max_git_epoch { my ($self) = @_; return if $self->version < 2; my $cur = $self->{-max_git_epoch}; - my $changed = git($self)->alternates_changed; - if (!defined($cur) || $changed) { + my $changed; + if (!defined($cur) || ($changed = git($self)->alternates_changed)) { git_cleanup($self) if $changed; my $gits = "$self->{inboxdir}/git"; if (opendir my $dh, $gits) { - my $max = -1; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - $max = $1 if $1 > $max; - } - $cur = $self->{-max_git_epoch} = $max if $max >= 0; - } else { - warn "opendir $gits failed: $!\n"; + my $max = max(map { + substr($_, 0, -4) + 0; # drop ".git" suffix + } grep(/\A[0-9]+\.git\z/, readdir($dh))) // return; + $cur = $self->{-max_git_epoch} = $max; } } $cur; @@ -191,50 +180,54 @@ sub mm { }; } -sub search ($;$$) { - my ($self, $over_only, $ctx) = @_; - my $srch = $self->{search} ||= eval { +sub search { + my ($self) = @_; + my $srch = $self->{search} //= eval { _cleanup_later($self); require PublicInbox::Search; PublicInbox::Search->new($self); }; - ($over_only || eval { $srch->xdb }) ? $srch : do { - $ctx and $ctx->{env}->{'psgi.errors'}->print(<{name}' search went away unexpectedly -EOF - undef; - }; + (eval { $srch->xdb }) ? $srch : undef; } +# isrch is preferred for read-only interfaces if available since it +# reduces kernel cache and FD overhead +sub isrch { $_[0]->{isrch} // search($_[0]) } + sub over { $_[0]->{over} //= eval { - my $srch = search($_[0], 1) or return; + my $srch = $_[0]->{search} //= eval { + _cleanup_later($_[0]); + require PublicInbox::Search; + PublicInbox::Search->new($_[0]); + }; my $over = PublicInbox::Over->new("$srch->{xpfx}/over.sqlite3"); $over->dbh; # may fail $over; }; } + sub try_cat { my ($path) = @_; - my $rv = ''; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; - } - $rv; + open(my $fh, '<', $path) or return ''; + local $/; + <$fh> // ''; +} + +sub cat_desc ($) { + my $desc = try_cat($_[0]); + local $/ = "\n"; + chomp $desc; + utf8::decode($desc); + $desc =~ s/\s+/ /smg; + $desc eq '' ? undef : $desc; } sub description { my ($self) = @_; - ($self->{description} //= do { - my $desc = try_cat("$self->{inboxdir}/description"); - local $/ = "\n"; - chomp $desc; - utf8::decode($desc); - $desc =~ s/\s+/ /smg; - $desc eq '' ? undef : $desc; - }) // '($INBOX_DIR/description missing)'; + ($self->{description} //= cat_desc("$self->{inboxdir}/description")) // + '($INBOX_DIR/description missing)'; } sub cloneurl { @@ -331,7 +324,7 @@ sub msg_by_smsg ($$) { return unless defined $smsg; defined(my $blob = $smsg->{blob}) or return; - git($self)->cat_file($blob); + $self->git->cat_file($blob); } sub smsg_eml { @@ -342,39 +335,35 @@ sub smsg_eml { $eml; } -sub mid2num($$) { - my ($self, $mid) = @_; - my $mm = mm($self) or return; - $mm->num_for($mid); -} - sub smsg_by_mid ($$) { my ($self, $mid) = @_; - my $over = over($self) or return; - # favor the Message-ID we used for the NNTP article number: - defined(my $num = mid2num($self, $mid)) or return; - my $smsg = $over->get_art($num) or return; - PublicInbox::Smsg::psgi_cull($smsg); + my $over = $self->over or return; + my $smsg; + if (my $mm = $self->mm) { + # favor the Message-ID we used for the NNTP article number: + defined(my $num = $mm->num_for($mid)) or return; + $smsg = $over->get_art($num); + } else { + my ($id, $prev); + $smsg = $over->next_by_mid($mid, \$id, \$prev); + } + $smsg ? PublicInbox::Smsg::psgi_cull($smsg) : undef; } sub msg_by_mid ($$) { my ($self, $mid) = @_; - - over($self) or - return msg_by_path($self, mid2path($mid)); - my $smsg = smsg_by_mid($self, $mid); - $smsg ? msg_by_smsg($self, $smsg) : undef; + $smsg ? msg_by_smsg($self, $smsg) : msg_by_path($self, mid2path($mid)); } sub recent { my ($self, $opts, $after, $before) = @_; - over($self)->recent($opts, $after, $before); + $self->over->recent($opts, $after, $before); } sub modified { my ($self) = @_; - if (my $over = over($self)) { + if (my $over = $self->over) { my $msgs = $over->recent({limit => 1}); if (my $smsg = $msgs->[0]) { return $smsg->{ts}; @@ -428,4 +417,8 @@ sub on_unlock { } } +sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } } + +sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } + 1; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 60948bea..35aed696 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -2,13 +2,11 @@ # License: AGPL-3.0+ # fields: -# pi_config: PublicInbox::Config ref # inot: Linux::Inotify2-like object # pathmap => { inboxdir => [ ibx, watch1, watch2, watch3... ] } mapping package PublicInbox::InboxIdle; use strict; use parent qw(PublicInbox::DS); -use Cwd qw(abs_path); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; @@ -23,11 +21,7 @@ require PublicInbox::In2Tie if $ino_cls; sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my ($ibx, $self) = @_; - my $dir = abs_path($ibx->{inboxdir}); - if (!defined($dir)) { - warn "W: $ibx->{inboxdir} not watched: $!\n"; - return; - } + my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); @@ -65,12 +59,15 @@ I: consider increasing /proc/sys/fs/inotify/max_user_watches } sub refresh { - my ($self, $pi_config) = @_; - $pi_config->each_inbox(\&in2_arm, $self); + my ($self, $pi_cfg) = @_; + $pi_cfg->each_inbox(\&in2_arm, $self); } +# internal API for ease-of-use +sub watch_inbox { in2_arm($_[1], $_[0]) }; + sub new { - my ($class, $pi_config) = @_; + my ($class, $pi_cfg) = @_; my $self = bless {}, $class; my $inot; if ($ino_cls) { @@ -84,7 +81,7 @@ sub new { $self->{inot} = $inot; $self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...] $self->{on_unlock} = {}; # lock path => ibx - refresh($self, $pi_config); + refresh($self, $pi_cfg) if $pi_cfg; PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } @@ -95,7 +92,8 @@ sub event_step { my @events = $self->{inot}->read; # Linux::Inotify2::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { - if (my $ibx = $on_unlock->{$ev->fullname}) { + my $fn = $ev->fullname // next; # cancelled + if (my $ibx = $on_unlock->{$fn}) { $ibx->on_unlock; } } diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 752f1997..b1d5caf5 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -46,12 +46,13 @@ sub _init_v1 { require PublicInbox::Msgmap; my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create $sidx->begin_txn_lazy; + my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); if (defined $skip_artnum) { - my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); $mm->{dbh}->begin_work; $mm->skip_artnum($skip_artnum); $mm->{dbh}->commit; } + undef $mm; # ->created_at set $sidx->commit_txn_lazy; } else { open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or @@ -64,7 +65,6 @@ sub init_inbox { if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); - $self->umask_prepare; $self->with_umask(\&_init_v1, $self, $skip_artnum); } else { my $v2w = importer($self); @@ -102,7 +102,7 @@ sub filter { $im->done; } - my @args = (-inbox => $self); + my @args = (ibx => $self); # basic line splitting, only # Perhaps we can have proper quote splitting one day... ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; @@ -259,7 +259,7 @@ sub _umask_for { sub with_umask { my ($self, $cb, @arg) = @_; - my $old = umask $self->{umask}; + my $old = umask($self->{umask} //= umask_prepare($self)); my $rv = eval { $cb->(@arg) }; my $err = $@; umask $old; @@ -270,8 +270,7 @@ sub with_umask { sub umask_prepare { my ($self) = @_; my $perm = _git_config_perm($self); - my $umask = _umask_for($perm); - $self->{umask} = $umask; + _umask_for($perm); } sub cleanup ($) { @@ -287,15 +286,24 @@ sub warn_ignore { # PublicInbox::MsgTime || $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/ || $s =~ /^bad Date: .+? in / + # Encode::Unicode::UTF7 + || $s =~ /^Bad UTF7 data escape at / } # this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." sub warn_ignore_cb { - my $cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $cb = $SIG{__WARN__} // \&CORE::warn; sub { return if warn_ignore(@_); $cb->(@_); } } +# v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove +sub git_dir_latest { + my ($self, $max) = @_; + defined($$max = $self->max_git_epoch) ? + "$self->{inboxdir}/git/$$max.git" : undef; +} + 1; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm new file mode 100644 index 00000000..7ca2f9e4 --- /dev/null +++ b/lib/PublicInbox/Isearch.pm @@ -0,0 +1,127 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# Provides everything the PublicInbox::Search object does; +# but uses global ExtSearch (->ALL) with an eidx_key query to +# emulate per-Inbox search using ->ALL. +package PublicInbox::Isearch; +use strict; +use v5.10.1; +use PublicInbox::ExtSearch; +use PublicInbox::Search; + +sub new { + my (undef, $ibx, $es) = @_; + bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; +} + +sub _ibx_id ($) { + my ($self) = @_; + my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($self->{eidx_key}); + $sth->fetchrow_array // + die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; +} + + +sub mset { + my ($self, $str, $opt) = @_; + my %opt = $opt ? %$opt : (); + $opt{eidx_key} = $self->{eidx_key}; + if (my $uid_range = $opt{uid_range}) { + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->{over}->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= 0xffffffff; + $r[0] //= 0; + } + $opt{uid_range} = \@r; + } + $self->{es}->mset($str, \%opt); +} + +sub mset_to_artnums { + my ($self, $mset, $opt) = @_; + my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $qmarks = join(',', map { '?' } @$docids); + if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING + my $range = ''; + my @r; + if (my $r = $opt->{uid_range}) { + $range = 'AND xnum >= ? AND xnum <= ?'; + @r = @$r; + } + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r); +SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range +ORDER BY xnum ASC + + return [ map { $_->[0] } @$rows ]; + } + + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids); +SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) + + my $i = -1; + my %order = map { $_ => ++$i } @$docids; + my @xnums; + for my $row (@$rows) { # @row = ($docid, $xnum) + my $idx = delete($order{$row->[0]}) // next; + $xnums[$idx] = $row->[1]; + } + if (scalar keys %order) { + warn "W: $self->{es}->{topdir} #", + join(', ', sort { $a <=> $b } keys %order), + " not mapped to `$self->{eidx_key}'\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + @xnums = grep { defined } @xnums; + } + \@xnums; +} + +sub mset_to_smsg { + my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx + my $xnums = mset_to_artnums($self, $mset); + my $i = -1; + my %order = map { $_ => ++$i } @$xnums; + my $unordered = $ibx->over->get_all(@$xnums); + my @msgs; + for my $smsg (@$unordered) { + my $idx = delete($order{$smsg->{num}}) // do { + warn "W: $ibx->{inboxdir} #$smsg->{num}\n"; + next; + }; + $msgs[$idx] = $smsg; + } + if (scalar keys %order) { + warn "W: $ibx->{inboxdir} #", + join(', ', sort { $a <=> $b } keys %order), + " no longer valid\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + } + wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs; +} + +sub has_threadid { 1 } + +sub help { $_[0]->{es}->help } + +1; diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index fa4a2ad8..0377a877 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -83,7 +83,7 @@ sub set_list_headers { } sub inboxes_for_list_id ($$) { - my ($klass, $config, $simple) = @_; + my ($klass, $pi_cfg, $simple) = @_; # newer Email::Simple allows header_raw, as does Email::MIME: my @list_ids = $simple->can('header_raw') ? @@ -92,7 +92,7 @@ sub inboxes_for_list_id ($$) { my @dests; for my $list_id (@list_ids) { $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next; - if (my $ibx = $config->lookup_list_id($1)) { + if (my $ibx = $pi_cfg->lookup_list_id($1)) { push @dests, $ibx; } } diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index 74820fb5..e02450fa 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -6,21 +6,12 @@ package PublicInbox::ManifestJsGz; use strict; use v5.10.1; use parent qw(PublicInbox::WwwListing); -use Digest::SHA (); -use File::Spec (); use bytes (); # length -use PublicInbox::Inbox; -use PublicInbox::Git; +use PublicInbox::Config; use IO::Compress::Gzip qw(gzip); use HTTP::Date qw(time2str); -*try_cat = \&PublicInbox::Inbox::try_cat; -our $json; -for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { - eval "require $mod" or next; - # ->ascii encodes non-ASCII to "\uXXXX" - $json = $mod->new->ascii(1) and last; -} +our $json = PublicInbox::Config::json(); # called by WwwListing sub url_regexp { @@ -30,76 +21,29 @@ sub url_regexp { $ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain'); } -sub fingerprint ($) { - my ($git) = @_; - # TODO: convert to qspawn for fairness when there's - # thousands of repos - my ($fh, $pid) = $git->popen('show-ref'); - my $dig = Digest::SHA->new(1); - while (read($fh, my $buf, 65536)) { - $dig->add($buf); - } - close $fh; - waitpid($pid, 0); - return if $?; # empty, uninitialized git repo - $dig->hexdigest; +sub inject_entry ($$$;$) { + my ($ctx, $url_path, $ent, $git_dir) = @_; + $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path; + my $modified = $ent->{modified}; + $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0); + $ctx->{manifest}->{$url_path} = $ent; } sub manifest_add ($$;$$) { my ($ctx, $ibx, $epoch, $default_desc) = @_; my $url_path = "/$ibx->{name}"; - my $git_dir = $ibx->{inboxdir}; + my $git; if (defined $epoch) { - $git_dir .= "/git/$epoch.git"; $url_path .= "/git/$epoch.git"; + $git = $ibx->git_epoch($epoch) or return; + } else { + $git = $ibx->git; } - return unless -d $git_dir; - my $git = PublicInbox::Git->new($git_dir); - my $fingerprint = fingerprint($git) or return; # no empty repos - - chomp(my $owner = $git->qx('config', 'gitweb.owner')); - chomp(my $desc = try_cat("$git_dir/description")); - utf8::decode($owner); - utf8::decode($desc); - $owner = undef if $owner eq ''; - $desc = 'Unnamed repository' if $desc eq ''; - - # templates/hooks--update.sample and git-multimail in git.git - # only match "Unnamed repository", not the full contents of - # templates/this--description in git.git - if ($desc =~ /\AUnnamed repository/) { - $desc = "$default_desc [epoch $epoch]" if defined($epoch); - } - - my $reference; - chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); - if ($alt) { - # n.b.: GitPython doesn't seem to handle comments or C-quoted - # strings like native git does; and we don't for now, either. - my @alt = split(/\n+/, $alt); - - # grokmirror only supports 1 alternate for "reference", - if (scalar(@alt) == 1) { - my $objdir = "$git_dir/objects"; - $reference = File::Spec->rel2abs($alt[0], $objdir); - $reference =~ s!/[^/]+/?\z!!; # basename - } - } - $ctx->{-abs2urlpath}->{$git_dir} = $url_path; - my $modified = $git->modified; - if ($modified > ($ctx->{-mtime} // 0)) { - $ctx->{-mtime} = $modified; - } - $ctx->{manifest}->{$url_path} = { - owner => $owner, - reference => $reference, - description => $desc, - modified => $modified, - fingerprint => $fingerprint, - }; + my $ent = $git->manifest_entry($epoch, $default_desc) or return; + inject_entry($ctx, $url_path, $ent, $git->{git_dir}); } -sub ibx_entry { +sub slow_manifest_add ($$) { my ($ctx, $ibx) = @_; eval { if (defined(my $max = $ibx->max_git_epoch)) { @@ -111,6 +55,29 @@ sub ibx_entry { manifest_add($ctx, $ibx); } }; +} + +sub eidx_manifest_add ($$$) { + my ($ctx, $ALL, $ibx) = @_; + if (my $data = $ALL->misc->inbox_data($ibx)) { + $data = $json->decode($data); + delete $data->{''}; # private + while (my ($url_path, $ent) = each %$data) { + inject_entry($ctx, $url_path, $ent); + } + } else { + warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n"; + } +} + +sub ibx_entry { + my ($ctx, $ibx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL; + if ($ALL) { + eidx_manifest_add($ctx, $ALL, $ibx); + } else { + slow_manifest_add($ctx, $ibx); + } warn "E: $@" if $@; } @@ -134,7 +101,8 @@ sub psgi_triple { sub per_inbox { my ($ctx) = @_; - ibx_entry($ctx, $ctx->{-inbox}); + # only one inbox, slow is probably OK + slow_manifest_add($ctx, $ctx->{ibx}); psgi_triple($ctx); } diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 47025891..83fa7d8a 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -17,10 +17,10 @@ use PublicInbox::Eml; sub getline { my ($ctx) = @_; # ctx my $smsg = $ctx->{smsg} or return; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $eml = $ibx->smsg_eml($smsg) or return; my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); + $ctx->zmore(msg_hdr($ctx, $eml)); if ($n) { $ctx->translate(msg_body($eml)); } else { # last message @@ -44,9 +44,9 @@ sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; my $smsg = delete $ctx->{smsg}; # next message - $ctx->{smsg} = $ctx->{-inbox}->over->next_by_mid(@{$ctx->{next_arg}}); + $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); + $ctx->zmore(msg_hdr($ctx, $eml)); $ctx->{http_out}->write($ctx->translate(msg_body($eml))); } @@ -56,7 +56,7 @@ sub res_hdr ($$) { $fn =~ s/^re:\s+//i; $fn = to_filename($fn) // 'no-subject'; my @hdr = ('Content-Type'); - if ($ctx->{-inbox}->{obfuscate}) { + if ($ctx->{ibx}->{obfuscate}) { # obfuscation is stupid, but maybe scrapers are, too... push @hdr, 'application/mbox'; $fn .= '.mbox'; @@ -71,17 +71,17 @@ sub res_hdr ($$) { # for rare cases where v1 inboxes aren't indexed w/ ->over at all sub no_over_raw ($) { my ($ctx) = @_; - my $mref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; + my $mref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; my $eml = PublicInbox::Eml->new($mref); [ 200, res_hdr($ctx, $eml->header_str('Subject')), - [ msg_hdr($ctx, $eml, $ctx->{mid}) . msg_body($eml) ] ] + [ msg_hdr($ctx, $eml) . msg_body($eml) ] ] } # /$INBOX/$MESSAGE_ID/raw sub emit_raw { my ($ctx) = @_; - $ctx->{base_url} = $ctx->{-inbox}->base_url($ctx->{env}); - my $over = $ctx->{-inbox}->over or return no_over_raw($ctx); + $ctx->{base_url} = $ctx->{ibx}->base_url($ctx->{env}); + my $over = $ctx->{ibx}->over or return no_over_raw($ctx); my ($id, $prev); my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; my $smsg = $ctx->{smsg} = $over->next_by_mid(@$mip) or return; @@ -90,8 +90,8 @@ sub emit_raw { $ctx->psgi_response(200, $res_hdr); } -sub msg_hdr ($$;$) { - my ($ctx, $eml, $mid) = @_; +sub msg_hdr ($$) { + my ($ctx, $eml) = @_; my $header_obj = $eml->header_obj; # drop potentially confusing headers, ssoma already should've dropped @@ -99,34 +99,11 @@ sub msg_hdr ($$;$) { foreach my $d (qw(Lines Bytes Content-Length Status)) { $header_obj->header_set($d); } - my $ibx = $ctx->{-inbox}; - my $base = $ctx->{base_url}; - $mid = $ctx->{mid} unless defined $mid; - $mid = mid_escape($mid); - my @append = ( - 'Archived-At', "<$base$mid/>", - 'List-Archive', "<$base>", - 'List-Post', "{-primary_address}>", - ); my $crlf = $header_obj->crlf; my $buf = $header_obj->as_string; # fixup old bug from import (pre-a0c07cba0e5d8b6a) $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf; - - for (my $i = 0; $i < @append; $i += 2) { - my $k = $append[$i]; - my $v = $append[$i + 1]; - my @v = $header_obj->header_raw($k); - foreach (@v) { - if ($v eq $_) { - $v = undef; - last; - } - } - $buf .= "$k: $v$crlf" if defined $v; - } - $buf .= $crlf; + "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf; } sub msg_body ($) { @@ -190,7 +167,7 @@ sub all_ids_cb { sub mbox_all_ids { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $prev = 0; my $mm = $ctx->{mm} = $ibx->mm; my $ids = $mm->ids_after(\$prev) or return @@ -203,27 +180,33 @@ sub mbox_all_ids { PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all'); } +sub gone ($$) { + my ($ctx, $what) = @_; + warn "W: `$ctx->{ibx}->{inboxdir}' $what went away unexpectedly\n"; + undef; +} + sub results_cb { my ($ctx) = @_; - my $over = $ctx->{-inbox}->over or return; + my $over = $ctx->{ibx}->over or return gone($ctx, 'over'); while (1) { while (defined(my $num = shift(@{$ctx->{ids}}))) { my $smsg = $over->get_art($num) or next; return $smsg; } # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; + my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } sub results_thread_cb { my ($ctx) = @_; - my $over = $ctx->{-inbox}->over or return; + my $over = $ctx->{ibx}->over or return gone($ctx, 'over'); while (1) { while (defined(my $num = shift(@{$ctx->{xids}}))) { my $smsg = $over->get_art($num) or next; @@ -234,11 +217,11 @@ sub results_thread_cb { next if $over->expand_thread($ctx); # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; + my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } @@ -247,19 +230,19 @@ sub mbox_all { my ($ctx, $q) = @_; my $q_string = $q->{'q'}; return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); - my $over = $ctx->{-inbox}->over or + my $over = $ctx->{ibx}->over or return PublicInbox::WWW::need($ctx, 'Overview'); - my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid + my $qopts = $ctx->{qopts} = { relevance => -1 }; # ORDER BY docid ASC $qopts->{thread} = 1 if $q->{t}; my $mset = $srch->mset($q_string, $qopts); $qopts->{offset} = $mset->size or return [404, [qw(Content-Type text/plain)], ["No results found\n"]]; $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts); require PublicInbox::MboxGz; my $fn; if ($q->{t} && $srch->has_threadid) { diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm index 913be6e4..7b054845 100644 --- a/lib/PublicInbox/MboxGz.pm +++ b/lib/PublicInbox/MboxGz.pm @@ -22,7 +22,7 @@ sub async_next ($) { sub mbox_gz { my ($self, $cb, $fn) = @_; $self->{cb} = $cb; - $self->{base_url} = $self->{-inbox}->base_url($self->{env}); + $self->{base_url} = $self->{ibx}->base_url($self->{env}); $self->{gz} = PublicInbox::GzipFilter::gzip_or_die(); $fn = to_filename($fn // '') // 'no-subject'; # http://www.iana.org/assignments/media-types/application/gzip @@ -37,8 +37,8 @@ sub getline { my ($self) = @_; my $cb = $self->{cb} or return; while (my $smsg = $cb->($self)) { - my $eml = $self->{-inbox}->smsg_eml($smsg) or next; - $self->zmore(msg_hdr($self, $eml, $smsg->{mid})); + my $eml = $self->{ibx}->smsg_eml($smsg) or next; + $self->zmore(msg_hdr($self, $eml)); return $self->translate(msg_body($eml)); } # signal that we're done and can return undef next call: diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm new file mode 100644 index 00000000..a04dd1c5 --- /dev/null +++ b/lib/PublicInbox/MiscIdx.pm @@ -0,0 +1,151 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# like PublicInbox::SearchIdx, but for searching for non-mail messages. +# Things indexed include: +# * inboxes themselves +# * epoch information +# * (maybe) git code repository information +# Expect ~100K-1M documents with no parallelism opportunities, +# so no sharding, here. +# +# See MiscSearch for read-only counterpart +package PublicInbox::MiscIdx; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable; +use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat +use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use PublicInbox::Spawn qw(nodatacow_dir); +use Carp qw(croak); +use File::Path (); +use PublicInbox::MiscSearch; +use PublicInbox::Config; +my $json; + +sub new { + my ($class, $eidx) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $mi_dir = "$eidx->{xpfx}/misc"; + File::Path::mkpath($mi_dir); + nodatacow_dir($mi_dir); + my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; + $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $json //= PublicInbox::Config::json(); + bless { + mi_dir => $mi_dir, + flags => $flags, + indexlevel => 'full', # small DB, no point in medium? + }, $class; +} + +sub begin_txn { + my ($self) = @_; + croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy? + my $wdb = $PublicInbox::Search::X{WritableDatabase}; + my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; + croak "Failed opening $self->{mi_dir}: $@" if $@; + $self->{xdb} = $xdb; + $xdb->begin_transaction; +} + +sub commit_txn { + my ($self) = @_; + croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy? + delete($self->{xdb})->commit_transaction; +} + +sub remove_eidx_key { + my ($self, $eidx_key) = @_; + my $xdb = $self->{xdb}; + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my @docids; # only one, unless we had bugs + for (; $head != $tail; $head++) { + push @docids, $head->get_docid; + } + for my $docid (@docids) { + $xdb->delete_document($docid); + warn "I: remove inbox docid #$docid ($eidx_key)\n"; + } +} + +# adds or updates according to $eidx_key +sub index_ibx { + my ($self, $ibx) = @_; + my $eidx_key = $ibx->eidx_key; + my $xdb = $self->{xdb}; + # Q = uniQue in Xapian terminology + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my ($docid, @drop); + for (; $head != $tail; $head++) { + if (defined $docid) { + my $i = $head->get_docid; + push @drop, $i; + warn <get_docid; + } + } + $xdb->delete_document($_) for @drop; # just in case + + my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); + + # allow sorting by modified and uidvalidity (created at) + add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); + + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + } + + # force reread from disk, {description} could be loaded from {misc} + delete $ibx->{description}; + my $desc = $ibx->description; + + # description = S/Subject (or title) + # address = A/Author + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); + my %map = ( + address => 'A', + listid => 'XLISTID', + infourl => 'XINFOURL', + url => 'XURL' + ); + while (my ($f, $pfx) = each %map) { + for my $v (@{$ibx->{$f} // []}) { + index_text($self, $v, 1, $pfx); + } + } + my $data = {}; + if (defined(my $max = $ibx->max_git_epoch)) { # v2 + my $pfx = "/$ibx->{name}/git/"; + for my $epoch (0..$max) { + my $git = $ibx->git_epoch($epoch) or return; + if (my $ent = $git->manifest_entry($epoch, $desc)) { + $data->{"$pfx$epoch.git"} = $ent; + $ent->{git_dir} = $git->{git_dir}; + } + $git->cleanup; # ->modified starts cat-file --batch + } + } elsif (my $ent = $ibx->git->manifest_entry) { # v1 + $ent->{git_dir} = $ibx->{inboxdir}; + $data->{"/$ibx->{name}"} = $ent; + } + $doc->set_data($json->encode($data)); + if (defined $docid) { + $xdb->replace_document($docid, $doc); + } else { + $xdb->add_document($doc); + } +} + +1; diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm new file mode 100644 index 00000000..6683d564 --- /dev/null +++ b/lib/PublicInbox/MiscSearch.pm @@ -0,0 +1,191 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# read-only counterpart to MiscIdx +package PublicInbox::MiscSearch; +use strict; +use v5.10.1; +use PublicInbox::Search qw(retry_reopen int_val); +my $json; + +# Xapian value columns: +our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) + +# avoid conflicting with message Search::prob_prefix for UI/UX reasons +my %PROB_PREFIX = ( + description => 'S', # $INBOX_DIR/description + address => 'A', + listid => 'XLISTID', + url => 'XURL', + infourl => 'XINFOURL', + name => 'XNAME', + '' => 'S A XLISTID XNAME XURL XINFOURL' +); + +sub new { + my ($class, $dir) = @_; + PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); + bless { + xdb => $PublicInbox::Search::X{Database}->new($dir) + }, $class; +} + +# read-only +sub mi_qp_new ($) { + my ($self) = @_; + my $xdb = $self->{xdb}; + my $qp = $PublicInbox::Search::X{QueryParser}->new; + $qp->set_default_op(PublicInbox::Search::OP_AND()); + $qp->set_database($xdb); + $qp->set_stemmer(PublicInbox::Search::stemmer($self)); + $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME()); + my $cb = $qp->can('set_max_wildcard_expansion') // + $qp->can('set_max_expansion'); # Xapian 1.5.0+ + $cb->($qp, 100); + $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + while (my ($name, $prefix) = each %PROB_PREFIX) { + $qp->add_prefix($name, $_) for split(/ /, $prefix); + } + $qp->add_boolean_prefix('type', 'T'); + $qp; +} + +sub misc_enquire_once { # retry_reopen callback + my ($self, $qr, $opt) = @_; + my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $eq->set_query($qr); + my $desc = !$opt->{asc}; + my $rel = $opt->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + } elsif ($rel) { + $eq->set_sort_by_relevance_then_value($MODIFIED, $desc); + } else { + $eq->set_sort_by_value_then_relevance($MODIFIED, $desc); + } + $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200); +} + +sub mset { + my ($self, $qs, $opt) = @_; + $opt ||= {}; + reopen($self); + my $qp = $self->{qp} //= mi_qp_new($self); + $qs = 'type:inbox' if $qs eq ''; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + $opt->{relevance} = 1 unless exists $opt->{relevance}; + retry_reopen($self, \&misc_enquire_once, $qr, $opt); +} + +sub ibx_matches_once { # retry_reopen callback + my ($self, $qr, $by_newsgroup) = @_; + # double in case no newsgroups are configured: + my $limit = scalar(keys %$by_newsgroup) * 2; + my $opt = { limit => $limit, offset => 0, relevance => -1 }; + my $ret = {}; # newsgroup => $ibx of matches + while (1) { + my $mset = misc_enquire_once($self, $qr, $opt); + for my $mi ($mset->items) { + my $doc = $mi->get_document; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + if ($cur != $end) { + my $ng = $cur->get_termname; # eidx_key + $ng =~ s/\AQ// or warn "BUG: no `Q': $ng"; + if (my $ibx = $by_newsgroup->{$ng}) { + $ret->{$ng} = $ibx; + } + } else { + warn <get_docid} has no `Q' (eidx_key) term +EOF + } + } + my $nr = $mset->size; + return $ret if $nr < $limit; + $opt->{offset} += $nr; + } +} + +# returns a newsgroup => PublicInbox::Inbox mapping +sub newsgroup_matches { + my ($self, $qs, $pi_cfg) = @_; + my $qp = $self->{qp} //= mi_qp_new($self); + $qs .= ' type:inbox'; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + retry_reopen($self, \&ibx_matches_once, $qr, $pi_cfg->{-by_newsgroup}); +} + +sub ibx_data_once { + my ($self, $ibx) = @_; + my $xdb = $self->{xdb}; + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); + if ($head != $tail) { + my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); + $doc->get_data; + } else { + undef; + } +} + +sub inbox_data { + my ($self, $ibx) = @_; + retry_reopen($self, \&ibx_data_once, $ibx); +} + +sub ibx_cache_load { + my ($doc, $cache) = @_; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + return if $cur == $end; + my $eidx_key = $cur->get_termname; + $eidx_key =~ s/\AQ// or return; # expired + my $ce = $cache->{$eidx_key} = {}; + $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY); + $ce->{-modified} = int_val($doc, $MODIFIED); + $ce->{description} = do { + # extract description from manifest.js.gz epoch description + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + $d; + } +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + +no warnings 'once'; +*reopen = \&PublicInbox::Search::reopen; + +1; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index f15875e3..a8c874af 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -36,8 +36,7 @@ sub new_file { create_tables($dbh); $self->created_at(time) unless $self->created_at; - my $max = $self->max // 0; - $self->num_highwater($max); + $self->num_highwater(max($self)); $dbh->commit; } $self; @@ -144,7 +143,7 @@ sub max { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MAX(num) FROM msgmap', undef, 1); $sth->execute; - $sth->fetchrow_array; + $sth->fetchrow_array // 0; } sub minmax { @@ -153,7 +152,7 @@ sub minmax { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1); $sth->execute; - ($sth->fetchrow_array, max($_[0])); + ($sth->fetchrow_array // 0, max($_[0])); } sub mid_delete { diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 2f821fa6..11a7ffb8 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -5,7 +5,7 @@ # fields: # nntpd: PublicInbox::NNTPD ref # article: per-session current article number -# ng: PublicInbox::Inbox ref +# ibx: PublicInbox::Inbox ref # long_cb: long_response private data package PublicInbox::NNTP; use strict; @@ -17,6 +17,8 @@ use PublicInbox::DS qw(now); use Digest::SHA qw(sha1_hex); use Time::Local qw(timegm timelocal); use PublicInbox::GitAsyncCat; +use PublicInbox::Address; + use constant { LINE_MAX => 512, # RFC 977 section 2.3 r501 => '501 command syntax error', @@ -31,9 +33,9 @@ use Errno qw(EAGAIN); my $ONE_MSGID = qr/\A$MID_EXTRACT\z/; my @OVERVIEW = qw(Subject From Date Message-ID References); my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') . - "Xref:full\r\n"; + "Xref:full\r\n."; my $LIST_HEADERS = join("\r\n", @OVERVIEW, - qw(:bytes :lines Xref To Cc)) . "\r\n"; + qw(:bytes :lines Xref To Cc)) . "\r\n."; my $CAPABILITIES = <<""; 101 Capability list:\r VERSION 2\r @@ -92,8 +94,7 @@ sub process_line ($$) { err($self, 'error from: %s (%s)', $l, $err); $res = '503 program fault - command not performed'; } - return 0 unless defined $res; - res($self, $res); + defined($res) ? res($self, $res) : 0; } # The keyword argument is not used (rfc3977 5.2.2) @@ -109,9 +110,7 @@ sub cmd_capabilities ($;$) { sub cmd_mode ($$) { my ($self, $arg) = @_; - $arg = uc $arg; - return r501 unless $arg eq 'READER'; - '201 Posting prohibited'; + uc($arg) eq 'READER' ? '201 Posting prohibited' : r501; } sub cmd_slave ($) { '202 slave status noted' } @@ -120,46 +119,66 @@ sub cmd_xgtitle ($;$) { my ($self, $wildmat) = @_; more($self, '282 list of groups and descriptions follows'); list_newsgroups($self, $wildmat); - '.' } -sub list_overview_fmt ($) { - my ($self) = @_; - $self->msg_more($OVERVIEW_FMT); -} +sub list_overview_fmt ($) { $OVERVIEW_FMT } -sub list_headers ($;$) { - my ($self) = @_; - $self->msg_more($LIST_HEADERS); +sub list_headers ($;$) { $LIST_HEADERS } + +sub list_active_i { # "LIST ACTIVE" and also just "LIST" (no args) + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $ibx; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + for my $ngname (@window) { + $ibx = $groups->{$ngname} and group_line($self, $ibx); + } + scalar(@$groupnames); # continue if there's more } -sub list_active ($;$) { +sub list_active ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - group_line($self, $ng); + long_response($self, \&list_active_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); +} + +sub list_active_times_i { + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + for my $ngname (@window) { + my $ibx = $groups->{$ngname} or next; + my $c = eval { $ibx->uidvalidity } // time; + more($self, "$ngname $c <$ibx->{-primary_address}>"); } + scalar(@$groupnames); # continue if there's more } -sub list_active_times ($;$) { +sub list_active_times ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $c = eval { $ng->mm->created_at } || time; - more($self, "$ng->{newsgroup} $c $ng->{-primary_address}"); + long_response($self, \&list_active_times_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); +} + +sub list_newsgroups_i { + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + my $ibx; + for my $ngname (@window) { + $ibx = $groups->{$ngname} and + more($self, "$ngname ".$ibx->description); } + scalar(@$groupnames); # continue if there's more } -sub list_newsgroups ($;$) { +sub list_newsgroups ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $d = $ng->description; - more($self, "$ng->{newsgroup} $d"); - } + long_response($self, \&list_newsgroups_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); } # LIST SUBSCRIPTIONS, DISTRIB.PATS are not supported @@ -168,6 +187,7 @@ sub cmd_list ($;$$) { if (scalar @args) { my $arg = shift @args; $arg =~ tr/A-Z./a-z_/; + my $ret = $arg eq 'active'; $arg = "list_$arg"; $arg = $self->can($arg); return r501 unless $arg && args_ok($arg, scalar @args); @@ -175,24 +195,22 @@ sub cmd_list ($;$$) { $arg->($self, @args); } else { more($self, '215 list of newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - group_line($self, $ng); - } + long_response($self, \&list_active_i, [ # copy array + @{$self->{nntpd}->{groupnames}} ]); } - '.' } sub listgroup_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end, 'num'); + my $r = $self->{ibx}->mm->msg_range($beg, $end, 'num'); scalar(@$r) or return; - more($self, join("\r\n", map { $_->[0] } @$r)); + $self->msg_more(join('', map { "$_->[0]\r\n" } @$r)); 1; } sub listgroup_all_i { my ($self, $num) = @_; - my $ary = $self->{ng}->mm->ids_after($num); + my $ary = $self->{ibx}->mm->ids_after($num); scalar(@$ary) or return; more($self, join("\r\n", @$ary)); 1; @@ -205,7 +223,7 @@ sub cmd_listgroup ($;$$) { return $res if ($res !~ /\A211 /); more($self, $res); } - $self->{ng} or return '412 no newsgroup selected'; + $self->{ibx} or return '412 no newsgroup selected'; if (defined $range) { my $r = get_range($self, $range); return $r unless ref $r; @@ -242,9 +260,22 @@ sub parse_time ($$;$) { } sub group_line ($$) { - my ($self, $ng) = @_; - my ($min, $max) = $ng->mm->minmax; - more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max; + my ($self, $ibx) = @_; + my ($min, $max) = $ibx->mm->minmax; + more($self, "$ibx->{newsgroup} $max $min n"); +} + +sub newgroups_i { + my ($self, $ts, $i, $groupnames) = @_; + my $end = $$i + 100; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + while ($$i < $end) { + my $ngname = $groupnames->[$$i++] // return; + my $ibx = $groups->{$ngname} or next; # expired on reload + next unless (eval { $ibx->uidvalidity } // 0) > $ts; + group_line($self, $ibx); + } + 1; } sub cmd_newgroups ($$$;$$) { @@ -254,12 +285,8 @@ sub cmd_newgroups ($$$;$$) { # TODO dists more($self, '231 list of new newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - my $c = eval { $ng->mm->created_at } || 0; - next unless $c > $ts; - group_line($self, $ng); - } - '.' + long_response($self, \&newgroups_i, $ts, \(my $i = 0), + $self->{nntpd}->{groupnames}); } sub wildmat2re (;$) { @@ -294,23 +321,27 @@ sub ngpat2re (;$) { } sub newnews_i { - my ($self, $overs, $ts, $prev) = @_; - my $over = $overs->[0]; - my $msgs = $over->query_ts($ts, $$prev); - if (scalar @$msgs) { - more($self, '<' . - join(">\r\n<", map { $_->{mid} } @$msgs ). - '>'); - $$prev = $msgs->[-1]->{num}; - } else { - shift @$overs; - if (@$overs) { # continue onto next newsgroup - $$prev = 0; - return 1; - } else { # break out of the long response. - return; + my ($self, $names, $ts, $prev) = @_; + my $ngname = $names->[0]; + if (my $ibx = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}->{$ngname}) { + if (my $over = $ibx->over) { + my $msgs = $over->query_ts($ts, $$prev); + if (scalar @$msgs) { + $self->msg_more(join('', map { + "<$_->{mid}>\r\n"; + } @$msgs)); + $$prev = $msgs->[-1]->{num}; + return 1; # continue on current group + } } } + shift @$names; + if (@$names) { # continue onto next newsgroup + $$prev = 0; + 1; + } else { # all done, break out of the long_response + undef; + } } sub cmd_newnews ($$$$;$$) { @@ -321,30 +352,22 @@ sub cmd_newnews ($$$$;$$) { my ($keep, $skip) = split('!', $newsgroups, 2); ngpat2re($keep); ngpat2re($skip); - my @overs; - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $keep or next; - $ng->{newsgroup} =~ $skip and next; - my $over = $ng->over or next; - push @overs, $over; - }; - return '.' unless @overs; - + my @names = grep(!/$skip/, grep(/$keep/, + @{$self->{nntpd}->{groupnames}})); + return '.' unless scalar(@names); my $prev = 0; - long_response($self, \&newnews_i, \@overs, $ts, \$prev); + long_response($self, \&newnews_i, \@names, $ts, \$prev); } sub cmd_group ($$) { my ($self, $group) = @_; - my $no_such = '411 no such news group'; my $nntpd = $self->{nntpd}; - my $ng = $nntpd->{groups}->{$group} or return $no_such; + my $ibx = $nntpd->{pi_cfg}->{-by_newsgroup}->{$group} or + return '411 no such news group'; $nntpd->idler_start; - $self->{ng} = $ng; - my ($min, $max) = $ng->mm->minmax; - $min ||= 0; - $max ||= 0; + $self->{ibx} = $ibx; + my ($min, $max) = $ibx->mm->minmax; $self->{article} = $min; my $est_size = $max - $min; "211 $est_size $min $max $group"; @@ -352,13 +375,13 @@ sub cmd_group ($$) { sub article_adj ($$) { my ($self, $off) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; + my $ibx = $self->{ibx} or return '412 no newsgroup selected'; my $n = $self->{article}; defined $n or return '420 no current article has been selected'; $n += $off; - my $mid = $ng->mm->mid_for($n); + my $mid = $ibx->mm->mid_for($n); unless ($mid) { $n = $off > 0 ? 'next' : 'previous'; return "421 no $n article in this group"; @@ -374,8 +397,8 @@ sub cmd_last ($) { article_adj($_[0], -1) } # the single-point-of-failure a single server provides. sub cmd_post ($) { my ($self) = @_; - my $ng = $self->{ng}; - $ng ? "440 mailto:$ng->{-primary_address} to post" + my $ibx = $self->{ibx}; + $ibx ? "440 mailto:$ibx->{-primary_address} to post" : '440 posting not allowed' } @@ -395,19 +418,41 @@ sub header_append ($$$) { $hdr->header_set($k, @v, $v); } -sub xref ($$$$) { - my ($self, $ng, $n, $mid) = @_; - my $ret = $self->{nntpd}->{servername} . " $ng->{newsgroup}:$n"; +sub xref_by_tc ($$$) { + my ($xref, $pi_cfg, $smsg) = @_; + my $by_addr = $pi_cfg->{-by_addr}; + my $mid = $smsg->{mid}; + for my $f (qw(to cc)) { + my @ibxs = map { + $by_addr->{lc($_)} // () + } (PublicInbox::Address::emails($smsg->{$f} // '')); + for my $ibx (@ibxs) { + my $ngname = $ibx->{newsgroup} // next; + next if defined $xref->{$ngname}; + $xref->{$ngname} = eval { $ibx->mm->num_for($mid) }; + } + } +} - # num_for is pretty cheap and sometimes we'll lookup the existence - # of an article without getting even the OVER info. In other words, - # I'm not sure if its worth optimizing by scanning To:/Cc: and - # PublicInbox::ExtMsg on the PSGI end is just as expensive - foreach my $other (@{$self->{nntpd}->{grouplist}}) { - next if $ng eq $other; - my $num = eval { $other->mm->num_for($mid) } or next; - $ret .= " $other->{newsgroup}:$num"; +sub xref ($$$) { + my ($self, $cur_ibx, $smsg) = @_; + my $nntpd = $self->{nntpd}; + my $cur_ng = $cur_ibx->{newsgroup}; + my $xref; + if (my $ALL = $nntpd->{pi_cfg}->ALL) { + $xref = $ALL->nntp_xref_for($cur_ibx, $smsg); + xref_by_tc($xref, $nntpd->{pi_cfg}, $smsg); + } else { # slow path + $xref = { $cur_ng => $smsg->{num} }; + my $mid = $smsg->{mid}; + for my $ibx (values %{$nntpd->{pi_cfg}->{-by_newsgroup}}) { + next if defined($xref->{$ibx->{newsgroup}}); + my $num = eval { $ibx->mm->num_for($mid) } // next; + $xref->{$ibx->{newsgroup}} = $num; + } } + my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng}); + $ret .= " $_:$xref->{$_}" for (sort keys %$xref); $ret; } @@ -430,7 +475,7 @@ sub set_nntp_headers ($$) { # clobber some existing headers my $ibx = $smsg->{-ibx}; - my $xref = xref($smsg->{nntp}, $ibx, $smsg->{num}, $mid); + my $xref = xref($smsg->{nntp}, $ibx, $smsg); $hdr->header_set('Xref', $xref); # RFC 5536 3.1.4 @@ -442,53 +487,34 @@ sub set_nntp_headers ($$) { # *something* here is required for leafnode, try to follow # RFC 5536 3.1.5... $hdr->header_set('Path', $server_name . '!not-for-mail'); - - header_append($hdr, 'List-Post', "{-primary_address}>"); - if (my $url = $ibx->base_url) { - $mid = mid_escape($mid); - header_append($hdr, 'Archived-At', "<$url$mid/>"); - header_append($hdr, 'List-Archive', "<$url>"); - } } sub art_lookup ($$$) { my ($self, $art, $code) = @_; - my $ng = $self->{ng}; - my ($n, $mid); + my ($ibx, $n); my $err; if (defined $art) { if ($art =~ /\A[0-9]+\z/) { $err = '423 no such article number in this group'; $n = int($art); - goto find_mid; + goto find_ibx; } elsif ($art =~ $ONE_MSGID) { - $mid = $1; - $err = r430; - $n = $ng->mm->num_for($mid) if $ng; - goto found if defined $n; - foreach my $g (values %{$self->{nntpd}->{groups}}) { - $n = $g->mm->num_for($mid); - if (defined $n) { - $ng = $g; - goto found; - } - } - return $err; + ($ibx, $n) = mid_lookup($self, $1); + goto found if $ibx; + return r430; } else { return r501; } } else { $err = '420 no current article has been selected'; - $n = $self->{article}; - defined $n or return $err; -find_mid: - $ng or return '412 no newsgroup has been selected'; - $mid = $ng->mm->mid_for($n); - defined $mid or return $err; + $n = $self->{article} // return $err; +find_ibx: + $ibx = $self->{ibx} or + return '412 no newsgroup has been selected'; } found: - my $smsg = $ng->over->get_art($n) or return $err; - $smsg->{-ibx} = $ng; + my $smsg = $ibx->over->get_art($n) or return $err; + $smsg->{-ibx} = $ibx; if ($code == 223) { # STAT set_art($self, $n); "223 $n <$smsg->{mid}> article retrieved - " . @@ -498,7 +524,7 @@ found: $smsg->{nntp_code} = $code; set_art($self, $art); # this dereferences to `undef' - ${git_async_cat($ng->git, $smsg->{blob}, \&blob_cb, $smsg)}; + ${git_async_cat($ibx->git, $smsg->{blob}, \&blob_cb, $smsg)}; } } @@ -598,10 +624,10 @@ sub cmd_help ($) { sub get_range ($$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no news group has been selected'; + my $ibx = $self->{ibx} or return '412 no news group has been selected'; defined $range or return '420 No article(s) selected'; my ($beg, $end); - my ($min, $max) = $ng->mm->minmax; + my ($min, $max) = $ibx->mm->minmax; if ($range =~ /\A([0-9]+)\z/) { $beg = $end = $1; } elsif ($range =~ /\A([0-9]+)-\z/) { @@ -671,9 +697,9 @@ sub long_response ($$;@) { sub hdr_msgid_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end); + my $r = $self->{ibx}->mm->msg_range($beg, $end); @$r or return; - more($self, join("\r\n", map { "$_->[0] <$_->[1]>" } @$r)); + $self->msg_more(join('', map { "$_->[0] <$_->[1]>\r\n" } @$r)); 1; } @@ -681,9 +707,9 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. my ($self, $xhdr, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, $range); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $range); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -695,28 +721,54 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. sub mid_lookup ($$) { my ($self, $mid) = @_; - my $self_ng = $self->{ng}; - if ($self_ng) { - my $n = $self_ng->mm->num_for($mid); - return ($self_ng, $n) if defined $n; + my $cur_ibx = $self->{ibx}; + if ($cur_ibx) { + my $n = $cur_ibx->mm->num_for($mid); + return ($cur_ibx, $n) if defined $n; } - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - next if defined $self_ng && $ng eq $self_ng; - my $n = $ng->mm->num_for($mid); - return ($ng, $n) if defined $n; + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + if (my @x = grep(/:$smsg->{blob}\z/, @$xr3)) { + my ($ngname, $xnum) = split(/:/, $x[0]); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + # fall through to trying all xref3s + } else { + warn < ($smsg->{blob}) in $ALL->{topdir}, -extindex bug? +EOF + } + # try all xref3s + for my $x (@$xr3) { + my ($ngname, $xnum) = split(/:/, $x); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + warn "W: `$ngname' does not exist for #$xnum\n"; + } + } + # no warning here, $mid is just invalid + } else { # slow path for non-ALL users + for my $ibx (values %{$pi_cfg->{-by_newsgroup}}) { + next if defined $cur_ibx && $ibx eq $cur_ibx; + my $n = $ibx->mm->num_for($mid); + return ($ibx, $n) if defined $n; + } } (undef, undef); } sub xref_range_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $r = $ng->mm->msg_range($beg, $end); - @$r or return; - more($self, join("\r\n", map { - my $num = $_->[0]; - "$num ".xref($self, $ng, $num, $_->[1]); - } @$r)); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over->query_xover($$beg, $end); + scalar(@$msgs) or return; + $$beg = $msgs->[-1]->{num} + 1; + $self->msg_more(join('', map { + "$_->{num} ".xref($self, $ibx, $_) . "\r\n"; + } @$msgs)); 1; } @@ -725,10 +777,11 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin if (defined $range && $range =~ $ONE_MSGID) { my $mid = $1; - my ($ng, $n) = mid_lookup($self, $mid); + my ($ibx, $n) = mid_lookup($self, $mid); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, - xref($self, $ng, $n, $mid)); + my $smsg = $ibx->over->get_art($n) or return; + hdr_mid_response($self, $xhdr, $ibx, $n, $range, + xref($self, $ibx, $smsg)); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -747,7 +800,7 @@ sub over_header_for { sub smsg_range_i { my ($self, $beg, $end, $field) = @_; - my $over = $self->{ng}->over; + my $over = $self->{ibx}->over; my $msgs = $over->query_xover($$beg, $end); scalar(@$msgs) or return; my $tmp = ''; @@ -770,10 +823,10 @@ sub smsg_range_i { sub hdr_smsg ($$$$) { my ($self, $xhdr, $field, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless defined $n; - my $v = over_header_for($ng->over, $n, $field); - hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); + my $v = over_header_for($ibx->over, $n, $field); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $v); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -813,26 +866,26 @@ sub cmd_xhdr ($$;$) { } sub hdr_mid_prefix ($$$$$) { - my ($self, $xhdr, $ng, $n, $mid) = @_; + my ($self, $xhdr, $ibx, $n, $mid) = @_; return $mid if $xhdr; # HDR for RFC 3977 users - if (my $self_ng = $self->{ng}) { - ($self_ng eq $ng) ? $n : '0'; + if (my $cur_ibx = $self->{ibx}) { + ($cur_ibx eq $ibx) ? $n : '0'; } else { '0'; } } sub hdr_mid_response ($$$$$$) { - my ($self, $xhdr, $ng, $n, $mid, $v) = @_; + my ($self, $xhdr, $ibx, $n, $mid, $v) = @_; my $res = ''; if ($xhdr) { $res .= r221 . "\r\n"; $res .= "$mid $v\r\n"; } else { $res .= r225 . "\r\n"; - my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid); + my $pfx = hdr_mid_prefix($self, $xhdr, $ibx, $n, $mid); $res .= "$pfx $v\r\n"; } res($self, $res .= '.'); @@ -841,14 +894,14 @@ sub hdr_mid_response ($$$$$$) { sub xrover_i { my ($self, $beg, $end) = @_; - my $h = over_header_for($self->{ng}->over, $$beg, 'references'); + my $h = over_header_for($self->{ibx}->over, $$beg, 'references'); more($self, "$$beg $h") if defined($h); $$beg++ < $end; } sub cmd_xrover ($;$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; + my $ibx = $self->{ibx} or return '412 no newsgroup selected'; (defined $range && $range =~ /[<>]/) and return '420 No article(s) selected'; # no message IDs @@ -859,11 +912,11 @@ sub cmd_xrover ($;$) { long_response($self, \&xrover_i, @$r); } -sub over_line ($$$$) { - my ($self, $ng, $num, $smsg) = @_; +sub over_line ($$$) { + my ($self, $ibx, $smsg) = @_; # n.b. field access and procedural calls can be # 10%-15% faster than OO method calls: - my $s = join("\t", $num, + my $s = join("\t", $smsg->{num}, $smsg->{subject}, $smsg->{from}, PublicInbox::Smsg::date($smsg), @@ -871,23 +924,28 @@ sub over_line ($$$$) { $smsg->{references}, $smsg->{bytes}, $smsg->{lines}, - "Xref: " . xref($self, $ng, $num, $smsg->{mid})); + "Xref: " . xref($self, $ibx, $smsg)); utf8::encode($s); - $s + $s .= "\r\n"; } sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); defined $n or return r430; - my $smsg = $ng->over->get_art($n) or return r430; + my $smsg = $ibx->over->get_art($n) or return r430; more($self, '224 Overview information follows (multi-line)'); # Only set article number column if it's the current group - my $self_ng = $self->{ng}; - $n = 0 if (!$self_ng || $self_ng ne $ng); - more($self, over_line($self, $ng, $n, $smsg)); + # (RFC 3977 8.3.2) + my $cur_ibx = $self->{ibx}; + if (!$cur_ibx || $cur_ibx ne $ibx) { + # set {-orig_num} for nntp_xref_for + $smsg->{-orig_num} = $smsg->{num}; + $smsg->{num} = 0; + } + $self->msg_more(over_line($self, $ibx, $smsg)); '.'; } else { cmd_xover($self, $range); @@ -896,13 +954,13 @@ sub cmd_over ($;$) { sub xover_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $msgs = $ng->over->query_xover($$beg, $end); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over->query_xover($$beg, $end); my $nr = scalar @$msgs or return; # OVERVIEW.FMT - more($self, join("\r\n", map { - over_line($self, $ng, $_->{num}, $_); + $self->msg_more(join('', map { + over_line($self, $ibx, $_); } @$msgs)); $$beg = $msgs->[-1]->{num} + 1; } @@ -949,12 +1007,28 @@ sub cmd_xpath ($$) { return r501 unless $mid =~ $ONE_MSGID; $mid = $1; my @paths; - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - my $n = $ng->mm->num_for($mid); - push @paths, "$ng->{newsgroup}/$n" if defined $n; + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + my $groups = $pi_cfg->{-by_newsgroup}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev, %seen); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + for my $x (@$xr3) { + my ($ngname, $n) = split(/:/, $x); + $x = "$ngname/$n"; + if ($groups->{$ngname} && !$seen{$x}++) { + push(@paths, $x); + } + } + } + } else { # slow path, no point in using long_response + for my $ibx (values %$groups) { + my $n = $ibx->mm->num_for($mid) // next; + push @paths, "$ibx->{newsgroup}/$n"; + } } return '430 no such article on server' unless @paths; - '223 '.join(' ', @paths); + '223 '.join(' ', sort(@paths)); } sub res ($$) { do_write($_[0], $_[1] . "\r\n") } diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 6b762d89..6907a03c 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -12,8 +12,8 @@ use PublicInbox::InboxIdle; sub new { my ($class) = @_; - my $pi_config = PublicInbox::Config->new; - my $name = $pi_config->{'publicinbox.nntpserver'}; + my $pi_cfg = PublicInbox::Config->new; + my $name = $pi_cfg->{'publicinbox.nntpserver'}; if (!defined($name) or $name eq '') { $name = hostname; } elsif (ref($name) eq 'ARRAY') { @@ -24,8 +24,7 @@ sub new { groups => {}, err => \*STDERR, out => \*STDOUT, - grouplist => [], - pi_config => $pi_config, + pi_cfg => $pi_cfg, servername => $name, greet => \"201 $name ready - post via email\r\n", # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } @@ -35,40 +34,33 @@ sub new { sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = $sig ? PublicInbox::Config->new : $self->{pi_config}; - my $new = {}; - my @list; - $pi_config->each_inbox(sub { - my ($ng) = @_; - my $ngname = $ng->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - # Newsgroup name needs to be compatible with RFC 3977 - # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. - # Leave out a few chars likely to cause problems or conflicts: - # '|', '<', '>', ';', '#', '$', '&', - } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) { - warn "newsgroup name invalid: `$ngname'\n"; - } elsif ($ng->nntp_usable) { - # Only valid if msgmap and search works - $new->{$ngname} = $ng; - push @list, $ng; - + my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg}; + my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; + $pi_cfg->each_inbox(sub { + my ($ibx) = @_; + my $ngname = $ibx->{newsgroup} // return; + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { + # only valid if msgmap and over works # preload to avoid fragmentation: - $ng->description; - $ng->base_url; + $ibx->description; + $ibx->base_url; + } else { + delete $groups->{$ngname}; + delete $ibx->{newsgroup}; + # Note: don't be tempted to delete more for memory + # savings just yet: NNTP, IMAP, and WWW may all + # run in the same process someday. } }); - @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list; - $self->{grouplist} = \@list; - $self->{pi_config} = $pi_config; + $self->{groupnames} = [ sort(keys %$groups) ]; # this will destroy old groups that got deleted - %{$self->{groups}} = %$new; + $self->{pi_cfg} = $pi_cfg; } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); } 1; diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index 6bed0103..61d9ae7c 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -13,9 +13,8 @@ use PublicInbox::MID qw(mid_escape); use PublicInbox::Hval qw(prurl); sub new { - my ($class, $pi_config) = @_; - $pi_config ||= PublicInbox::Config->new; - bless { pi_config => $pi_config }, $class; + my ($class, $pi_cfg) = @_; + bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class; } sub redirect ($$) { @@ -47,8 +46,8 @@ sub call { # /inbox.foo.bar/123456 my (undef, @parts) = split(m!/!, $env->{PATH_INFO}); my ($ng, $article) = @parts; - my $pi_config = $self->{pi_config}; - if (my $ibx = $pi_config->lookup_newsgroup($ng)) { + my $pi_cfg = $self->{pi_cfg}; + if (my $ibx = $pi_cfg->lookup_newsgroup($ng)) { my $url = prurl($env, $ibx->{url}); my $code = 301; if (defined $article && $article =~ /\A[0-9]+\z/) { @@ -63,7 +62,6 @@ sub call { return redirect($code, $url); } - my $res; my @try = (join('/', @parts)); # trailing slash is in the rest of our WWW, so maybe some users @@ -72,13 +70,30 @@ sub call { pop @parts; push @try, join('/', @parts); } - - foreach my $mid (@try) { - my $arg = [ $mid ]; - $pi_config->each_inbox(\&try_inbox, $arg); - defined($res = $arg->[1]) and last; + my $ALL = $pi_cfg->ALL; + if (my $over = $ALL ? $ALL->over : undef) { + my $by_eidx_key = $pi_cfg->{-by_eidx_key}; + for my $mid (@try) { + my ($id, $prev); + while (my $x = $over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $over->get_xref3($x->{num}); + for (@$xr3) { + s/:[0-9]+:$x->{blob}\z// or next; + my $ibx = $by_eidx_key->{$_} // next; + my $url = $ibx->base_url or next; + $url .= mid_escape($mid) . '/'; + return redirect(302, $url); + } + } + } + } else { # slow path, scan every inbox + for my $mid (@try) { + my $arg = [ $mid ]; # [1] => result + $pi_cfg->each_inbox(\&try_inbox, $arg); + return $arg->[1] if $arg->[1]; + } } - $res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; + [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; } 1; diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 08112386..49ba180b 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -260,6 +260,27 @@ SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 $smsg ? load_from_row($smsg) : undef; } +sub get_xref3 { + my ($self, $num, $raw) = @_; + my $dbh = dbh($self); + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum,oidbin FROM xref3 WHERE docid = ? ORDER BY ibx_id,xnum ASC + + $sth->execute($num); + my $rows = $sth->fetchall_arrayref; + return $rows if $raw; + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? + + [ map { + my $r = $_; + $eidx_key_sth->execute($r->[0]); + my $eidx_key = $eidx_key_sth->fetchrow_array; + $eidx_key //= "missing://ibx_id=$r->[0]"; + "$eidx_key:$r->[1]:".unpack('H*', $r->[2]); + } @$rows ]; +} + sub next_by_mid { my ($self, $mid, $id, $prev) = @_; my $dbh = dbh($self); diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 840e2c2a..dcc2cff3 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -79,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -267,6 +272,13 @@ sub subject_path ($) { lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; @@ -278,10 +290,7 @@ sub add_overview { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, $smsg, $mids, $refs, $xpath, $dd); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -385,13 +394,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, /* NNTP article number == IMAP UID */ + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ ds INTEGER, /* RFC-2822 sent Date: header, git author time */ - ddd VARBINARY, /* doc-data-deflated (->to_doc_data, ->load_from_data) */ - UNIQUE (num) + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -465,10 +473,14 @@ sub dbh_close { sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: PublicInbox::Over::dbh($self); @@ -518,4 +530,162 @@ EOM $pr->("I: rethread culled $total ghosts\n") if $pr && $total; } +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<""); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq ( + docid INTEGER PRIMARY KEY NOT NULL +) + + $dbh; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# returns remaining reference count to $docid +sub remove_xref3 { + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; + begin_lazy($self); + my $oidbin = pack('H*', $oidhex); + my ($sth, $ibx_id); + if (defined $eidx_key) { + $ibx_id = ibx_id($self, $eidx_key); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $oidbin, SQL_BLOB); + } else { + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + } + $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + 1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm index 88b6d390..2aa2042a 100644 --- a/lib/PublicInbox/Qspawn.pm +++ b/lib/PublicInbox/Qspawn.pm @@ -359,12 +359,12 @@ sub new { } sub setup_rlimit { - my ($self, $name, $config) = @_; + my ($self, $name, $cfg) = @_; foreach my $rlim (@PublicInbox::Spawn::RLIMITS) { my $k = lc($rlim); $k =~ tr/_//d; $k = "publicinboxlimiter.$name.$k"; - defined(my $v = $config->{$k}) or next; + defined(my $v = $cfg->{$k}) or next; my @rlimit = split(/\s*,\s*/, $v); if (scalar(@rlimit) == 1) { push @rlimit, $rlimit[0]; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index fb35b747..fb3e9975 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(mdocid); +our @EXPORT_OK = qw(retry_reopen int_val); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -54,11 +54,15 @@ use constant { use PublicInbox::Smsg; use PublicInbox::Over; -my $QP_FLAGS; -our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem); +our $QP_FLAGS; +our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); our $Xap; # 'Search::Xapian' or 'Xapian' -my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -my $ENQ_ASCENDING; +our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') + +# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, +# let's hope the ABI is stable +our $ENQ_DESCENDING = 0; +our $ENQ_ASCENDING = 1; sub load_xapian () { return 1 if defined $Xap; @@ -84,15 +88,8 @@ sub load_xapian () { 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); - # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, - # so lets hope this part of the ABI is stable because it's - # just an integer: - $ENQ_ASCENDING = $x eq 'Xapian' ? - 1 : Search::Xapian::ENQ_ASCENDING(); - - # for Smsg: - *PublicInbox::Smsg::sortable_unserialise = - $Xap.'::sortable_unserialise'; + *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -193,38 +190,41 @@ sub xdir ($;$) { } } -sub _xdb ($) { +sub xdb_sharded { + my ($self) = @_; + opendir(my $dh, $self->{xpfx}) or return; # not initialized yet + + # We need numeric sorting so shard[0] is first for reading + # Xapian metadata, if needed + my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return; + my (@xdb, $slow_phrase); + for (0..$last) { + my $shard_dir = "$self->{xpfx}/$_"; + if (-d $shard_dir && -r _) { + push @xdb, $X{Database}->new($shard_dir); + $slow_phrase ||= -f "$shard_dir/iamchert"; + } else { # gaps from missing epochs throw off mdocid() + warn "E: $shard_dir missing or unreadable\n"; + return; + } + } + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; + $self->{nshard} = scalar(@xdb); + my $xdb = shift @xdb; + $xdb->add_database($_) for @xdb; + $xdb; +} + +sub _xdb { my ($self) = @_; my $dir = xdir($self, 1); - my ($xdb, $slow_phrase); - my $qpf = \($self->{qp_flags} ||= $QP_FLAGS); + $self->{qp_flags} //= $QP_FLAGS; if ($self->{ibx_ver} >= 2) { - my @xdb; - opendir(my $dh, $dir) or return; # not initialized yet - - # We need numeric sorting so shard[0] is first for reading - # Xapian metadata, if needed - my $last = max(grep(/\A[0-9]+\z/, readdir($dh))); - return if !defined($last); - for (0..$last) { - my $shard_dir = "$dir/$_"; - if (-d $shard_dir && -r _) { - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } else { # gaps from missing epochs throw off mdocid() - warn "E: $shard_dir missing or unreadable\n"; - return; - } - } - $self->{nshard} = scalar(@xdb); - $xdb = shift @xdb; - $xdb->add_database($_) for @xdb; + xdb_sharded($self); } else { - $slow_phrase = -f "$dir/iamchert"; - $xdb = $X{Database}->new($dir); + $self->{qp_flags} |= FLAG_PHRASE() if !-f "$dir/iamchert"; + $X{Database}->new($dir); } - $$qpf |= FLAG_PHRASE() unless $slow_phrase; - $xdb; } # v2 Xapian docids don't conflict, so they're identical to @@ -244,9 +244,9 @@ sub mset_to_artnums { sub xdb ($) { my ($self) = @_; - $self->{xdb} ||= do { + $self->{xdb} //= do { load_xapian(); - _xdb($self); + $self->_xdb; }; } @@ -285,20 +285,19 @@ sub mset { $opts ||= {}; my $qp = $self->{qp} //= qparse_new($self); my $query = $qp->parse_query($query_string, $self->{qp_flags}); - $opts->{relevance} = 1 unless exists $opts->{relevance}; _do_enquire($self, $query, $opts); } sub retry_reopen { - my ($self, $cb, $arg) = @_; + my ($self, $cb, @arg) = @_; for my $i (1..10) { if (wantarray) { my @ret; - eval { @ret = $cb->($arg) }; + eval { @ret = $cb->($self, @arg) }; return @ret unless $@; } else { my $ret; - eval { $ret = $cb->($arg) }; + eval { $ret = $cb->($self, @arg) }; return $ret unless $@; } # Exception: The revision being read has been discarded - @@ -318,7 +317,7 @@ sub retry_reopen { sub _do_enquire { my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); + retry_reopen($self, \&_enquire_once, $query, $opts); } # returns true if all docs have the THREADID value @@ -328,19 +327,32 @@ sub has_threadid ($) { } sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @{$_[0]}; + my ($self, $query, $opts) = @_; my $xdb = xdb($self); + if (defined(my $eidx_key = $opts->{eidx_key})) { + $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opts->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $query = $X{Query}->new(OP_FILTER(), $query, $range); + } my $enquire = $X{Enquire}->new($xdb); $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; - if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID + my $rel = $opts->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $enquire->set_weighting_scheme($X{BoolWeight}->new); $enquire->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enquire->set_sort_by_value_then_relevance(TS, $desc); + } elsif ($rel == -2) { $enquire->set_weighting_scheme($X{BoolWeight}->new); - } elsif ($opts->{relevance}) { + $enquire->set_docid_order($ENQ_DESCENDING); + } else { # rel > 0 $enquire->set_sort_by_relevance_then_value(TS, $desc); - } else { - $enquire->set_sort_by_value_then_relevance(TS, $desc); } # `mairix -t / --threads' or JMAP collapseThreads @@ -426,4 +438,10 @@ sub help { \@ret; } +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c36fc6c7..b3361e05 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -15,15 +15,17 @@ use PublicInbox::InboxWritable; use PublicInbox::MID qw(mids_for_index mids); use PublicInbox::MsgIter; use PublicInbox::IdxStack; -use Carp qw(croak); +use Carp qw(croak carp); use POSIX qw(strftime); +use Time::Local qw(timegm); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -31,11 +33,11 @@ use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; +our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; sub new { my ($class, $ibx, $creat, $shard) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $levels = qr/\A(?:full|medium|basic)\z/; my $inboxdir = $ibx->{inboxdir}; my $version = $ibx->version; my $indexlevel = 'full'; @@ -45,7 +47,7 @@ sub new { $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; } if ($ibx->{indexlevel}) { - if ($ibx->{indexlevel} =~ $levels) { + if ($ibx->{indexlevel} =~ $INDEXLEVELS) { $indexlevel = $ibx->{indexlevel}; } else { die("Invalid indexlevel $ibx->{indexlevel}\n"); @@ -65,7 +67,6 @@ sub new { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; @@ -135,7 +136,7 @@ sub idx_acquire { } } return unless defined $flag; - $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync}; + $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -152,7 +153,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } @@ -323,6 +324,16 @@ sub index_xapian { # msg_iter callback } } +sub index_list_id ($$$) { + my ($self, $doc, $hdr) = @_; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } +} + sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; for my $mid (@$mids) { @@ -336,16 +347,12 @@ sub index_ids ($$$$) { } } $doc->add_boolean_term('Q' . $_) for @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = lc $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } + index_list_id($self, $doc, $hdr); } -sub add_xapian ($$$$) { +sub eml2doc ($$$;$) { my ($self, $eml, $smsg, $mids) = @_; + $mids //= mids_for_index($eml); my $doc = $X->{Document}->new; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); @@ -361,6 +368,9 @@ sub add_xapian ($$$$) { $tg->set_document($doc); index_headers($self, $smsg); + if (defined(my $eidx_key = $smsg->{eidx_key})) { + $doc->add_boolean_term('O'.$eidx_key); + } msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); @@ -385,6 +395,12 @@ sub add_xapian ($$$$) { } } } + $doc; +} + +sub add_xapian ($$$$) { + my ($self, $eml, $smsg, $mids) = @_; + my $doc = eml2doc($self, $eml, $smsg, $mids); $self->{xdb}->replace_document($smsg->{num}, $doc); } @@ -434,32 +450,81 @@ sub add_message { $smsg->{num}; } +sub _get_doc ($$) { + my ($self, $docid) = @_; + my $doc = eval { $self->{xdb}->get_document($docid) }; + $doc // do { + warn "E: $@\n" if $@; + warn "E: #$docid missing in Xapian\n"; + undef; + } +} + +sub add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + term_generator($self)->set_document($doc); + $doc->add_boolean_term('O'.$eidx_key); + index_list_id($self, $doc, $eml); + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + eval { $doc->remove_term('O'.$eidx_key) }; + warn "W: ->remove_term O$eidx_key: $@\n" if $@; + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + eval { $doc->remove_term('G' . $lid) }; + warn "W: ->remove_term G$lid: $@\n" if $@; + + # nb: we don't remove the XL probabilistic terms + # since terms may overlap if cross-posted. + # + # IOW, a message which has both + # and would have overlapping + # "XLexample" and "XLcom" as terms and which we + # wouldn't know if they're safe to remove if we just + # unindex while preserving + # . + # + # In any case, this entire sub is will likely never + # be needed and users using the "l:" prefix are probably + # rarer. + } + $self->{xdb}->replace_document($docid, $doc); +} + +sub smsg_from_doc ($) { + my ($doc) = @_; + my $data = $doc->get_data or return; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); + $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); + $smsg->load_from_data($data); + $smsg; +} + sub xdb_remove { - my ($self, $oid, @removed) = @_; + my ($self, @docids) = @_; my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = eval { $xdb->get_document($num) }; - unless ($doc) { - warn "E: $@\n" if $@; - warn "E: #$num $oid missing in Xapian\n"; - next; - } - my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->load_expand($doc); - my $blob = $smsg->{blob} // '(unset)'; - if ($blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; - } + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in in Xapian? $@\n" if $@; } } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{oidx}; +sub remove_by_docid { + my ($self, $num) = @_; + die "BUG: remove_by_docid is v2-only\n" if $self->{oidx}; $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); + xdb_remove($self, $num) if need_xapian($self); } sub index_git_blob_id { @@ -484,8 +549,8 @@ sub unindex_eml { $tmp{$_}++ for @removed; } if (!$nr) { - $mids = join('> <', @$mids); - warn "W: <$mids> missing for removal from overview\n"; + my $m = join('> <', @$mids); + warn "W: <$m> missing for removal from overview\n"; } while (my ($num, $nr) = each %tmp) { warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; @@ -495,7 +560,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -526,34 +591,63 @@ sub crlf_adjust ($) { } } +sub is_bad_blob ($$$$) { + my ($oid, $type, $size, $expect_oid) = @_; + if ($type ne 'blob') { + carp "W: $expect_oid is not a blob (type=$type)"; + return 1; + } + croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid; + $size == 0 ? 1 : 0; # size == 0 means purged +} + sub index_both { # git->cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; $size += crlf_adjust($$bref); my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; + my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + ${$sync->{latest_cmt}} = $cur_cmt; } sub unindex_both { # git->cat_async callback - my ($bref, $oid, $type, $size, $self) = @_; + my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); + my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); + # may be undef if leftover + if (defined(my $cur_cmt = $sync->{cur_cmt})) { + ${$sync->{latest_cmt}} = $cur_cmt; + } + ++$self->{nidx}; +} + +sub with_umask { + my $self = shift; + ($self->{ibx} // $self->{eidx})->with_umask(@_); } # called by public-inbox-index sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; - $self->{ibx}->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + $self->with_umask(\&_index_sync, $self, $opt); + if ($opt->{reindex} && !$opt->{quit}) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } @@ -569,46 +663,44 @@ sub check_size { # check_async cb for -index --max-size=... sub v1_checkpoint ($$;$) { my ($self, $sync, $stk) = @_; - $self->{ibx}->git->check_async_wait; - $self->{ibx}->git->cat_async_wait; + $self->{ibx}->git->async_wait_all; - # latest_cmt may be undef - my $newest = $stk ? $stk->{latest_cmt} : undef; - if ($newest) { + # $newest may be undef + my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + if (defined($newest)) { my $cur = $self->{mm}->last_commit || ''; if (need_update($self, $cur, $newest)) { $self->{mm}->last_commit($newest); } - } else { - ${$sync->{max}} = $self->{batch_bytes}; } + ${$sync->{max}} = $self->{batch_bytes}; $self->{mm}->{dbh}->commit; - if ($newest && need_xapian($self)) { - my $xdb = $self->{xdb}; + my $xdb = need_xapian($self) ? $self->{xdb} : undef; + if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } - + } + if ($stk) { # all done if $stk is passed # let SearchView know a full --reindex was done so it can # generate ->has_threadid-dependent links - if ($sync->{reindex} && !ref($sync->{reindex})) { + if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) { my $n = $xdb->get_metadata('has_threadid'); $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } + $self->{oidx}->rethread_done($sync->{-opt}); # all done } - - $self->{oidx}->rethread_done($sync->{-opt}) if $newest; # all done commit_txn_lazy($self); - $self->{ibx}->git->cleanup; + $sync->{ibx}->git->cleanup; my $nr = ${$sync->{nr}}; idx_release($self, $nr); # let another process do some work... if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -617,27 +709,32 @@ sub v1_checkpoint ($$;$) { # only for v1 sub process_stack { my ($self, $sync, $stk) = @_; - my $git = $self->{ibx}->git; + my $git = $sync->{ibx}->git; my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; $sync->{sidx} = $self; + $sync->{latest_cmt} = \(my $latest_cmt); $self->{mm}->{dbh}->begin_work; if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $sync); } } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { + my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid }; + last if $sync->{quit}; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; + $arg->{autime} = $at; + $arg->{cotime} = $ct; if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { @@ -645,17 +742,17 @@ sub process_stack { } v1_checkpoint($self, $sync) if $max <= 0; } elsif ($f eq 'd') { - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } -sub log2stack ($$$$) { - my ($sync, $git, $range, $ibx) = @_; +sub log2stack ($$$) { + my ($sync, $git, $range) = @_; my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise) my ($add, $del); - if ($ibx->version == 1) { + if ($sync->{ibx}->version == 1) { my $path = $hex.'{2}/'.$hex.'{38}'; $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!; $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!; @@ -669,17 +766,18 @@ sub log2stack ($$$$) { my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H --no-notes --no-color --no-renames --no-abbrev), $range); - my ($at, $ct, $stk); + my ($at, $ct, $stk, $cmt); while (<$fh>) { + return if $sync->{quit}; if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); + ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3); + $stk //= PublicInbox::IdxStack->new($cmt); } elsif (/$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); + $stk->push_rec('d', $at, $ct, $oid, $cmt); } } elsif (/$add/) { my $oid = $1; @@ -687,12 +785,10 @@ sub log2stack ($$$$) { my $oid_bin = pack('H*', $oid); my $nr = --$D->{$oid_bin}; delete($D->{$oid_bin}) if $nr <= 0; - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); + next if $nr >= 0; } + $stk->push_rec('m', $at, $ct, $oid, $cmt); } } close $fh or die "git log failed: \$?=$?"; @@ -700,9 +796,9 @@ sub log2stack ($$$$) { $stk->read_prepare; } -sub prepare_stack ($$$) { - my ($self, $sync, $range) = @_; - my $git = $self->{ibx}->git; +sub prepare_stack ($$) { + my ($sync, $range) = @_; + my $git = $sync->{ibx}->git; if (index($range, '..') < 0) { # don't show annoying git errors to users who run -index @@ -711,7 +807,7 @@ sub prepare_stack ($$$) { return PublicInbox::IdxStack->new->read_prepare if $?; } $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR - log2stack($sync, $git, $range, $self->{ibx}); + log2stack($sync, $git, $range); } # --is-ancestor requires git 1.8.0+ @@ -759,15 +855,30 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; my $tip = $opt->{ref} || 'HEAD'; - my $git = $self->{ibx}->git; + my $ibx = $self->{ibx}; + local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $git->batch_prepare; + $ibx->git->batch_prepare; my $pr = $opt->{-progress}; - my $sync = { reindex => $opt->{reindex}, -opt => $opt }; + my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); @@ -785,10 +896,10 @@ sub _index_sync { my $lx = reindex_from($sync->{reindex}, $last_commit); my $range = $lx eq '' ? $tip : "$lx..$tip"; $pr->("counting changes\n\t$range ... ") if $pr; - my $stk = prepare_stack($self, $sync, $range); + my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -808,7 +919,7 @@ sub _begin_txn { sub begin_txn_lazy { my ($self) = @_; - $self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn}; + $self->with_umask(\&_begin_txn, $self) if !$self->{txn}; } # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) @@ -836,6 +947,10 @@ sub set_metadata_once { sub _commit_txn { my ($self) = @_; + if (my $eidx = $self->{eidx}) { + $eidx->git->async_wait_all; + $eidx->{transact_bytes} = 0; + } if (my $xdb = $self->{xdb}) { set_metadata_once($self); $xdb->commit_transaction; @@ -846,7 +961,7 @@ sub _commit_txn { sub commit_txn_lazy { my ($self) = @_; delete($self->{txn}) and - $self->{ibx}->with_umask(\&_commit_txn, $self); + $self->with_umask(\&_commit_txn, $self); } sub worker_done { @@ -857,4 +972,39 @@ sub worker_done { die "$$ $0 still in transaction\n" if $self->{txn}; } +sub eidx_shard_new { + my ($class, $eidx, $shard) = @_; + my $self = bless { + eidx => $eidx, + xpfx => $eidx->{xpfx}, + indexlevel => $eidx->{indexlevel}, + -skip_docdata => 1, + shard => $shard, + creat => 1, + }, $class; + $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium'; + $self; +} + +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); + } +} + 1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index f23d23d0..2e654769 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -7,13 +7,16 @@ package PublicInbox::SearchIdxShard; use strict; use v5.10.1; use parent qw(PublicInbox::SearchIdx); +use bytes qw(length); use IO::Handle (); # autoflush use PublicInbox::Eml; +use PublicInbox::Sigfd; sub new { - my ($class, $v2w, $shard) = @_; + my ($class, $v2w, $shard) = @_; # v2w may be ExtSearchIdx my $ibx = $v2w->{ibx}; - my $self = $class->SUPER::new($ibx, 1, $shard); + my $self = $ibx ? $class->SUPER::new($ibx, 1, $shard) + : $class->eidx_shard_new($v2w, $shard); # create the DB before forking: $self->idx_acquire; $self->set_metadata_once; @@ -27,9 +30,13 @@ sub spawn_worker { my ($r, $w); pipe($r, $w) or die "pipe failed: $!\n"; $w->autoflush(1); + my $oldset = PublicInbox::Sigfd::block_signals(); my $pid = fork; defined $pid or die "fork failed: $!\n"; if ($pid == 0) { + # these signals are localized in parent + $SIG{$_} = 'IGNORE' for (qw(TERM INT QUIT)); + PublicInbox::Sigfd::sig_setmask($oldset); my $bnote = $v2w->atfork_child; close $w or die "failed to close: $!"; @@ -42,71 +49,122 @@ sub spawn_worker { die "unexpected MM $self->{mm}" if $self->{mm}; exit; } + PublicInbox::Sigfd::sig_setmask($oldset); $self->{pid} = $pid; $self->{w} = $w; close $r or die "failed to close: $!"; } +sub eml ($$) { + my ($r, $len) = @_; + return if $len == 0; + my $n = read($r, my $bref, $len) or die "read: $!\n"; + $n == $len or die "short read: $n != $len\n"; + PublicInbox::Eml->new(\$bref); +} + # this reads all the writes to $self->{w} from the parent process sub shard_worker_loop ($$$$$) { my ($self, $v2w, $r, $shard, $bnote) = @_; - $0 = "pi-v2-shard[$shard]"; + $0 = "shard[$shard]"; $self->begin_txn_lazy; while (my $line = readline($r)) { + chomp $line; $v2w->{current_info} = "[$shard] $line"; - if ($line eq "commit\n") { + if ($line eq 'commit') { $self->commit_txn_lazy; - } elsif ($line eq "close\n") { + } elsif ($line eq 'close') { $self->idx_release; - } elsif ($line eq "barrier\n") { + } elsif ($line eq 'barrier') { $self->commit_txn_lazy; # no need to lock < 512 bytes is atomic under POSIX print $bnote "barrier $shard\n" or die "write failed for barrier $!\n"; - } elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) { - $self->remove_by_oid($1, $2 + 0); + } elsif ($line =~ /\AD ([0-9]+)\z/s) { + $self->remove_by_docid($1 + 0); + } elsif ($line =~ s/\A\+X //) { + my ($len, $docid, $eidx_key) = split(/ /, $line, 3); + $self->add_eidx_info($docid, $eidx_key, eml($r, $len)); + } elsif ($line =~ s/\A-X //) { + my ($len, $docid, $eidx_key) = split(/ /, $line, 3); + $self->remove_eidx_info($docid, $eidx_key, + eml($r, $len)); + } elsif ($line =~ s/\AO ([^\n]+)//) { + my $over_fn = $1; + $over_fn =~ tr/\0/\n/; + $self->over_check(PublicInbox::Over->new($over_fn)); } else { - chomp $line; + my $eidx_key; + if ($line =~ s/\AX=(.+)\0//) { + $eidx_key = $1; + $v2w->{current_info} =~ s/\0/\\0 /; + } # n.b. $mid may contain spaces(!) - my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid) + my ($len, $bytes, $num, $oid, $ds, $ts, $tid, $mid) = split(/ /, $line, 8); $self->begin_txn_lazy; - my $n = read($r, my $msg, $to_read) or die "read: $!\n"; - $n == $to_read or die "short read: $n != $to_read\n"; - my $mime = PublicInbox::Eml->new(\$msg); my $smsg = bless { bytes => $bytes, num => $num + 0, - blob => $blob, + blob => $oid, mid => $mid, tid => $tid, ds => $ds, ts => $ts, }, 'PublicInbox::Smsg'; - $self->add_message($mime, $smsg); + $smsg->{eidx_key} = $eidx_key if defined($eidx_key); + $self->add_message(eml($r, $len), $smsg); } } $self->worker_done; } sub index_raw { - my ($self, $msgref, $eml, $smsg) = @_; + my ($self, $msgref, $eml, $smsg, $eidx_key) = @_; if (my $w = $self->{w}) { + my @ekey = defined($eidx_key) ? ("X=$eidx_key\0") : (); + $msgref //= \($eml->as_string); + $smsg->{raw_bytes} //= length($$msgref); # mid must be last, it can contain spaces (but not LF) - print $w join(' ', @$smsg{qw(raw_bytes bytes + print $w @ekey, join(' ', @$smsg{qw(raw_bytes bytes num blob ds ts tid mid)}), "\n", $$msgref or die "failed to write shard $!\n"; } else { if ($eml) { - undef $$msgref; + undef($$msgref) if $msgref; } else { # --xapian-only + --sequential-shard: $eml = PublicInbox::Eml->new($msgref); } $self->begin_txn_lazy; + $smsg->{eidx_key} = $eidx_key if defined $eidx_key; $self->add_message($eml, $smsg); } } +sub shard_add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + if (my $w = $self->{w}) { + my $hdr = $eml->header_obj->as_string; + my $len = length($hdr); + print $w "+X $len $docid $eidx_key\n", $hdr or + die "failed to write shard: $!"; + } else { + $self->add_eidx_info($docid, $eidx_key, $eml); + } +} + +sub shard_remove_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + if (my $w = $self->{w}) { + my $hdr = $eml ? $eml->header_obj->as_string : ''; + my $len = length($hdr); + print $w "-X $len $docid $eidx_key\n", $hdr or + die "failed to write shard: $!"; + } else { + $self->remove_eidx_info($docid, $eidx_key, $eml); + } +} + sub atfork_child { close $_[0]->{w} or die "failed to close write pipe: $!\n"; } @@ -144,11 +202,22 @@ sub shard_close { } sub shard_remove { - my ($self, $oid, $num) = @_; - if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child - print $w "D $oid $num\n" or die "failed to write remove $!"; + my ($self, $num) = @_; + if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child + print $w "D $num\n" or die "failed to write remove $!"; } else { # same process - $self->remove_by_oid($oid, $num); + $self->remove_by_docid($num); + } +} + +sub shard_over_check { + my ($self, $over) = @_; + if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child + my ($over_fn) = $over->{dbh}->sqlite_db_filename; + $over_fn =~ tr/\n/\0/; + print $w "O $over_fn\n" or die "failed to write over $!"; + } else { + $self->over_check($over); } } diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 60f692b2..8fb3a030 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -42,7 +42,7 @@ sub thread { # We'll trust the client Date: header here instead of the Received: # time since this is for display (and not retrieval) _set_parent(\%id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $rootset = [ grep { !delete($_->{parent}) && $_->visible($ibx) } values %id_table ]; @@ -166,7 +166,7 @@ sub order_children { my %seen = ($cur => 1); # self-referential loop prevention my @q = ($cur); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; while (defined($cur = shift @q)) { my $c = $cur->{children}; # The hashref here... diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index c482f1c9..f568f31c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::SearchThread; use PublicInbox::SearchQuery; -use PublicInbox::Search qw(mdocid); +use PublicInbox::Search; my %rmap_inc; sub mbox_results { @@ -30,7 +30,7 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $x = $q->{x}; @@ -93,9 +93,9 @@ sub mset_summary { my $pad = length("$total"); my $pfx = ' ' x $pad; my $res = \($ctx->{-html_tip}); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - my @nums = @{$ibx->search->mset_to_artnums($mset)}; + my @nums = @{$ibx->isrch->mset_to_artnums($mset)}; my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)}; my ($min, $max); @@ -156,7 +156,7 @@ sub path2inc ($) { sub err_txt { my ($ctx, $err) = @_; - my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/'; + my $u = $ctx->{ibx}->base_url($ctx->{env}) . '_/text/help/'; $err =~ s/^\s*Exception:\s*//; # bad word to show users :P $err =~ s!(\S+)!path2inc($1)!sge; $err = ascii_html($err); @@ -201,7 +201,7 @@ sub search_nav_top { } my $A = $q->qs_html(x => 'A', r => undef); $rv .= qq{|Atom feed]}; - if ($ctx->{-inbox}->search->has_threadid) { + if ($ctx->{ibx}->isrch->has_threadid) { $rv .= qq{\n\t\t\tdownload mbox.gz: } . # we set name=z w/o using it since it seems required for # lynx (but works fine for w3m). @@ -286,14 +286,13 @@ sub get_pct ($) { sub mset_thread { my ($ctx, $mset, $q) = @_; - my $ibx = $ctx->{-inbox}; - my $nshard = $ibx->search->{nshard} // 1; - my %pct = map { mdocid($nshard, $_) => get_pct($_) } $mset->items; - my $msgs = $ibx->over->get_all(keys %pct); - $_->{pct} = $pct{$_->{num}} for @$msgs; + my $ibx = $ctx->{ibx}; + my @pct = map { get_pct($_) } $mset->items; + my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset); + my $i = 0; + $_->{pct} = $pct[$i++] for @$msgs; my $r = $q->{r}; if ($r) { # for descriptions in search_nav_bot - my @pct = values %pct; $q->{-min_pct} = min(@pct); $q->{-max_pct} = max(@pct); } @@ -354,7 +353,7 @@ sub ctx_prepare { sub adump { my ($cb, $mset, $q, $ctx) = @_; - $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset); + $ctx->{ids} = $ctx->{ibx}->isrch->mset_to_artnums($mset); $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i); } @@ -363,7 +362,7 @@ sub adump { sub adump_i { my ($ctx) = @_; while (my $num = shift @{$ctx->{ids}}) { - my $smsg = eval { $ctx->{-inbox}->over->get_art($num) } or next; + my $smsg = eval { $ctx->{ibx}->over->get_art($num) } or next; return $smsg; } } diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 171e0a00..14086538 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -15,13 +15,6 @@ our @EXPORT_OK = qw(subject_normalized); use PublicInbox::MID qw(mids); use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -use Time::Local qw(timegm); - -sub get_val ($$) { - my ($doc, $col) = @_; - # sortable_unserialise is defined by PublicInbox::Search::load_xapian() - sortable_unserialise($doc->get_value($col)); -} sub to_doc_data { my ($self) = @_; @@ -61,17 +54,6 @@ sub load_from_data ($$) { ) = split(/\n/, $_[1]); } -sub load_expand { - my ($self, $doc) = @_; - my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); - my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); - $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); - load_from_data($self, $data); - $self; -} - sub psgi_cull ($) { my ($self) = @_; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 83f7a4ee..a53f28b1 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -216,7 +216,7 @@ sub filename_query ($) { sub find_smsgs ($$$) { my ($self, $ibx, $want) = @_; - my $srch = $ibx->search or return; + my $srch = $ibx->isrch or return; my $post = $want->{oid_b} or die 'BUG: no {oid_b}'; $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post"; diff --git a/lib/PublicInbox/Spamcheck.pm b/lib/PublicInbox/Spamcheck.pm index ffebb3cf..218fcc01 100644 --- a/lib/PublicInbox/Spamcheck.pm +++ b/lib/PublicInbox/Spamcheck.pm @@ -7,8 +7,8 @@ use strict; use warnings; sub get { - my ($config, $key, $default) = @_; - my $spamcheck = $config->{$key}; + my ($cfg, $key, $default) = @_; + my $spamcheck = $cfg->{$key}; $spamcheck = $default unless $spamcheck; return if !$spamcheck || $spamcheck eq 'none'; diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index e4f00a2a..c403f78a 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -227,38 +227,46 @@ sub epoll_ctl_mod8 { our $epoll_wait_events; our $epoll_wait_size = 0; sub epoll_wait_mod4 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 12 x $epoll_wait_size; - } - my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - for (0..$ct-1) { - @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec); + for (0..$ct - 1) { + # 12-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 8 bytes: epoll_data_t union (first 4 bytes are the fd) + # So we skip the first 4 bytes and take the middle 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 12 * $_ + 4, 4)); + } } sub epoll_wait_mod8 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 16 x $epoll_wait_size; - } - my $ct; - if ($no_deprecated) { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef); - } else { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - } - for (0..$ct-1) { - # 16 byte epoll_event structs, with format: - # 4 byte mask [idx 1] - # 4 byte padding (we put it into idx 2, useless) - # 8 byte data (first 4 bytes are fd, into idx 0) - @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec, + $no_deprecated ? undef : ()); + for (0..$ct - 1) { + # 16-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 4 bytes padding (skipped, useless) + # 8 bytes epoll_data_t union (first 4 bytes are the fd) + # So skip the first 8 bytes, take 4, and ignore the last 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 16 * $_ + 8, 4)); + } } sub signalfd ($$$) { diff --git a/lib/PublicInbox/Tmpfile.pm b/lib/PublicInbox/Tmpfile.pm index 25bb3a52..eb0fce00 100644 --- a/lib/PublicInbox/Tmpfile.pm +++ b/lib/PublicInbox/Tmpfile.pm @@ -2,8 +2,8 @@ # License: AGPL-3.0+ package PublicInbox::Tmpfile; use strict; -use warnings; -use base qw(Exporter); +use v5.10.1; +use parent qw(Exporter); our @EXPORT = qw(tmpfile); use Fcntl qw(:DEFAULT); use Errno qw(EEXIST); @@ -13,6 +13,9 @@ use File::Spec; # unlinked filename which makes sense when viewed with lsof # (at least on Linux) # And if we ever stop caring to have debuggable filenames, O_TMPFILE :) +# +# This is also for Perl <5.32 which lacks: open(..., '+>>', undef) +# sub tmpfile ($;$$) { my ($id, $sock, $append) = @_; if (defined $sock) { diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm index 945e7ae7..ae0b0679 100644 --- a/lib/PublicInbox/Unsubscribe.pm +++ b/lib/PublicInbox/Unsubscribe.pm @@ -12,7 +12,8 @@ use warnings; use Crypt::CBC; use Plack::Util; use MIME::Base64 qw(decode_base64url); -my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CODE_URL = qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git); my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); sub new { @@ -38,13 +39,15 @@ sub new { my $unsubscribe = $opt{unsubscribe} or die "`unsubscribe' callback not given\n"; + my $code_url = $opt{code_url} || \@CODE_URL; + $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY'; bless { - pi_config => $opt{pi_config}, # PublicInbox::Config + pi_cfg => $opt{pi_config}, # PublicInbox::Config owner_email => $opt{owner_email}, cipher => $cipher, unsubscribe => $unsubscribe, contact => qq($e), - code_url => $opt{code_url} || $CODE_URL, + code_url => $code_url, confirm => $opt{confirm}, }, $class; } @@ -138,7 +141,7 @@ sub r { "$title
".
 		join("\n", "$title\n", @body) . '

'. "
This page is available under AGPL-3.0+\n" .
-		"git clone $self->{code_url}\n" .
+		join('', map { "git clone $_\n" } @{$self->{code_url}}) .
 		qq(Email $self->{contact} if you have any questions).
 		'
' ] ]; @@ -149,9 +152,9 @@ sub archive_info { my $archive_url = $self->{archive_urls}->{$list_addr}; unless ($archive_url) { - if (my $config = $self->{pi_config}) { + if (my $cfg = $self->{pi_cfg}) { # PublicInbox::Config::lookup - my $ibx = $config->lookup($list_addr); + my $ibx = $cfg->lookup($list_addr); # PublicInbox::Inbox::base_url $archive_url = $ibx->base_url if $ibx; } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b8abfa94..567582c5 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -17,7 +17,8 @@ use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size); +use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size + is_bad_blob); use IO::Handle; # ->autoflush use File::Temp (); @@ -65,11 +66,21 @@ sub nproc_shards ($) { sub count_shards ($) { my ($self) = @_; - # always load existing shards in case core count changes: - # Also, shard count may change while -watch is running - my $srch = $self->{ibx}->search or return 0; - delete $self->{ibx}->{search}; - $srch->{nshard} // 0 + if (my $ibx = $self->{ibx}) { + # always load existing shards in case core count changes: + # Also, shard count may change while -watch is running + my $srch = $ibx->search or return 0; + delete $ibx->{search}; + $srch->{nshard} // 0 + } else { # ExtSearchIdx + $self->{nshard} // do { + if ($self->xdb_sharded) { + $self->{nshard} // die 'BUG: {nshard} unset'; + } else { + 0; + } + } + } } sub new { @@ -86,8 +97,6 @@ sub new { die "$dir does not exist\n"; } } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, @@ -117,12 +126,9 @@ sub init_inbox { } $self->idx_init; $self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum; - my $epoch_max = -1; - git_dir_latest($self, \$epoch_max); - if (defined $skip_epoch && $epoch_max == -1) { - $epoch_max = $skip_epoch; - } - $self->git_init($epoch_max >= 0 ? $epoch_max : 0); + my $max = $self->{ibx}->max_git_epoch; + $max = $skip_epoch if (defined($skip_epoch) && !defined($max)); + $self->git_init($max // 0); $self->done; } @@ -133,12 +139,17 @@ sub add { $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb); } +sub idx_shard ($$) { + my ($self, $num) = @_; + $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})]; +} + # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); $self->{oidx}->add_overview($mime, $smsg); - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); + my $idx = idx_shard($self, $smsg->{num}); $idx->index_raw($msgref, $mime, $smsg); my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; $n >= $self->{batch_bytes}; @@ -249,11 +260,6 @@ sub v2_num_for_harder { ($num, $mid0); } -sub idx_shard { - my ($self, $shard_i) = @_; - $self->{idx_shards}->[$shard_i]; -} - sub _idx_init { # with_umask callback my ($self, $opt) = @_; $self->lock_acquire unless $opt && $opt->{-skip_lock}; @@ -264,7 +270,6 @@ sub _idx_init { # with_umask callback $self->{shards} = $nshards if $nshards && $nshards != $self->{shards}; $self->{batch_bytes} = $opt->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; - $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; # need to create all shards before initializing msgmap FD # idx_shards must be visible to all forked processes @@ -272,14 +277,34 @@ sub _idx_init { # with_umask callback my $idx = $self->{idx_shards} = []; push @$idx, PublicInbox::SearchIdxShard->new($self, $_) for (0..$max); + # SearchIdxShard may do their own flushing, so don't scale + # until after forking + $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; + + my $ibx = $self->{ibx} or return; # ExtIdxSearch + # Now that all subprocesses are up, we can open the FDs # for SQLite: my $mm = $self->{mm} = PublicInbox::Msgmap->new_file( - "$self->{ibx}->{inboxdir}/msgmap.sqlite3", - $self->{ibx}->{-no_fsync} ? 2 : 1); + "$ibx->{inboxdir}/msgmap.sqlite3", + $ibx->{-no_fsync} ? 2 : 1); $mm->{dbh}->begin_work; } +sub parallel_init ($$) { + my ($self, $indexlevel) = @_; + if (($indexlevel // 'full') eq 'basic') { + $self->{parallel} = 0; + } else { + pipe(my ($r, $w)) or die "pipe failed: $!"; + # pipe for barrier notifications doesn't need to be big, + # 1031: F_SETPIPE_SZ + fcntl($w, 1031, 4096) if $^O eq 'linux'; + $self->{bnote} = [ $r, $w ]; + $w->autoflush(1); + } +} + # idempotent sub idx_init { my ($self, $opt) = @_; @@ -292,17 +317,7 @@ sub idx_init { delete @$ibx{qw(mm search)}; $ibx->git->cleanup; - $self->{parallel} = 0 if ($ibx->{indexlevel}//'') eq 'basic'; - if ($self->{parallel}) { - pipe(my ($r, $w)) or die "pipe failed: $!"; - # pipe for barrier notifications doesn't need to be big, - # 1031: F_SETPIPE_SZ - fcntl($w, 1031, 4096) if $^O eq 'linux'; - $self->{bnote} = [ $r, $w ]; - $w->autoflush(1); - } - - $ibx->umask_prepare; + parallel_init($self, $ibx->{indexlevel}); $ibx->with_umask(\&_idx_init, $self, $opt); } @@ -312,14 +327,10 @@ sub idx_init { sub _replace_oids ($$$) { my ($self, $mime, $replace_map) = @_; $self->done; - my $pfx = "$self->{ibx}->{inboxdir}/git"; + my $ibx = $self->{ibx}; + my $pfx = "$ibx->{inboxdir}/git"; my $rewrites = []; # epoch => commit - my $max = $self->{epoch_max}; - - unless (defined($max)) { - defined(my $latest = git_dir_latest($self, \$max)) or return; - $self->{epoch_max} = $max; - } + my $max = $self->{epoch_max} //= $ibx->max_git_epoch // return; foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; @@ -414,7 +425,7 @@ sub rewrite_internal ($$;$$$) { } else { # ->purge or ->remove $self->{mm}->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } } @@ -467,7 +478,7 @@ sub git_hash_raw ($$) { my ($self, $raw) = @_; # grab the expected OID we have to reindex: pipe(my($in, $w)) or die "pipe: $!"; - my $git_dir = $self->{ibx}->git->{git_dir}; + my $git_dir = $self->git->{git_dir}; my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; my $r = popen_rd($cmd, undef, { 0 => $in }); print $w $$raw or die "print \$w: $!"; @@ -531,11 +542,11 @@ W: $list } # make sure we really got the OID: - my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid); + my ($blob, $type, $bytes) = $self->git->check($expect_oid); $blob eq $expect_oid or die "BUG: $expect_oid not found after replace"; # don't leak FDs to Xapian: - $self->{ibx}->git->cleanup; + $self->git->cleanup; # reindex modified messages: for my $smsg (@$need_reindex) { @@ -558,7 +569,7 @@ sub last_epoch_commit ($$;$) { $self->{mm}->last_commit_xap($v, $i, $cmt); } -sub set_last_commits ($) { +sub set_last_commits ($) { # this is NOT for ExtSearchIdx my ($self) = @_; defined(my $epoch_max = $self->{epoch_max}) or return; my $last_commit = $self->{last_commit}; @@ -600,34 +611,40 @@ sub checkpoint ($;$) { } my $shards = $self->{idx_shards}; if ($shards) { - my $dbh = $self->{mm}->{dbh}; + my $mm = $self->{mm}; + my $dbh = $mm->{dbh} if $mm; # SQLite msgmap data is second in importance - $dbh->commit; + $dbh->commit if $dbh; # SQLite overview is third $self->{oidx}->commit_lazy; # Now deal with Xapian if ($wait) { - my $barrier = $self->barrier_init(scalar @$shards); + my $barrier = barrier_init($self, scalar @$shards); # each shard needs to issue a barrier command $_->shard_barrier for @$shards; # wait for each Xapian shard - $self->barrier_wait($barrier); + barrier_wait($self, $barrier); } else { $_->shard_commit for @$shards; } + my $midx = $self->{midx}; # misc index + $midx->commit_txn if $midx; + # last_commit is special, don't commit these until - # remote shards are done: - $dbh->begin_work; + # Xapian shards are done: + $dbh->begin_work if $dbh; set_last_commits($self); - $dbh->commit; - - $dbh->begin_work; + if ($dbh) { + $dbh->commit; + $dbh->begin_work; + } + $midx->begin_txn if $midx; } $self->{total_bytes} += $self->{transact_bytes}; $self->{transact_bytes} = 0; @@ -667,14 +684,27 @@ sub done { } eval { $self->{oidx}->dbh_close }; $err .= "over close: $@\n" if $@; + delete $self->{midx}; delete $self->{bnote}; my $nbytes = $self->{total_bytes}; $self->{total_bytes} = 0; $self->lock_release(!!$nbytes) if $shards; - $self->{ibx}->git->cleanup; + $self->git->cleanup; die $err if $err; } +sub write_alternates ($$$) { + my ($info_dir, $mode, $out) = @_; + my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir); + my $tmp = $fh->filename; + print $fh @$out or die "print $tmp: $!\n"; + chmod($mode, $fh) or die "fchmod $tmp: $!\n"; + close $fh or die "close $tmp $!\n"; + my $alt = "$info_dir/alternates"; + rename($tmp, $alt) or die "rename $tmp => $alt: $!\n"; + $fh->unlink_on_destroy(0); +} + sub fill_alternates ($$) { my ($self, $epoch) = @_; @@ -713,15 +743,8 @@ sub fill_alternates ($$) { } } return unless $new; - - my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir); - my $tmp = $fh->filename; - print $fh join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n" - or die "print $tmp: $!\n"; - chmod($mode, $fh) or die "fchmod $tmp: $!\n"; - close $fh or die "close $tmp $!\n"; - rename($tmp, $alt) or die "rename $tmp => $alt: $!\n"; - $fh->unlink_on_destroy(0); + write_alternates($info_dir, $mode, + [join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n"]); } sub git_init { @@ -735,23 +758,6 @@ sub git_init { $git_dir } -sub git_dir_latest { - my ($self, $max) = @_; - $$max = -1; - my $pfx = "$self->{ibx}->{inboxdir}/git"; - return unless -d $pfx; - my $latest; - opendir my $dh, $pfx or die "opendir $pfx: $!\n"; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - if ($1 > $$max) { - $$max = $1; - $latest = "$pfx/$git_dir"; - } - } - $latest; -} - sub importer { my ($self) = @_; my $im = $self->{im}; @@ -770,7 +776,7 @@ sub importer { } my $epoch = 0; my $max; - my $latest = git_dir_latest($self, \$max); + my $latest = $self->{ibx}->git_dir_latest(\$max); if (defined $latest) { my $git = PublicInbox::Git->new($latest); my $packed_bytes = $git->packed_bytes; @@ -861,29 +867,50 @@ sub atfork_child { sub reindex_checkpoint ($$) { my ($self, $sync) = @_; - $self->{ibx}->git->cleanup; # *async_wait + $self->git->async_wait_all; + $self->update_last_commit($sync); ${$sync->{need_checkpoint}} = 0; my $mm_tmp = $sync->{mm_tmp}; $mm_tmp->atfork_prepare if $mm_tmp; - $self->done; # release lock + die 'BUG: {im} during reindex' if $self->{im}; + if ($self->{ibx_map} && !$sync->{checkpoint_unlocks}) { + checkpoint($self, 1); # no need to release lock on pure index + } else { + $self->done; # release lock + } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } # allow -watch or -mda to write... $self->idx_init($sync->{-opt}); # reacquire lock + if (my $intvl = $sync->{check_intvl}) { # eidx + $sync->{next_check} = PublicInbox::DS::now() + $intvl; + } $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if $size == 0; # purged + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); my $chash = content_hash($eml); - my $self = $arg->{v2w}; if (scalar(@$mids) == 0) { warn "E: $oid has no Message-ID, skipping\n"; @@ -950,36 +977,39 @@ sub index_oid { # cat_async callback if (do_idx($self, $bref, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current -sub update_last_commit ($$$$) { - my ($self, $git, $i, $cmt) = @_; - my $last = last_epoch_commit($self, $i); - if (defined $last && is_ancestor($git, $last, $cmt)) { - my @cmd = (qw(rev-list --count), "$last..$cmt"); - chomp(my $n = $git->qx(@cmd)); +sub update_last_commit { + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $last = last_epoch_commit($self, $unit->{epoch}); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); return if $n ne '' && $n == 0; } - last_epoch_commit($self, $i, $cmt); + last_epoch_commit($self, $unit->{epoch}, $latest_cmt); } -sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" } - -sub last_commits ($$) { - my ($self, $epoch_max) = @_; +sub last_commits { + my ($self, $sync) = @_; my $heads = []; - for (my $i = $epoch_max; $i >= 0; $i--) { + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { $heads->[$i] = last_epoch_commit($self, $i); } $heads; } # returns a revision range for git-log(1) -sub log_range ($$$$$) { - my ($self, $sync, $git, $i, $tip) = @_; +sub log_range ($$$) { + my ($sync, $unit, $tip) = @_; my $opt = $sync->{-opt}; my $pr = $opt->{-progress} if (($opt->{verbose} || 0) > 1); + my $i = $unit->{epoch}; my $cur = $sync->{ranges}->[$i] or do { $pr->("$i.git indexing all of $tip\n") if $pr; return $tip; # all of it @@ -993,7 +1023,8 @@ sub log_range ($$$$$) { my $range = "$cur..$tip"; $pr->("$i.git checking contiguity... ") if $pr; - if (is_ancestor($git, $cur, $tip)) { # common case + my $git = $unit->{git}; + if (is_ancestor($sync->{self}->git, $cur, $tip)) { # common case $pr->("OK\n") if $pr; my $n = $git->qx(qw(rev-list --count), $range); chomp($n); @@ -1018,63 +1049,103 @@ Rewritten history? (in $git->{git_dir}) warn "discarding history at $cur\n"; } warn <<""; -reindexing $git->{git_dir} starting at -$range - - $sync->{unindex_range}->{$i} = "$base..$cur"; +reindexing $git->{git_dir} +starting at $range + + # $cur^0 may no longer exist if pruned by git + if ($git->qx(qw(rev-parse -q --verify), "$cur^0")) { + $unit->{unindex_range} = "$base..$cur"; + } elsif ($base && $git->qx(qw(rev-parse -q --verify), $base)) { + $unit->{unindex_range} = "$base.."; + } else { + warn "W: unable to unindex before $range\n"; + } } $range; } -sub sync_prepare ($$$) { - my ($self, $sync, $epoch_max) = @_; +# overridden by ExtSearchIdx +sub artnum_max { $_[0]->{mm}->num_highwater } + +sub sync_prepare ($$) { + my ($self, $sync) = @_; + $sync->{ranges} = sync_ranges($self, $sync); my $pr = $sync->{-opt}->{-progress}; my $regen_max = 0; - my $head = $self->{ibx}->{ref_head} || 'refs/heads/master'; - - # reindex stops at the current heads and we later rerun index_sync - # without {reindex} - my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex}; - - for (my $i = $epoch_max; $i >= 0; $i--) { - my $git_dir = git_dir_n($self, $i); + my $head = $sync->{ibx}->{ref_head} || 'HEAD'; + my $pfx; + if ($pr) { + ($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g); + $pfx //= $sync->{ibx}->{inboxdir}; + } + + my $reindex_heads; + if ($self->{ibx_map}) { + # ExtSearchIdx won't index messages unless they're in + # over.sqlite3 for a given inbox, so don't read beyond + # what's in the per-inbox index. + $reindex_heads = []; + my $v = PublicInbox::Search::SCHEMA_VERSION; + my $mm = $sync->{ibx}->mm; + for my $i (0..$sync->{epoch_max}) { + $reindex_heads->[$i] = $mm->last_commit_xap($v, $i); + } + } elsif ($sync->{reindex}) { # V2 inbox + # reindex stops at the current heads and we later + # rerun index_sync without {reindex} + $reindex_heads = $self->last_commits($sync); + } + if ($sync->{max_size} = $sync->{-opt}->{max_size}) { + $sync->{index_oid} = $self->can('index_oid'); + } + my $git_pfx = "$sync->{ibx}->{inboxdir}/git"; + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { + my $git_dir = "$git_pfx/$i.git"; -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); + my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - - next if $?; # new repo - my $range = log_range($self, $sync, $git, $i, $tip) or next; + my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter - $pr->("$i.git counting $range ... ") if $pr; + $pr->("$pfx $i.git counting $range ... ") if $pr; # Don't bump num_highwater on --reindex by using {D}. # We intentionally do NOT use {D} in the non-reindex case # because we want NNTP article number gaps from unindexed # messages to show up in mirrors, too. $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR - my $stk = log2stack($sync, $git, $range, $self->{ibx}); + my $stk = log2stack($sync, $git, $range); + return 0 if $sync->{quit}; my $nr = $stk ? $stk->num_records : 0; $pr->("$nr\n") if $pr; - $sync->{stacks}->[$i] = $stk if $stk; + $unit->{stack} = $stk; # may be undef + unshift @{$sync->{todo}}, $unit; $regen_max += $nr; } + return 0 if $sync->{quit}; # XXX this should not happen unless somebody bypasses checks in # our code and blindly injects "d" file history into git repos if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); - my $arg = { v2w => $self }; - my $all = $self->{ibx}->git; + local $self->{current_info} = 'leftover '; + my $unindex_oid = $self->can('unindex_oid'); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $self->{current_info} = "leftover $oid"; - $all->cat_async($oid, \&unindex_oid, $arg); + my $req = { %$sync, oid => $oid }; + $self->git->cat_async($oid, $unindex_oid, $req); } - $all->cat_async_wait; + $self->git->cat_async_wait; } - if (!$regen_max && !keys(%{$self->{unindex_range}})) { + return 0 if $sync->{quit}; + if (!$regen_max) { $sync->{-regen_fmt} = "%u/?\n"; return 0; } @@ -1085,22 +1156,25 @@ sub sync_prepare ($$$) { $sync->{-regen_fmt} = "% ${pad}u/$regen_max\n"; $sync->{nr} = \(my $nr = 0); return -1 if $sync->{reindex}; - $regen_max + $self->{mm}->num_highwater() || 0; + $regen_max + $self->artnum_max || 0; } -sub unindex_oid_remote ($$$) { +sub unindex_oid_aux ($$$) { my ($self, $oid, $mid) = @_; my @removed = $self->{oidx}->remove_oid($oid, $mid); for my $num (@removed) { - my $idx = idx_shard($self, $num % $self->{shards}); - $idx->shard_remove($oid, $num); + my $idx = idx_shard($self, $num); + $idx->shard_remove($num); } } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - my $self = $sync->{v2w}; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1123,43 +1197,46 @@ sub unindex_oid ($$;$) { # git->cat_async callback } $mm->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } +sub git { $_[0]->{ibx}->git } + # this is rare, it only happens when we get discontiguous history in # a mirror because the source used -purge or -edit -sub unindex ($$$$) { - my ($self, $sync, $git, $unindex_range) = @_; +sub unindex_todo ($$$) { + my ($self, $sync, $unit) = @_; + my $unindex_range = delete($unit->{unindex_range}) // return; my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num my $before = scalar keys %$unindexed; # order does not matter, here: - my @cmd = qw(log --raw -r - --no-notes --no-color --no-abbrev --no-renames); - my $fh = $git->popen(@cmd, $unindex_range); - my $all = $self->{ibx}->git; + my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color + --no-abbrev --no-renames), $unindex_range); local $sync->{in_unindex} = 1; + my $unindex_oid = $self->can('unindex_oid'); while (<$fh>) { /\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next; - $all->cat_async($1, \&unindex_oid, $sync); + $self->git->cat_async($1, $unindex_oid, { %$sync, oid => $1 }); } close $fh or die "git log failed: \$?=$?"; - $all->cat_async_wait; + $self->git->cat_async_wait; return unless $sync->{-opt}->{prune}; my $after = scalar keys %$unindexed; return if $before == $after; # ensure any blob can not longer be accessed via dumb HTTP - PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}", + PublicInbox::Import::run_die(['git', + "--git-dir=$unit->{git}->{git_dir}", qw(-c gc.reflogExpire=now gc --prune=all --quiet)]); } -sub sync_ranges ($$$) { - my ($self, $sync, $epoch_max) = @_; +sub sync_ranges ($$) { + my ($self, $sync) = @_; my $reindex = $sync->{reindex}; - - return last_commits($self, $epoch_max) unless $reindex; + return $self->last_commits($sync) unless $reindex; return [] if ref($reindex) ne 'HASH'; my $ranges = $reindex->{from}; # arrayref; @@ -1171,8 +1248,8 @@ sub sync_ranges ($$$) { sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; - my $self = $smsg->{v2w}; - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); + my $self = $smsg->{self}; + my $idx = idx_shard($self, $smsg->{num}); $smsg->{raw_bytes} = $size; $idx->index_raw($bref, undef, $smsg); $self->{transact_bytes} += $size; @@ -1190,8 +1267,9 @@ sub index_xap_step ($$$;$) { "$beg..$end (% $step)\n"); } for (my $num = $beg; $num <= $end; $num += $step) { + last if $sync->{quit}; my $smsg = $ibx->over->get_art($num) or next; - $smsg->{v2w} = $self; + $smsg->{self} = $self; $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg); if ($self->{transact_bytes} >= $self->{batch_bytes}) { ${$sync->{nr}} = $num; @@ -1200,37 +1278,53 @@ sub index_xap_step ($$$;$) { } } -sub index_epoch ($$$) { - my ($self, $sync, $i) = @_; - - my $git_dir = git_dir_n($self, $i); - -d $git_dir or return; # missing epochs are fine - my $git = PublicInbox::Git->new($git_dir); - if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare - unindex($self, $sync, $git, $unindex_range); - } - defined(my $stk = $sync->{stacks}->[$i]) or return; - $sync->{stacks}->[$i] = undef; - my $all = $self->{ibx}->git; - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { - $self->{current_info} = "$i.git $oid"; +sub index_todo ($$$) { + my ($self, $sync, $unit) = @_; + return if $sync->{quit}; + unindex_todo($self, $sync, $unit); + my $stk = delete($unit->{stack}) or return; + my $all = $self->git; + my $index_oid = $self->can('index_oid'); + my $unindex_oid = $self->can('unindex_oid'); + my $pfx; + if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) { + $pfx = "$1 $2"; # v2 + } else { # v1 + ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g); + $pfx //= $unit->{git}->{git_dir}; + } + local $self->{current_info} = "$pfx "; + local $sync->{latest_cmt} = \(my $latest_cmt); + local $sync->{unit} = $unit; + while (my ($f, $at, $ct, $oid, $cmt) = $stk->pop_rec) { + if ($sync->{quit}) { + warn "waiting to quit...\n"; + $all->async_wait_all; + $self->update_last_commit($sync); + return; + } + my $req = { + %$sync, + autime => $at, + cotime => $ct, + oid => $oid, + cur_cmt => $cmt + }; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; if ($sync->{max_size}) { - $all->check_async($oid, \&check_size, $arg); + $all->check_async($oid, \&check_size, $req); } else { - $all->cat_async($oid, \&index_oid, $arg); + $all->cat_async($oid, $index_oid, $req); } } elsif ($f eq 'd') { - $all->cat_async($oid, \&unindex_oid, $sync); + $all->cat_async($oid, $unindex_oid, $req); } if (${$sync->{need_checkpoint}}) { reindex_checkpoint($self, $sync); } } - $all->check_async_wait; - $all->cat_async_wait; - update_last_commit($self, $git, $i, $stk->{latest_cmt}); + $all->async_wait_all; + $self->update_last_commit($sync, $stk); } sub xapian_only { @@ -1243,7 +1337,7 @@ sub xapian_only { $sync //= { need_checkpoint => \(my $bool = 0), -opt => $opt, - v2w => $self, + self => $self, nr => \(my $nr = 0), -regen_fmt => "%u/?\n", }; @@ -1251,6 +1345,7 @@ sub xapian_only { if ($seq || !$self->{parallel}) { my $shard_end = $self->{shards} - 1; for my $i (0..$shard_end) { + last if $sync->{quit}; index_xap_step($self, $sync, $art_beg + $i); if ($i != $shard_end) { reindex_checkpoint($self, $sync); @@ -1260,7 +1355,7 @@ sub xapian_only { index_xap_step($self, $sync, $art_beg, 1); } } - $self->{ibx}->git->cat_async_wait; + $self->git->cat_async_wait; $self->done; } @@ -1270,11 +1365,19 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = git_dir_latest($self, \$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } + my $pr = $opt->{-progress}; my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; @@ -1285,13 +1388,18 @@ sub index_sync { $self->{oidx}->rethread_prepare($opt); my $sync = { need_checkpoint => \(my $bool = 0), - unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, -opt => $opt, - v2w => $self, + self => $self, + ibx => $self->{ibx}, + epoch_max => $epoch_max, }; - $sync->{ranges} = sync_ranges($self, $sync, $epoch_max); - if (sync_prepare($self, $sync, $epoch_max)) { + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + + if (sync_prepare($self, $sync)) { # tmp_clone seems to fail if inside a transaction, so # we rollback here (because we opened {mm} for reading) # Note: we do NOT rely on DBI transactions for atomicity; @@ -1303,16 +1411,13 @@ sub index_sync { # xapian_only works incrementally w/o --reindex if ($seq && !$opt->{reindex}) { - $art_beg = $sync->{mm_tmp}->max; - $art_beg++ if defined($art_beg); + $art_beg = $sync->{mm_tmp}->max || -1; + $art_beg++; } } - if ($sync->{max_size} = $opt->{max_size}) { - $sync->{index_oid} = \&index_oid; - } # work forwards through history - index_epoch($self, $sync, $_) for (0..$epoch_max); - $self->{oidx}->rethread_done($opt); + index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []}; + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; $self->done; if (my $nr = $sync->{nr}) { @@ -1320,14 +1425,21 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } + my $quit_warn; # deal with Xapian shards sequentially if ($seq && delete($sync->{mm_tmp})) { - $self->{ibx}->{indexlevel} = $idxlevel; - xapian_only($self, $opt, $sync, $art_beg); + if ($sync->{quit}) { + $quit_warn = 1; + } else { + $self->{ibx}->{indexlevel} = $idxlevel; + xapian_only($self, $opt, $sync, $art_beg); + $quit_warn = 1 if $sync->{quit}; + } } # --reindex on the command-line - if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') { + if (!$sync->{quit} && $opt->{reindex} && + !ref($opt->{reindex}) && $idxlevel ne 'basic') { $self->lock_acquire; my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0); if (my $xdb = $s0->idx_acquire) { @@ -1339,12 +1451,16 @@ sub index_sync { } # reindex does not pick up new changes, so we rerun w/o it: - if ($opt->{reindex}) { + if ($opt->{reindex} && !$sync->{quit}) { my %again = %$opt; $sync = undef; delete @again{qw(rethread reindex -skip_lock)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } + warn <{-inbox}->msg_by_mid($ctx->{mid}) or return; # 404 + my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; # 404 my $eml = PublicInbox::Eml->new($bref); $ctx->{mhref} = ''; PublicInbox::WwwStream::init($ctx); @@ -64,7 +64,7 @@ sub no_over_html ($) { sub msg_page { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; my $over = $ctx->{over} = $ibx->over or return no_over_html($ctx); my ($id, $prev); @@ -88,7 +88,7 @@ sub msg_reply ($$) { 'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style'; my $info = ''; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if (my $url = $ibx->{infourl}) { $url = prurl($ctx->{env}, $url); $info = qq(\n List information: $url\n); @@ -421,7 +421,7 @@ sub stream_thread ($$) { sub thread_html { my ($ctx) = @_; my $mid = $ctx->{mid}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); return missing_thread($ctx) if $nr == 0; @@ -554,7 +554,7 @@ EOF sub add_text_body { # callback for each_part my ($p, $ctx) = @_; my $upfx = $ctx->{mhref}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; # $p - from each_part: [ Email::MIME-like, depth, $idx ] my ($part, $depth, $idx) = @$p; @@ -639,7 +639,7 @@ sub add_text_body { # callback for each_part sub _msg_page_prepare_obuf { my ($eml, $ctx) = @_; - my $over = $ctx->{-inbox}->over; + my $over = $ctx->{ibx}->over; my $obfs_ibx = $ctx->{-obfs_ibx}; my $rv = ''; my $mids = mids_for_index($eml); @@ -729,7 +729,7 @@ sub SKEL_EXPAND () { sub thread_skel ($$$) { my ($skel, $ctx, $hdr) = @_; my $mid = mids($hdr)->[0]; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); my $parent = in_reply_to($hdr); $$skel .= "\nThread overview: "; @@ -800,7 +800,7 @@ sub _parent_headers { # returns a string buffer sub html_footer { my ($ctx, $hdr) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $upfx = '../'; my $skel; my $rv = '
';
@@ -1072,7 +1072,7 @@ sub acc_topic { # walk_thread callback
 	my ($ctx, $level, $smsg) = @_;
 	my $mid = $smsg->{mid};
 	my $has_blob = $smsg->{blob} // do {
-		if (my $by_mid = $ctx->{-inbox}->smsg_by_mid($mid)) {
+		if (my $by_mid = $ctx->{ibx}->smsg_by_mid($mid)) {
 			%$smsg = (%$smsg, %$by_mid);
 			1;
 		}
@@ -1116,7 +1116,7 @@ sub dump_topics {
 	}
 
 	my @out;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
 
 	# sort by recency, this allows new posts to "bump" old topics...
@@ -1194,7 +1194,7 @@ sub paginate_recent ($$) {
 	$t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1);
 	$t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1);
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $msgs = $ibx->recent($opts, $after, $before);
 	my $nr = scalar @$msgs;
 	if ($nr < $lim && defined($after)) {
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index 87927d5e..3f34ea82 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -197,7 +197,7 @@ sub show ($$;$) {
 
 	$ctx->{'log'} = tmpfile("solve.$oid_b");
 	$ctx->{fn} = $fn;
-	my $solver = PublicInbox::SolverGit->new($ctx->{-inbox},
+	my $solver = PublicInbox::SolverGit->new($ctx->{ibx},
 						\&solve_result, $ctx);
 	# PSGI server will call this immediately and give us a callback (-wcb)
 	sub {
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 37f55347..52630ae3 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -32,9 +32,8 @@ our $ATTACH_RE = qr!([0-9][0-9\.]*)-($PublicInbox::Hval::FN)!;
 our $OID_RE = qr![a-f0-9]{7,}!;
 
 sub new {
-	my ($class, $pi_config) = @_;
-	$pi_config ||= PublicInbox::Config->new;
-	bless { pi_config => $pi_config }, $class;
+	my ($class, $pi_cfg) = @_;
+	bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class;
 }
 
 # backwards compatibility, do not use
@@ -169,14 +168,14 @@ sub preload {
 		eval "require PublicInbox::$_;";
 	}
 	if (ref($self)) {
-		my $pi_config = $self->{pi_config};
-		if (defined($pi_config->{'publicinbox.cgitrc'})) {
-			$pi_config->limiter('-cgit');
+		my $pi_cfg = $self->{pi_cfg};
+		if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
+			$pi_cfg->limiter('-cgit');
 		}
 		$self->cgit;
 		$self->stylesheets_prepare($_) for ('', '../', '../../');
 		$self->news_www;
-		$pi_config->each_inbox(\&preload_inbox);
+		$pi_cfg->each_inbox(\&preload_inbox);
 	}
 }
 
@@ -210,9 +209,10 @@ sub news_cgit_fallback ($) {
 # returns undef if valid, array ref response if invalid
 sub invalid_inbox ($$) {
 	my ($ctx, $inbox) = @_;
-	my $ibx = $ctx->{www}->{pi_config}->lookup_name($inbox);
+	my $ibx = $ctx->{www}->{pi_cfg}->lookup_name($inbox) //
+			$ctx->{www}->{pi_cfg}->lookup_ei($inbox);
 	if (defined $ibx) {
-		$ctx->{-inbox} = $ibx;
+		$ctx->{ibx} = $ibx;
 		return;
 	}
 
@@ -230,11 +230,11 @@ sub invalid_inbox_mid {
 	return $ret if $ret;
 
 	my $mid = $ctx->{mid} = uri_unescape($mid_ue);
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) {
 		my ($x2, $x38) = ($1, $2);
 		# this is horrifically wasteful for legacy URLs:
-		my $str = $ctx->{-inbox}->msg_by_path("$x2/$x38") or return;
+		my $str = $ctx->{ibx}->msg_by_path("$x2/$x38") or return;
 		my $s = PublicInbox::Eml->new($str);
 		$mid = PublicInbox::MID::mid_clean($s->header_raw('Message-ID'));
 		return r301($ctx, $inbox, mid_escape($mid));
@@ -285,7 +285,7 @@ sub get_mid_html {
 # /$INBOX/$MESSAGE_ID/t/
 sub get_thread {
 	my ($ctx, $flat) = @_;
-	$ctx->{-inbox}->over or return need($ctx, 'Overview');
+	$ctx->{ibx}->over or return need($ctx, 'Overview');
 	$ctx->{flat} = $flat;
 	require PublicInbox::View;
 	PublicInbox::View::thread_html($ctx);
@@ -338,7 +338,7 @@ EOF
 # especially on older systems.  Stick to zlib since that's what git uses.
 sub get_thread_mbox {
 	my ($ctx, $sfx) = @_;
-	my $over = $ctx->{-inbox}->over or return need($ctx, 'Overview');
+	my $over = $ctx->{ibx}->over or return need($ctx, 'Overview');
 	require PublicInbox::Mbox;
 	PublicInbox::Mbox::thread_mbox($ctx, $over, $sfx);
 }
@@ -347,7 +347,7 @@ sub get_thread_mbox {
 # /$INBOX/$MESSAGE_ID/t.atom		  -> thread as Atom feed
 sub get_thread_atom {
 	my ($ctx) = @_;
-	$ctx->{-inbox}->over or return need($ctx, 'Overview');
+	$ctx->{ibx}->over or return need($ctx, 'Overview');
 	require PublicInbox::Feed;
 	PublicInbox::Feed::generate_thread_atom($ctx);
 }
@@ -412,11 +412,11 @@ sub legacy_redirects {
 
 sub r301 {
 	my ($ctx, $inbox, $mid_ue, $suffix) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	unless ($ibx) {
 		my $r404 = invalid_inbox($ctx, $inbox);
 		return $r404 if $r404;
-		$ibx = $ctx->{-inbox};
+		$ibx = $ctx->{ibx};
 	}
 	my $url = $ibx->base_url($ctx->{env});
 	my $qs = $ctx->{env}->{QUERY_STRING};
@@ -453,7 +453,7 @@ sub msg_page {
 sub serve_git {
 	my ($ctx, $epoch, $path) = @_;
 	my $env = $ctx->{env};
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $git = defined $epoch ? $ibx->git_epoch($epoch) : $ibx->git;
 	$git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404();
 }
@@ -461,7 +461,7 @@ sub serve_git {
 sub mbox_results {
 	my ($ctx) = @_;
 	if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
-		$ctx->{-inbox}->search or return need($ctx, 'search');
+		$ctx->{ibx}->isrch or return need($ctx, 'search');
 		require PublicInbox::SearchView;
 		return PublicInbox::SearchView::mbox_results($ctx);
 	}
@@ -480,18 +480,18 @@ sub news_www {
 	my ($self) = @_;
 	$self->{news_www} ||= do {
 		require PublicInbox::NewsWWW;
-		PublicInbox::NewsWWW->new($self->{pi_config});
+		PublicInbox::NewsWWW->new($self->{pi_cfg});
 	}
 }
 
 sub cgit {
 	my ($self) = @_;
 	$self->{cgit} ||= do {
-		my $pi_config = $self->{pi_config};
+		my $pi_cfg = $self->{pi_cfg};
 
-		if (defined($pi_config->{'publicinbox.cgitrc'})) {
+		if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
 			require PublicInbox::Cgit;
-			PublicInbox::Cgit->new($pi_config);
+			PublicInbox::Cgit->new($pi_cfg);
 		} else {
 			require Plack::Util;
 			Plack::Util::inline_object(call => sub { r404() });
@@ -537,7 +537,7 @@ sub stylesheets_prepare ($$) {
 	} || sub { $_[0] };
 
 	my $css_map = {};
-	my $stylesheets = $self->{pi_config}->{css} || [];
+	my $stylesheets = $self->{pi_cfg}->{css} || [];
 	my $links = [];
 	my $inline_ok = 1;
 
@@ -641,7 +641,7 @@ sub get_css ($$$) {
 	my $css = $css_map->{$key};
 	if (!defined($css) && $key eq 'userContent') {
 		my $env = $ctx->{env};
-		$css = PublicInbox::UserContent::sample($ctx->{-inbox}, $env);
+		$css = PublicInbox::UserContent::sample($ctx->{ibx}, $env);
 	}
 	defined $css or return r404();
 	my $h = [ 'Content-Length', bytes::length($css),
@@ -653,7 +653,7 @@ sub get_css ($$$) {
 sub get_description {
 	my ($ctx, $inbox) = @_;
 	invalid_inbox($ctx, $inbox) || do {
-		my $d = $ctx->{-inbox}->description . "\n";
+		my $d = $ctx->{ibx}->description . "\n";
 		[ 200, [ 'Content-Length', bytes::length($d),
 			'Content-Type', 'text/plain' ], [ $d ] ];
 	};
diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index 8bbce929..bc296e01 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -41,7 +41,7 @@ sub compile_watchheaders ($) {
 }
 
 sub new {
-	my ($class, $config) = @_;
+	my ($class, $cfg) = @_;
 	my (%mdmap, $spamc);
 	my (%imap, %nntp); # url => [inbox objects] or 'watchspam'
 
@@ -50,7 +50,7 @@ sub new {
 	# indefinitely...
 	foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) {
 		my $k = "$pfx.watchspam";
-		defined(my $dirs = $config->{$k}) or next;
+		defined(my $dirs = $cfg->{$k}) or next;
 		$dirs = PublicInbox::Config::_array($dirs);
 		for my $dir (@$dirs) {
 			my $url;
@@ -69,10 +69,10 @@ sub new {
 
 	my $k = 'publicinboxwatch.spamcheck';
 	my $default = undef;
-	my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default);
+	my $spamcheck = PublicInbox::Spamcheck::get($cfg, $k, $default);
 	$spamcheck = _spamcheck_cb($spamcheck) if $spamcheck;
 
-	$config->each_inbox(sub {
+	$cfg->each_inbox(sub {
 		# need to make all inboxes writable for spam removal:
 		my $ibx = $_[0] = PublicInbox::InboxWritable->new($_[0]);
 
@@ -113,7 +113,7 @@ sub new {
 		spamcheck => $spamcheck,
 		mdmap => \%mdmap,
 		mdre => $mdre,
-		config => $config,
+		pi_cfg => $cfg,
 		imap => scalar keys %imap ? \%imap : undef,
 		nntp => scalar keys %nntp? \%nntp : undef,
 		importers => {},
@@ -175,7 +175,7 @@ sub _remove_spam {
 	$path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return;
 	my $eml = eml_from_path($path) or return;
 	local $SIG{__WARN__} = warn_ignore_cb();
-	$self->{config}->each_inbox(\&remove_eml_i, $self, $eml, $path);
+	$self->{pi_cfg}->each_inbox(\&remove_eml_i, $self, $eml, $path);
 }
 
 sub import_eml ($$$) {
@@ -217,7 +217,7 @@ sub _try_path {
 		warn "unmappable dir: $1\n";
 		return;
 	}
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
 		$warn_cb->($pfx, "path: $path\n", @_);
@@ -316,7 +316,7 @@ sub cfg_bool ($$$) {
 # flesh out common IMAP-specific data structures
 sub imap_common_init ($) {
 	my ($self) = @_;
-	my $cfg = $self->{config};
+	my $cfg = $self->{pi_cfg};
 	my $mic_args = {}; # scheme://authority => Mail:IMAPClient arg
 	for my $url (sort keys %{$self->{imap}}) {
 		my $uri = PublicInbox::URIimap->new($url);
@@ -418,7 +418,7 @@ sub imap_import_msg ($$$$$) {
 		if ($flags =~ /\\Seen\b/) {
 			local $SIG{__WARN__} = warn_ignore_cb();
 			my $eml = PublicInbox::Eml->new($raw);
-			$self->{config}->each_inbox(\&remove_eml_i,
+			$self->{pi_cfg}->each_inbox(\&remove_eml_i,
 						$self, $eml, "$url UID:$uid");
 		}
 	} else {
@@ -467,7 +467,7 @@ sub imap_fetch_all ($$$) {
 	my $key = $req;
 	$key =~ s/\.PEEK//;
 	my ($uids, $batch);
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
 		$batch //= '?';
@@ -775,7 +775,7 @@ sub watch_imap_init ($$) {
 # flesh out common NNTP-specific data structures
 sub nntp_common_init ($) {
 	my ($self) = @_;
-	my $cfg = $self->{config};
+	my $cfg = $self->{pi_cfg};
 	my $nn_args = {}; # scheme://authority => Net::NNTP->new arg
 	for my $url (sort keys %{$self->{nntp}}) {
 		my $sec = uri_section(uri_new($url));
@@ -929,7 +929,7 @@ sub nntp_fetch_all ($$$) {
 	$beg = $l_art + 1;
 
 	warn "I: $url fetching ARTICLE $beg..$end\n";
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	my ($err, $art);
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
@@ -966,7 +966,7 @@ sub nntp_fetch_all ($$$) {
 			}
 		} elsif ($inboxes eq 'watchspam') {
 			my $eml = PublicInbox::Eml->new(\$raw);
-			$self->{config}->each_inbox(\&remove_eml_i,
+			$self->{pi_cfg}->each_inbox(\&remove_eml_i,
 					$self, $eml, "$url ARTICLE $art");
 		} else {
 			die "BUG: destination unknown $inboxes";
diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm
index 2818400e..204e2f82 100644
--- a/lib/PublicInbox/WwwAltId.pm
+++ b/lib/PublicInbox/WwwAltId.pm
@@ -30,7 +30,7 @@ sub check_output {
 sub sqldump ($$) {
 	my ($ctx, $altid_pfx) = @_;
 	my $env = $ctx->{env};
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $altid_map = $ibx->altid_map;
 	my $fn = $altid_map->{$altid_pfx};
 	unless (defined $fn) {
diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm
index 388def12..912f860e 100644
--- a/lib/PublicInbox/WwwAtomStream.pm
+++ b/lib/PublicInbox/WwwAtomStream.pm
@@ -15,7 +15,7 @@ use PublicInbox::MsgTime qw(msg_timestamp);
 
 sub new {
 	my ($class, $ctx, $cb) = @_;
-	$ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env});
+	$ctx->{feed_base_url} = $ctx->{ibx}->base_url($ctx->{env});
 	$ctx->{cb} = $cb || \&PublicInbox::GzipFilter::close;
 	$ctx->{emit_header} = 1;
 	bless $ctx, $class;
@@ -53,7 +53,7 @@ sub getline {
 	my ($self) = @_;
 	my $cb = $self->{cb} or return;
 	while (my $smsg = $cb->($self)) {
-		my $eml = $self->{-inbox}->smsg_eml($smsg) or next;
+		my $eml = $self->{ibx}->smsg_eml($smsg) or next;
 		return $self->translate(feed_entry($self, $smsg, $eml));
 	}
 	delete $self->{cb};
@@ -82,7 +82,7 @@ sub to_uuid ($) {
 
 sub atom_header {
 	my ($ctx, $title) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $base_url = $ctx->{feed_base_url};
 	my $search_q = $ctx->{search_query};
 	my $self_url = $base_url;
@@ -136,10 +136,10 @@ sub feed_entry {
 	$title = title_tag($title);
 
 	my $from = $eml->header('From') // $eml->header('Sender') //
-		$ctx->{-inbox}->{-primary_address};
+		$ctx->{ibx}->{-primary_address};
 	my ($email) = PublicInbox::Address::emails($from);
 	my $name = ascii_html(join(', ', PublicInbox::Address::names($from)));
-	$email = ascii_html($email // $ctx->{-inbox}->{-primary_address});
+	$email = ascii_html($email // $ctx->{ibx}->{-primary_address});
 
 	my $s = delete($ctx->{emit_header}) ? atom_header($ctx, $title) : '';
 	$s .= "$name$email" .
diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm
index 09c66d02..0fe63e42 100644
--- a/lib/PublicInbox/WwwAttach.pm
+++ b/lib/PublicInbox/WwwAttach.pm
@@ -16,7 +16,7 @@ sub referer_match ($) {
 	return 1 if $referer eq ''; # no referer is always OK for wget/curl
 
 	# prevent deep-linking from other domains on some browsers (Firefox)
-	# n.b.: $ctx->{-inbox}->base_url($env) with INBOX_URL won't work
+	# n.b.: $ctx->{ibx}->base_url($env) with INBOX_URL won't work
 	# with dillo, we can only match "$url_scheme://$HTTP_HOST/" without
 	# path components
 	my $base_url = $env->{'psgi.url_scheme'} . '://' .
@@ -88,15 +88,15 @@ sub get_attach ($$$) {
 	$ctx->{idx} = $idx;
 	bless $ctx, __PACKAGE__;
 	my $eml;
-	if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) {
+	if ($ctx->{smsg} = $ctx->{ibx}->smsg_by_mid($ctx->{mid})) {
 		return sub { # public-inbox-httpd-only
 			$ctx->{wcb} = $_[0];
 			scan_attach($ctx);
 		} if $ctx->{env}->{'pi-httpd.async'};
 		# generic PSGI:
-		$eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg});
-	} elsif (!$ctx->{-inbox}->over) {
-		if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) {
+		$eml = $ctx->{ibx}->smsg_eml($ctx->{smsg});
+	} elsif (!$ctx->{ibx}->over) {
+		if (my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid})) {
 			$eml = PublicInbox::Eml->new($bref);
 		}
 	}
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index bda2761c..4b3f1674 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -44,7 +44,7 @@ sub url_regexp {
 	my ($ctx, $key, $default) = @_;
 	$key //= 'publicInbox.wwwListing';
 	$default //= '404';
-	my $v = $ctx->{www}->{pi_config}->{lc $key} // $default;
+	my $v = $ctx->{www}->{pi_cfg}->{lc $key} // $default;
 again:
 	if ($v eq 'match=domain') {
 		my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME};
@@ -69,8 +69,11 @@ sub hide_key { 'www' }
 sub response {
 	my ($class, $ctx) = @_;
 	bless $ctx, $class;
+	if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) {
+		$ALL->misc->reopen;
+	}
 	my $re = $ctx->url_regexp or return $ctx->psgi_triple;
-	my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_config},
+	my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg},
 						\&list_match_i, $re, $ctx);
 	sub {
 		$ctx->{-wcb} = $_[0]; # HTTP server callback
diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm
index 638f4e27..958251a3 100644
--- a/lib/PublicInbox/WwwStream.pm
+++ b/lib/PublicInbox/WwwStream.pm
@@ -12,11 +12,12 @@ our @EXPORT_OK = qw(html_oneshot);
 use bytes (); # length
 use PublicInbox::Hval qw(ascii_html prurl ts2str);
 our $TOR_URL = 'https://www.torproject.org/';
-our $CODE_URL = 'https://public-inbox.org/public-inbox.git';
+our $CODE_URL = [ qw(http://ou63pmih66umazou.onion/public-inbox.git
+	https://public-inbox.org/public-inbox.git) ];
 
 sub base_url ($) {
 	my $ctx = shift;
-	my $base_url = $ctx->{-inbox}->base_url($ctx->{env});
+	my $base_url = $ctx->{ibx}->base_url($ctx->{env});
 	chop $base_url; # no trailing slash for clone
 	$base_url;
 }
@@ -35,7 +36,7 @@ sub async_eml { # for async_blob_cb
 
 sub html_top ($) {
 	my ($ctx) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $desc = ascii_html($ibx->description);
 	my $title = delete($ctx->{-title_html}) // $desc;
 	my $upfx = $ctx->{-upfx} || '';
@@ -54,7 +55,7 @@ sub html_top ($) {
 			qq(color / ).
 			qq(mirror / ).
 			qq(Atom feed);
-	if ($ibx->search) {
+	if ($ibx->isrch) {
 		my $q_val = delete($ctx->{-q_value_html}) // '';
 		$q_val = qq(\nvalue="$q_val") if $q_val ne '';
 		# XXX gross, for SearchView.pm
@@ -78,22 +79,24 @@ sub html_top ($) {
 
 sub coderepos ($) {
 	my ($ctx) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $cr = $ctx->{ibx}->{coderepo} // return ();
+	my $cfg = $ctx->{www}->{pi_cfg};
+	my $upfx = ($ctx->{-upfx} // ''). '../';
 	my @ret;
-	if (defined(my $cr = $ibx->{coderepo})) {
-		my $cfg = $ctx->{www}->{pi_config};
-		my $env = $ctx->{env};
-		for my $cr_name (@$cr) {
-			my $urls = $cfg->{"coderepo.$cr_name.cgiturl"};
-			if ($urls) {
-				$ret[0] //= <{"coderepo.$cr_name.cgiturl"} // next;
+		$ret[0] //= <{env}, $u));
+			$ret[0] .= qq(\n\t$u);
 		}
 	}
-	@ret; # may be empty
+	@ret; # may be empty, this sub is called as an arg for join()
 }
 
 sub code_footer ($) {
@@ -109,7 +112,7 @@ sub _html_end {
 id=mirror>This inbox may be cloned and mirrored by anyone:
 EOF
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $desc = ascii_html($ibx->description);
 
 	my @urls;
@@ -143,10 +146,10 @@ EOF
 	}
 
 	$urls .= "\n" . join('', map { "\tgit clone --mirror $_\n" } @urls);
-	my $addrs = $ibx->{address};
-	$addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
-	my $v = defined $max ? '-V2' : '-V1';
-	$urls .= <{address}) {
+		$addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
+		my $v = defined $max ? '-V2' : '-V1';
+		$urls .= <{-upfx} // '').'_/text/config/raw';
 	$urls .= <{cb} or return;
 	while (defined(my $x = $cb->($ctx))) { # x = smsg or scalar non-ref
 		if (ref($x)) { # smsg
-			my $eml = $ctx->{-inbox}->smsg_eml($x) or next;
+			my $eml = $ctx->{ibx}->smsg_eml($x) or next;
 			$ctx->{smsg} = $x;
 			return $ctx->translate($cb->($ctx, $eml));
 		} else { # scalar
diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm
index 04c9b1c4..a8560916 100644
--- a/lib/PublicInbox/WwwText.pm
+++ b/lib/PublicInbox/WwwText.pm
@@ -49,7 +49,7 @@ sub get_text {
 
 	# enforce trailing slash for "wget -r" compatibility
 	if (!$have_tslash && $code == 200) {
-		my $url = $ctx->{-inbox}->base_url($env);
+		my $url = $ctx->{ibx}->base_url($env);
 		$url .= "_/text/$key/";
 
 		return [ 302, [ 'Content-Type', 'text/plain',
@@ -100,7 +100,7 @@ sub _srch_prefix ($$) {
 
 sub _colors_help ($$) {
 	my ($ctx, $txt) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $env = $ctx->{env};
 	my $base_url = $ibx->base_url($env);
 	$$txt .= "color customization for $base_url\n";
@@ -135,7 +135,7 @@ sub URI_PATH () { '^A-Za-z0-9\-\._~/' }
 # n.b. this is a perfect candidate for memoization
 sub inbox_config ($$$) {
 	my ($ctx, $hdr, $txt) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	push @$hdr, 'Content-Disposition', 'inline; filename=inbox.config';
 	my $name = dq_escape($ibx->{name});
 	my $inboxdir = '/path/to/top-level-inbox';
@@ -189,9 +189,9 @@ EOF
 ; line number ranges in `[PATCH]' emails link to /$INBOX_NAME/$OID/s/,
 ; an HTTP endpoint which reconstructs git blobs via git-apply(1).
 EOF
-		my $pi_config = $ctx->{www}->{pi_config};
+		my $pi_cfg = $ctx->{www}->{pi_cfg};
 		for my $cr_name (@$cr) {
-			my $urls = $pi_config->{"coderepo.$cr_name.cgiturl"};
+			my $urls = $pi_cfg->{"coderepo.$cr_name.cgiturl"};
 			my $path = "/path/to/$cr_name";
 			$cr_name = dq_escape($cr_name);
 
@@ -221,7 +221,7 @@ sub _default_text ($$$$) {
 	return inbox_config($ctx, $hdr, $txt) if $key eq 'config';
 	return if $key ne 'help'; # TODO more keys?
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $base_url = $ibx->base_url($ctx->{env});
 	$$txt .= "public-inbox help for $base_url\n";
 	$$txt .= <search;
+	my $srch = $ibx->isrch;
 	if ($srch) {
 		$$txt .= <{reindex}->{from} = $lc;
 		}
 	} else { # v2
-		my $max;
-		$im->git_dir_latest(\$max) or return;
+		my $max = $ibx->max_git_epoch // return;
 		my $from = $opt->{reindex}->{from};
 		my $mm = $ibx->mm;
 		my $v = PublicInbox::Search::SCHEMA_VERSION();
@@ -271,7 +270,6 @@ sub run {
 
 	local %SIG = %SIG;
 	setup_signals();
-	$ibx->umask_prepare;
 	$ibx->with_umask(\&_run, $ibx, $cb, $opt);
 }
 
diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h
new file mode 100644
index 00000000..800c6bad
--- /dev/null
+++ b/lib/PublicInbox/gcf2_libgit2.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2020 all contributors 
+ * License: AGPL-3.0+ 
+ *
+ * libgit2 for Inline::C
+ * Avoiding Git::Raw since it doesn't guarantee a stable API,
+ * while libgit2 itself seems reasonably stable.
+ */
+#include 
+#include 
+#include 
+#include 
+
+static void croak_if_err(int rc, const char *msg)
+{
+	if (rc != GIT_OK) {
+		const git_error *e = giterr_last();
+
+		croak("%d %s (%s)", rc, msg, e ? e->message : "unknown");
+	}
+}
+
+SV *new()
+{
+	git_odb *odb;
+	SV *ref, *self;
+	int rc = git_odb_new(&odb);
+	croak_if_err(rc, "git_odb_new");
+
+	ref = newSViv((IV)odb);
+	self = newRV_noinc(ref);
+	sv_bless(self, gv_stashpv("PublicInbox::Gcf2", GV_ADD));
+	SvREADONLY_on(ref);
+
+	return self;
+}
+
+static git_odb *odb_ptr(SV *self)
+{
+	return (git_odb *)SvIV(SvRV(self));
+}
+
+void DESTROY(SV *self)
+{
+	git_odb_free(odb_ptr(self));
+}
+
+/* needs "$GIT_DIR/objects", not $GIT_DIR */
+void add_alternate(SV *self, const char *objects_path)
+{
+	int rc = git_odb_add_disk_alternate(odb_ptr(self), objects_path);
+	croak_if_err(rc, "git_odb_add_disk_alternate");
+}
+
+#define CAPA(v) (sizeof(v) / sizeof((v)[0]))
+
+/*
+ * returns true on success, false on failure
+ * this requires an unabbreviated git OID
+ */
+int cat_oid(SV *self, int fd, SV *oidsv)
+{
+	/*
+	 * adjust when libgit2 gets SHA-256 support, we return the
+	 * same header as git-cat-file --batch "$OID $TYPE $SIZE\n"
+	 */
+	char hdr[GIT_OID_HEXSZ + sizeof(" commit 18446744073709551615")];
+	struct iovec vec[3];
+	size_t nvec = CAPA(vec);
+	git_oid oid;
+	git_odb_object *object = NULL;
+	int rc, err = 0;
+	STRLEN oidlen;
+	char *oidptr = SvPV(oidsv, oidlen);
+
+	/* same trailer as git-cat-file --batch */
+	vec[2].iov_len = 1;
+	vec[2].iov_base = "\n";
+
+	rc = git_oid_fromstrn(&oid, oidptr, oidlen);
+	if (rc == GIT_OK)
+		rc = git_odb_read(&object, odb_ptr(self), &oid);
+	if (rc == GIT_OK) {
+		vec[0].iov_base = hdr;
+		vec[1].iov_base = (void *)git_odb_object_data(object);
+		vec[1].iov_len = git_odb_object_size(object);
+
+		git_oid_nfmt(hdr, GIT_OID_HEXSZ, git_odb_object_id(object));
+		vec[0].iov_len = GIT_OID_HEXSZ +
+				snprintf(hdr + GIT_OID_HEXSZ,
+					sizeof(hdr) - GIT_OID_HEXSZ,
+					" %s %zu\n",
+					git_object_type2string(
+						git_odb_object_type(object)),
+					vec[1].iov_len);
+	} else { /* caller retries */
+		nvec = 0;
+	}
+	while (nvec && !err) {
+		ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec);
+
+		if (w > 0) {
+			size_t done = 0;
+			size_t i;
+
+			for (i = CAPA(vec) - nvec; i < CAPA(vec); i++) {
+				if (w >= vec[i].iov_len) {
+					/* fully written vec */
+					w -= vec[i].iov_len;
+					done++;
+				} else { /* partially written vec */
+					char *p = vec[i].iov_base;
+					vec[i].iov_base = p + w;
+					vec[i].iov_len -= w;
+					break;
+				}
+			}
+			nvec -= done;
+		} else if (w < 0) {
+			err = errno;
+			switch (err) {
+			case EAGAIN: {
+				struct pollfd pfd;
+				pfd.events = POLLOUT;
+				pfd.fd = fd;
+				poll(&pfd, 1, -1);
+			}
+				/* fall-through */
+			case EINTR:
+				err = 0;
+			}
+		} else { /* w == 0 */
+			err = ENOSPC;
+		}
+	}
+	if (object)
+		git_odb_object_free(object);
+	if (err)
+		croak("writev error: %s", strerror(err));
+
+	return rc == GIT_OK;
+}
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index b61c743f..e6ee6529 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -47,34 +47,21 @@ die $help if (scalar(@ARGV) || $new_dir eq '' || $old_dir eq '');
 die "$new_dir exists\n" if -d $new_dir;
 die "$old_dir not a directory\n" unless -d $old_dir;
 
-require Cwd;
-Cwd->import('abs_path');
+require PublicInbox::Admin;
 require PublicInbox::Config;
 require PublicInbox::InboxWritable;
 
-my $abs = abs_path($old_dir);
-die "failed to resolve $old_dir: $!\n" if (!defined($abs));
-
 my $cfg = PublicInbox::Config->new;
-my $old;
-$cfg->each_inbox(sub {
-	$old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir;
-});
-if ($old) {
-	$old = PublicInbox::InboxWritable->new($old);
-} else {
+my @old = PublicInbox::Admin::resolve_inboxes([$old_dir], undef, $cfg);
+@old > 1 and die "BUG: resolved several inboxes from $old_dir:\n",
+		map { "\t$_->{inboxdir}\n" } @old;
+my $old = PublicInbox::InboxWritable->new($old[0]);
+if (delete $old->{-unconfigured}) {
 	warn "W: $old_dir not configured in " .
 		PublicInbox::Config::default_file() . "\n";
-	$old = PublicInbox::InboxWritable->new({
-		inboxdir => $old_dir,
-		name => 'ignored',
-		-primary_address => 'old@example.com',
-		address => [ 'old@example.com' ],
-	});
 }
 die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2;
 
-require File::Spec;
 require PublicInbox::Admin;
 my $detected = PublicInbox::Admin::detect_indexlevel($old);
 $old->{indexlevel} //= $detected;
@@ -88,12 +75,11 @@ if ($opt->{'index'}) {
 }
 local %ENV = (%$env, %ENV) if $env;
 my $new = { %$old };
-$new->{inboxdir} = File::Spec->canonpath($new_dir);
+$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir);
 $new->{version} = 2;
 $new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} });
 $new->{-no_fsync} = 1 if !$opt->{fsync};
 my $v2w;
-$old->umask_prepare;
 
 sub link_or_copy ($$) {
 	my ($src, $dst) = @_;
diff --git a/script/public-inbox-edit b/script/public-inbox-edit
index a70614fc..81f023bc 100755
--- a/script/public-inbox-edit
+++ b/script/public-inbox-edit
@@ -183,7 +183,8 @@ retry_edit:
 	# rename/relink $edit_fn
 	open my $new_fh, '<', $edit_fn or
 		die "can't read edited file ($edit_fn): $!\n";
-	my $new_raw = do { local $/; <$new_fh> };
+	defined(my $new_raw = do { local $/; <$new_fh> }) or die
+		"read $edit_fn: $!\n";
 
 	if (!$opt->{raw}) {
 		# get rid of the From we added
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
new file mode 100644
index 00000000..5f27988f
--- /dev/null
+++ b/script/public-inbox-extindex
@@ -0,0 +1,81 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+# Basic tool to create a Xapian search index for a public-inbox.
+use strict;
+use v5.10.1;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = < -1, compact => 0, fsync => 1, scan => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
+		fsync|sync!
+		indexlevel|index-level|L=s max_size|max-size=s
+		batch_size|batch-size=s
+		gc commit-interval=i watch scan!
+		all help|h))
+	or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+my $cfg = PublicInbox::Config->new;
+my $eidx_dir = shift(@ARGV);
+unless (defined $eidx_dir) {
+	if ($opt->{all} && $cfg->ALL) {
+		$eidx_dir = $cfg->ALL->{topdir};
+	} else {
+		die "E: $help";
+	}
+}
+my @ibxs;
+if ($opt->{gc}) {
+	die "E: inbox paths must not be specified with --gc\n" if @ARGV;
+	die "E: --all not compatible with --gc\n" if $opt->{all};
+	die "E: --watch is not compatible with --gc\n" if $opt->{watch};
+} else {
+	@ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+}
+PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
+require PublicInbox::ExtSearchIdx;
+my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
+if ($opt->{gc}) {
+	$eidx->attach_config($cfg);
+	$eidx->eidx_gc($opt);
+} else {
+	if ($opt->{all}) {
+		$eidx->attach_config($cfg);
+	} else {
+		$eidx->attach_inbox($_) for @ibxs;
+	}
+	if ($opt->{watch}) {
+		$cfg = undef; # save memory only after SIGHUP
+		$eidx->eidx_watch($opt);
+	} else {
+		$eidx->eidx_sync($opt);
+	}
+}
diff --git a/script/public-inbox-httpd b/script/public-inbox-httpd
index b8159f3a..3befdab8 100755
--- a/script/public-inbox-httpd
+++ b/script/public-inbox-httpd
@@ -13,6 +13,7 @@ BEGIN {
 	require PublicInbox::HTTP;
 	require PublicInbox::HTTPD;
 }
+
 my %httpds;
 my $app;
 my $refresh = sub {
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 5dad6ecb..0fdfddc0 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -11,12 +11,13 @@ use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $help = < -1, compact => 0, max_size => undef, fsync => 1 };
+my $opt = {
+	quiet => -1, compact => 0, max_size => undef, fsync => 1,
+	'update-extindex' => [], # ":s@" optional arg sets '' if no arg given
+};
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
 		fsync|sync! xapian_only|xapian-only
 		indexlevel|index-level|L=s max_size|max-size=s
 		batch_size|batch-size=s
 		sequential_shard|seq-shard|sequential-shard
-		skip-docdata all help|h))
+		no-update-extindex update-extindex|E=s@
+		fast-noop|F skip-docdata all help|h))
 	or die $help;
 if ($opt->{help}) { print $help; exit 0 };
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
 if ($opt->{xapian_only} && !$opt->{reindex}) {
 	die "--xapian-only requires --reindex\n";
 }
+if ($opt->{reindex} && delete($opt->{'fast-noop'})) {
+	warn "--fast-noop ignored with --reindex\n";
+}
 
 # require lazily to speed up --help
 require PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-index');
 
 my $cfg = PublicInbox::Config->new; # Config is loaded by Admin
+$opt->{-use_cwd} = 1;
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR $help; exit 1 }
 
+my (@eidx, %eidx_seen);
+my $update_extindex = $opt->{'update-extindex'};
+if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
+	# extindex and normal inboxes may have different owners
+	push(@$update_extindex, 'all') if -w $ALL->{topdir};
+}
+@$update_extindex = () if $opt->{'no-update-extindex'};
+if (scalar @$update_extindex) {
+	PublicInbox::Admin::require_or_die('-search');
+	require PublicInbox::ExtSearchIdx;
+}
+for my $ei_name (@$update_extindex) {
+	my $es = $cfg->lookup_ei($ei_name);
+	my $topdir;
+	if (!$es && -d $ei_name) { # allow dirname or config section name
+		$topdir = $ei_name;
+	} elsif ($es) {
+		$topdir = $es->{topdir};
+	} else {
+		die "extindex `$ei_name' not configured or found\n";
+	}
+	my $o = { %$opt };
+	delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic';
+	$eidx_seen{$topdir} //=
+		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o));
+}
 my $mods = {};
+my @eidx_unconfigured;
 foreach my $ibx (@ibxs) {
 	# detect_indexlevel may also set $ibx->{-skip_docdata}
 	my $detected = PublicInbox::Admin::detect_indexlevel($ibx);
@@ -62,7 +98,14 @@ foreach my $ibx (@ibxs) {
 	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
 			'full' : $detected);
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
+	if (@eidx && $ibx->{-unconfigured}) {
+		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
+	}
 }
+warn <{compact} = 0 if !$mods->{'Search::Xapian'};
@@ -88,9 +131,21 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean
 EOL
 		$ibx_opt = { %$opt, sequential_shard => $v };
 	}
-	PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	last if $ibx_opt->{quit};
 	if (my $copt = $opt->{compact_opt}) {
 		local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard};
 		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
 	}
+	last if $ibx_opt->{quit};
+	next if $ibx->{-unconfigured} || !$nidx;
+	for my $eidx (@eidx) {
+		$eidx->attach_inbox($ibx);
+	}
+}
+my $pr = $opt->{-progress};
+for my $eidx (@eidx) {
+	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
+	$eidx->eidx_sync($opt);
+	last if $opt->{quit};
 }
diff --git a/script/public-inbox-init b/script/public-inbox-init
index c775eb31..7ac77830 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -100,11 +100,7 @@ if (-e $pi_config) {
 	defined $perm or die "(f)stat failed on $pi_config: $!\n";
 	chmod($perm & 07777, $fh) or
 		die "(f)chmod failed on future $pi_config: $!\n";
-	my $old;
-	{
-		local $/;
-		$old = <$oh>;
-	}
+	defined(my $old = do { local $/; <$oh> }) or die "read $pi_config: $!\n";
 	print $fh $old or die "failed to write: $!\n";
 	close $oh or die "failed to close $pi_config: $!\n";
 
@@ -138,10 +134,9 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n";
 my $pfx = "publicinbox.$name";
 my @x = (qw/git config/, "--file=$pi_config_tmp");
 
-require File::Spec;
-$inboxdir = File::Spec->canonpath($inboxdir);
+$inboxdir = PublicInbox::Config::rel2abs_collapsed($inboxdir);
+die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0;
 
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
 if (-f "$inboxdir/inbox.lock") {
 	if (!defined $version) {
 		$version = 2;
@@ -186,11 +181,6 @@ if ($skip_docdata) {
 	$ibx->{-skip_docdata} = $skip_docdata;
 }
 $ibx->init_inbox(0, $skip_epoch, $skip_artnum);
-require Cwd;
-my $tmp = Cwd::abs_path($inboxdir);
-defined($tmp) or die "failed to resolve $inboxdir: $!\n";
-$inboxdir = $tmp;
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
 
 # needed for git prior to v2.1.0
 umask(0077) if defined $perm;
diff --git a/script/public-inbox-learn b/script/public-inbox-learn
index fb2d86ec..1731a4ba 100755
--- a/script/public-inbox-learn
+++ b/script/public-inbox-learn
@@ -36,11 +36,10 @@ if ($train !~ /\A(?:ham|spam|rm)\z/) {
 die "--all only works with `rm'\n" if $opt{all} && $train ne 'rm';
 
 my $spamc = PublicInbox::Spamcheck::Spamc->new;
-my $pi_config = PublicInbox::Config->new;
+my $pi_cfg = PublicInbox::Config->new;
 my $err;
 my $mime = PublicInbox::Eml->new(do{
-	local $/;
-	my $data = ;
+	defined(my $data = do { local $/;  }) or die "read STDIN: $!\n";
 	$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 
 	if ($train ne 'rm') {
@@ -87,7 +86,7 @@ sub remove_or_add ($$$$) {
 
 # spam is removed from all known inboxes since it is often Bcc:-ed
 if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
-	$pi_config->each_inbox(sub {
+	$pi_cfg->each_inbox(sub {
 		my ($ibx) = @_;
 		$ibx = PublicInbox::InboxWritable->new($ibx);
 		my $im = $ibx->importer(0);
@@ -102,7 +101,7 @@ if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
 	for ($mime->header('Cc'), $mime->header('To')) {
 		foreach my $addr (PublicInbox::Address::emails($_)) {
 			$addr = lc($addr);
-			$dests{$addr} //= $pi_config->lookup($addr) // 0;
+			$dests{$addr} //= $pi_cfg->lookup($addr) // 0;
 		}
 	}
 
@@ -113,7 +112,7 @@ if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
 		next if $seen{"$ibx"}++;
 		remove_or_add($ibx, $train, $mime, $addr);
 	}
-	my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime);
+	my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_cfg, $mime);
 	for my $ibx (@$dests) {
 		next if $seen{"$ibx"}++;
 		remove_or_add($ibx, $train, $mime, $ibx->{-primary_address});
diff --git a/script/public-inbox-mda b/script/public-inbox-mda
index 3ed5abb6..40963f8e 100755
--- a/script/public-inbox-mda
+++ b/script/public-inbox-mda
@@ -42,18 +42,18 @@ my $str = do { local $/;  };
 $str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 $ems->prepare(\$str);
 my $eml = PublicInbox::Eml->new(\$str);
-my $config = PublicInbox::Config->new;
+my $cfg = PublicInbox::Config->new;
 my $key = 'publicinboxmda.spamcheck';
 my $default = 'PublicInbox::Spamcheck::Spamc';
-my $spamc = PublicInbox::Spamcheck::get($config, $key, $default);
+my $spamc = PublicInbox::Spamcheck::get($cfg, $key, $default);
 my $dests = [];
 my $recipient = $ENV{ORIGINAL_RECIPIENT};
 if (defined $recipient) {
-	my $ibx = $config->lookup($recipient); # first check
+	my $ibx = $cfg->lookup($recipient); # first check
 	push @$dests, $ibx if $ibx;
 }
 if (!scalar(@$dests)) {
-	$dests = PublicInbox::MDA->inboxes_for_list_id($config, $eml);
+	$dests = PublicInbox::MDA->inboxes_for_list_id($cfg, $eml);
 	if (!scalar(@$dests) && !defined($recipient)) {
 		die "ORIGINAL_RECIPIENT not defined in ENV\n";
 	}
diff --git a/script/public-inbox-purge b/script/public-inbox-purge
index 7bca11ea..52f1f18a 100755
--- a/script/public-inbox-purge
+++ b/script/public-inbox-purge
@@ -32,7 +32,7 @@ if ($opt->{help}) { print $help; exit 0 };
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt);
 PublicInbox::AdminEdit::check_editable(\@ibxs);
 
-my $data = do { local $/;  };
+defined(my $data = do { local $/;  }) or die "read STDIN: $!\n";
 $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 my $n_purged = 0;
 
diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool
index bdcc605c..0acffc1f 100755
--- a/scripts/import_slrnspool
+++ b/scripts/import_slrnspool
@@ -22,8 +22,8 @@ $SIG{TERM} = $sighandler;
 my $spool = shift @ARGV or die usage();
 my $recipient = $ENV{ORIGINAL_RECIPIENT};
 defined $recipient or die usage();
-my $config = PublicInbox::Config->new;
-my $ibx = $config->lookup($recipient);
+my $cfg = PublicInbox::Config->new;
+my $ibx = $cfg->lookup($recipient);
 my $git = $ibx->git;
 my $im;
 if ($ibx->version == 2) {
diff --git a/t/admin.t b/t/admin.t
index c25667b2..60c6037d 100644
--- a/t/admin.t
+++ b/t/admin.t
@@ -5,24 +5,25 @@ use warnings;
 use Test::More;
 use PublicInbox::TestCommon;
 use PublicInbox::Import;
-use_ok 'PublicInbox::Admin', qw(resolve_repo_dir);
+use_ok 'PublicInbox::Admin';
 my ($tmpdir, $for_destroy) = tmpdir();
 my $git_dir = "$tmpdir/v1";
 my $v2_dir = "$tmpdir/v2";
 my ($res, $err, $v);
 
 PublicInbox::Import::init_bare($git_dir);
+*resolve_inboxdir = \&PublicInbox::Admin::resolve_inboxdir;
 
 # v1
-is(resolve_repo_dir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
-is(resolve_repo_dir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
+is(resolve_inboxdir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
+is(resolve_inboxdir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
 
 ok(chdir($git_dir), 'chdir GIT_DIR works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
 
 ok(chdir("$git_dir/objects"), 'chdir GIT_DIR/objects works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
-$res = resolve_repo_dir(undef, \$v);
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
+$res = resolve_inboxdir(undef, \$v);
 is($v, 1, 'version 1 detected');
 is($res, $git_dir, 'detects directory along with version');
 
@@ -36,13 +37,13 @@ SKIP: {
 
 	ok(chdir($no_vcs_dir), 'chdir to a non-inbox');
 	open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
-	$res = eval { resolve_repo_dir() };
+	$res = eval { resolve_inboxdir() };
 	open STDERR, '>&', $olderr or die "restore stderr: $!";
 	is($res, undef, 'fails inside non-version-controlled dir');
 
 	ok(chdir($tmpdir), 'back to test-specific $tmpdir');
 	open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
-	$res = eval { resolve_repo_dir($no_vcs_dir) };
+	$res = eval { resolve_inboxdir($no_vcs_dir) };
 	$err = $@;
 	open STDERR, '>&', $olderr or die "restore stderr: $!";
 	is($res, undef, 'fails on non-version-controlled dir');
@@ -66,18 +67,25 @@ SKIP: {
 	PublicInbox::V2Writable->new($ibx, 1)->idx_init;
 
 	ok(-e "$v2_dir/inbox.lock", 'exists');
-	is(resolve_repo_dir($v2_dir), $v2_dir,
-		'resolve_repo_dir works on v2_dir');
-	ok(chdir($v2_dir), 'chdir v2_dir OK');
-	is(resolve_repo_dir(), $v2_dir, 'resolve_repo_dir works inside v2_dir');
-	$res = resolve_repo_dir(undef, \$v);
+	is(resolve_inboxdir($v2_dir), $v2_dir,
+		'resolve_inboxdir works on v2_dir');
+	chdir($v2_dir) or BAIL_OUT "chdir v2_dir: $!";
+	is(resolve_inboxdir(), $v2_dir, 'resolve_inboxdir works inside v2_dir');
+	$res = resolve_inboxdir(undef, \$v);
 	is($v, 2, 'version 2 detected');
 	is($res, $v2_dir, 'detects directory along with version');
 
 	# TODO: should work from inside Xapian dirs, and git dirs, here...
+	PublicInbox::Import::init_bare("$v2_dir/git/0.git");
+	my $objdir = "$v2_dir/git/0.git/objects";
+	is($v2_dir, resolve_inboxdir($objdir, \$v), 'at $objdir');
+	is($v, 2, 'version 2 detected at $objdir');
+	chdir($objdir) or BAIL_OUT "chdir objdir: $!";
+	is(resolve_inboxdir(undef, \$v), $v2_dir, 'inside $objdir');
+	is($v, 2, 'version 2 detected inside $objdir');
 }
 
-chdir '/';
+chdir '/' or BAIL_OUT "chdir: $!";
 
 my @pairs = (
 	'1g' => 1024 ** 3,
diff --git a/t/config.t b/t/config.t
index 204fc790..7fb44acc 100644
--- a/t/config.t
+++ b/t/config.t
@@ -41,7 +41,6 @@ my ($tmpdir, $for_destroy) = tmpdir();
 		'url' => [ 'http://example.com/meta' ],
 		-primary_address => 'meta@public-inbox.org',
 		'name' => 'meta',
-		feedmax => 25,
 		-httpbackend_limiter => undef,
 		nntpserver => undef,
 	}, "lookup matches expected output");
@@ -58,7 +57,6 @@ my ($tmpdir, $for_destroy) = tmpdir();
 		'inboxdir' => '/home/pi/test-main.git',
 		'domain' => 'public-inbox.org',
 		'name' => 'test',
-		feedmax => 25,
 		'url' => [ 'http://example.com/test' ],
 		-httpbackend_limiter => undef,
 		nntpserver => undef,
diff --git a/t/ds-poll.t b/t/ds-poll.t
index 3771059b..0ee57b69 100644
--- a/t/ds-poll.t
+++ b/t/ds-poll.t
@@ -16,35 +16,35 @@ pipe($r, $w) or die;
 pipe($x, $y) or die;
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($r), EPOLLIN), 0, 'add EPOLLIN');
 my $events = [];
-my $n = $p->epoll_wait(9, 0, $events);
+$p->epoll_wait(9, 0, $events);
 is_deeply($events, [], 'no events set');
-is($n, 0, 'nothing ready, yet');
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($w), EPOLLOUT|EPOLLONESHOT), 0,
 	'add EPOLLOUT|EPOLLONESHOT');
-$n = $p->epoll_wait(9, -1, $events);
-is($n, 1, 'got POLLOUT event');
-is($events->[0]->[0], fileno($w), '$w ready');
+$p->epoll_wait(9, -1, $events);
+is(scalar(@$events), 1, 'got POLLOUT event');
+is($events->[0], fileno($w), '$w ready');
 
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after oneshot');
+$p->epoll_wait(9, 0, $events);
+is(scalar(@$events), 0, 'nothing ready after oneshot');
 is_deeply($events, [], 'no events set after oneshot');
 
 syswrite($w, '1') == 1 or die;
 for my $t (0..1) {
-	$n = $p->epoll_wait(9, $t, $events);
-	is($events->[0]->[0], fileno($r), "level-trigger POLLIN ready #$t");
-	is($n, 1, "only event ready #$t");
+	$p->epoll_wait(9, $t, $events);
+	is($events->[0], fileno($r), "level-trigger POLLIN ready #$t");
+	is(scalar(@$events), 1, "only event ready #$t");
 }
 syswrite($y, '1') == 1 or die;
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($x), EPOLLIN|EPOLLONESHOT), 0,
 	'EPOLLIN|EPOLLONESHOT add');
-is($p->epoll_wait(9, -1, $events), 2, 'epoll_wait has 2 ready');
-my @fds = sort(map { $_->[0] } @$events);
+$p->epoll_wait(9, -1, $events);
+is(scalar @$events, 2, 'epoll_wait has 2 ready');
+my @fds = sort @$events;
 my @exp = sort((fileno($r), fileno($x)));
 is_deeply(\@fds, \@exp, 'got both ready FDs');
 
 is($p->epoll_ctl(EPOLL_CTL_DEL, fileno($r), 0), 0, 'EPOLL_CTL_DEL OK');
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after EPOLL_CTL_DEL');
+$p->epoll_wait(9, 0, $events);
+is(scalar @$events, 0, 'nothing ready after EPOLL_CTL_DEL');
 
 done_testing;
diff --git a/t/epoll.t b/t/epoll.t
index b47650e3..a1e73e07 100644
--- a/t/epoll.t
+++ b/t/epoll.t
@@ -12,11 +12,11 @@ is(epoll_ctl($epfd, EPOLL_CTL_ADD, fileno($w), EPOLLOUT), 0,
     'epoll_ctl socket EPOLLOUT');
 
 my @events;
-is(epoll_wait($epfd, 100, 10000, \@events), 1, 'epoll_wait returns');
+epoll_wait($epfd, 100, 10000, \@events);
 is(scalar(@events), 1, 'got one event');
-is($events[0]->[0], fileno($w), 'got expected FD');
-is($events[0]->[1], EPOLLOUT, 'got expected event');
+is($events[0], fileno($w), 'got expected FD');
 close $w;
-is(epoll_wait($epfd, 100, 0, \@events), 0, 'epoll_wait timeout');
+epoll_wait($epfd, 100, 0, \@events);
+is(@events, 0, 'epoll_wait timeout');
 
 done_testing;
diff --git a/t/extsearch.t b/t/extsearch.t
new file mode 100644
index 00000000..fb31b0ab
--- /dev/null
+++ b/t/extsearch.t
@@ -0,0 +1,370 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Config;
+use PublicInbox::Search;
+use PublicInbox::InboxWritable;
+use Fcntl qw(:seek);
+my $json = PublicInbox::Config::json() or plan skip_all => 'JSON missing';
+require_git(2.6);
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::ExtSearch';
+use_ok 'PublicInbox::ExtSearchIdx';
+use_ok 'PublicInbox::OverIdx';
+my $sock = tcp_server();
+my $host_port = $sock->sockhost . ':' . $sock->sockport;
+my ($home, $for_destroy) = tmpdir();
+local $ENV{HOME} = $home;
+mkdir "$home/.public-inbox" or BAIL_OUT $!;
+my $cfg_path = "$home/.public-inbox/config";
+open my $fh, '>', $cfg_path or BAIL_OUT $!;
+print $fh < $v2addr };
+my $eml = eml_load('t/utf8.eml');
+
+$eml->header_set('List-Id', '');
+open($fh, '+>', undef) or BAIL_OUT $!;
+$fh->autoflush(1);
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
+	'http://example.com/v1test', $v1addr ]), 'v1test init');
+
+$eml->header_set('List-Id', '');
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+truncate($fh, 0) or BAIL_OUT $!;
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+$env = { ORIGINAL_RECIPIENT => $v1addr };
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?";
+
+ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
+{
+	my $es = PublicInbox::ExtSearch->new("$home/extindex");
+	ok($es->has_threadid, '->has_threadid');
+}
+
+{ # TODO: -extindex should write this to config
+	open $fh, '>>', $cfg_path or BAIL_OUT $!;
+	print $fh <ALL
+[extindex "all"]
+	topdir = $home/extindex
+EOF
+	close $fh or BAIL_OUT $!;
+
+	my $pi_cfg = PublicInbox::Config->new;
+	$pi_cfg->fill_all;
+	ok($pi_cfg->ALL, '->ALL');
+	my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
+	my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
+	is_deeply($ret, { 'v1.example' => 1, 'v2.example' => 1 },
+		'->nntp_xref_for');
+}
+
+SKIP: {
+	require_mods(qw(Net::NNTP), 1);
+	my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
+	my $cmd = [ '-nntpd', '-W0', "--stdout=$out", "--stderr=$err" ];
+	my $td = start_script($cmd, undef, { 3 => $sock });
+	my $n = Net::NNTP->new($host_port);
+	my @xp = $n->xpath('');
+	is_deeply(\@xp, [ qw(v1.example/1 v2.example/1) ]);
+	$n->group('v1.example');
+	my $res = $n->head(1);
+	@$res = grep(/^Xref: /, @$res);
+	like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works');
+}
+
+my $es = PublicInbox::ExtSearch->new("$home/extindex");
+{
+	my $smsg = $es->over->get_art(1);
+	ok($smsg, 'got first article');
+	is($es->over->get_art(2), undef, 'only one added');
+	my $xref3 = $es->over->get_xref3(1);
+	like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1');
+	like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2');
+	is(scalar(@$xref3), 2, 'only to entries');
+}
+
+if ('inbox edited') {
+	my ($in, $out, $err);
+	$in = $out = $err = '';
+	my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+	my $env = { MAIL_EDITOR => "$^X -i -p -e 's/test message/BEST MSG/'" };
+	my $cmd = [ qw(-edit -Ft/utf8.eml), "$home/v2test" ];
+	ok(run_script($cmd, $env, $opt), '-edit');
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+		'extindex again');
+	like($err, qr/discontiguous range/, 'warned about discontiguous range');
+	my $msg1 = $es->over->get_art(1) or BAIL_OUT 'msg1 missing';
+	my $msg2 = $es->over->get_art(2) or BAIL_OUT 'msg2 missing';
+	is($msg1->{mid}, $msg2->{mid}, 'edited message indexed');
+	isnt($msg1->{blob}, $msg2->{blob}, 'blobs differ');
+	my $eml2 = $es->smsg_eml($msg2);
+	like($eml2->body, qr/BEST MSG/, 'edited body in #2');
+	unlike($eml2->body, qr/test message/, 'old body discarded in #2');
+	my $eml1 = $es->smsg_eml($msg1);
+	like($eml1->body, qr/test message/, 'original body in #1');
+	my $x1 = $es->over->get_xref3(1);
+	my $x2 = $es->over->get_xref3(2);
+	is(scalar(@$x1), 1, 'original only has one xref3');
+	is(scalar(@$x2), 1, 'new message has one xref3');
+	isnt($x1->[0], $x2->[0], 'xref3 differs');
+
+	my $mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 1, 'new message found');
+	$mset = $es->mset('b:"test message"');
+	is($mset->size, 1, 'old message found');
+	delete @$es{qw(git over xdb)}; # fork preparation
+
+	my $pi_cfg = PublicInbox::Config->new;
+	$pi_cfg->fill_all;
+	is(scalar($pi_cfg->ALL->mset('s:Testing')->items), 2,
+		'2 results in ->ALL');
+	my $res = {};
+	my $nr = 0;
+	$pi_cfg->each_inbox(sub {
+		$nr++;
+		my ($ibx) = @_;
+		local $SIG{__WARN__} = sub {}; # FIXME support --reindex
+		my $mset = $ibx->isrch->mset('s:Testing');
+		$res->{$ibx->eidx_key} = $ibx->isrch->mset_to_smsg($ibx, $mset);
+	});
+	is($nr, 2, 'two inboxes');
+	my $exp = {};
+	for my $v (qw(v1 v2)) {
+		my $ibx = $pi_cfg->lookup_newsgroup("$v.example");
+		my $smsg = $ibx->over->get_art(1);
+		$smsg->psgi_cull;
+		$exp->{"$v.example"} = [ $smsg ];
+	}
+	is_deeply($res, $exp, 'isearch limited results');
+	$pi_cfg = $res = $exp = undef;
+
+	open my $rmfh, '+>', undef or BAIL_OUT $!;
+	$rmfh->autoflush(1);
+	print $rmfh $eml2->as_string or BAIL_OUT $!;
+	seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
+	$opt->{0} = $rmfh;
+	ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
+
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
+		'extindex after rm');
+	is($es->over->get_art(2), undef, 'doc #2 gone');
+	$mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 0, 'new message gone');
+}
+
+my $misc = $es->misc;
+my @it = $misc->mset('')->items;
+is(scalar(@it), 2, 'two inboxes');
+like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
+like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
+
+my $cfg = PublicInbox::Config->new;
+my $schema_version = PublicInbox::Search::SCHEMA_VERSION();
+my $f = "$home/extindex/ei$schema_version/over.sqlite3";
+my $oidx = PublicInbox::OverIdx->new($f);
+if ('inject w/o indexing') {
+	use PublicInbox::Import;
+	my $v1ibx = $cfg->lookup_name('v1test');
+	my $last_v1_commit = $v1ibx->mm->last_commit;
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $last_v2_commit = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $git0 = PublicInbox::Git->new("$v2ibx->{inboxdir}/git/0.git");
+	chomp(my $cmt = $git0->qx(qw(rev-parse HEAD^0)));
+	is($last_v2_commit, $cmt, 'v2 index up-to-date');
+
+	my $v2im = PublicInbox::Import->new($git0, undef, undef, $v2ibx);
+	$v2im->{lock_path} = undef;
+	$v2im->{path_type} = 'v2';
+	$v2im->add(eml_load('t/mda-mime.eml'));
+	$v2im->done;
+	chomp(my $tip = $git0->qx(qw(rev-parse HEAD^0)));
+	isnt($tip, $cmt, '0.git v2 updated');
+
+	# inject a message w/o updating index
+	rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or
+		BAIL_OUT $!;
+	open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!;
+	run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or
+		BAIL_OUT '-mda';
+	rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or
+		BAIL_OUT $!;
+
+	my ($in, $out, $err);
+	$in = $out = $err = '';
+	my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+	ok(run_script([qw(-extindex -v -v --all), "$home/extindex"],
+		undef, undef), 'extindex noop');
+	$es->{xdb}->reopen;
+	my $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 0, 'did not attempt to index unindexed v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 0, 'did not attempt to index unindexed v2 message');
+	ok(run_script([qw(-index --all)]), 'indexed v1 and v2 inboxes');
+
+	isnt($v1ibx->mm->last_commit, $last_v1_commit, '-index v1 worked');
+	isnt($v2ibx->mm->last_commit_xap($schema_version, 0),
+		$last_v2_commit, '-index v2 worked');
+	ok(run_script([qw(-extindex --all), "$home/extindex"]),
+		'extindex updates');
+
+	$es->{xdb}->reopen;
+	$mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 1, 'got v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 1, 'got v2 message');
+}
+
+if ('reindex catches missed messages') {
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+	my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $eml = eml_load('t/data/0001.patch');
+	$im->add($eml);
+	$im->done;
+	my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	isnt($cmt_a, $cmt_b, 'v2 0.git HEAD updated');
+	$oidx->dbh;
+	my $uv = $v2ibx->uidvalidity;
+	my $lc_key = "lc-v2:v2.example//$uv;0";
+	is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+		'update lc-v2 meta, old is as expected');
+	my $max = $oidx->max;
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex), "$home/extindex", $v2ibx->{inboxdir}]),
+		'-extindex noop');
+	is($oidx->max, $max, '->max unchanged');
+	is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 unchanged');
+	$oidx->dbh_close;
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --reindex), "$home/extindex",
+			$v2ibx->{inboxdir}], undef, $opt),
+			'--reindex for unseen');
+	is($oidx->max, $max + 1, '->max bumped');
+	is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 stays unchanged');
+	my @err = split(/^/, $err);
+	is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+	like($err[0], qr/I: reindex_unseen/, 'got reindex_unseen message');
+	my $new = $oidx->get_art($max + 1);
+	is($new->{subject}, $eml->header('Subject'), 'new message added');
+
+	$es->{xdb}->reopen;
+	my $mset = $es->mset("mid:$new->{mid}");
+	is($mset->size, 1, 'previously unseen, now indexed in Xapian');
+
+	ok($im->remove($eml), 'remove new message from v2 inbox');
+	$im->done;
+	my $cmt_c = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	is($oidx->eidx_meta($lc_key, $cmt_c), $cmt_b,
+		'bump lc-v2 meta again to skip v2 remove');
+	$err = '';
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex --reindex), "$home/extindex",
+			$v2ibx->{inboxdir}], undef, $opt),
+			'--reindex for stale');
+	@err = split(/^/, $err);
+	is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+	like($err[0], qr/\(#$new->{num}\): stale/, 'got stale message warning');
+	is($oidx->get_art($new->{num}), undef,
+		'stale message gone from over');
+	is_deeply($oidx->get_xref3($new->{num}), [],
+		'stale message has no xref3');
+	$es->{xdb}->reopen;
+	$mset = $es->mset("mid:$new->{mid}");
+	is($mset->size, 0, 'stale mid gone Xapian');
+}
+
+if ('reindex catches content bifurcation') {
+	use PublicInbox::MID qw(mids);
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+	my $eml = eml_load('t/data/message_embed.eml');
+	my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	$im->add($eml);
+	$im->done;
+	my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $uv = $v2ibx->uidvalidity;
+	my $lc_key = "lc-v2:v2.example//$uv;0";
+	$oidx->dbh;
+	is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+		'update lc-v2 meta, old is as expected');
+	my $mid = mids($eml)->[0];
+	my $smsg = $v2ibx->over->next_by_mid($mid, \(my $id), \(my $prev));
+	my $oldmax = $oidx->max;
+	my $x3_orig = $oidx->get_xref3(3);
+	is(scalar(@$x3_orig), 1, '#3 has one xref');
+	$oidx->add_xref3(3, $smsg->{num}, $smsg->{blob}, 'v2.example');
+	my $x3 = $oidx->get_xref3(3);
+	is(scalar(@$x3), 2, 'injected xref3');
+	$oidx->commit_lazy;
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+		'extindex --all is noop');
+	is($err, '', 'no warnings in index');
+	$oidx->dbh;
+	is($oidx->max, $oldmax, 'oidx->max unchanged');
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
+		undef, $opt), 'extindex --reindex');
+	$oidx->dbh;
+	ok($oidx->max > $oldmax, 'oidx->max bumped');
+	like($err, qr/split into 2 due to deduplication change/,
+		'bifurcation noted');
+	my $added = $oidx->get_art($oidx->max);
+	is($added->{blob}, $smsg->{blob}, 'new blob indexed');
+	is_deeply(["v2.example:$smsg->{num}:$smsg->{blob}"],
+		$oidx->get_xref3($added->{num}),
+		'xref3 corrected for bifurcated message');
+	is_deeply($oidx->get_xref3(3), $x3_orig, 'xref3 restored for #3');
+}
+
+if ('--reindex --rethread') {
+	my $before = $oidx->dbh->selectrow_array(<<'');
+SELECT MAX(tid) FROM over WHERE num > 0
+
+	my $opt = {};
+	ok(run_script([qw(-extindex --reindex --rethread --all),
+			"$home/extindex"], undef, $opt),
+			'--rethread');
+	my $after = $oidx->dbh->selectrow_array(<<'');
+SELECT MIN(tid) FROM over WHERE num > 0
+
+	# actual rethread logic is identical to v1/v2 and tested elsewhere
+	ok($after > $before, '--rethread updates MIN(tid)');
+}
+
+if ('remove v1test and test gc') {
+	xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
+		{ GIT_CONFIG => $cfg_path });
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
+		'extindex --gc');
+	like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
+	is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+		'no non-informational messages');
+	$misc->{xdb}->reopen;
+	@it = $misc->mset('')->items;
+	is(scalar(@it), 1, 'only one inbox left');
+}
+
+done_testing;
diff --git a/t/feed.t b/t/feed.t
index 5ad90a07..9f6a987b 100644
--- a/t/feed.t
+++ b/t/feed.t
@@ -75,7 +75,7 @@ EOF
 {
 	# check initial feed
 	{
-		my $feed = string_feed({ -inbox => $ibx });
+		my $feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 3 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($feed);
@@ -109,7 +109,7 @@ EOF
 
 	# check spam shows up
 	{
-		my $spammy_feed = string_feed({ -inbox => $ibx });
+		my $spammy_feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($spammy_feed);
@@ -127,7 +127,7 @@ EOF
 
 	# spam no longer shows up
 	{
-		my $feed = string_feed({ -inbox => $ibx });
+		my $feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($feed);
diff --git a/t/filter_rubylang.t b/t/filter_rubylang.t
index e6c53f98..6d639c00 100644
--- a/t/filter_rubylang.t
+++ b/t/filter_rubylang.t
@@ -35,7 +35,7 @@ SKIP: {
 	];
 	my $ibx = PublicInbox::Inbox->new({ inboxdir => $git_dir,
 						altid => $altid });
-	$f = PublicInbox::Filter::RubyLang->new(-inbox => $ibx);
+	$f = PublicInbox::Filter::RubyLang->new(ibx => $ibx);
 	$msg = <<'EOF';
 X-Mail-Count: 12
 Message-ID: 
diff --git a/t/gcf2.t b/t/gcf2.t
new file mode 100644
index 00000000..35b2f113
--- /dev/null
+++ b/t/gcf2.t
@@ -0,0 +1,162 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Fcntl qw(:seek);
+use IO::Handle ();
+use POSIX qw(_exit);
+use Cwd qw(abs_path);
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2';
+use PublicInbox::Import;
+my ($tmpdir, $for_destroy) = tmpdir();
+
+my $gcf2 = PublicInbox::Gcf2::new();
+is(ref($gcf2), 'PublicInbox::Gcf2', '::new works');
+my $COPYING = 'dba13ed2ddf783ee8118c6a581dbf75305f816a3';
+open my $agpl, '<', 'COPYING' or BAIL_OUT "AGPL-3 missing: $!";
+$agpl = do { local $/; <$agpl> };
+
+PublicInbox::Import::init_bare($tmpdir);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr);
+is($?, 0, 'fast-import succeeded');
+$gcf2->add_alternate("$tmpdir/objects");
+
+{
+	my ($r, $w);
+	pipe($r, $w) or BAIL_OUT $!;
+	my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+	$gcf2->cat_oid(fileno($w), $tree);
+	close $w;
+	is("$tree tree 30\n", <$r>, 'tree header ok');
+	$r = do { local $/; <$r> };
+	is(chop($r), "\n", 'got trailing newline');
+	is(length($r), 30, 'tree length matches');
+}
+
+chomp(my $objdir = xqx([qw(git rev-parse --git-path objects)]));
+if ($objdir =~ /\A--git-path\n/) { # git <2.5
+	chomp($objdir = xqx([qw(git rev-parse --git-dir)]));
+	$objdir .= '/objects';
+}
+if ($objdir && -d $objdir) {
+	$objdir = abs_path($objdir);
+	open my $alt, '>>', "$tmpdir/objects/info/alternates" or
+							BAIL_OUT $!;
+	print $alt $objdir, "\n" or BAIL_OUT $!;
+	close $alt or BAIL_OUT $!;
+
+	# calling gcf2->add_alternate on an already-added path won't
+	# cause alternates to be reloaded, so we do
+	# $gcf2->add_alternate($objdir) later on instead of
+	# $gcf2->add_alternate("$tmpdir/objects");
+	# $objdir = "$tmpdir/objects";
+} else {
+	$objdir = undef
+}
+
+my $nr = $ENV{TEST_LEAK_NR};
+my $cat = $ENV{TEST_LEAK_CAT} // 10;
+diag "checking for leaks... (TEST_LEAK_NR=$nr TEST_LEAK_CAT=$cat)" if $nr;
+
+SKIP: {
+	skip 'not in git worktree', 21 unless defined($objdir);
+	$gcf2->add_alternate($objdir);
+	eval { $gcf2->add_alternate($objdir) };
+	ok(!$@, 'no error adding alternate redundantly');
+	if ($nr) {
+		diag "adding alternate $nr times redundantly";
+		$gcf2->add_alternate($objdir) for (1..$nr);
+		diag 'done adding redundant alternates';
+	}
+
+	open my $fh, '+>', undef or BAIL_OUT "open: $!";
+	$fh->autoflush(1);
+
+	ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails');
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	is(do { local $/; <$fh> }, '', 'nothing written');
+
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails');
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	is(do { local $/; <$fh> }, '', 'nothing written for z40');
+
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	my $ck_copying = sub {
+		my ($desc) = @_;
+		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+		is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc");
+		my $buf = do { local $/; <$fh> };
+		is(chop($buf), "\n", 'got trailing \\n');
+		is($buf, $agpl, "AGPL matches ($desc)");
+	};
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal');
+	$ck_copying->('regular file');
+
+	$gcf2 = PublicInbox::Gcf2::new();
+	$gcf2->add_alternate("$tmpdir/objects");
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate');
+	$ck_copying->('alternates after reopen');
+
+	$^O eq 'linux' or skip('pipe tests are Linux-only', 14);
+	for my $blk (1, 0) {
+		my ($r, $w);
+		pipe($r, $w) or BAIL_OUT $!;
+		fcntl($w, 1031, 4096) or
+			skip('Linux too old for F_SETPIPE_SZ', 14);
+		$w->blocking($blk);
+		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+		truncate($fh, 0) or BAIL_OUT "truncate: $!";
+		defined(my $pid = fork) or BAIL_OUT "fork: $!";
+		if ($pid == 0) {
+			close $w;
+			tick; # wait for parent to block on writev
+			my $buf = do { local $/; <$r> };
+			print $fh $buf or _exit(1);
+			_exit(0);
+		}
+		ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk");
+		close $w or BAIL_OUT "close: $!";
+		is(waitpid($pid, 0), $pid, 'child exited');
+		is($?, 0, 'no error in child');
+		$ck_copying->("pipe blocking($blk)");
+
+		pipe($r, $w) or BAIL_OUT $!;
+		fcntl($w, 1031, 4096) or BAIL_OUT $!;
+		$w->blocking($blk);
+		close $r;
+		local $SIG{PIPE} = 'IGNORE';
+		eval { $gcf2->cat_oid(fileno($w), $COPYING) };
+		like($@, qr/writev error:/, 'got writev error');
+	}
+}
+
+if ($nr) {
+	open my $null, '>', '/dev/null' or BAIL_OUT "open /dev/null: $!";
+	my $fd = fileno($null);
+	local $SIG{PIPE} = 'IGNORE';
+	my ($r, $w);
+	pipe($r, $w);
+	close $r;
+	my $broken = fileno($w);
+	for (1..$nr) {
+		my $obj = PublicInbox::Gcf2::new();
+		if (defined($objdir)) {
+			$obj->add_alternate($objdir);
+			for (1..$cat) {
+				$obj->cat_oid($fd, $COPYING);
+				eval { $obj->cat_oid($broken, $COPYING) };
+				$obj->cat_oid($fd, '0'x40);
+				$obj->cat_oid($fd, 'invalid');
+			}
+		}
+	}
+}
+done_testing;
diff --git a/t/gcf2_client.t b/t/gcf2_client.t
new file mode 100644
index 00000000..f1302a54
--- /dev/null
+++ b/t/gcf2_client.t
@@ -0,0 +1,90 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Cwd qw(getcwd);
+use PublicInbox::Import;
+use PublicInbox::DS;
+
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2Client';
+my ($tmpdir, $for_destroy) = tmpdir();
+my $git_a = "$tmpdir/a.git";
+my $git_b = "$tmpdir/b.git";
+PublicInbox::Import::init_bare($git_a);
+PublicInbox::Import::init_bare($git_b);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr);
+is($?, 0, 'fast-import succeeded');
+
+my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+my $called = 0;
+my $err_f = "$tmpdir/err";
+{
+	PublicInbox::DS->Reset;
+	open my $err, '>>', $err_f or BAIL_OUT $!;
+	my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->cat_async("$tree $git_a", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is($oid, $tree, 'got expected OID');
+		is($size, 30, 'got expected length');
+		is($type, 'tree', 'got tree type');
+		is(length($$bref), 30, 'got a tree');
+		is($arg, 'hi', 'arg passed');
+		$called++;
+	}, 'hi');
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	my $estr = do { local $/; <$err> };
+	is($estr, '', 'nothing in stderr');
+
+	my $trunc = substr($tree, 0, 39);
+	$gcf2c->cat_async("$trunc $git_a", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is(undef, $bref, 'missing bref is undef');
+		is($oid, $trunc, 'truncated OID printed');
+		is($type, 'missing', 'type is "missing"');
+		is($size, undef, 'size is undef');
+		is($arg, 'bye', 'arg passed when missing');
+		$called++;
+	}, 'bye');
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry');
+
+	# try failed alternates lookup
+	PublicInbox::DS->Reset;
+	open $err, '>', $err_f or BAIL_OUT $!;
+	$gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->cat_async("$tree $git_b", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is(undef, $bref, 'missing bref from alt is undef');
+		$called++;
+	});
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry before alt update');
+
+	# now try successful alternates lookup
+	open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!;
+	print $alt "$git_a/objects\n" or BAIL_OUT $!;
+	close $alt or BAIL_OUT;
+	my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]);
+	$gcf2c->cat_async("$tree $git_a", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is($oid, $tree, 'oid match on alternates retry');
+		is($$bref, $expect, 'tree content matched');
+		$called++;
+	});
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+}
+is($called, 4, 'cat_async callbacks hit');
+done_testing;
diff --git a/t/git.t b/t/git.t
index dfd7173a..2cfff248 100644
--- a/t/git.t
+++ b/t/git.t
@@ -76,12 +76,17 @@ if (1) {
 	is(length($$x), $size, 'read correct number of bytes');
 
 	my $ref = $gcf->qx(qw(cat-file blob), $buf);
+	is($?, 0, 'no error on scalar success');
 	my @ref = $gcf->qx(qw(cat-file blob), $buf);
+	is($?, 0, 'no error on wantarray success');
 	my $nl = scalar @ref;
 	ok($nl > 1, "qx returned array length of $nl");
+	is(join('', @ref), $ref, 'qx array and scalar context both work');
 
 	$gcf->qx(qw(repack -adq));
 	ok($gcf->packed_bytes > 0, 'packed size is positive');
+	$gcf->qx(qw(rev-parse --verify bogus));
+	isnt($?, 0, '$? set on failure'.$?);
 }
 
 SKIP: {
diff --git a/t/idx_stack.t b/t/idx_stack.t
index 35aff37b..e0474fa4 100644
--- a/t/idx_stack.t
+++ b/t/idx_stack.t
@@ -6,6 +6,8 @@ use Test::More;
 use_ok 'PublicInbox::IdxStack';
 my $oid_a = '03c21563cf15c241687966b5b2a3f37cdc193316';
 my $oid_b = '963caad026055ab9bcbe3ee9550247f9d8840feb';
+my $cmt_a = 'df8e4a0612545d53672036641e9f076efc94c2f6';
+my $cmt_b = '3ba7c9fa4a083c439e768882c571c2026a981ca5';
 
 my $stk = PublicInbox::IdxStack->new;
 is($stk->read_prepare, $stk, 'nothing');
@@ -13,19 +15,19 @@ is($stk->num_records, 0, 'no records');
 is($stk->pop_rec, undef, 'undef on empty');
 
 $stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
 is($stk->read_prepare, $stk, 'read_prepare');
 is($stk->num_records, 1, 'num_records');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop once');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop once');
 is($stk->pop_rec, undef, 'undef on empty');
 
 $stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
-$stk->push_rec('d', 1234, 5678, $oid_b);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
+$stk->push_rec('d', 1234, 5678, $oid_b, $cmt_b);
 is($stk->read_prepare, $stk, 'read_prepare');
 is($stk->num_records, 2, 'num_records');
-is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b], 'pop');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop-pop');
+is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b, $cmt_b], 'pop');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop-pop');
 is($stk->pop_rec, undef, 'empty');
 
 SKIP: {
@@ -37,11 +39,11 @@ SKIP: {
 	while (<$fh>) {
 		chomp;
 		my ($at, $ct, $H) = split(/\./);
-		$stk //= PublicInbox::IdxStack->new($H);
+		$stk //= PublicInbox::IdxStack->new;
 		# not bothering to parse blobs here, just using commit OID
 		# as a blob OID since they're the same size + format
-		$stk->push_rec('m', $at + 0, $ct + 0, $H);
-		push(@expect, [ 'm', $at, $ct, $H ]);
+		$stk->push_rec('m', $at + 0, $ct + 0, $H, $H);
+		push(@expect, [ 'm', $at, $ct, $H, $H ]);
 	}
 	$stk or skip('nothing from git log', 3);
 	is($stk->read_prepare, $stk, 'read_prepare');
diff --git a/t/imapd.t b/t/imapd.t
index a464ad86..63a86e71 100644
--- a/t/imapd.t
+++ b/t/imapd.t
@@ -251,8 +251,8 @@ ok($mic->logout, 'logout works');
 
 my $have_inotify = eval { require Linux::Inotify2; 1 };
 
-my $pi_config = PublicInbox::Config->new;
-$pi_config->each_inbox(sub {
+my $pi_cfg = PublicInbox::Config->new;
+$pi_cfg->each_inbox(sub {
 	my ($ibx) = @_;
 	my $env = { ORIGINAL_RECIPIENT => $ibx->{-primary_address} };
 	my $name = $ibx->{name};
diff --git a/t/inbox_idle.t b/t/inbox_idle.t
index e16ee11b..198856bd 100644
--- a/t/inbox_idle.t
+++ b/t/inbox_idle.t
@@ -32,14 +32,14 @@ for my $V (1, 2) {
 		$sidx->set_metadata_once;
 		$sidx->idx_release; # allow watching on lockfile
 	}
-	my $pi_config = PublicInbox::Config->new(\<new(\<each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
-	my $ii = PublicInbox::InboxIdle->new($pi_config);
+	$pi_cfg->each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
+	my $ii = PublicInbox::InboxIdle->new($pi_cfg);
 	ok($ii, 'InboxIdle created');
 	SKIP: {
 		skip('inotify or kqueue missing', 1) unless $ii->{sock};
@@ -50,7 +50,7 @@ EOF
 	PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
 	$ii->event_step;
 	is(scalar @{$obj->{called}}, 1, 'called on unlock');
-	$pi_config->each_inbox(sub { shift->unsubscribe_unlock($ident) });
+	$pi_cfg->each_inbox(sub { shift->unsubscribe_unlock($ident) });
 	ok($im->add(eml_load('t/data/0001.patch')), "$V added #2");
 	$im->done;
 	PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
diff --git a/t/mda_filter_rubylang.t b/t/mda_filter_rubylang.t
index 754d52f7..489ea223 100644
--- a/t/mda_filter_rubylang.t
+++ b/t/mda_filter_rubylang.t
@@ -44,8 +44,8 @@ something
 EOF
 		ok(run_script(['-mda'], $env, $opt), 'message delivered');
 	}
-	my $config = PublicInbox::Config->new;
-	my $ibx = $config->lookup_name($v);
+	my $cfg = PublicInbox::Config->new;
+	my $ibx = $cfg->lookup_name($v);
 
 	# make sure all serials are searchable:
 	for my $i (1..2) {
diff --git a/t/miscsearch.t b/t/miscsearch.t
new file mode 100644
index 00000000..0ba79194
--- /dev/null
+++ b/t/miscsearch.t
@@ -0,0 +1,57 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::InboxWritable;
+require_mods(qw(Search::Xapian DBD::SQLite));
+use_ok 'PublicInbox::MiscSearch';
+use_ok 'PublicInbox::MiscIdx';
+
+my ($tmp, $for_destroy) = tmpdir();
+my $eidx = { xpfx => "$tmp/eidx", -no_fsync => 1 }; # mock ExtSearchIdx
+{
+	mkdir "$tmp/v1" or BAIL_OUT "mkdir $!";
+	open my $fh, '>', "$tmp/v1/description" or BAIL_OUT "open: $!";
+	print $fh "Everything sucks this year\n" or BAIL_OUT "print $!";
+	close $fh or BAIL_OUT "close $!";
+}
+{
+	my $v1 = PublicInbox::InboxWritable->new({
+		inboxdir => "$tmp/v1",
+		name => 'hope',
+		address => [ 'nope@example.com' ],
+		indexlevel => 'basic',
+		version => 1,
+	});
+	$v1->init_inbox;
+	my $mi = PublicInbox::MiscIdx->new($eidx);
+	$mi->begin_txn;
+	$mi->index_ibx($v1);
+	$mi->commit_txn;
+}
+
+my $ms = PublicInbox::MiscSearch->new("$tmp/eidx/misc");
+my $mset = $ms->mset('"everything sucks today"');
+is(scalar($mset->items), 0, 'no match on description phrase');
+
+$mset = $ms->mset('"everything sucks this year"');
+is(scalar($mset->items), 1, 'match phrase on description');
+
+$mset = $ms->mset('everything sucks');
+is(scalar($mset->items), 1, 'match words in description');
+
+$mset = $ms->mset('nope@example.com');
+is(scalar($mset->items), 1, 'match full address');
+
+$mset = $ms->mset('nope');
+is(scalar($mset->items), 1, 'match partial address');
+
+$mset = $ms->mset('hope');
+is(scalar($mset->items), 1, 'match name');
+my $mi = ($mset->items)[0];
+my $doc = $mi->get_document;
+is($doc->get_data, '{}', 'stored empty data');
+
+done_testing;
diff --git a/t/msgmap.t b/t/msgmap.t
index 437e106e..2d31f1de 100644
--- a/t/msgmap.t
+++ b/t/msgmap.t
@@ -12,7 +12,7 @@ my $d = PublicInbox::Msgmap->new($tmpdir, 1);
 my %mid2num;
 my %num2mid;
 my @mids = qw(a@b c@d e@f g@h aa@bb aa@cc);
-is_deeply([$d->minmax], [undef,undef], "empty min max on new DB");
+is_deeply([$d->minmax], [0,0], "zero min max on new DB");
 
 foreach my $mid (@mids) {
 	my $n = $d->mid_insert($mid);
diff --git a/t/nntp.t b/t/nntp.t
index 9a482acb..36eb6945 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -8,6 +8,7 @@ use PublicInbox::Eml;
 require_mods(qw(DBD::SQLite Data::Dumper));
 use_ok 'PublicInbox::NNTP';
 use_ok 'PublicInbox::Inbox';
+use PublicInbox::Config;
 
 {
 	sub quote_str {
@@ -98,44 +99,38 @@ use_ok 'PublicInbox::Inbox';
 
 { # test setting NNTP headers in HEAD and ARTICLE requests
 	my $u = 'https://example.com/a/';
-	my $ng = PublicInbox::Inbox->new({ name => 'test',
+	my $ibx = PublicInbox::Inbox->new({ name => 'test',
 					inboxdir => 'test.git',
 					address => 'a@example.com',
 					-primary_address => 'a@example.com',
 					newsgroup => 'test',
 					domain => 'example.com',
 					url => [ '//example.com/a' ]});
-	is($ng->base_url, $u, 'URL expanded');
+	is($ibx->base_url, $u, 'URL expanded');
 	my $mid = 'a@b';
 	my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n");
 	my $hdr = $mime->header_obj;
 	my $mock_self = {
-		nntpd => { grouplist => [], servername => 'example.com' },
-		ng => $ng,
+		nntpd => {
+			servername => 'example.com',
+			pi_cfg => bless {}, 'PublicInbox::Config',
+		},
+		ibx => $ibx,
 	};
-	my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };
+	my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ibx };
 	PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
-	is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
-		'Archived-At: set');
-	is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ],
-		'List-Archive: set');
-	is_deeply([ $mime->header('List-Post') ], [ '' ],
-		'List-Post: set');
 	is_deeply([ $mime->header('Newsgroups') ], [ 'test' ],
 		'Newsgroups: set');
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:1' ],
 		'Xref: set');
 
-	$ng->{-base_url} = 'http://mirror.example.com/m/';
+	$ibx->{-base_url} = 'http://mirror.example.com/m/';
 	$smsg->{num} = 2;
 	PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
-	is_deeply([ $mime->header('Archived-At') ],
-		[ "<${u}a\@b/>", '' ],
-		'Archived-At: appended');
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ],
 		'Old Xref: clobbered');
 }
diff --git a/t/over.t b/t/over.t
index 4c8f8098..22061249 100644
--- a/t/over.t
+++ b/t/over.t
@@ -74,4 +74,28 @@ SKIP: {
 		'WAL journal_mode not clobbered if manually set');
 }
 
+# ext index additions
+$over->eidx_prep;
+{
+	my @arg = qw(1349 2019 adeadba7cafe example.key);
+	ok($over->add_xref3(@arg), 'first add');
+	ok($over->add_xref3(@arg), 'add idempotent');
+	my $xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.key:2019:adeadba7cafe' ], 'xref3 works');
+
+	@arg = qw(1349 2018 deadbeefcafe example.kee);
+	ok($over->add_xref3(@arg), 'add another xref3');
+	$xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.key:2019:adeadba7cafe',
+			'example.kee:2018:deadbeefcafe' ],
+			'xref3 works forw two');
+
+	@arg = qw(1349 adeadba7cafe example.key);
+	is($over->remove_xref3(@arg), 1, 'remove first');
+	$xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
+		'confirm removal successful');
+	$over->rollback_lazy;
+}
+
 done_testing();
diff --git a/t/psgi_mount.t b/t/psgi_mount.t
index b4de8274..48d8e5c0 100644
--- a/t/psgi_mount.t
+++ b/t/psgi_mount.t
@@ -17,7 +17,7 @@ use_ok 'PublicInbox::WWW';
 use PublicInbox::Import;
 use PublicInbox::Git;
 use PublicInbox::Config;
-my $config = PublicInbox::Config->new(\<new(\<done;
 }
 
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
 my $app = builder(sub {
 	enable('Head');
 	mount('/a' => builder(sub { sub { $www->call(@_) } }));
@@ -67,11 +67,9 @@ test_psgi($app, sub {
 
 	$res = $cb->(GET('/a/test/blah%40example.com/raw'));
 	is($res->code, 200, 'OK with URLMap mount');
-	like($res->content, qr!^List-Archive: !m,
-		'List-Archive set in /raw mboxrd');
 	like($res->content,
-		qr!^Archived-At: !m,
-		'Archived-At set in /raw mboxrd');
+		qr/^Message-Id: \n/sm,
+		'headers appear in /raw');
 
 	# redirects
 	$res = $cb->(GET('/a/test/m/blah%40example.com.html'));
@@ -85,7 +83,7 @@ test_psgi($app, sub {
 
 SKIP: {
 	require_mods(qw(DBD::SQLite Search::Xapian IO::Uncompress::Gunzip), 3);
-	my $ibx = $config->lookup_name('test');
+	my $ibx = $cfg->lookup_name('test');
 	require_ok 'PublicInbox::SearchIdx';
 	PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
 	test_psgi($app, sub {
@@ -94,12 +92,8 @@ SKIP: {
 		my $gz = $res->content;
 		my $raw;
 		IO::Uncompress::Gunzip::gunzip(\$gz => \$raw);
-		like($raw, qr!^List-Archive: !m,
-			'List-Archive set in /t.mbox.gz mboxrd');
-		like($raw,
-			qr!^Archived-At:\x20
-				!mx,
-			'Archived-At set in /t.mbox.gz mboxrd');
+		like($raw, qr!^Message-Id:\x20\n!sm,
+			'headers appear in /t.mbox.gz mboxrd');
 	});
 }
 
diff --git a/t/psgi_search.t b/t/psgi_search.t
index c1677eb3..07fb4846 100644
--- a/t/psgi_search.t
+++ b/t/psgi_search.t
@@ -67,11 +67,11 @@ $im->done;
 PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
 
 my $cfgpfx = "publicinbox.test";
-my $config = PublicInbox::Config->new(\<new(\<new($config);
+my $www = PublicInbox::WWW->new($cfg);
 test_psgi(sub { $www->call(@_) }, sub {
 	my ($cb) = @_;
 	my $res;
@@ -144,7 +144,7 @@ test_psgi(sub { $www->call(@_) }, sub {
 		$xdb->set_metadata('has_threadid', '0');
 		$sidx->idx_release;
 	}
-	$config->each_inbox(sub { delete $_[0]->{search} });
+	$cfg->each_inbox(sub { delete $_[0]->{search} });
 	$res = $cb->(GET('/test/?q=s:test'));
 	is($res->code, 200, 'successful search w/o has_threadid');
 	unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
diff --git a/t/psgi_v2.t b/t/psgi_v2.t
index c13f5e71..bdc1a3c4 100644
--- a/t/psgi_v2.t
+++ b/t/psgi_v2.t
@@ -87,12 +87,11 @@ like($$msg, qr/\AFrom oldbug/s,
 	'"From_" line stored to test old bug workaround');
 
 my $cfgpfx = "publicinbox.v2test";
-my $cfg = <new(\<{-primary_address}
 $cfgpfx.inboxdir=$inboxdir
 EOF
-my $config = PublicInbox::Config->new(\$cfg);
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
 my ($res, $raw, @from_);
 my $client0 = sub {
 	my ($cb) = @_;
@@ -154,7 +153,7 @@ my $client1 = sub {
 	like($raw, qr/^hello ghosts$/m, 'got third message');
 	@from_ = ($raw =~ m/^From /mg);
 	is(scalar(@from_), 3, 'three From_ lines');
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 
 	SKIP: {
 		eval { require IO::Uncompress::Gunzip };
@@ -244,7 +243,7 @@ $run_httpd->($client1, 38);
 	$im->done;
 	my @h = $mime->header('Message-ID');
 	is_deeply($exp, \@h, 'reused existing Message-ID');
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 }
 
 my $client2 = sub {
@@ -283,7 +282,7 @@ $run_httpd->($client2, 8);
 		ok($im->add($mime), "added attachment $body");
 	}
 	$im->done;
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 }
 
 my $client3 = sub {
diff --git a/t/search.t b/t/search.t
index 8df8a202..3754717d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -332,13 +332,13 @@ $ibx->with_umask(sub {
 		like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears');
 		my $doc = $m->get_document;
 		my $col = PublicInbox::Search::BYTES();
-		my $bytes = PublicInbox::Smsg::get_val($doc, $col);
+		my $bytes = PublicInbox::Search::int_val($doc, $col);
 		like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit');
 		ok($bytes > 0, '$bytes is > 0');
 		is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over');
 
 		$col = PublicInbox::Search::UID();
-		my $uid = PublicInbox::Smsg::get_val($doc, $col);
+		my $uid = PublicInbox::Search::int_val($doc, $col);
 		is($uid, $smsg->{num}, 'UID column matches {num}');
 		is($uid, $m->get_docid, 'UID column matches docid');
 	}
@@ -535,5 +535,3 @@ $ibx->with_umask(sub {
 });
 
 done_testing();
-
-1;
diff --git a/t/v2writable.t b/t/v2writable.t
index 2f71fafa..358a2bb7 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -274,14 +274,13 @@ EOF
 	$mime->header_set('Message-ID', "<$y>");
 	$mime->header_set('References', "<$x>");
 	ok($im->add($mime), 'add excessively long References');
-	$im->barrier;
+	$im->done;
 
 	my $msgs = $ibx->over->get_thread('x'x244);
 	is(2, scalar(@$msgs), 'got both messages');
 	is($msgs->[0]->{mid}, 'x'x244, 'stored truncated mid');
 	is($msgs->[1]->{references}, '<'.('x'x244).'>', 'stored truncated ref');
 	is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)');
-	$im->done;
 }
 
 my $tmp = {
diff --git a/t/watch_filter_rubylang.t b/t/watch_filter_rubylang.t
index 6513f30b..9c70b4ea 100644
--- a/t/watch_filter_rubylang.t
+++ b/t/watch_filter_rubylang.t
@@ -72,11 +72,11 @@ $cfgpfx.filter=PublicInbox::Filter::RubyLang
 $cfgpfx.altid=serial:alerts:file=msgmap.sqlite3
 publicinboxwatch.watchspam=maildir:$spamdir
 EOF
-	my $config = PublicInbox::Config->new(\$orig);
-	my $ibx = $config->lookup_name($v);
+	my $cfg = PublicInbox::Config->new(\$orig);
+	my $ibx = $cfg->lookup_name($v);
 	ok($ibx, 'found inbox by name');
 
-	my $w = PublicInbox::Watch->new($config);
+	my $w = PublicInbox::Watch->new($cfg);
 	for my $i (1..2) {
 		$w->scan('full');
 	}
@@ -101,8 +101,8 @@ EOF
 	}
 	$w->scan('full');
 
-	$config = PublicInbox::Config->new(\$orig);
-	$ibx = $config->lookup_name($v);
+	$cfg = PublicInbox::Config->new(\$orig);
+	$ibx = $cfg->lookup_name($v);
 	is($ibx->search->reopen->mset('b:spam')->size, 0, 'spam removed');
 
 	is_deeply([], \@warn, 'no warnings');
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index ae53caf9..c948b41b 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -34,13 +34,13 @@ my $sem = PublicInbox::Emergency->new($spamdir); # create dirs
 {
 	my @w;
 	local $SIG{__WARN__} = sub { push @w, @_ };
-	my $config = PublicInbox::Config->new(\<new(\<new($config);
+	my $wm = PublicInbox::Watch->new($cfg);
 	is(scalar grep(/is a spam folder/, @w), 1, 'got warning about spam');
 	is_deeply($wm->{mdmap}, { "$spamdir/cur" => 'watchspam' },
 		'only got the spam folder to watch');
@@ -61,8 +61,8 @@ EOF
 	close $fh or BAIL_OUT $!;
 }
 
-my $config = PublicInbox::Config->new($cfg_path);
-PublicInbox::Watch->new($config)->scan('full');
+my $cfg = PublicInbox::Config->new($cfg_path);
+PublicInbox::Watch->new($cfg)->scan('full');
 my $git = PublicInbox::Git->new($git_dir);
 my @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 1, 'one revision in rev-list');
@@ -79,7 +79,7 @@ my $write_spam = sub {
 };
 $write_spam->();
 is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 2, 'two revisions in rev-list');
 @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
@@ -93,7 +93,7 @@ To unsubscribe from this list: send the line "unsubscribe git" in
 the body of a message to majordomo\@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file');
 	my $mref = $git->cat_file('HEAD:'.$list[0]);
@@ -101,7 +101,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 
 	is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
 	$write_spam->();
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree is empty');
 	@list = $git->qx(qw(rev-list refs/heads/master));
@@ -115,10 +115,10 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $fail_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	{
 		local $SIG{__WARN__} = sub {}; # quiet spam check warning
-		PublicInbox::Watch->new($config)->scan('full');
+		PublicInbox::Watch->new($cfg)->scan('full');
 	}
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree has no files spamc checked');
@@ -131,9 +131,9 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $main_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file after spamc checked');
 
@@ -166,9 +166,9 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 		$delivered++;
 	};
 	PublicInbox::DS->Reset;
-	my $ii = PublicInbox::InboxIdle->new($config);
+	my $ii = PublicInbox::InboxIdle->new($cfg);
 	my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup';
-	$config->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
+	$cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
 	PublicInbox::DS->SetPostLoopCallback(sub { $delivered == 0 });
 
 	# wait for -watch to setup inotify watches
diff --git a/t/watch_maildir_v2.t b/t/watch_maildir_v2.t
index 12546418..532e5c7c 100644
--- a/t/watch_maildir_v2.t
+++ b/t/watch_maildir_v2.t
@@ -44,11 +44,11 @@ $cfgpfx.watch=maildir:$maildir
 $cfgpfx.filter=PublicInbox::Filter::Vger
 publicinboxlearn.watchspam=maildir:$spamdir
 EOF
-my $config = PublicInbox::Config->new(\$orig);
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$orig);
+my $ibx = $cfg->lookup_name('test');
 ok($ibx, 'found inbox by name');
 
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 my $total = scalar @{$ibx->over->recent};
 is($total, 1, 'got one revision');
 
@@ -68,7 +68,7 @@ my $write_spam = sub {
 };
 $write_spam->();
 is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 is_deeply($ibx->over->recent, [], 'deleted file');
 is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
 
@@ -79,7 +79,7 @@ To unsubscribe from this list: send the line "unsubscribe git" in
 the body of a message to majordomo\@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 1, 'got one file back');
 	my $mref = $ibx->msg_by_smsg($msgs->[0]);
@@ -87,7 +87,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 
 	is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
 	$write_spam->();
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	$msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 0, 'inbox is empty again');
 	is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
@@ -99,10 +99,10 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $fail_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	{
 		local $SIG{__WARN__} = sub {}; # quiet spam check warning
-		PublicInbox::Watch->new($config)->scan('full');
+		PublicInbox::Watch->new($cfg)->scan('full');
 	}
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 0, 'inbox is still empty');
@@ -115,13 +115,13 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $main_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
-	PublicInbox::Watch->new($config)->scan('full');
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 1, 'inbox has one mail after spamc OK-ed a message');
 	my $mref = $ibx->msg_by_smsg($msgs->[0]);
 	like($$mref, qr/something\n\z/s, 'message scrubbed on import');
-	delete $config->{'publicinboxwatch.spamcheck'};
+	delete $cfg->{'publicinboxwatch.spamcheck'};
 }
 
 {
@@ -129,7 +129,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	open my $fh, '<', $patch or die "failed to open $patch: $!\n";
 	$msg = do { local $/; <$fh> };
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $post = $ibx->search->reopen->mset('dfpost:6e006fd7');
 	is($post->size, 1, 'diff postimage found');
 	my $pre = $ibx->search->mset('dfpre:090d998b6c2c');
@@ -146,12 +146,12 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $v1pfx = "publicinbox.v1";
 	my $v1addr = 'v1-public@example.com';
 	PublicInbox::Import::init_bare($v1repo);
-	my $cfg2 = <new(\$cfg2);
+	my $cfg = PublicInbox::Config->new(\$raw);
 	my $both = <new($maildir)->prepare(\$both);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $mset = $ibx->search->reopen->mset('m:both@b.com');
 	my $msgs = $ibx->search->mset_to_smsg($ibx, $mset);
-	my $v1 = $config->lookup_name('v1');
+	my $v1 = $cfg->lookup_name('v1');
 	my $msg = $v1->git->cat_file($msgs->[0]->{blob});
 	is($both, $$msg, 'got original message back from v1');
 	$msg = $ibx->git->cat_file($msgs->[0]->{blob});
@@ -184,21 +184,21 @@ List-Id: 
 X-Mailing-List: no@example.com
 Message-ID: 
 EOF
-	my $cfg = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
+	my $raw = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
 	PublicInbox::Emergency->new($maildir)->prepare(\$want);
 	PublicInbox::Emergency->new($maildir)->prepare(\$do_not_want);
-	my $config = PublicInbox::Config->new(\$cfg);
-	PublicInbox::Watch->new($config)->scan('full');
-	$ibx = $config->lookup_name('test');
+	my $cfg = PublicInbox::Config->new(\$raw);
+	PublicInbox::Watch->new($cfg)->scan('full');
+	$ibx = $cfg->lookup_name('test');
 	my $num = $ibx->mm->num_for('do.want@example.com');
 	ok(defined $num, 'List-ID matched for watch');
 	$num = $ibx->mm->num_for('do.not.want@example.com');
 	is($num, undef, 'unaccepted List-ID matched for watch');
 
-	$cfg = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
-	$config = PublicInbox::Config->new(\$cfg);
-	PublicInbox::Watch->new($config)->scan('full');
-	$ibx = $config->lookup_name('test');
+	$raw = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
+	$cfg = PublicInbox::Config->new(\$raw);
+	PublicInbox::Watch->new($cfg)->scan('full');
+	$ibx = $cfg->lookup_name('test');
 	$num = $ibx->mm->num_for('do.not.want@example.com');
 	ok(defined $num, 'X-Mailing-List matched');
 }
diff --git a/t/watch_multiple_headers.t b/t/watch_multiple_headers.t
index a0813532..1fe392d4 100644
--- a/t/watch_multiple_headers.t
+++ b/t/watch_multiple_headers.t
@@ -54,16 +54,16 @@ PublicInbox::Emergency->new($maildir)->prepare(\$msg_to);
 PublicInbox::Emergency->new($maildir)->prepare(\$msg_cc);
 PublicInbox::Emergency->new($maildir)->prepare(\$msg_none);
 
-my $cfg = <new(\$cfg);
-PublicInbox::Watch->new($config)->scan('full');
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$raw);
+PublicInbox::Watch->new($cfg)->scan('full');
+my $ibx = $cfg->lookup_name('test');
 ok($ibx, 'found inbox by name');
 
 my $num = $ibx->mm->num_for('to@a.com');
diff --git a/t/www_listing.t b/t/www_listing.t
index 4309a5e1..63613371 100644
--- a/t/www_listing.t
+++ b/t/www_listing.t
@@ -21,8 +21,7 @@ use_ok 'PublicInbox::Git';
 my ($tmpdir, $for_destroy) = tmpdir();
 my $bare = PublicInbox::Git->new("$tmpdir/bare.git");
 PublicInbox::Import::init_bare($bare->{git_dir});
-is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
-	'empty repo has no fingerprint');
+is($bare->manifest_entry, undef, 'empty repo has no manifest entry');
 {
 	my $fi_data = './t/git.fast-import-data';
 	open my $fh, '<', $fi_data or die "open $fi_data: $!";
@@ -31,7 +30,7 @@ is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
 		'fast-import');
 }
 
-like(PublicInbox::ManifestJsGz::fingerprint($bare), qr/\A[a-f0-9]{40}\z/,
+like($bare->manifest_entry->{fingerprint}, qr/\A[a-f0-9]{40}\z/,
 	'got fingerprint with non-empty repo');
 
 sub tiny_test {
diff --git a/xt/cmp-msgview.t b/xt/cmp-msgview.t
index 5bd7aa17..24151267 100644
--- a/xt/cmp-msgview.t
+++ b/xt/cmp-msgview.t
@@ -24,7 +24,7 @@ vec(my $vec = '', fileno($fh), 1) = 1;
 select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
 my $mime_ctx = {
 	env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
-	-inbox => $ibx,
+	ibx => $ibx,
 	www => Plack::Util::inline_object(style => sub {''}),
 	obuf => \(my $mime_buf = ''),
 	mhref => '../',
diff --git a/xt/create-many-inboxes.t b/xt/create-many-inboxes.t
new file mode 100644
index 00000000..c92643b2
--- /dev/null
+++ b/xt/create-many-inboxes.t
@@ -0,0 +1,99 @@
+#!perl -w
+# Copyright (C) 2020 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use File::Path qw(mkpath);
+use IO::Handle (); # autoflush
+use POSIX qw(_exit);
+use Cwd qw(getcwd abs_path);
+use File::Spec;
+my $many_root = $ENV{TEST_MANY_ROOT} or
+	plan skip_all => 'TEST_MANY_ROOT not defined';
+my $cwd = getcwd();
+mkpath($many_root);
+-d $many_root or BAIL_OUT "$many_root: $!";
+$many_root = abs_path($many_root);
+$many_root =~ m!\A\Q$cwd\E/! and BAIL_OUT "$many_root must not be in $cwd";
+require_git 2.6;
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::V2Writable';
+my $nr_inbox = $ENV{NR_INBOX} // 10;
+my $nproc = $ENV{NPROC} || PublicInbox::V2Writable::detect_nproc() || 2;
+my $indexlevel = $ENV{TEST_INDEXLEVEL} // 'basic';
+diag "NR_INBOX=$nr_inbox NPROC=$nproc TEST_INDEXLEVEL=$indexlevel";
+diag "TEST_MANY_ROOT=$many_root";
+my $level_cfg = $indexlevel eq 'full' ? '' : "\tindexlevel = $indexlevel\n";
+my $pfx = "$many_root/$nr_inbox-$indexlevel";
+mkpath($pfx);
+open my $cfg_fh, '>>', "$pfx/config" or BAIL_OUT $!;
+$cfg_fh->autoflush(1);
+my $v2_init_add = sub {
+	my ($i) = @_;
+	my $ibx = PublicInbox::Inbox->new({
+		inboxdir => "$pfx/test-$i",
+		name => "test-$i",
+		newsgroup => "inbox.comp.test.foo.test-$i",
+		address => [ "test-$i\@example.com" ],
+		url => [ "//example.com/test-$i" ],
+		version => 2,
+	});
+	$ibx->{indexlevel} = $indexlevel if $level_cfg ne '';
+	my $entry = <{name}"]
+	address = $ibx->{-primary_address}
+	url = $ibx->{url}->[0]
+	newsgroup = $ibx->{newsgroup}
+	inboxdir = $ibx->{inboxdir}
+EOF
+	$entry .= $level_cfg;
+	print $cfg_fh $entry or die $!;
+	my $v2w = PublicInbox::V2Writable->new($ibx, { nproc => 0 });
+	$v2w->init_inbox(0);
+	$v2w->add(PublicInbox::Eml->new(<
+To: test-$i\@example.com
+Message-ID: <20101002-000000-$i\@example.com>
+Subject: hello world $i
+
+hi
+EOM
+	$v2w->done;
+};
+
+my @children;
+for my $i (1..$nproc) {
+	my ($r, $w);
+	pipe($r, $w) or BAIL_OUT $!;
+	my $pid = fork;
+	if ($pid == 0) {
+		close $w;
+		while (my $i = <$r>) {
+			chomp $i;
+			$v2_init_add->($i);
+		}
+		_exit(0);
+	}
+	defined $pid or BAIL_OUT "fork: $!";
+	close $r or BAIL_OUT $!;
+	push @children, [ $w, $pid ];
+	$w->autoflush(1);
+}
+
+for my $i (0..$nr_inbox) {
+	print { $children[$i % @children]->[0] } "$i\n" or BAIL_OUT $!;
+}
+
+for my $c (@children) {
+	close $c->[0] or BAIL_OUT "close $!";
+}
+my $i = 0;
+for my $c (@children) {
+	my $pid = waitpid($c->[1], 0);
+	is($?, 0, ++$i.' exited ok');
+}
+ok(close($cfg_fh), 'config written');
+done_testing;
diff --git a/xt/perf-msgview.t b/xt/perf-msgview.t
index d99101a3..30e133d7 100644
--- a/xt/perf-msgview.t
+++ b/xt/perf-msgview.t
@@ -29,7 +29,7 @@ select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
 
 my $ctx = {
 	env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
-	-inbox => $ibx,
+	ibx => $ibx,
 	www => Plack::Util::inline_object(style => sub {''}),
 };
 my ($mime, $res, $oid, $type);
diff --git a/xt/perf-threading.t b/xt/perf-threading.t
index b27c9cbd..472c1953 100644
--- a/xt/perf-threading.t
+++ b/xt/perf-threading.t
@@ -25,7 +25,7 @@ ok($n, 'got some messages');
 diag "enquire: ".timestr($elapsed)." for $n";
 
 $elapsed = timeit(1, sub {
-	PublicInbox::View::thread_results({-inbox => $ibx}, $msgs);
+	PublicInbox::View::thread_results({ibx => $ibx}, $msgs);
 });
 diag "thread_results ".timestr($elapsed);