From: Eric Wong Date: Mon, 25 Jan 2021 19:56:21 +0000 (+0000) Subject: doc: re-add missing 1.6 release notes X-Git-Tag: v1.7.0~1301 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=9dfc0b670fc634b54998c3020f173b82de1915ac;hp=3bacac503f6ff0bcf19aa581151c9c89fa35fe55 doc: re-add missing 1.6 release notes I missed these during the merge :x --- diff --git a/Documentation/design_notes.txt b/Documentation/design_notes.txt index e871f4c8..bc668da3 100644 --- a/Documentation/design_notes.txt +++ b/Documentation/design_notes.txt @@ -149,5 +149,5 @@ problems solved. Copyright --------- -Copyright 2013-2020 all contributors +Copyright 2013-2021 all contributors License: AGPL-3.0+ diff --git a/Documentation/extman.perl b/Documentation/extman.perl index a9a830c0..c6cfb4c5 100755 --- a/Documentation/extman.perl +++ b/Documentation/extman.perl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # prints a manpage to stdout use strict; diff --git a/Documentation/flow.ge b/Documentation/flow.ge index 0cc1c333..4308989a 100644 --- a/Documentation/flow.ge +++ b/Documentation/flow.ge @@ -24,5 +24,5 @@ graph { flow: down } public-inbox-imapd\n public-inbox-nntpd] -# Copyright 2020 all contributors +# Copyright 2020-2021 all contributors # License: AGPL-3.0+ diff --git a/Documentation/flow.txt b/Documentation/flow.txt index 8225cc8f..1116a917 100644 --- a/Documentation/flow.txt +++ b/Documentation/flow.txt @@ -29,5 +29,5 @@ | public-inbox-nntpd | +--------------------+ -# Copyright 2020 all contributors +# Copyright 2020-2021 all contributors # License: AGPL-3.0+ diff --git a/Documentation/include.mk b/Documentation/include.mk index 207983f0..df6c17e0 100644 --- a/Documentation/include.mk +++ b/Documentation/include.mk @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2020 all contributors +# Copyright (C) 2013-2021 all contributors # License: AGPL-3.0+ all:: diff --git a/Documentation/mknews.perl b/Documentation/mknews.perl index 510a4e18..1936cea7 100755 --- a/Documentation/mknews.perl +++ b/Documentation/mknews.perl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # Generates NEWS, NEWS.atom, and NEWS.html files using release emails # this uses unstable internal APIs of public-inbox, and this script @@ -43,7 +43,7 @@ if ($dst eq 'NEWS') { ); $ibx->{-primary_address} = $addr; my $ctx = { - -inbox => $ibx, + ibx => $ibx, -upfx => "$base_url/", -hr => 1, }; @@ -119,10 +119,10 @@ sub html_start { } sub html_end { - print $out < -EOF + for (@$PublicInbox::WwwStream::CODE_URL) { + print $out " git clone $_\n" or die; + } + print $out "\n" or die; } sub atom_start { @@ -131,7 +131,7 @@ sub atom_start { # WwwAtomStream stats this dir for mtime my $astream = PublicInbox::WwwAtomStream->new($ctx); delete $astream->{emit_header}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $title = PublicInbox::WwwAtomStream::title_tag($ibx->description); my $updated = PublicInbox::WwwAtomStream::feed_updated($mtime); print $out < =head1 COPYRIGHT -Copyright 2018-2020 all contributors L +Copyright 2018-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 2d845f16..4a97fe3b 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -412,7 +412,7 @@ and L =head1 COPYRIGHT -Copyright 2016-2020 all contributors L +Copyright 2016-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-convert.pod b/Documentation/public-inbox-convert.pod index a7958cf8..f400fab8 100644 --- a/Documentation/public-inbox-convert.pod +++ b/Documentation/public-inbox-convert.pod @@ -91,7 +91,7 @@ and L =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-daemon.pod b/Documentation/public-inbox-daemon.pod index 747c1452..7405cdf9 100644 --- a/Documentation/public-inbox-daemon.pod +++ b/Documentation/public-inbox-daemon.pod @@ -174,7 +174,7 @@ and L =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-edit.pod b/Documentation/public-inbox-edit.pod index 55d1c163..8014d7c3 100644 --- a/Documentation/public-inbox-edit.pod +++ b/Documentation/public-inbox-edit.pod @@ -114,7 +114,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-extindex-format.pod b/Documentation/public-inbox-extindex-format.pod new file mode 100644 index 00000000..52eb8e85 --- /dev/null +++ b/Documentation/public-inbox-extindex-format.pod @@ -0,0 +1,110 @@ +% public-inbox developer manual + +=head1 NAME + +public-inbox extindex format description + +=head1 DESCRIPTION + +The extindex is an index-only evolution of the per-inbox +SQLite and Xapian indices used by L +and L. It exists to facilitate +searches across multiple inboxes as well as to reduce index +space when messages are cross-posted to several existing +inboxes. + +It transparently indexes messages across any combination of v1 and v2 +inboxes and data about inboxes themselves. + +=head1 DIRECTORY LAYOUT + +While inspired by v2, there is no git blob storage nor +C DB. + +Instead, there is an C (all caps) git repo which treats +every indexed v1 inbox or v2 epoch as a git alternate. + +As with v2 inboxes, it uses C and Xapian "shards" +for WWW and IMAP use. Several exclusive new tables are added +to deal with L and metadata. + +Unlike v1 and v2 inboxes, it is NOT designed to map to a NNTP +newsgroup. Thus it lacks C to enforce the +unique Message-ID requirement of NNTP. + +=head2 INDEX OVERVIEW AND DEFINITIONS + + $SCHEMA_VERSION - DB schema version (for Xapian) + $SHARD - Integer starting with 0 based on parallelism + + foo/ # "foo" is the name of the index + - ei.lock # lock file to protect global state + - ALL.git # empty, alternates for inboxes + - ei$SCHEMA_VERSION/$SHARD # per-shard Xapian DB + - ei$SCHEMA_VERSION/over.sqlite3 # overview DB for WWW, IMAP + - ei$SCHEMA_VERSION/misc # misc Xapian DB + +File and directory names are intentionally different from +analogous v2 names to ensure extindex and v2 inboxes can +easily be distinguished from each other. + +=head2 XREF3 DEDUPLICATION + +Due to cross-posted messages being the norm in the large Linux kernel +development community and Xapian indices being the primary consumer of +storage, it makes sense to deduplicate indexing as much as possible. + +The internal storage format is based on the NNTP "Xref" tuple, +but with the addition of a third element: the git blob OID. +Thus the triple is expressed in string form as: + + $NEWSGROUP_NAME:$ARTICLE_NUM:$OID + +If no C is configured for an inbox, the C +of the inbox is used. + +This data is stored in the C table of over.sqlite3. + +=head2 misc XAPIAN DB + +In addition to the numeric Xapian shards for indexing messages, +there is a new, in-development Xapian index for storing data +about inboxes themselves and other non-message data. This +index allows us to speed up operations involving hundreds or +thousands of inboxes. + +=head1 BENEFITS + +In addition to providing cross-inbox search capabilities, it can +also replace per-inbox Xapian shards (but not per-inbox +over.sqlite3). This allows reduction in disk space, open file +handles, and associated memory use. + +=head1 CAVEATS + +Relocating v1 and v2 inboxes on the filesystem will require +extindex to be garbage-collected and/or reindexed. + +Configuring and maintaining stable C names before any +messages are indexed from every inbox can avoid expensive +reindexing and rely exclusively on GC. + +=head1 LOCKING + +L locking exclusively locks the empty ei.lock file +for all non-atomic operations. + +=head1 THANKS + +Thanks to the Linux Foundation for sponsoring the development +and testing. + +=head1 COPYRIGHT + +Copyright 2020-2021 all contributors L + +License: AGPL-3.0+ L + +=head1 SEE ALSO + +L diff --git a/Documentation/public-inbox-httpd.pod b/Documentation/public-inbox-httpd.pod index f4e9945a..eef3dccd 100644 --- a/Documentation/public-inbox-httpd.pod +++ b/Documentation/public-inbox-httpd.pod @@ -29,7 +29,7 @@ and L =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-imapd.pod b/Documentation/public-inbox-imapd.pod index a5c996b8..99632871 100644 --- a/Documentation/public-inbox-imapd.pod +++ b/Documentation/public-inbox-imapd.pod @@ -81,7 +81,7 @@ L =head1 COPYRIGHT -Copyright 2020 all contributors L +Copyright 2020-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 0848e860..67219a23 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -162,6 +162,23 @@ See L for description and caveats. Available in public-inbox 1.6.0+. +=item --update-extindex=EXTINDEX, -E + +Update the given external index (L. +Either the configured section name (e.g. C) or a directory name +may be specified. + +Defaults to C if C<[extindex "all"]> is configured, +otherwise no external indices are updated. + +May be specified multiple times in rare cases where multiple +external indices are configured. + +=item --no-update-extindex + +Do not update the C external index by default. This negates +all uses of C<-E> / C<--update-extindex=> on the command-line. + =back =head1 FILES @@ -291,10 +308,10 @@ and L =head1 COPYRIGHT -Copyright 2016-2020 all contributors L +Copyright 2016-2021 all contributors L License: AGPL-3.0+ L =head1 SEE ALSO -L, L +L, L, L diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod index f1ec05de..771775d3 100644 --- a/Documentation/public-inbox-init.pod +++ b/Documentation/public-inbox-init.pod @@ -142,7 +142,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-learn.pod b/Documentation/public-inbox-learn.pod index 498c5092..54bc7f50 100644 --- a/Documentation/public-inbox-learn.pod +++ b/Documentation/public-inbox-learn.pod @@ -82,7 +82,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-mda.pod b/Documentation/public-inbox-mda.pod index a5e353e5..b992ca24 100644 --- a/Documentation/public-inbox-mda.pod +++ b/Documentation/public-inbox-mda.pod @@ -78,7 +78,7 @@ and L =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-nntpd.pod b/Documentation/public-inbox-nntpd.pod index 18f83c9c..0e820602 100644 --- a/Documentation/public-inbox-nntpd.pod +++ b/Documentation/public-inbox-nntpd.pod @@ -81,7 +81,7 @@ L =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-overview.pod b/Documentation/public-inbox-overview.pod index 44989e6e..6a087896 100644 --- a/Documentation/public-inbox-overview.pod +++ b/Documentation/public-inbox-overview.pod @@ -124,6 +124,6 @@ and L =head1 COPYRIGHT -Copyright 2016-2020 all contributors L +Copyright 2016-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-purge.pod b/Documentation/public-inbox-purge.pod index a9479657..30227422 100644 --- a/Documentation/public-inbox-purge.pod +++ b/Documentation/public-inbox-purge.pod @@ -74,7 +74,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-tuning.pod b/Documentation/public-inbox-tuning.pod index f5a25676..e9702416 100644 --- a/Documentation/public-inbox-tuning.pod +++ b/Documentation/public-inbox-tuning.pod @@ -151,6 +151,6 @@ L, and other places =head1 COPYRIGHT -Copyright 2020 all contributors L +Copyright 2020-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-v1-format.pod b/Documentation/public-inbox-v1-format.pod index da19d2c9..db223fd9 100644 --- a/Documentation/public-inbox-v1-format.pod +++ b/Documentation/public-inbox-v1-format.pod @@ -175,7 +175,7 @@ This is up to the administrators. =head1 COPYRIGHT -Copyright 2013-2020 all contributors L +Copyright 2013-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod index 3c89f13e..e93d7fc7 100644 --- a/Documentation/public-inbox-v2-format.pod +++ b/Documentation/public-inbox-v2-format.pod @@ -235,7 +235,7 @@ and testing of the v2 format. =head1 COPYRIGHT -Copyright 2018-2020 all contributors L +Copyright 2018-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-watch.pod b/Documentation/public-inbox-watch.pod index 38686645..dd38351a 100644 --- a/Documentation/public-inbox-watch.pod +++ b/Documentation/public-inbox-watch.pod @@ -201,7 +201,7 @@ and L =head1 COPYRIGHT -Copyright 2016-2020 all contributors L +Copyright 2016-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod index 1bc1b1df..5f99c4ab 100644 --- a/Documentation/public-inbox-xcpdb.pod +++ b/Documentation/public-inbox-xcpdb.pod @@ -129,7 +129,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox.cgi.pod b/Documentation/public-inbox.cgi.pod index 8fd6f3e7..2fd256a3 100644 --- a/Documentation/public-inbox.cgi.pod +++ b/Documentation/public-inbox.cgi.pod @@ -25,7 +25,7 @@ and L =head1 COPYRIGHT -Copyright 2019-2020 all contributors L +Copyright 2019-2021 all contributors L License: AGPL-3.0+ L diff --git a/Documentation/standards.perl b/Documentation/standards.perl index 1c56830e..32003c91 100755 --- a/Documentation/standards.perl +++ b/Documentation/standards.perl @@ -1,6 +1,6 @@ #!/usr/bin/perl -w use strict; -# Copyright 2019-2020 all contributors +# Copyright 2019-2021 all contributors # License: AGPL-3.0+ print < 'Standard for Interchange of USENET Messages', 5536 => 'Netnews Article Format', 5537 => 'Netnews Architecture and Protocols', + 1738 => 'Uniform resource locators', + 5092 => 'IMAP URL scheme', + 5538 => 'NNTP URI schemes', 6048 => 'NNTP additions to LIST command (TODO)', 8054 => 'NNTP compression', 4642 => 'NNTP TLS', diff --git a/Documentation/txt2pre b/Documentation/txt2pre index 75d2d12b..75e4725c 100755 --- a/Documentation/txt2pre +++ b/Documentation/txt2pre @@ -1,5 +1,5 @@ #!/usr/bin/env perl -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # # Stupid script to make HTML from preformatted, utf-8 text versions, diff --git a/INSTALL b/INSTALL index 9f05c3f6..de871b1a 100644 --- a/INSTALL +++ b/INSTALL @@ -2,7 +2,7 @@ public-inbox (server-side) installation --------------------------------------- This is for folks who want to setup their own public-inbox instance. -Clients should use normal git-clone/git-fetch, or NNTP clients +Clients should use normal git-clone/git-fetch, IMAP or NNTP clients if they want to import mail into their personal inboxes. public-inbox is developed on Debian GNU/Linux systems and will @@ -24,7 +24,7 @@ functionality. The core tools are, of course: * Git (1.8.0+, 2.6+ for writing v2 inboxes) * Perl 5.10.1+ -* DBD::SQLite (needed for NNTP, message threading, and v2 inboxes) +* DBD::SQLite (needed for IMAP, NNTP, message threading, and v2 inboxes) To accept incoming mail into a public inbox, you'll likely want: @@ -70,17 +70,17 @@ Numerous optional modules are likely to be useful as well: - DBD::SQLite deb: libdbd-sqlite3-perl pkg: p5-DBD-SQLite rpm: perl-DBD-SQLite - (for v2, NNTP, or gzipped mboxes) + (for v2, IMAP, NNTP, or gzipped mboxes) - Search::Xapian deb: libsearch-xapian-perl pkg: p5-Search-Xapian rpm: perl-Search-Xapian - (HTTP search) + (HTTP and IMAP search) - Net::Server deb: libnet-server-perl pkg: pkg-Net-Server rpm: perl-Net-Server - (for HTTP/NNTP background daemons, + (for HTTP/IMAP/NNTP background daemons, not needed as systemd services or foreground servers) @@ -92,7 +92,14 @@ Numerous optional modules are likely to be useful as well: - Email::Address::XS deb: libemail-address-xs-perl pkg: pkg-Email-Address-XS (correct parsing of tricky email - addresses, phrases and comments) + addresses, phrases and comments, + required for IMAP) + +- Parse::RecDescent deb: libparse-recdescent-perl + pkg: p5-Parse-RecDescent + rpm: perl-ParseRecDescent + (optional, for public-inbox-imapd(1)) + - Plack::Middleware::ReverseProxy deb: libplack-middleware-reverseproxy-perl pkg: p5-Plack-Middleware-ReverseProxy @@ -129,7 +136,7 @@ above, so there is no need to explicitly install them: - Linux::Inotify2 deb: liblinux-inotify2-perl rpm: perl-Linux-Inotify2 - (for public-inbox-watch on Linux) + (for public-inbox-watch and -imapd on Linux) - IO::Compress (::Gzip) deb: perl-modules (or libio-compress-perl) pkg: perl5 @@ -196,5 +203,5 @@ the installation is complete. Copyright --------- -Copyright 2013-2020 all contributors +Copyright 2013-2021 all contributors License: AGPL-3.0+ diff --git a/MANIFEST b/MANIFEST index 8a47ccbf..f981f2ac 100644 --- a/MANIFEST +++ b/MANIFEST @@ -28,6 +28,7 @@ Documentation/public-inbox-config.pod Documentation/public-inbox-convert.pod Documentation/public-inbox-daemon.pod Documentation/public-inbox-edit.pod +Documentation/public-inbox-extindex-format.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-imapd.pod Documentation/public-inbox-index.pod @@ -62,6 +63,7 @@ ci/README ci/deps.perl ci/profiles.sh ci/run.sh +contrib/completion/lei-completion.bash contrib/css/216dark.css contrib/css/216light.css contrib/css/README @@ -101,12 +103,14 @@ examples/unsubscribe-psgi@.service examples/unsubscribe.milter examples/unsubscribe.psgi examples/varnish-4.vcl +lei.sh lib/PublicInbox/Address.pm lib/PublicInbox/AddressPP.pm lib/PublicInbox/Admin.pm lib/PublicInbox/AdminEdit.pm lib/PublicInbox/AltId.pm lib/PublicInbox/Cgit.pm +lib/PublicInbox/CmdIPC4.pm lib/PublicInbox/CompressNoop.pm lib/PublicInbox/Config.pm lib/PublicInbox/ConfigIter.pm @@ -122,6 +126,8 @@ lib/PublicInbox/Emergency.pm lib/PublicInbox/Eml.pm lib/PublicInbox/EmlContentFoo.pm lib/PublicInbox/ExtMsg.pm +lib/PublicInbox/ExtSearch.pm +lib/PublicInbox/ExtSearchIdx.pm lib/PublicInbox/FakeInotify.pm lib/PublicInbox/Feed.pm lib/PublicInbox/Filter/Base.pm @@ -130,6 +136,8 @@ lib/PublicInbox/Filter/Mirror.pm lib/PublicInbox/Filter/RubyLang.pm lib/PublicInbox/Filter/SubjectTag.pm lib/PublicInbox/Filter/Vger.pm +lib/PublicInbox/Gcf2.pm +lib/PublicInbox/Gcf2Client.pm lib/PublicInbox/GetlineBody.pm lib/PublicInbox/Git.pm lib/PublicInbox/GitAsyncCat.pm @@ -147,13 +155,24 @@ lib/PublicInbox/IMAPD.pm lib/PublicInbox/IMAPTracker.pm lib/PublicInbox/IMAPdeflate.pm lib/PublicInbox/IMAPsearchqp.pm +lib/PublicInbox/IPC.pm lib/PublicInbox/IdxStack.pm lib/PublicInbox/Import.pm lib/PublicInbox/In2Tie.pm lib/PublicInbox/Inbox.pm lib/PublicInbox/InboxIdle.pm lib/PublicInbox/InboxWritable.pm +lib/PublicInbox/Isearch.pm lib/PublicInbox/KQNotify.pm +lib/PublicInbox/LEI.pm +lib/PublicInbox/LeiDedupe.pm +lib/PublicInbox/LeiExternal.pm +lib/PublicInbox/LeiOverview.pm +lib/PublicInbox/LeiQuery.pm +lib/PublicInbox/LeiSearch.pm +lib/PublicInbox/LeiStore.pm +lib/PublicInbox/LeiToMail.pm +lib/PublicInbox/LeiXSearch.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm lib/PublicInbox/Lock.pm @@ -163,6 +182,9 @@ lib/PublicInbox/MIME.pm lib/PublicInbox/ManifestJsGz.pm lib/PublicInbox/Mbox.pm lib/PublicInbox/MboxGz.pm +lib/PublicInbox/MboxReader.pm +lib/PublicInbox/MiscIdx.pm +lib/PublicInbox/MiscSearch.pm lib/PublicInbox/MsgIter.pm lib/PublicInbox/MsgTime.pm lib/PublicInbox/Msgmap.pm @@ -171,6 +193,8 @@ lib/PublicInbox/NNTP.pm lib/PublicInbox/NNTPD.pm lib/PublicInbox/NNTPdeflate.pm lib/PublicInbox/NewsWWW.pm +lib/PublicInbox/OnDestroy.pm +lib/PublicInbox/OpPipe.pm lib/PublicInbox/Over.pm lib/PublicInbox/OverIdx.pm lib/PublicInbox/ProcessPipe.pm @@ -184,6 +208,7 @@ lib/PublicInbox/SearchIdxShard.pm lib/PublicInbox/SearchQuery.pm lib/PublicInbox/SearchThread.pm lib/PublicInbox/SearchView.pm +lib/PublicInbox/SharedKV.pm lib/PublicInbox/Sigfd.pm lib/PublicInbox/Smsg.pm lib/PublicInbox/SolverGit.pm @@ -214,13 +239,16 @@ lib/PublicInbox/WwwStatic.pm lib/PublicInbox/WwwStream.pm lib/PublicInbox/WwwText.pm lib/PublicInbox/Xapcmd.pm +lib/PublicInbox/gcf2_libgit2.h sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre sa_config/user/.spamassassin/user_prefs +script/lei script/public-inbox-compact script/public-inbox-convert script/public-inbox-edit +script/public-inbox-extindex script/public-inbox-httpd script/public-inbox-imapd script/public-inbox-index @@ -251,6 +279,7 @@ t/altid.t t/altid_v2.t t/cgi.t t/check-www-inbox.perl +t/cmd_ipc.t t/config.t t/config_limiter.t t/content_hash.t @@ -267,6 +296,7 @@ t/eml.t t/eml_content_disposition.t t/eml_content_type.t t/epoll.t +t/extsearch.t t/fail-bin/spamc t/fake_inotify.t t/feed.t @@ -277,6 +307,8 @@ t/filter_mirror.t t/filter_rubylang.t t/filter_subjecttag.t t/filter_vger.t +t/gcf2.t +t/gcf2_client.t t/git-http-backend.psgi t/git.fast-import-data t/git.t @@ -302,15 +334,26 @@ t/index-git-times.t t/indexlevels-mirror-v1.t t/indexlevels-mirror.t t/init.t +t/ipc.t t/iso-2202-jp.eml t/kqnotify.t +t/lei-oneshot.t +t/lei.t +t/lei_dedupe.t +t/lei_external.t +t/lei_overview.t +t/lei_store.t +t/lei_to_mail.t +t/lei_xsearch.t t/linkify.t t/main-bin/spamc +t/mbox_reader.t t/mda-mime.eml t/mda.t t/mda_filter_rubylang.t t/mid.t t/mime.t +t/miscsearch.t t/msg_iter-nested.eml t/msg_iter-order.eml t/msg_iter.t @@ -323,6 +366,7 @@ t/nntpd-v2.t t/nntpd.t t/nodatacow.t t/nulsubject.t +t/on_destroy.t t/over.t t/plack-2-txt-bodies.eml t/plack-attached-patch.eml @@ -348,6 +392,7 @@ t/run.perl t/search-amsg.eml t/search-thr-index.t t/search.t +t/shared_kv.t t/sigfd.t t/solve/0001-simple-mod.patch t/solve/0002-rename-with-modifications.patch @@ -381,12 +426,14 @@ t/x-unknown-alpine.eml t/xcpdb-reshard.t xt/cmp-msgstr.t xt/cmp-msgview.t +xt/create-many-inboxes.t xt/eml_check_limits.t xt/git-http-backend.t xt/git_async_cmp.t xt/httpd-async-stream.t xt/imapd-mbsync-oimap.t xt/imapd-validate.t +xt/lei-sigpipe.t xt/mem-imapd-tls.t xt/mem-msgview.t xt/msgtime_cmp.t diff --git a/Makefile.PL b/Makefile.PL index be3471a6..9d0a361a 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2013-2020 all contributors +# Copyright (C) 2013-2021 all contributors # License: AGPL-3.0+ use strict; use ExtUtils::MakeMaker; @@ -31,9 +31,20 @@ my @syn = (@EXE_FILES, grep(m!^lib/.*\.pm$!, @manifest), @scripts); @syn = grep(!/SaPlugin/, @syn) if !eval { require Mail::SpamAssasin }; $v->{syn_files} = \@syn; $v->{my_syntax} = [map { "$_.syntax" } @syn]; -$v->{-m1} = [ map { (split('/'))[-1] } @EXE_FILES ]; +my @no_pod; +$v->{-m1} = [ map { + my $x = (split('/'))[-1]; + my $pod = "Documentation/$x.pod"; + if (-f $pod) { + $x; + } else { + warn "W: $pod missing\n"; + push @no_pod, $x; + (); + } + } @EXE_FILES ]; $v->{-m5} = [ qw(public-inbox-config public-inbox-v1-format - public-inbox-v2-format) ]; + public-inbox-v2-format public-inbox-extindex-format) ]; $v->{-m7} = [ qw(public-inbox-overview public-inbox-tuning) ]; $v->{-m8} = [ qw(public-inbox-daemon) ]; my @sections = (1, 5, 7, 8); @@ -109,6 +120,7 @@ my %man3 = map {; # semi-colon tells Perl this is a BLOCK (and not EXPR) $mod =~ s/\.\w+\z//; "lib/PublicInbox/$_" => "blib/man3/PublicInbox::$mod.\$(MAN3EXT)" } qw(Git.pm Import.pm WWW.pod SaPlugin/ListMirror.pod); +my $warn_no_pod = @no_pod ? "\n\t\@echo W: missing .pod: @no_pod\n" : ''; WriteMakefile( NAME => 'PublicInbox', # n.b. camel-case is not our choice @@ -172,6 +184,8 @@ $VARS -include Documentation/include.mk $TGTS +check-man ::$warn_no_pod + # syntax checks are currently GNU make only: %.syntax :: % @\$(PERL) -w -I lib -c \$< @@ -209,5 +223,22 @@ Makefile.PL : MANIFEST touch -r MANIFEST \$@ \$(PERLRUN) \$@ +# Install symlinks to ~/bin (which is hopefuly in PATH) which point to +# this source tree. +# prefix + bindir matches git.git Makefile: +prefix = \$(HOME) +bindir = \$(prefix)/bin +symlink-install : + mkdir -p \$(bindir) + lei=\$\$(realpath lei.sh) && cd \$(bindir) && \\ + for x in \$(EXE_FILES); do \\ + ln -sf "\$\$lei" \$\$(basename "\$\$x"); \\ + done + +update-copyrights : + \@case '\$(GNULIB_PATH)' in '') echo >&2 GNULIB_PATH unset; false;; esac + git ls-files | UPDATE_COPYRIGHT_HOLDER='all contributors' \\ + UPDATE_COPYRIGHT_USE_INTERVALS=2 \\ + xargs \$(GNULIB_PATH)/build-aux/update-copyright EOF } diff --git a/README b/README index ae428bcf..5f8a1a68 100644 --- a/README +++ b/README @@ -3,7 +3,7 @@ public-inbox - an "archives first" approach to mailing lists public-inbox implements the sharing of an email inbox via git to complement or replace traditional mailing lists. Readers may -read via NNTP, Atom feeds or HTML archives. +read via NNTP, IMAP, Atom feeds or HTML archives. public-inbox spawned around three main ideas: @@ -38,7 +38,7 @@ headers. List server admins are also burdened with delivery failures. public-inbox uses the "pull" model. Casual readers may -follow the list via NNTP, Atom feed or HTML archives. +follow the list via NNTP, IMAP, Atom feed or HTML archives. If a reader loses interest, they simply stop following. @@ -56,7 +56,7 @@ Features * stores email in git, readers may have a complete archive of the inbox -* Atom feed and NNTP allows casual readers to follow via feed reader +* Atom feed, IMAP, NNTP allows casual readers to follow via local tools * uses only well-documented and easy-to-implement data formats @@ -64,7 +64,7 @@ Try it out now, see https://try.public-inbox.org/ Requirements for reading: -* any software capable of NNTP or following Atom feeds +* any software capable of IMAP, NNTP or following Atom feeds Any basic web browser will do for the HTML archives. We primarily develop on w3m to maximize accessibility. @@ -94,6 +94,7 @@ AGPL source code is available via git: git clone https://public-inbox.org/public-inbox.git git clone https://repo.or.cz/public-inbox.git + torsocks git clone http://ou63pmih66umazou.onion/public-inbox.git torsocks git clone http://hjrcffqmbrq6wope.onion/public-inbox See below for contact info. @@ -113,15 +114,19 @@ subscription. This also makes it easier to rope in folks of tangentially related projects we depend on (e.g. git developers on git@vger.kernel.org). -The archives are readable via NNTP or HTTP: +The archives are readable via IMAP, NNTP or HTTP: - nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta + nntps://news.public-inbox.org/inbox.comp.mail.public-inbox.meta + imaps://news.public-inbox.org/inbox.comp.mail.public-inbox.meta.0 https://public-inbox.org/meta/ +AUTH=ANONYMOUS is supported for IMAP, but any username + password works + And as Tor hidden services: http://hjrcffqmbrq6wope.onion/meta/ nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta + imap://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta.0 You may also clone all messages via git: @@ -152,7 +157,7 @@ aims to preserve the focus on content, and not presentation. Copyright --------- -Copyright 2013-2020 all contributors +Copyright 2013-2021 all contributors License: AGPL-3.0+ This program is free software: you can redistribute it and/or modify diff --git a/ci/deps.perl b/ci/deps.perl index 4c273337..643e86c0 100755 --- a/ci/deps.perl +++ b/ci/deps.perl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # Helper script for installing/uninstalling packages for CI use # Intended for use on non-production chroots or VMs since it diff --git a/ci/profiles.sh b/ci/profiles.sh index c891494f..3cd8fa38 100755 --- a/ci/profiles.sh +++ b/ci/profiles.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # Prints OS-specific package profiles to stdout (one per-newline) to use diff --git a/ci/run.sh b/ci/run.sh index 3f36d0d9..9613943b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ set -e SUDO=${SUDO-'sudo'} PERL=${PERL-'perl'} MAKE=${MAKE-'make'} diff --git a/contrib/completion/lei-completion.bash b/contrib/completion/lei-completion.bash new file mode 100644 index 00000000..0b82b109 --- /dev/null +++ b/contrib/completion/lei-completion.bash @@ -0,0 +1,11 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# preliminary bash completion support for lei (Local Email Interface) +# Needs a lot of work, see `lei__complete' in lib/PublicInbox::LEI.pm +_lei() { + COMPREPLY=($(compgen -W "$(lei _complete ${COMP_WORDS[@]})" \ + -- "${COMP_WORDS[COMP_CWORD]}")) + return 0 +} +complete -o default -o bashdefault -F _lei lei diff --git a/examples/README.unsubscribe b/examples/README.unsubscribe index f84e9355..3e80e838 100644 --- a/examples/README.unsubscribe +++ b/examples/README.unsubscribe @@ -36,5 +36,5 @@ in /etc/postfix/main.cf: # This is not needed for mlmmj since mlmmj uses SMTP: # non_smtpd_milters = local:/var/spool/postfix/unsubscribe/unsubscribe.sock -Copyright (C) 2016-2020 all contributors +Copyright (C) 2016-2021 all contributors License: AGPL-3.0+ diff --git a/examples/cgit-commit-filter.lua b/examples/cgit-commit-filter.lua index 73af9948..8f9d3eb5 100644 --- a/examples/cgit-commit-filter.lua +++ b/examples/cgit-commit-filter.lua @@ -1,4 +1,4 @@ --- Copyright (C) 2015-2020 all contributors +-- Copyright (C) 2015-2021 all contributors -- License: GPLv2 or later -- This commit filter maps a subject line to a search URL of a public-inbox -- disclaimer: written by someone who does not know Lua. diff --git a/examples/cgit-wwwhighlight-filter.lua b/examples/cgit-wwwhighlight-filter.lua index 708e8696..a622e9f8 100644 --- a/examples/cgit-wwwhighlight-filter.lua +++ b/examples/cgit-wwwhighlight-filter.lua @@ -1,4 +1,4 @@ --- Copyright (C) 2019-2020 all contributors +-- Copyright (C) 2019-2021 all contributors -- License: GPL-2.0+ -- -- This filter accesses the PublicInbox::WwwHighlight PSGI endpoint diff --git a/examples/cgit.psgi b/examples/cgit.psgi index 7ad38e28..876171b9 100644 --- a/examples/cgit.psgi +++ b/examples/cgit.psgi @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: GPL-3.0+ # # PublicInbox::Cgit may be used independently of WWW. @@ -14,8 +14,8 @@ use warnings; use Plack::Builder; use PublicInbox::Cgit; use PublicInbox::Config; -my $pi_config = PublicInbox::Config->new; -my $cgit = PublicInbox::Cgit->new($pi_config); +my $pi_cfg = PublicInbox::Config->new; +my $cgit = PublicInbox::Cgit->new($pi_cfg); builder { eval { enable 'ReverseProxy' }; diff --git a/examples/highlight.psgi b/examples/highlight.psgi index 23ec7861..d0f0be41 100644 --- a/examples/highlight.psgi +++ b/examples/highlight.psgi @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # # Usage: plackup [OPTIONS] /path/to/this/file diff --git a/examples/newswww.psgi b/examples/newswww.psgi index 52ad7043..44462dd3 100644 --- a/examples/newswww.psgi +++ b/examples/newswww.psgi @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: GPL-3.0+ # # NewsWWW may be used independently of WWW. This can be useful diff --git a/examples/public-inbox.psgi b/examples/public-inbox.psgi index 3537be2c..e017b2fb 100644 --- a/examples/public-inbox.psgi +++ b/examples/public-inbox.psgi @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: GPL-3.0+ # Note: this is part of our test suite, update t/plack.t if this changes # Usage: plackup [OPTIONS] /path/to/this/file diff --git a/examples/unsubscribe.milter b/examples/unsubscribe.milter index 23229511..7b126e30 100644 --- a/examples/unsubscribe.milter +++ b/examples/unsubscribe.milter @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ use strict; use warnings; diff --git a/examples/unsubscribe.psgi b/examples/unsubscribe.psgi index 7b97e253..c804b7d0 100644 --- a/examples/unsubscribe.psgi +++ b/examples/unsubscribe.psgi @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: GPL-3.0+ # This should not require any other PublicInbox code, but may use # PublicInbox::Config if ~/.public-inbox/config exists or diff --git a/lei.sh b/lei.sh new file mode 100755 index 00000000..f1510a73 --- /dev/null +++ b/lei.sh @@ -0,0 +1,7 @@ +#!/bin/sh -e +# symlink this file to a directory in PATH to run lei (or anything in script/*) +# without needing perms to install globally. Used by "make symlink-install" +p=$(realpath "$0" || readlink "$0") # neither is POSIX, but common +p=$(dirname "$p") c=$(basename "$0") # both are POSIX +exec ${PERL-perl} -w -I"$p"/lib "$p"/script/"${c%.sh}" "$@" +: this script is too short to copyright diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm index f413c2f6..2c9c4395 100644 --- a/lib/PublicInbox/Address.pm +++ b/lib/PublicInbox/Address.pm @@ -1,8 +1,10 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ package PublicInbox::Address; use strict; -use warnings; +use v5.10.1; +use parent 'Exporter'; +our @EXPORT_OK = qw(pairs); sub xs_emails { grep { defined } map { $_->address() } parse_email_addresses($_[0]) @@ -17,17 +19,25 @@ sub xs_names { } parse_email_addresses($_[0]); } +sub xs_pairs { # for JMAP, RFC 8621 section 4.1.2.3 + [ map { # LHS (name) may be undef + [ $_->phrase // $_->comment, $_->address ] + } parse_email_addresses($_[0]) ]; +} + eval { require Email::Address::XS; Email::Address::XS->import(qw(parse_email_addresses)); *emails = \&xs_emails; *names = \&xs_names; + *pairs = \&xs_pairs; }; if ($@) { require PublicInbox::AddressPP; *emails = \&PublicInbox::AddressPP::emails; *names = \&PublicInbox::AddressPP::names; + *pairs = \&PublicInbox::AddressPP::pairs; } 1; diff --git a/lib/PublicInbox/AddressPP.pm b/lib/PublicInbox/AddressPP.pm index 74a82843..6a3ae4fe 100644 --- a/lib/PublicInbox/AddressPP.pm +++ b/lib/PublicInbox/AddressPP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ package PublicInbox::AddressPP; use strict; @@ -13,6 +13,7 @@ sub emails { } sub names { + # split by address and post-address comment my @p = split(/]+)\@[\w\.\-]+>?\s*(\(.*?\))?(?:,\s*|\z)/, $_[0]); my @ret; @@ -35,4 +36,24 @@ sub names { @ret; } +sub pairs { # for JMAP, RFC 8621 section 4.1.2.3 + my ($s) = @_; + [ map { + my $addr = $_; + if ($s =~ s/\A\s*(.*?)\s*<\Q$addr\E>\s*(.*?)\s*(?:,|\z)// || + $s =~ s/\A\s*(.*?)\s*\Q$addr\E\s*(.*?)\s*(?:,|\z)//) { + my ($phrase, $comment) = ($1, $2); + $phrase =~ tr/\r\n\t / /s; + $phrase =~ s/\A['"\s]*//; + $phrase =~ s/['"\s]*\z//; + $phrase =~ s/\s*<*\s*\z//; + $phrase = undef if $phrase !~ /\S/; + $comment = ($comment =~ /\((.*?)\)/) ? $1 : undef; + [ $phrase // $comment, $addr ] + } else { + (); + } + } emails($s) ]; +} + 1; diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index fb88e621..f96397ea 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # common stuff for administrative command-line tools @@ -6,15 +6,15 @@ package PublicInbox::Admin; use strict; use parent qw(Exporter); -use Cwd qw(abs_path); -use POSIX (); -our @EXPORT_OK = qw(resolve_repo_dir setup_signals); +our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); +*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { my ($cb, $arg) = @_; # optional + require POSIX; # we call exit() here instead of _exit() so DESTROY methods # get called (e.g. File::Temp::Dir and PublicInbox::Msgmap) @@ -27,21 +27,34 @@ sub setup_signals { }; } -sub resolve_repo_dir { +sub resolve_inboxdir { my ($cd, $ver) = @_; - my $prefix = defined $cd ? $cd : './'; - if (-d $prefix && -f "$prefix/inbox.lock") { # v2 - $$ver = 2 if $ver; - return abs_path($prefix); + my $try = $cd // '.'; + my $root_dev_ino; + while (1) { # favor v2, first + if (-f "$try/inbox.lock") { + $$ver = 2 if $ver; + return rel2abs_collapsed($try); + } elsif (-d $try) { + my @try = stat _; + $root_dev_ino //= do { + my @root = stat('/') or die "stat /: $!\n"; + "$root[0]\0$root[1]"; + }; + last if "$try[0]\0$try[1]" eq $root_dev_ino; + $try .= '/..'; # continue, cd up + } else { + die "`$try' is not a directory\n"; + } } + # try v1 bare git dirs my $cmd = [ qw(git rev-parse --git-dir) ]; my $fh = popen_rd($cmd, undef, {-C => $cd}); my $dir = do { local $/; <$fh> }; - close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n"; + close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n"; chomp $dir; $$ver = 1 if $ver; - return abs_path($cd) if ($dir eq '.' && defined $cd); - abs_path($dir); + rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir); } # for unconfigured inboxes @@ -78,8 +91,8 @@ sub unconfigured_ibx ($$) { name => $name, address => [ "$name\@example.com" ], inboxdir => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, + # consumers (-convert) warn on this: + -unconfigured => 1, }); } @@ -95,40 +108,53 @@ sub resolve_inboxes ($;$$) { } my $min_ver = $opt->{-min_inbox_version} || 0; + # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames, + # pathnames are not unique due to symlinks and bind mounts my (@old, @ibxs); - my %dir2ibx; - if ($cfg) { + if ($opt->{all}) { $cfg->each_inbox(sub { my ($ibx) = @_; - my $path = abs_path($ibx->{inboxdir}); - if (defined($path)) { - $dir2ibx{$path} = $ibx; + if (-e $ibx->{inboxdir}) { + push(@ibxs, $ibx) if $ibx->version >= $min_ver; } else { - warn <{name} $ibx->{inboxdir}: $! -EOF + warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); - } - if ($opt->{all}) { - my @all = values %dir2ibx; - @all = grep { $_->version >= $min_ver } @all; - push @ibxs, @all; } else { # directories specified on the command-line - my $i = 0; my @dirs = @$argv; - push @dirs, '.' unless @dirs; - foreach (@dirs) { - my $v; - my $dir = resolve_repo_dir($_, \$v); - if ($v < $min_ver) { + push @dirs, '.' if !@dirs && $opt->{-use_cwd}; + my %s2i; # "st_dev\0st_ino" => array index + for (my $i = 0; $i <= $#dirs; $i++) { + my $dir = $dirs[$i]; + my @st = stat($dir) or die "stat($dir): $!\n"; + $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver)); + if ($ver >= $min_ver) { + $s2i{"$st[0]\0$st[1]"} //= $i; + } else { push @old, $dir; - next; } - my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); - $i++; - push @ibxs, $ibx; } + my $done = \'done'; + eval { + $cfg->each_inbox(sub { + my ($ibx) = @_; + return if $ibx->version < $min_ver; + my $dir = $ibx->{inboxdir}; + if (my @s = stat $dir) { + my $i = delete($s2i{"$s[0]\0$s[1]"}) + // return; + $ibxs[$i] = $ibx; + die $done if !keys(%s2i); + } else { + warn "W: $ibx->{name} $dir: $!\n"; + } + }); + }; + die $@ if $@ && $@ ne $done; + for my $i (sort { $a <=> $b } values %s2i) { + $ibxs[$i] = unconfigured_ibx($dirs[$i], $i); + } + @ibxs = grep { defined } @ibxs; # duplicates are undef } if (@old) { die "-V$min_ver inboxes not supported by $0\n\t", @@ -208,12 +234,20 @@ sub index_terminate { sub index_inbox { my ($ibx, $im, $opt) = @_; + require PublicInbox::InboxWritable; my $jobs = delete $opt->{jobs} if $opt; if (my $pr = $opt->{-progress}) { $pr->("indexing $ibx->{inboxdir} ...\n"); } local %SIG = %SIG; setup_signals(\&index_terminate, $ibx); + my $warn_cb = $SIG{__WARN__} // \&CORE::warn; + my $idx = { current_info => $ibx->{inboxdir} }; + my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore'); + local $SIG{__WARN__} = sub { + return if $warn_ignore->(@_); + $warn_cb->($idx->{current_info}, ': ', @_); + }; if (ref($ibx) && $ibx->version == 2) { eval { require PublicInbox::V2Writable }; die "v2 requirements not met: $@\n" if $@; @@ -225,21 +259,19 @@ sub index_inbox { } else { my $n = $v2w->{shards}; if ($jobs < ($n + 1) && !$opt->{reshard}) { - warn -"Unable to respect --jobs=$jobs on index, inbox was created with $n shards\n"; + warn <($v2w->{current_info}, ': ', @_); - }; - $v2w->index_sync($opt); + $idx = $v2w; } else { require PublicInbox::SearchIdx; - my $s = PublicInbox::SearchIdx->new($ibx, 1); - $s->index_sync($opt); + $idx = PublicInbox::SearchIdx->new($ibx, 1); } + $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } sub progress_prepare ($) { diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm index 4448dcc2..2f6707d8 100644 --- a/lib/PublicInbox/AdminEdit.pm +++ b/lib/PublicInbox/AdminEdit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # common stuff between -edit, -purge (and maybe -learn in the future) diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index 6d16242a..80757ceb 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Used for giving serial numbers to messages. This can be tied to diff --git a/lib/PublicInbox/Cgit.pm b/lib/PublicInbox/Cgit.pm index fb0d0e60..f38e8b6b 100644 --- a/lib/PublicInbox/Cgit.pm +++ b/lib/PublicInbox/Cgit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # wrapper for cgit(1) and git-http-backend(1) for browsing and @@ -16,9 +16,9 @@ use PublicInbox::Qspawn; use PublicInbox::WwwStatic qw(r); sub locate_cgit ($) { - my ($pi_config) = @_; - my $cgit_bin = $pi_config->{'publicinbox.cgitbin'}; - my $cgit_data = $pi_config->{'publicinbox.cgitdata'}; + my ($pi_cfg) = @_; + my $cgit_bin = $pi_cfg->{'publicinbox.cgitbin'}; + my $cgit_data = $pi_cfg->{'publicinbox.cgitdata'}; # /var/www/htdocs/cgit is the default install path from cgit.git # /usr/{lib,share}/cgit is where Debian puts cgit @@ -51,28 +51,28 @@ sub locate_cgit ($) { } sub new { - my ($class, $pi_config) = @_; - my ($cgit_bin, $cgit_data) = locate_cgit($pi_config); + my ($class, $pi_cfg) = @_; + my ($cgit_bin, $cgit_data) = locate_cgit($pi_cfg); my $self = bless { cmd => [ $cgit_bin ], cgit_data => $cgit_data, - pi_config => $pi_config, + pi_cfg => $pi_cfg, }, $class; - $pi_config->fill_all; # fill in -code_repos mapped to inboxes + $pi_cfg->fill_all; # fill in -code_repos mapped to inboxes # some cgit repos may not be mapped to inboxes, so ensure those exist: - my $code_repos = $pi_config->{-code_repos}; - foreach my $k (keys %$pi_config) { + my $code_repos = $pi_cfg->{-code_repos}; + foreach my $k (keys %$pi_cfg) { $k =~ /\Acoderepo\.(.+)\.dir\z/ or next; - my $dir = $pi_config->{$k}; + my $dir = $pi_cfg->{$k}; $code_repos->{$1} ||= PublicInbox::Git->new($dir); } while (my ($nick, $repo) = each %$code_repos) { $self->{"\0$nick"} = $repo; } - my $cgit_static = $pi_config->{-cgit_static}; + my $cgit_static = $pi_cfg->{-cgit_static}; my $static = join('|', map { quotemeta $_ } keys %$cgit_static); $self->{static} = qr/\A($static)\z/; $self; @@ -120,7 +120,7 @@ sub call { my $rdr = input_prepare($env) or return r(500); my $qsp = PublicInbox::Qspawn->new($self->{cmd}, $cgi_env, $rdr); - my $limiter = $self->{pi_config}->limiter('-cgit'); + my $limiter = $self->{pi_cfg}->limiter('-cgit'); $qsp->psgi_return($env, $limiter, $parse_cgi_headers); } diff --git a/lib/PublicInbox/CmdIPC4.pm b/lib/PublicInbox/CmdIPC4.pm new file mode 100644 index 00000000..c244f6a1 --- /dev/null +++ b/lib/PublicInbox/CmdIPC4.pm @@ -0,0 +1,36 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# callers should use PublicInbox::CmdIPC4->can('send_cmd4') (or recv_cmd4) +# first choice for script/lei front-end and 2nd choice for lei backend +# libsocket-msghdr-perl is in Debian but many other distros as of 2021. +package PublicInbox::CmdIPC4; +use strict; +use v5.10.1; +use Socket qw(SOL_SOCKET SCM_RIGHTS); +BEGIN { eval { +require Socket::MsgHdr; # XS +no warnings 'once'; + +# 3 FDs per-sendmsg(2) + buffer +*send_cmd4 = sub ($$$$) { # (sock, fds, buf, flags) = @_; + my ($sock, $fds, undef, $flags) = @_; + my $mh = Socket::MsgHdr->new(buf => $_[2]); + $mh->cmsghdr(SOL_SOCKET, SCM_RIGHTS, + pack('i' x scalar(@$fds), @$fds)); + Socket::MsgHdr::sendmsg($sock, $mh, $flags); +}; + +*recv_cmd4 = sub ($$$) { + my ($s, undef, $len) = @_; # $_[1] = destination buffer + my $mh = Socket::MsgHdr->new(buflen => $len, controllen => 256); + my $r = Socket::MsgHdr::recvmsg($s, $mh, 0) // return ($_[1] = undef); + $_[1] = $mh->buf; + return () if $r == 0; + my (undef, undef, $data) = $mh->cmsghdr; + defined($data) ? unpack('i' x (length($data) / 4), $data) : (); +}; + +} } # /eval /BEGIN + +1; diff --git a/lib/PublicInbox/CompressNoop.pm b/lib/PublicInbox/CompressNoop.pm index fe73c2d1..e3301473 100644 --- a/lib/PublicInbox/CompressNoop.pm +++ b/lib/PublicInbox/CompressNoop.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Provide the same methods as Compress::Raw::Zlib::Deflate but diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index d57c361a..4f63bc93 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # # Used throughout the project for reading configuration @@ -33,6 +33,7 @@ sub new { $self->{-by_list_id} = {}; $self->{-by_name} = {}; $self->{-by_newsgroup} = {}; + $self->{-by_eidx_key} = {}; $self->{-no_obfuscate} = {}; $self->{-limiters} = {}; $self->{-code_repos} = {}; # nick => PublicInbox::Git object @@ -89,6 +90,14 @@ sub lookup_name ($$) { $self->{-by_name}->{$name} // _fill($self, "publicinbox.$name"); } +sub lookup_ei { + my ($self, $name) = @_; + $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name"); +} + +# special case for [extindex "all"] +sub ALL { lookup_ei($_[0], 'all') } + sub each_inbox { my ($self, $cb, @arg) = @_; # may auto-vivify if config file is non-existent: @@ -123,20 +132,16 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my %rv; - my (%section_seen, @section_order); + my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i); local $/ = $rs; - while (defined(my $line = <$fh>)) { - chomp $line; - my ($k, $v) = split($fs, $line, 2); - my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); - unless (defined $section_seen{$section}) { - $section_seen{$section} = 1; - push @section_order, $section; - } - - my $cur = $rv{$k}; - if (defined $cur) { + while (defined($line = <$fh>)) { # perf critical with giant configs + $i = index($line, $fs); + $k = substr($line, 0, $i); + $v = substr($line, $i + 1, -1); # chop off $fs + $section = substr($k, 0, rindex($k, '.')); + $seen{$section} //= push(@section_order, $section); + + if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { push @$cur, $v; } else { @@ -154,11 +159,10 @@ sub config_fh_parse ($$$) { sub git_config_dump { my ($file) = @_; return {} unless -e $file; - my @cmd = (qw/git config -z -l --includes/, "--file=$file"); - my $cmd = join(' ', @cmd); - my $fh = popen_rd(\@cmd); + my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; + my $fh = popen_rd($cmd); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "failed to close ($cmd) pipe: $?"; + close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } @@ -360,6 +364,16 @@ sub git_bool { } } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed { + require File::Spec; + my $p = File::Spec->rel2abs($_[-1]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; + require Cwd; + Cwd::abs_path($p); +} + sub _fill { my ($self, $pfx) = @_; my $ibx = {}; @@ -382,10 +396,10 @@ EOF } } - # backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"}; - if (($ibx->{inboxdir} // '') =~ /\n/s) { - warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; + # "mainrepo" is backwards compatibility: + my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if (index($dir, "\n") >= 0) { + warn "E: `$dir' must not contain `\\n'\n"; return; } foreach my $k (qw(obfuscate)) { @@ -406,17 +420,14 @@ EOF } } - return unless defined($ibx->{inboxdir}); - my $name = $pfx; - $name =~ s/\Apublicinbox\.//; - + my $name = substr($pfx, length('publicinbox.')); if (!valid_inbox_name($name)) { warn "invalid inbox name: '$name'\n"; return; } $ibx->{name} = $name; - $ibx->{-pi_config} = $self; + $ibx->{-pi_cfg} = $self; $ibx = PublicInbox::Inbox->new($ibx); foreach (@{$ibx->{address}}) { my $lc_addr = lc($_); @@ -429,8 +440,31 @@ EOF $self->{-by_list_id}->{lc($list_id)} = $ibx; } } - if (my $ng = $ibx->{newsgroup}) { - $self->{-by_newsgroup}->{$ng} = $ibx; + if (defined(my $ngname = $ibx->{newsgroup})) { + if (ref($ngname)) { + delete $ibx->{newsgroup}; + warn 'multiple newsgroups not supported: '. + join(', ', @$ngname). "\n"; + # Newsgroup name needs to be compatible with RFC 3977 + # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. + # Leave out a few chars likely to cause problems or conflicts: + # '|', '<', '>', ';', '#', '$', '&', + } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! || + $ngname eq '') { + delete $ibx->{newsgroup}; + warn "newsgroup name invalid: `$ngname'\n"; + } else { + # PublicInbox::NNTPD does stricter ->nntp_usable + # checks, keep this lean for startup speed + $self->{-by_newsgroup}->{$ngname} = $ibx; + } + } + unless (defined $ibx->{newsgroup}) { # for ->eidx_key + my $abs = rel2abs_collapsed($dir); + if ($abs ne $dir) { + warn "W: `$dir' canonicalized to `$abs'\n"; + $ibx->{inboxdir} = $abs; + } } $self->{-by_name}->{$name} = $ibx; if ($ibx->{obfuscate}) { @@ -453,8 +487,18 @@ EOF push @$repo_objs, $repo if $repo; } } + if (my $es = ALL($self)) { + require PublicInbox::Isearch; + $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es); + } + $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx; +} - $ibx +sub _fill_ei ($$) { + my ($self, $pfx) = @_; + require PublicInbox::ExtSearch; + my $d = $self->{"$pfx.topdir"}; + defined($d) && -d $d ? PublicInbox::ExtSearch->new($d) : undef; } sub urlmatch { @@ -476,4 +520,16 @@ sub urlmatch { } } +sub json { + state $json; + $json //= do { + for my $mod (qw(Cpanel::JSON::XS JSON::MaybeXS JSON JSON::PP)) { + eval "require $mod" or next; + # ->ascii encodes non-ASCII to "\uXXXX" + $json = $mod->new->ascii(1) and last; + } + $json; + }; +} + 1; diff --git a/lib/PublicInbox/ConfigIter.pm b/lib/PublicInbox/ConfigIter.pm index e6fa8172..24cb09bf 100644 --- a/lib/PublicInbox/ConfigIter.pm +++ b/lib/PublicInbox/ConfigIter.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Intended for PublicInbox::DS->EventLoop in read-only daemons diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index 1fe22955..838fdd6f 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Unstable internal API. diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index a02b3bb7..40994fd4 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -21,19 +21,19 @@ # (tmpio = [ GLOB, offset, [ length ] ]) package PublicInbox::DS; use strict; +use v5.10.1; +use parent qw(Exporter); use bytes; -use POSIX qw(WNOHANG); +use POSIX qw(WNOHANG sigprocmask SIG_SETMASK); use IO::Handle qw(); use Fcntl qw(SEEK_SET :DEFAULT O_APPEND); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); -use parent qw(Exporter); -our @EXPORT_OK = qw(now msg_more); -use 5.010_001; use Scalar::Util qw(blessed); use PublicInbox::Syscall qw(:epoll); use PublicInbox::Tmpfile; use Errno qw(EAGAIN EINVAL); use Carp qw(confess carp); +our @EXPORT_OK = qw(now msg_more dwaitpid); my $nextq; # queue for next_tick my $wait_pids; # list of [ pid, callback, callback_arg ] @@ -50,7 +50,6 @@ our ( $PostLoopCallback, # subref to call at the end of each loop, if defined (global) $LoopTimeout, # timeout of event loop in milliseconds - $DoneInit, # if we've done the one-time module init yet @Timers, # timers $in_loop, ); @@ -67,20 +66,18 @@ Reset all state =cut sub Reset { + $in_loop = undef; # first in case DESTROY callbacks use this %DescriptorMap = (); - $in_loop = $wait_pids = $later_queue = $reap_armed = undef; + $wait_pids = $later_queue = $reap_armed = undef; $EXPMAP = {}; $nextq = $ToClose = $later_timer = $exp_timer = undef; $LoopTimeout = -1; # no timeout by default @Timers = (); $PostLoopCallback = undef; - $DoneInit = 0; $_io = undef; # closes real $Epoll FD $Epoll = undef; # may call DSKQXS::DESTROY - - *EventLoop = *FirstTimeEventLoop; } =head2 C<< CLASS->SetLoopTimeout( $timeout ) >> @@ -91,9 +88,7 @@ A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return immediately. =cut -sub SetLoopTimeout { - return $LoopTimeout = $_[1] + 0; -} +sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 } =head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >> @@ -137,14 +132,13 @@ sub set_cloexec ($) { fcntl($_io, F_SETFD, $fl | FD_CLOEXEC); } +# caller sets return value to $Epoll sub _InitPoller { - return if $DoneInit; - $DoneInit = 1; - if (PublicInbox::Syscall::epoll_defined()) { - $Epoll = epoll_create(); - set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0); + my $fd = epoll_create(); + set_cloexec($fd) if (defined($fd) && $fd >= 0); + $fd; } else { my $cls; for (qw(DSKQXS DSPoll)) { @@ -152,9 +146,8 @@ sub _InitPoller last if eval "require $cls"; } $cls->import(qw(epoll_ctl epoll_wait)); - $Epoll = $cls->new; + $cls->new; } - *EventLoop = *EpollEventLoop; } =head2 C<< CLASS->EventLoop() >> @@ -163,13 +156,6 @@ Start processing IO events. In most daemon programs this never exits. See C below for how to exit the loop. =cut -sub FirstTimeEventLoop { - my $class = shift; - - _InitPoller(); - - EventLoop($class); -} sub now () { clock_gettime(CLOCK_MONOTONIC) } @@ -213,12 +199,17 @@ sub RunTimers { my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; # -1 is an infinite timeout, so prefer a real timeout - return $timeout if $LoopTimeout == -1; + ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout; +} + +sub sig_setmask { sigprocmask(SIG_SETMASK, @_) or die "sigprocmask: $!" } - # otherwise pick the lower of our regular timeout and time until - # the next timer - return $LoopTimeout if $LoopTimeout < $timeout; - return $timeout; +sub block_signals () { + my $oldset = POSIX::SigSet->new; + my $newset = POSIX::SigSet->new; + $newset->fillset or die "fillset: $!"; + sig_setmask($newset, $oldset); + $oldset; } # We can't use waitpid(-1) safely here since it can hit ``, system(), @@ -230,17 +221,22 @@ sub reap_pids { $reap_armed = undef; my $tmp = $wait_pids or return; $wait_pids = undef; + my $oldset = block_signals(); foreach my $ary (@$tmp) { my ($pid, $cb, $arg) = @$ary; my $ret = waitpid($pid, WNOHANG); if ($ret == 0) { push @$wait_pids, $ary; # autovivifies @$wait_pids - } elsif ($cb) { - eval { $cb->($arg, $pid) }; + } elsif ($ret == $pid) { + if ($cb) { + eval { $cb->($arg, $pid) }; + warn "E: dwaitpid($pid) in_loop: $@" if $@; + } + } else { + warn "waitpid($pid, WNOHANG) = $ret, \$!=$!, \$?=$?"; } } - # we may not be done, yet, and could've missed/masked a SIGCHLD: - $reap_armed //= requeue(\&reap_pids) if $wait_pids; + sig_setmask($oldset); } # reentrant SIGCHLD handler (since reap_pids is not reentrant) @@ -271,21 +267,21 @@ sub PostEventLoop () { $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; } -sub EpollEventLoop { +sub EventLoop { + $Epoll //= _InitPoller(); local $in_loop = 1; + my @events; do { - my @events; - my $i; my $timeout = RunTimers(); # get up to 1000 events - my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events); - for ($i=0; $i<$evcount; $i++) { + epoll_wait($Epoll, 1000, $timeout, \@events); + for my $fd (@events) { # it's possible epoll_wait returned many events, including some at the end # that ones in the front triggered unregister-interest actions. if we # can't find the %sock entry, it's because we're no longer interested # in that event. - $DescriptorMap{$events[$i]->[0]}->event_step; + $DescriptorMap{$fd}->event_step; } } while (PostEventLoop()); _run_later(); @@ -330,8 +326,7 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - _InitPoller(); - + $Epoll //= _InitPoller(); retry: if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { @@ -629,13 +624,23 @@ sub shutdn ($) { } } -# must be called with eval, PublicInbox::DS may not be loaded (see t/qspawn.t) -sub dwaitpid ($$$) { - die "Not in EventLoop\n" unless $in_loop; - push @$wait_pids, [ @_ ]; # [ $pid, $cb, $arg ] - - # We could've just missed our SIGCHLD, cover it, here: - enqueue_reap(); +sub dwaitpid ($;$$) { + my ($pid, $cb, $arg) = @_; + if ($in_loop) { + push @$wait_pids, [ $pid, $cb, $arg ]; + # We could've just missed our SIGCHLD, cover it, here: + enqueue_reap(); + } else { + my $ret = waitpid($pid, 0); + if ($ret == $pid) { + if ($cb) { + eval { $cb->($arg, $pid) }; + carp "E: dwaitpid($pid) !in_loop: $@" if $@; + } + } else { + carp "waitpid($pid, 0) = $ret, \$!=$!, \$?=$?"; + } + } } sub _run_later () { diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index d1d3fe60..acc31d9b 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # @@ -18,7 +18,7 @@ use Symbol qw(gensym); use IO::KQueue; use Errno qw(EAGAIN); use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLLET - EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL $SFD_NONBLOCK); + EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL SFD_NONBLOCK); our @EXPORT_OK = qw(epoll_ctl epoll_wait); sub EV_DISPATCH () { 0x0080 } @@ -57,7 +57,7 @@ sub signalfd { sub TIEHANDLE { # similar to signalfd() my ($class, $signo, $flags) = @_; my $self = $class->new; - $self->{timeout} = ($flags & $SFD_NONBLOCK) ? 0 : -1; + $self->{timeout} = ($flags & SFD_NONBLOCK) ? 0 : -1; my $kq = $self->{kq}; $kq->EV_SET($_, EVFILT_SIGNAL, EV_ADD) for @$signo; $self; @@ -134,7 +134,7 @@ sub epoll_wait { } } # caller only cares for $events[$i]->[0] - scalar(@$events); + $_ = $_->[0] for @$events; } # kqueue is close-on-fork (not exec), so we must not close it diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 1d9b51d9..56a400c2 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # @@ -45,14 +45,13 @@ sub epoll_wait { my $fd = $pset[$i++]; my $revents = $pset[$i++] or next; delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; - push @$events, [ $fd ]; + push @$events, $fd; } my $nevents = scalar @$events; if ($n != $nevents) { warn "BUG? poll() returned $n, but got $nevents"; } } - $n; } 1; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index 5fdcba14..b5f97d81 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -1,7 +1,9 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ -# contains common daemon code for the httpd, imapd, and nntpd servers. -# This may be used for read-only IMAP server if we decide to implement it. +# +# Contains common daemon code for the httpd, imapd, and nntpd servers +# and designed for handling thousands of untrusted clients over slow +# and/or lossy connections. package PublicInbox::Daemon; use strict; use warnings; @@ -11,14 +13,14 @@ use IO::Socket; use POSIX qw(WNOHANG :signal_h); use Socket qw(IPPROTO_TCP SOL_SOCKET); sub SO_ACCEPTFILTER () { 0x1000 } -use Cwd qw/abs_path/; STDOUT->autoflush(1); STDERR->autoflush(1); use PublicInbox::DS qw(now); -use PublicInbox::Syscall qw($SFD_NONBLOCK); +use PublicInbox::Syscall qw(SFD_NONBLOCK); require PublicInbox::Listener; use PublicInbox::EOFpipe; use PublicInbox::Sigfd; +use PublicInbox::GitAsyncCat; my @CMD; my ($set_user, $oldset); my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); @@ -75,7 +77,7 @@ sub accept_tls_opt ($) { sub daemon_prepare ($) { my ($default_listen) = @_; my $listener_names = {}; # sockname => IO::Handle - $oldset = PublicInbox::Sigfd::block_signals(); + $oldset = PublicInbox::DS::block_signals(); @CMD = ($0, @ARGV); my ($prog) = ($CMD[0] =~ m!([^/]+)\z!g); my $help = <&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - $pid = fork; - die "could not fork: $!\n" unless defined $pid; + $pid = fork // die "fork: $!"; exit if $pid; } return unless defined $pid_file; @@ -368,14 +369,12 @@ sub inherit ($) { foreach my $fd (3..$end) { my $s = IO::Handle->new_from_fd($fd, 'r'); if (my $k = sockname($s)) { - if ($s->blocking) { - $s->blocking(0); - warn <<""; + my $prev_was_blocking = $s->blocking(0); + warn <<"" if $prev_was_blocking; Inherited socket (fd=$fd) is blocking, making it non-blocking. Set 'NonBlocking = true' in the systemd.service unit to avoid stalled processes when multiple service instances start. - } $listener_names->{$k} = $s; push @rv, $s; } else { @@ -422,11 +421,8 @@ sub upgrade { # $_[0] = signal name or number (unused) } sub kill_workers ($) { - my ($s) = @_; - - while (my ($pid, $id) = each %pids) { - kill $s, $pid; - } + my ($sig) = @_; + kill $sig, keys(%pids); } sub upgrade_aborted ($) { @@ -518,8 +514,8 @@ EOF CHLD => \&reap_children, }; my $sigfd = PublicInbox::Sigfd->new($sig, 0); - local %SIG = (%SIG, %$sig) if !$sigfd; - PublicInbox::Sigfd::sig_setmask($oldset) if !$sigfd; + local @SIG{keys %$sig} = values(%$sig) unless $sigfd; + PublicInbox::DS::sig_setmask($oldset) if !$sigfd; while (1) { # main loop my $n = scalar keys %pids; unless (@listeners) { @@ -535,12 +531,15 @@ EOF } my $want = $worker_processes - 1; if ($n <= $want) { - PublicInbox::Sigfd::block_signals() if !$sigfd; + PublicInbox::DS::block_signals() if !$sigfd; for my $i ($n..$want) { + my $seed = rand(0xffffffff); my $pid = fork; if (!defined $pid) { warn "failed to fork worker[$i]: $!\n"; } elsif ($pid == 0) { + srand($seed); + eval { Net::SSLeay::randomize() }; $set_user->() if $set_user; return $p0; # run normal work code } else { @@ -548,7 +547,7 @@ EOF $pids{$pid} = $i; } } - PublicInbox::Sigfd::sig_setmask($oldset) if !$sigfd; + PublicInbox::DS::sig_setmask($oldset) if !$sigfd; } if ($sigfd) { # Linux and IO::KQueue users: @@ -631,12 +630,12 @@ sub daemon_loop ($$$$) { # this calls epoll_create: PublicInbox::Listener->new($_, $tls_cb || $post_accept) } @listeners; - my $sigfd = PublicInbox::Sigfd->new($sig, $SFD_NONBLOCK); - local %SIG = (%SIG, %$sig) if !$sigfd; + my $sigfd = PublicInbox::Sigfd->new($sig, SFD_NONBLOCK); + local @SIG{keys %$sig} = values(%$sig) unless $sigfd; if (!$sigfd) { # wake up every second to accept signals if we don't # have signalfd or IO::KQueue: - PublicInbox::Sigfd::sig_setmask($oldset); + PublicInbox::DS::sig_setmask($oldset); PublicInbox::DS->SetLoopTimeout(1000); } PublicInbox::DS->EventLoop; @@ -648,6 +647,10 @@ sub run ($$$;$) { daemon_prepare($default); my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef; my $for_destroy = daemonize(); + + # localize GCF2C for tests: + local $PublicInbox::GitAsyncCat::GCF2C; + daemon_loop($refresh, $post_accept, $tlsd, $af_default); PublicInbox::DS->Reset; # ->DESTROY runs when $for_destroy goes out-of-scope diff --git a/lib/PublicInbox/DirIdle.pm b/lib/PublicInbox/DirIdle.pm index 458285e2..5437190d 100644 --- a/lib/PublicInbox/DirIdle.pm +++ b/lib/PublicInbox/DirIdle.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Used by public-inbox-watch for Maildir (and possibly MH in the future) diff --git a/lib/PublicInbox/DummyInbox.pm b/lib/PublicInbox/DummyInbox.pm index 69b0b683..c516eec4 100644 --- a/lib/PublicInbox/DummyInbox.pm +++ b/lib/PublicInbox/DummyInbox.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # # An EXAMINE-able, PublicInbox::Inbox-like object for IMAP. Some @@ -7,16 +7,16 @@ package PublicInbox::DummyInbox; use strict; -sub created_at { 0 } # Msgmap::created_at +sub uidvalidity { 0 } # Msgmap::created_at sub mm { shift } sub uid_range { [] } # Over::uid_range sub subscribe_unlock { undef }; no warnings 'once'; -*max = \&created_at; +*max = \&uidvalidity; *query_xover = \&uid_range; *over = \&mm; -*search = *unsubscribe_unlock = +*isrch = *search = *unsubscribe_unlock = *get_art = *description = *base_url = \&subscribe_unlock; 1; diff --git a/lib/PublicInbox/EOFpipe.pm b/lib/PublicInbox/EOFpipe.pm index 489caf82..e537e2aa 100644 --- a/lib/PublicInbox/EOFpipe.pm +++ b/lib/PublicInbox/EOFpipe.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ package PublicInbox::EOFpipe; diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm index b705e776..67f27bc7 100644 --- a/lib/PublicInbox/Emergency.pm +++ b/lib/PublicInbox/Emergency.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # Emergency Maildir delivery for MDA diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 4d3fffc0..462d51fc 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # # Lazy MIME parser, it still slurps the full message but keeps short diff --git a/lib/PublicInbox/EmlContentFoo.pm b/lib/PublicInbox/EmlContentFoo.pm index c163eaf5..80fc7364 100644 --- a/lib/PublicInbox/EmlContentFoo.pm +++ b/lib/PublicInbox/EmlContentFoo.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES # This library is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 03faf3a1..5c8bf561 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # Used by the web interface to link to messages outside of the our @@ -32,8 +32,8 @@ sub PARTIAL_MAX () { 100 } sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; - my $srch = $ibx->search or return; - my $opt = { limit => PARTIAL_MAX, mset => 2 }; + my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL + my $opt = { limit => PARTIAL_MAX, relevance => -1 }; my @try = ("m:$mid*"); my $chop = $mid; if ($chop =~ s/(\W+)(\w*)\z//) { @@ -76,7 +76,7 @@ sub search_partial ($$) { sub ext_msg_i { my ($other, $ctx) = @_; - return if $other->{name} eq $ctx->{-inbox}->{name} || !$other->base_url; + return if $other->{name} eq $ctx->{ibx}->{name} || !$other->base_url; my $mm = $other->mm or return; @@ -103,19 +103,48 @@ sub ext_msg_step { } } +sub ext_msg_ALL ($) { + my ($ctx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL or return; + my $by_eidx_key = $ctx->{www}->{pi_cfg}->{-by_eidx_key}; + my $cur_key = eval { $ctx->{ibx}->eidx_key } // + return partial_response($ctx); # $cur->{ibx} == $ALL + my %seen = ($cur_key => 1); + my ($id, $prev); + while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($x->{num}); + for my $k (@$xr3) { + $k =~ s/:[0-9]+:$x->{blob}\z// or next; + next if $k eq $cur_key; + my $ibx = $by_eidx_key->{$k} // next; + my $url = $ibx->base_url or next; + push(@{$ctx->{found}}, $ibx) unless $seen{$k}++; + } + } + return exact($ctx) if $ctx->{found}; + + # fall back to partial MID matching + for my $ibxish ($ctx->{ibx}, $ALL) { + my $mids = search_partial($ibxish, $ctx->{mid}) or next; + push @{$ctx->{partial}}, [ $ibxish, $mids ]; + last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX; + } + partial_response($ctx); +} + sub ext_msg { my ($ctx) = @_; - sub { + ext_msg_ALL($ctx) // sub { $ctx->{-wcb} = $_[0]; # HTTP server write callback if ($ctx->{env}->{'pi-httpd.async'}) { require PublicInbox::ConfigIter; my $iter = PublicInbox::ConfigIter->new( - $ctx->{www}->{pi_config}, + $ctx->{www}->{pi_cfg}, \&ext_msg_step, $ctx); $iter->event_step; } else { - $ctx->{www}->{pi_config}->each_inbox(\&ext_msg_i, $ctx); + $ctx->{www}->{pi_cfg}->each_inbox(\&ext_msg_i, $ctx); finalize_exact($ctx); } }; @@ -141,7 +170,7 @@ sub finalize_exact { # fall back to partial MID matching my $mid = $ctx->{mid}; - my $cur = $ctx->{-inbox}; + my $cur = $ctx->{ibx}; my $mids = search_partial($cur, $mid); if ($mids) { $ctx->{n_partial} = scalar(@$mids); @@ -159,7 +188,7 @@ sub finalize_exact { finalize_partial($ctx); } -sub finalize_partial { +sub partial_response ($) { my ($ctx) = @_; my $mid = $ctx->{mid}; my $code = 404; @@ -172,7 +201,7 @@ sub finalize_partial { my $es = $n_partial == 1 ? '' : 'es'; $n_partial .= '+' if ($n_partial == PARTIAL_MAX); $s .= "\n$n_partial partial match$es found:\n\n"; - my $cur_name = $ctx->{-inbox}->{name}; + my $cur_name = $ctx->{ibx}->{name}; foreach my $pair (@{$ctx->{partial}}) { my ($ibx, $res) = @$pair; my $env = $ctx->{env} if $ibx->{name} eq $cur_name; @@ -192,9 +221,11 @@ sub finalize_partial { $ctx->{-html_tip} = $s .= ''; $ctx->{-title_html} = $title; $ctx->{-upfx} = '../'; - $ctx->{-wcb}->(html_oneshot($ctx, $code)); + html_oneshot($ctx, $code); } +sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) } + sub ext_urls { my ($ctx, $mid, $href, $html) = @_; diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm new file mode 100644 index 00000000..8ba4d396 --- /dev/null +++ b/lib/PublicInbox/ExtSearch.pm @@ -0,0 +1,123 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Read-only external (detached) index for cross inbox search. +# This is a read-only counterpart to PublicInbox::ExtSearchIdx +# and behaves like PublicInbox::Inbox AND PublicInbox::Search +package PublicInbox::ExtSearch; +use strict; +use v5.10.1; +use PublicInbox::Over; +use PublicInbox::Inbox; +use PublicInbox::MiscSearch; +use DBI qw(:sql_types); # SQL_BLOB + +# for ->reopen, ->mset, ->mset_to_artnums +use parent qw(PublicInbox::Search); + +sub new { + my ($class, $topdir) = @_; + bless { + topdir => $topdir, + # xpfx => 'ei15' + xpfx => "$topdir/ei".PublicInbox::Search::SCHEMA_VERSION + }, $class; +} + +sub misc { + my ($self) = @_; + $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); +} + +# same as per-inbox ->over, for now... +sub over { + my ($self) = @_; + $self->{over} //= PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); +} + +sub git { + my ($self) = @_; + $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); +} + +# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table +sub nntp_xref_for { # NNTP only + my ($self, $xibx, $xsmsg) = @_; + my $dbh = over($self)->dbh; + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($xibx->{newsgroup}); + my $xibx_id = $sth->fetchrow_array // do { + warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n"; + return; + }; + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1 + + $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB); + + # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2 + $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num}); + $sth->bind_param(3, $xibx_id); + $sth->execute; + my $docid = $sth->fetchrow_array // do { + warn <{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}" +EOF + return; + }; + + # LIMIT is number of newsgroups on server: + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ? + + $sth->execute($docid, $xibx_id); + my $rows = $sth->fetchall_arrayref; + + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 + + my %xref = map { + my ($ibx_id, $xnum) = @$_; + + $eidx_key_sth->execute($ibx_id); + my $eidx_key = $eidx_key_sth->fetchrow_array; + + # only include if there's a newsgroup name + $eidx_key && index($eidx_key, '/') >= 0 ? + () : ($eidx_key => $xnum) + } @$rows; + $xref{$xibx->{newsgroup}} = $xsmsg->{num}; + \%xref; +} + +sub mm { undef } + +sub altid_map { {} } + +sub description { + my ($self) = @_; + ($self->{description} //= + PublicInbox::Inbox::cat_desc("$self->{topdir}/description")) // + '$EXTINDEX_DIR/description missing'; +} + +sub cloneurl { [] } # TODO + +sub base_url { 'https://example.com/TODO/' } +sub nntp_url { [] } + +no warnings 'once'; +*smsg_eml = \&PublicInbox::Inbox::smsg_eml; +*smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; +*msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; +*modified = \&PublicInbox::Inbox::modified; +*recent = \&PublicInbox::Inbox::recent; + +*max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef +*isrch = *search = \&PublicInbox::Search::reopen; + +1; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm new file mode 100644 index 00000000..c782a62a --- /dev/null +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -0,0 +1,1133 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Detached/external index cross inbox search indexing support +# read-write counterpart to PublicInbox::ExtSearch +# +# It's based on the same ideas as public-inbox-v2-format(5) using +# over.sqlite3 for dedupe and sharded Xapian. msgmap.sqlite3 is +# missing, so there is no Message-ID conflict resolution, meaning +# no NNTP support for now. +# +# v2 has a 1:1 mapping of index:inbox or msgmap for NNTP support. +# This is intended to be an M:N index:inbox mapping, but it'll likely +# be 1:N in common practice (M==1) + +package PublicInbox::ExtSearchIdx; +use strict; +use v5.10.1; +use parent qw(PublicInbox::ExtSearch PublicInbox::Lock); +use Carp qw(croak carp); +use Sys::Hostname qw(hostname); +use POSIX qw(strftime); +use PublicInbox::Search; +use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); +use PublicInbox::OverIdx; +use PublicInbox::MiscIdx; +use PublicInbox::MID qw(mids); +use PublicInbox::V2Writable; +use PublicInbox::InboxWritable; +use PublicInbox::ContentHash qw(content_hash); +use PublicInbox::Eml; +use PublicInbox::DS qw(now); +use DBI qw(:sql_types); # SQL_BLOB + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not yet supported\n"; + my $self = bless { + xpfx => "$dir/ei".PublicInbox::Search::SCHEMA_VERSION, + topdir => $dir, + creat => $opt->{creat}, + ibx_map => {}, # (newsgroup//inboxdir) => $ibx + ibx_list => [], + indexlevel => $l, + transact_bytes => 0, + total_bytes => 0, + current_info => '', + parallel => 1, + lock_path => "$dir/ei.lock", + }, __PACKAGE__; + $self->{shards} = $self->count_shards || nproc_shards($opt->{creat}); + my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); + $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{oidx} = $oidx; + $self +} + +sub attach_inbox { + my ($self, $ibx) = @_; + $self->{ibx_map}->{$ibx->eidx_key} //= do { + push @{$self->{ibx_list}}, $ibx; + $ibx; + } +} + +sub _ibx_attach { # each_inbox callback + my ($ibx, $self) = @_; + attach_inbox($self, $ibx); +} + +sub attach_config { + my ($self, $cfg) = @_; + $self->{cfg} = $cfg; + $cfg->each_inbox(\&_ibx_attach, $self); +} + +sub check_batch_limit ($) { + my ($req) = @_; + my $self = $req->{self}; + my $new_smsg = $req->{new_smsg}; + my $n = $self->{transact_bytes} += $new_smsg->{bytes}; + + # set flag for PublicInbox::V2Writable::index_todo: + ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; +} + +sub do_xpost ($$) { + my ($req, $smsg) = @_; + my $self = $req->{self}; + my $docid = $smsg->{num}; + my $idx = $self->idx_shard($docid); + my $oid = $req->{oid}; + my $xibx = $req->{ibx}; + my $eml = $req->{eml}; + my $eidx_key = $xibx->eidx_key; + if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message + my $xnum = $req->{xnum}; + $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); + $idx->ipc_do('add_eidx_info', $docid, $eidx_key, $eml); + check_batch_limit($req); + } else { # 'd' + my $rm_eidx_info; + my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key, + \$rm_eidx_info); + if ($nr == 0) { + $self->{oidx}->eidxq_del($docid); + $idx->ipc_do('xdb_remove', $docid); + } elsif ($rm_eidx_info) { + $idx->ipc_do('remove_eidx_info', + $docid, $eidx_key, $eml); + $self->{oidx}->eidxq_add($docid); # yes, add + } + } +} + +# called by V2Writable::sync_prepare +sub artnum_max { $_[0]->{oidx}->eidx_max } + +sub index_unseen ($) { + my ($req) = @_; + my $new_smsg = $req->{new_smsg} or die 'BUG: {new_smsg} unset'; + my $eml = delete $req->{eml}; + $new_smsg->populate($eml, $req); + my $self = $req->{self}; + my $docid = $self->{oidx}->adj_counter('eidx_docid', '+'); + $new_smsg->{num} = $docid; + my $idx = $self->idx_shard($docid); + $self->{oidx}->add_overview($eml, $new_smsg); + my $oid = $new_smsg->{blob}; + my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset'; + $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key); + $idx->index_eml($eml, $new_smsg, $ibx->eidx_key); + check_batch_limit($req); +} + +sub do_finalize ($) { + my ($req) = @_; + if (my $indexed = $req->{indexed}) { + do_xpost($req, $_) for @$indexed; + } elsif (exists $req->{new_smsg}) { # totally unseen messsage + index_unseen($req); + } else { + # `d' message was already unindexed in the v1/v2 inboxes, + # so it's too noisy to warn, here. + } + # cur_cmt may be undef for unindex_oid, set by V2Writable::index_todo + if (defined(my $cur_cmt = $req->{cur_cmt})) { + ${$req->{latest_cmt}} = $cur_cmt; + } +} + +sub do_step ($) { # main iterator for adding messages to the index + my ($req) = @_; + my $self = $req->{self} // die 'BUG: {self} missing'; + while (1) { + if (my $next_arg = $req->{next_arg}) { + if (my $smsg = $self->{oidx}->next_by_mid(@$next_arg)) { + $req->{cur_smsg} = $smsg; + $self->git->cat_async($smsg->{blob}, + \&ck_existing, $req); + return; # ck_existing calls do_step + } + delete $req->{cur_smsg}; + delete $req->{next_arg}; + } + my $mid = shift(@{$req->{mids}}); + last unless defined $mid; + my ($id, $prev); + $req->{next_arg} = [ $mid, \$id, \$prev ]; + # loop again + } + do_finalize($req); +} + +sub _blob_missing ($) { # called when req->{cur_smsg}->{blob} is bad + my ($req) = @_; + my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing'; + my $self = $req->{self}; + my $xref3 = $self->{oidx}->get_xref3($smsg->{num}); + my @keep = grep(!/:$smsg->{blob}\z/, @$xref3); + if (@keep) { + $keep[0] =~ /:([a-f0-9]{40,}+)\z/ or + die "BUG: xref $keep[0] has no OID"; + my $oidhex = $1; + $self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob}); + my $upd = $self->{oidx}->update_blob($smsg, $oidhex); + my $saved = $self->{oidx}->get_art($smsg->{num}); + } else { + $self->{oidx}->delete_by_num($smsg->{num}); + } +} + +sub ck_existing { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing'; + if ($type eq 'missing') { + _blob_missing($req); + } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) { + my $self = $req->{self} // die 'BUG: {self} missing'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $cur = PublicInbox::Eml->new($bref); + if (content_hash($cur) eq $req->{chash}) { + push @{$req->{indexed}}, $smsg; # for do_xpost + } # else { index_unseen later } + } + do_step($req); +} + +# is the messages visible in the inbox currently being indexed? +# return the number if so +sub cur_ibx_xnum ($$) { + my ($req, $bref) = @_; + my $ibx = $req->{ibx} or die 'BUG: current {ibx} missing'; + + $req->{eml} = PublicInbox::Eml->new($bref); + $req->{chash} = content_hash($req->{eml}); + $req->{mids} = mids($req->{eml}); + my @q = @{$req->{mids}}; # copy + while (defined(my $mid = shift @q)) { + my ($id, $prev); + while (my $x = $ibx->over->next_by_mid($mid, \$id, \$prev)) { + return $x->{num} if $x->{blob} eq $req->{oid}; + } + } + undef; +} + +sub index_oid { # git->cat_async callback for 'm' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $new_smsg = $req->{new_smsg} = bless { + blob => $oid, + }, 'PublicInbox::Smsg'; + $new_smsg->set_bytes($$bref, $size); + defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return; + ++${$req->{nr}}; + do_step($req); +} + +sub unindex_oid { # git->cat_async callback for 'd' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + return if defined(cur_ibx_xnum($req, $bref)); # was re-added + do_step($req); +} + +# overrides V2Writable::last_commits, called by sync_ranges via sync_prepare +sub last_commits { + my ($self, $sync) = @_; + my $heads = []; + my $ekey = $sync->{ibx}->eidx_key; + my $uv = $sync->{ibx}->uidvalidity; + for my $i (0..$sync->{epoch_max}) { + $heads->[$i] = $self->{oidx}->eidx_meta("lc-v2:$ekey//$uv;$i"); + } + $heads; +} + +sub _ibx_index_reject ($) { + my ($ibx) = @_; + $ibx->mm // return 'unindexed, no msgmap.sqlite3'; + $ibx->uidvalidity // return 'no UIDVALIDITY'; + $ibx->over // return 'unindexed, no over.sqlite3'; + undef; +} + +sub _sync_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + if (defined(my $err = _ibx_index_reject($ibx))) { + return "W: skipping $ekey ($err)"; + } + $sync->{ibx} = $ibx; + $sync->{nr} = \(my $nr = 0); + my $v = $ibx->version; + if ($v == 2) { + $sync->{epoch_max} = $ibx->max_git_epoch // return; + sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable + } elsif ($v == 1) { + my $uv = $ibx->uidvalidity; + my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); + my $head = $ibx->mm->last_commit // + return "E: $ibx->{inboxdir} is not indexed"; + my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); + my $unit = { stack => $stk, git => $ibx->git }; + push @{$sync->{todo}}, $unit; + } else { + return "E: $ekey unsupported inbox version (v$v)"; + } + for my $unit (@{delete($sync->{todo}) // []}) { + last if $sync->{quit}; + index_todo($self, $sync, $unit); + } + $self->{midx}->index_ibx($ibx) unless $sync->{quit}; + $ibx->git->cleanup; # done with this inbox, now + undef; +} + +sub gc_unref_doc ($$$$) { + my ($self, $ibx_id, $eidx_key, $docid) = @_; + my $dbh = $self->{oidx}->dbh; + + # for debug/info purposes, oids may no longer be accessible + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref}; + + $dbh->prepare_cached(<<'')->execute($docid, $ibx_id); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? + + my $remain = $self->{oidx}->get_xref3($docid); + if (scalar(@$remain)) { + $self->{oidx}->eidxq_add($docid); # enqueue for reindex + for my $oid (@oid) { + warn "I: unref #$docid $eidx_key $oid\n"; + } + } else { + warn "I: remove #$docid $eidx_key @oid\n"; + $self->idx_shard($docid)->ipc_do('xdb_remove', $docid); + } +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + + my $dbh = $self->{oidx}->dbh; + my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?'); + my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes'); + my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?'); + + $ibx_ck->execute; + while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { + next if $self->{ibx_map}->{$eidx_key}; + $self->{midx}->remove_eidx_key($eidx_key); + warn "I: deleting messages for $eidx_key...\n"; + $x3_doc->execute($ibx_id); + while (defined(my $docid = $x3_doc->fetchrow_array)) { + gc_unref_doc($self, $ibx_id, $eidx_key, $docid); + } + $dbh->prepare_cached(<<'')->execute($ibx_id); +DELETE FROM inboxes WHERE ibx_id = ? + + # drop last_commit info + my $pat = $eidx_key; + $pat =~ s/([_%])/\\$1/g; + $lc_i->execute("lc-%:$pat//%"); + while (my ($key) = $lc_i->fetchrow_array) { + next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; + warn "I: removing $key\n"; + $dbh->prepare_cached(<<'')->execute($key); +DELETE FROM eidx_meta WHERE key = ? + + } + + warn "I: $eidx_key removed\n"; + } + + # it's not real unless it's in `over', we use parallelism here, + # shards will be reading directly from over, so commit + $self->{oidx}->commit_lazy; + $self->{oidx}->begin_lazy; + + for my $idx (@{$self->{idx_shards}}) { + warn "I: cleaning up shard #$idx->{shard}\n"; + $idx->shard_over_check($self->{oidx}); + } + my $nr = $dbh->do(<<''); +DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) + + warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + + done($self); +} + +sub _ibx_for ($$$) { + my ($self, $sync, $smsg) = @_; + my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset'; + my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos"; + $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" +} + +sub _fd_constrained ($) { + my ($self) = @_; + $self->{-fd_constrained} //= do { + my $soft; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($soft, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + chomp($soft = `sh -c 'ulimit -n'`); + } + if (defined($soft)) { + my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate + my $ret = $want > $soft; + if ($ret) { + warn <{sync}; + my $self = $sync->{self}; + my $by_chash = delete $req->{by_chash} or die 'BUG: no {by_chash}'; + my $nr = scalar(keys(%$by_chash)) or die 'BUG: no content hashes'; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $docid = $smsg->{num} = $orig_smsg->{num}; + $self->{oidx}->add_overview($eml, $smsg); # may rethread + check_batch_limit({ %$sync, new_smsg => $smsg }); + my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}"; + my $stable = delete($by_chash->{$chash0}) // + die "BUG: $smsg->{blob} chash missing"; + my $idx = $self->idx_shard($docid); + my $top_smsg = pop @$stable; + $top_smsg == $smsg or die 'BUG: top_smsg != smsg'; + my $ibx = _ibx_for($self, $sync, $smsg); + $idx->index_eml($eml, $smsg, $ibx->eidx_key); + for my $x (reverse @$stable) { + $ibx = _ibx_for($self, $sync, $x); + my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}'; + $idx->ipc_do('add_eidx_info', $docid, $ibx->eidx_key, $hdr); + } + return if $nr == 1; # likely, all good + + warn "W: #$docid split into $nr due to deduplication change\n"; + my @todo; + for my $ary (values %$by_chash) { + for my $x (reverse @$ary) { + warn "removing #$docid xref3 $x->{blob}\n"; + my $n = $self->{oidx}->remove_xref3($docid, $x->{blob}); + die "BUG: $x->{blob} invalidated #$docid" if $n == 0; + } + my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; + $x->{num} = delete($x->{xnum}) // die '{xnum} unset'; + $ibx = _ibx_for($self, $sync, $x); + if (my $over = $ibx->over) { + my $e = $over->get_art($x->{num}); + $e->{blob} eq $x->{blob} or die <{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); +EOF + push @todo, $ibx, $e; + $over->dbh_close if _fd_constrained($self); + } else { + die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n"; + } + } + undef $by_chash; + while (my ($ibx, $e) = splice(@todo, 0, 2)) { + reindex_unseen($self, $sync, $ibx, $e); + } +} + +sub _reindex_oid { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $sync = $req->{sync}; + my $self = $sync->{self}; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2]; + my $docid = $orig_smsg->{num}; + if (is_bad_blob($oid, $type, $size, $expect_oid)) { + my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid); + if ($remain == 0) { + warn "W: #$docid gone or corrupted\n"; + $self->idx_shard($docid)->ipc_do('xdb_remove', $docid); + } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { + warn "BUG: #$docid gone (UNEXPECTED)\n"; + $self->idx_shard($docid)->ipc_do('xdb_remove', $docid); + } + return; + } + my $ci = $self->{current_info}; + local $self->{current_info} = "$ci #$docid $oid"; + my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $re_smsg->set_bytes($$bref, $size); + my $eml = PublicInbox::Eml->new($bref); + $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, + cotime => $orig_smsg->{ts} }); + my $chash = content_hash($eml); + $re_smsg->{chash} = $chash; + $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1]; + $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0]; + $re_smsg->{hdr} = $eml->header_obj; + push @{$req->{by_chash}->{$chash}}, $re_smsg; + if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { # last $re_smsg is the highest priority xref3 + local $self->{current_info} = "$ci #$docid"; + _reindex_finalize($req, $re_smsg, $eml); + } +} + +sub _reindex_smsg ($$$) { + my ($self, $sync, $smsg) = @_; + my $docid = $smsg->{num}; + my $xr3 = $self->{oidx}->get_xref3($docid, 1); + if (scalar(@$xr3) == 0) { # _reindex_check_stale should've covered this + warn <<""; +BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex + + $self->{oidx}->delete_by_num($docid); + $self->idx_shard($docid)->ipc_do('xdb_remove', $docid); + return; + } + + # we sort {xr3r} in the reverse order of {ibx_list} so we can + # hit the common case in _reindex_finalize without rereading + # from git (or holding multiple messages in memory). + my $id2pos = $sync->{id2pos}; # index in {ibx_list} + @$xr3 = sort { + $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} + || + $b->[1] <=> $a->[1] # break ties with {xnum} + } @$xr3; + @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3; + my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 }; + $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req); +} + +sub checkpoint_due ($) { + my ($sync) = @_; + ${$sync->{need_checkpoint}} || (now() > $sync->{next_check}); +} + +sub host_ident () { + # I've copied FS images and only changed the hostname before, + # so prepend hostname. Use `state' since these a BOFH can change + # these while this process is running and we always want to be + # able to release locks taken by this process. + state $retval = hostname . '-' . do { + my $m; # machine-id(5) is systemd + if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> } + # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs + chomp($m ||= `{ sysctl -n kern.hostid || + hostid || ghostid; } 2>/dev/null` + || "no-machine-id-or-hostid-on-$^O"); + $m; + }; +} + +sub eidxq_release { + my ($self) = @_; + my $expect = delete($self->{-eidxq_locked}) or return; + my ($owner_pid, undef) = split(/-/, $expect); + return if $owner_pid != $$; # shards may fork + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') // ''; + if ($cur eq $expect) { + $oidx->eidx_meta('eidxq_lock', ''); + return 1; + } elsif ($cur ne '') { + warn "E: eidxq_lock($expect) stolen by $cur\n"; + } else { + warn "E: eidxq_lock($expect) released by another process\n"; + } + undef; +} + +sub DESTROY { + my ($self) = @_; + eidxq_release($self) and $self->{oidx}->commit_lazy; +} + +sub _eidxq_take ($) { + my ($self) = @_; + my $val = "$$-${\time}-$>-".host_ident; + $self->{oidx}->eidx_meta('eidxq_lock', $val); + $self->{-eidxq_locked} = $val; +} + +sub eidxq_lock_acquire ($) { + my ($self) = @_; + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self); + if (my $locked = $self->{-eidxq_locked}) { # be lazy + return $locked if $locked eq $cur; + } + my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4); + my $t = strftime('%Y-%m-%d %k:%M:%S', gmtime($time)); + if ($euid == $> && $ident eq host_ident) { + if (kill(0, $pid)) { + warn <dbh->sqlite_db_filename; + warn <{oidx}->dbh; + my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; + ${$sync->{nr}} = 0; + local $sync->{-regen_fmt} = "%u/$tot\n"; + my $pr = $sync->{-opt}->{-progress}; + if ($pr) { + my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq'); + my $max = $dbh->selectrow_array('SELECT MAX(docid) FROM eidxq'); + $pr->("Xapian indexing $min..$max (total=$tot)\n"); + } + $sync->{id2pos} //= do { + my %id2pos; + my $pos = 0; + $id2pos{$_->{-ibx_id}} = $pos++ for @{$self->{ibx_list}}; + \%id2pos; + }; + my ($del, $iter); +restart: + $del = $dbh->prepare('DELETE FROM eidxq WHERE docid = ?'); + $iter = $dbh->prepare('SELECT docid FROM eidxq ORDER BY docid ASC'); + $iter->execute; + while (defined(my $docid = $iter->fetchrow_array)) { + last if $sync->{quit}; + if (my $smsg = $self->{oidx}->get_art($docid)) { + _reindex_smsg($self, $sync, $smsg); + } else { + warn "E: #$docid does not exist in over\n"; + } + $del->execute($docid); + ++${$sync->{nr}}; + + if (checkpoint_due($sync)) { + $dbh = $del = $iter = undef; + reindex_checkpoint($self, $sync); # release lock + $dbh = $self->{oidx}->dbh; + goto restart; + } + } + $self->git->async_wait_all; + $pr->("reindexed ${$sync->{nr}}/$tot\n") if $pr; +} + +sub _reindex_unseen { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $self = $req->{self} // die 'BUG: {self} unset'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; + $new_smsg->set_bytes($$bref, $size); + my $eml = $req->{eml} = PublicInbox::Eml->new($bref); + $req->{new_smsg} = $new_smsg; + $req->{chash} = content_hash($eml); + $req->{mids} = mids($eml); # do_step iterates through this + do_step($req); # enter the normal indexing flow +} + +# --reindex may catch totally unseen messages, this handles them +sub reindex_unseen ($$$$) { + my ($self, $sync, $ibx, $xsmsg) = @_; + my $req = { + %$sync, # has {self} + autime => $xsmsg->{ds}, + cotime => $xsmsg->{ts}, + oid => $xsmsg->{blob}, + ibx => $ibx, + xnum => $xsmsg->{num}, + # {mids} and {chash} will be filled in at _reindex_unseen + }; + warn "I: reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n"; + $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req); +} + +sub _reindex_check_unseen ($$$) { + my ($self, $sync, $ibx) = @_; + my $ibx_id = $ibx->{-ibx_id}; + my $slice = 1000; + my ($beg, $end) = (1, $slice); + + # first, check if we missed any messages in target $ibx + my $msgs; + my $pr = $sync->{-opt}->{-progress}; + my $ekey = $ibx->eidx_key; + local $sync->{-regen_fmt} = + "$ekey checking unseen %u/".$ibx->over->max."\n"; + ${$sync->{nr}} = 0; + + while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) { + ${$sync->{nr}} = $beg; + $beg = $msgs->[-1]->{num} + 1; + $end = $beg + $slice; + if (checkpoint_due($sync)) { + reindex_checkpoint($self, $sync); # release lock + } + + my $inx3 = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1); +SELECT DISTINCT(docid) FROM xref3 WHERE +ibx_id = ? AND xnum = ? AND oidbin = ? + + for my $xsmsg (@$msgs) { + my $oidbin = pack('H*', $xsmsg->{blob}); + $inx3->bind_param(1, $ibx_id); + $inx3->bind_param(2, $xsmsg->{num}); + $inx3->bind_param(3, $oidbin, SQL_BLOB); + $inx3->execute; + my $docids = $inx3->fetchall_arrayref; + # index messages which were totally missed + # the first time around ASAP: + if (scalar(@$docids) == 0) { + reindex_unseen($self, $sync, $ibx, $xsmsg); + } else { # already seen, reindex later + for my $r (@$docids) { + $self->{oidx}->eidxq_add($r->[0]); + } + } + last if $sync->{quit}; + } + last if $sync->{quit}; + } +} + +sub _reindex_check_stale ($$$) { + my ($self, $sync, $ibx) = @_; + my $min = 0; + my $pr = $sync->{-opt}->{-progress}; + my $fetching; + my $ekey = $ibx->eidx_key; + local $sync->{-regen_fmt} = + "$ekey check stale/missing %u/".$ibx->over->max."\n"; + ${$sync->{nr}} = 0; + do { + if (checkpoint_due($sync)) { + reindex_checkpoint($self, $sync); # release lock + } + # now, check if there's stale xrefs + my $iter = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1); +SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? AND docid > ? +ORDER BY docid,xnum ASC LIMIT 10000 + + $iter->execute($ibx->{-ibx_id}, $min); + $fetching = undef; + + while (my ($docid, $xnum, $oidbin) = $iter->fetchrow_array) { + return if $sync->{quit}; + ${$sync->{nr}} = $xnum; + + $fetching = $min = $docid; + my $smsg = $ibx->over->get_art($xnum); + my $oidhex = unpack('H*', $oidbin); + my $err; + if (!$smsg) { + $err = 'stale'; + } elsif ($smsg->{blob} ne $oidhex) { + $err = "mismatch (!= $smsg->{blob})"; + } else { + next; # likely, all good + } + # current_info already has eidx_key + warn "$xnum:$oidhex (#$docid): $err\n"; + my $del = $self->{oidx}->dbh->prepare_cached(<<''); +DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ? + + $del->bind_param(1, $ibx->{-ibx_id}); + $del->bind_param(2, $xnum); + $del->bind_param(3, $oidbin, SQL_BLOB); + $del->execute; + + # get_xref3 over-fetches, but this is a rare path: + my $xr3 = $self->{oidx}->get_xref3($docid); + my $idx = $self->idx_shard($docid); + if (scalar(@$xr3) == 0) { # all gone + $self->{oidx}->delete_by_num($docid); + $self->{oidx}->eidxq_del($docid); + $idx->ipc_do('xdb_remove', $docid); + } else { # enqueue for reindex of remaining messages + $idx->ipc_do('remove_eidx_info', + $docid, $ibx->eidx_key); + $self->{oidx}->eidxq_add($docid); # yes, add + } + } + } while (defined $fetching); +} + +sub _reindex_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + local $self->{current_info} = $ekey; + if (defined(my $err = _ibx_index_reject($ibx))) { + warn "W: cannot reindex $ekey ($err)\n"; + } else { + _reindex_check_unseen($self, $sync, $ibx); + _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + } + delete @$ibx{qw(over mm search git)}; # won't need these for a bit +} + +sub eidx_reindex { + my ($self, $sync) = @_; + + # acquire eidxq_lock early because full reindex takes forever + # and incremental -extindex processes can run during our checkpoints + if (!eidxq_lock_acquire($self)) { + warn "E: aborting --reindex\n"; + return; + } + for my $ibx (@{$self->{ibx_list}}) { + _reindex_inbox($self, $sync, $ibx); + last if $sync->{quit}; + } + $self->git->async_wait_all; # ensure eidxq gets filled completely + eidxq_process($self, $sync) unless $sync->{quit}; +} + +sub sync_inbox { + my ($self, $sync, $ibx) = @_; + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); +} + +sub eidx_sync { # main entry point + my ($self, $opt) = @_; + + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + local $self->{current_info} = ''; + local $SIG{__WARN__} = sub { + $warn_cb->($self->{current_info}, ': ', @_); + }; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + $self->{oidx}->rethread_prepare($opt); + my $sync = { + need_checkpoint => \(my $need_checkpoint = 0), + check_intvl => 10, + next_check => now() + 10, + -opt => $opt, + # DO NOT SET {reindex} here, it's incompatible with reused + # V2Writable code, reindex is totally different here + # compared to v1/v2 inboxes because we have multiple histories + self => $self, + -regen_fmt => "%u/?\n", + }; + local $SIG{USR1} = sub { $need_checkpoint = 1 }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + for my $ibx (@{$self->{ibx_list}}) { + $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); + } + if (delete($opt->{reindex})) { + local $sync->{checkpoint_unlocks} = 1; + eidx_reindex($self, $sync); + } + + # don't use $_ here, it'll get clobbered by reindex_checkpoint + if ($opt->{scan} // 1) { + for my $ibx (@{$self->{ibx_list}}) { + last if $sync->{quit}; + sync_inbox($self, $sync, $ibx); + } + } + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; + eidxq_process($self, $sync) unless $sync->{quit}; + + eidxq_release($self); + done($self); + $sync; # for eidx_watch +} + +sub update_last_commit { # overrides V2Writable + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $ibx = $sync->{ibx} or die 'BUG: {ibx} missing'; + my $ekey = $ibx->eidx_key; + my $uv = $ibx->uidvalidity; + my $epoch = $unit->{epoch}; + my $meta_key; + my $v = $ibx->version; + if ($v == 2) { + die 'No {epoch} for v2 unit' unless defined $epoch; + $meta_key = "lc-v2:$ekey//$uv;$epoch"; + } elsif ($v == 1) { + die 'Unexpected {epoch} for v1 unit' if defined $epoch; + $meta_key = "lc-v1:$ekey//$uv"; + } else { + die "Unsupported inbox version: $v"; + } + my $last = $self->{oidx}->eidx_meta($meta_key); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); + return if $n ne '' && $n == 0; + } + $self->{oidx}->eidx_meta($meta_key, $latest_cmt); +} + +sub _idx_init { # with_umask callback + my ($self, $opt) = @_; + PublicInbox::V2Writable::_idx_init($self, $opt); + $self->{midx} = PublicInbox::MiscIdx->new($self); +} + +sub idx_init { # similar to V2Writable + my ($self, $opt) = @_; + return if $self->{idx_shards}; + + $self->git->cleanup; + my $mode = 0644; + my $ALL = $self->git->{git_dir}; # ALL.git + my $old = -d $ALL; + if ($opt->{-private}) { # LeiStore + $mode = 0600; + if (!$old) { + umask 077; # don't bother restoring + PublicInbox::Import::init_bare($ALL); + $self->git->qx(qw(config core.sharedRepository 0600)); + } + } else { + PublicInbox::Import::init_bare($ALL) unless $old; + } + my $info_dir = "$ALL/objects/info"; + my $alt = "$info_dir/alternates"; + my (@old, @new, %seen); # seen: st_dev + st_ino + if (-e $alt) { + open(my $fh, '<', $alt) or die "open $alt: $!"; + $mode = (stat($fh))[2] & 07777; + while (my $line = <$fh>) { + chomp(my $d = $line); + + # expand relative path (/local/ stuff) + substr($d, 0, 3) eq '../' and + $d = "$ALL/objects/$d"; + if (my @st = stat($d)) { + next if $seen{"$st[0]\0$st[1]"}++; + } else { + warn "W: stat($d) failed (from $alt): $!\n"; + next if $opt->{-idx_gc}; + } + push @old, $line; + } + } + + # for LeiStore, and possibly some mirror-only state + if (opendir(my $dh, my $local = "$self->{topdir}/local")) { + # highest numbered epoch first + for my $n (sort { $b <=> $a } map { substr($_, 0, -4) + 0 } + grep(/\A[0-9]+\.git\z/, readdir($dh))) { + my $d = "$local/$n.git/objects"; # absolute path + if (my @st = stat($d)) { + next if $seen{"$st[0]\0$st[1]"}++; + # favor relative paths for rename-friendliness + push @new, "../../local/$n.git/objects\n"; + } else { + warn "W: stat($d) failed: $!\n"; + } + } + } + for my $ibx (@{$self->{ibx_list}}) { + my $line = $ibx->git->{git_dir} . "/objects\n"; + chomp(my $d = $line); + if (my @st = stat($d)) { + next if $seen{"$st[0]\0$st[1]"}++; + } else { + warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n"; + next if $opt->{-idx_gc}; + } + push @new, $line; + } + if (scalar @new) { + push @old, @new; + my $o = \@old; + PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o); + } + $self->parallel_init($self->{indexlevel}); + $self->with_umask(\&_idx_init, $self, $opt); + $self->{oidx}->begin_lazy; + $self->{oidx}->eidx_prep; + $self->git->batch_prepare; + $self->{midx}->begin_txn; +} + +sub _watch_commit { # PublicInbox::DS::add_timer callback + my ($self) = @_; + delete $self->{-commit_timer}; + eidxq_process($self, $self->{-watch_sync}); + eidxq_release($self); + delete local $self->{-watch_sync}->{-regen_fmt}; + reindex_checkpoint($self, $self->{-watch_sync}); + + # call event_step => done unless commit_timer is armed + PublicInbox::DS::requeue($self); +} + +sub on_inbox_unlock { # called by PublicInbox::InboxIdle + my ($self, $ibx) = @_; + my $opt = $self->{-watch_sync}->{-opt}; + my $pr = $opt->{-progress}; + my $ekey = $ibx->eidx_key; + local $0 = "sync $ekey"; + $pr->("indexing $ekey\n") if $pr; + $self->idx_init($opt); + sync_inbox($self, $self->{-watch_sync}, $ibx); + $self->{-commit_timer} //= PublicInbox::DS::add_timer( + $opt->{'commit-interval'} // 10, + \&_watch_commit, $self); +} + +sub eidx_reload { # -extindex --watch SIGHUP handler + my ($self, $idler) = @_; + if ($self->{cfg}) { + my $pr = $self->{-watch_sync}->{-opt}->{-progress}; + $pr->('reloading ...') if $pr; + delete $self->{-resync_queue}; + @{$self->{ibx_list}} = (); + %{$self->{ibx_map}} = (); + delete $self->{-watch_sync}->{id2pos}; + my $cfg = PublicInbox::Config->new; + attach_config($self, $cfg); + $idler->refresh($cfg); + $pr->(" done\n") if $pr; + } else { + warn "reload not supported without --all\n"; + } +} + +sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler + my ($self) = @_; + $self->{-resync_queue} //= [ @{$self->{ibx_list}} ]; + PublicInbox::DS::requeue($self); # trigger our ->event_step +} + +sub event_step { # PublicInbox::DS::requeue callback + my ($self) = @_; + if (my $resync_queue = $self->{-resync_queue}) { + if (my $ibx = shift(@$resync_queue)) { + on_inbox_unlock($self, $ibx); + PublicInbox::DS::requeue($self); + } else { + delete $self->{-resync_queue}; + _watch_commit($self); + } + } else { + done($self) unless $self->{-commit_timer}; + } +} + +sub eidx_watch { # public-inbox-extindex --watch main loop + my ($self, $opt) = @_; + local %SIG = %SIG; + for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) { + $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" }; + } + require PublicInbox::InboxIdle; + require PublicInbox::DS; + require PublicInbox::Syscall; + require PublicInbox::Sigfd; + my $idler = PublicInbox::InboxIdle->new($self->{cfg}); + if (!$self->{cfg}) { + $idler->watch_inbox($_) for @{$self->{ibx_list}}; + } + $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $pr = $opt->{-progress}; + $pr->("performing initial scan ...\n") if $pr; + my $sync = eidx_sync($self, $opt); # initial sync + return if $sync->{quit}; + my $oldset = PublicInbox::DS::block_signals(); + local $self->{current_info} = ''; + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { $cb->($self->{current_info}, ': ', @_) }; + my $sig = { + HUP => sub { eidx_reload($self, $idler) }, + USR1 => sub { eidx_resync_start($self) }, + TSTP => sub { kill('STOP', $$) }, + }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; + my $sigfd = PublicInbox::Sigfd->new($sig, + $PublicInbox::Syscall::SFD_NONBLOCK); + %SIG = (%SIG, %$sig) if !$sigfd; + local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock + if (!$sigfd) { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + PublicInbox::DS::sig_setmask($oldset); + PublicInbox::DS->SetLoopTimeout(1000); + } + PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + $pr->("initial scan complete, entering event loop\n") if $pr; + PublicInbox::DS->EventLoop; # calls InboxIdle->event_step + done($self); +} + +no warnings 'once'; +*done = \&PublicInbox::V2Writable::done; +*with_umask = \&PublicInbox::InboxWritable::with_umask; +*parallel_init = \&PublicInbox::V2Writable::parallel_init; +*nproc_shards = \&PublicInbox::V2Writable::nproc_shards; +*sync_prepare = \&PublicInbox::V2Writable::sync_prepare; +*index_todo = \&PublicInbox::V2Writable::index_todo; +*count_shards = \&PublicInbox::V2Writable::count_shards; +*atfork_child = \&PublicInbox::V2Writable::atfork_child; +*idx_shard = \&PublicInbox::V2Writable::idx_shard; +*reindex_checkpoint = \&PublicInbox::V2Writable::reindex_checkpoint; + +1; diff --git a/lib/PublicInbox/FakeInotify.pm b/lib/PublicInbox/FakeInotify.pm index 92758613..326b2391 100644 --- a/lib/PublicInbox/FakeInotify.pm +++ b/lib/PublicInbox/FakeInotify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # for systems lacking Linux::Inotify2 or IO::KQueue, just emulates diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 805076f0..b2219dad 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2020 all contributors +# Copyright (C) 2013-2021 all contributors # License: AGPL-3.0+ # # Used for generating Atom feeds for web-accessible mailing list archives. @@ -24,7 +24,7 @@ sub generate { sub generate_thread_atom { my ($ctx) = @_; - my $msgs = $ctx->{msgs} = $ctx->{-inbox}->over->get_thread($ctx->{mid}); + my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid}); return _no_thread() unless @$msgs; PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); } @@ -34,7 +34,7 @@ sub generate_html_index { # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: my $qp = $ctx->{qp}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if ($qp && !$qp->{r} && $ibx->over) { return PublicInbox::View::index_topics($ctx); } @@ -79,8 +79,8 @@ sub _no_thread () { sub recent_msgs { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; - my $max = $ibx->{feedmax}; + my $ibx = $ctx->{ibx}; + my $max = $ibx->{feedmax} // 25; return PublicInbox::View::paginate_recent($ctx, $max) if $ibx->over; # only for rare v1 inboxes which aren't indexed at all diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm index d54570fd..f6355e1b 100644 --- a/lib/PublicInbox/Filter/Base.pm +++ b/lib/PublicInbox/Filter/Base.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # base class for creating per-list or per-project filters diff --git a/lib/PublicInbox/Filter/Gmane.pm b/lib/PublicInbox/Filter/Gmane.pm index c326faca..a18b77d2 100644 --- a/lib/PublicInbox/Filter/Gmane.pm +++ b/lib/PublicInbox/Filter/Gmane.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Filter for importing some archives from gmane diff --git a/lib/PublicInbox/Filter/Mirror.pm b/lib/PublicInbox/Filter/Mirror.pm index 9f6dd342..fe915fc3 100644 --- a/lib/PublicInbox/Filter/Mirror.pm +++ b/lib/PublicInbox/Filter/Mirror.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Dumb filter for blindly accepting everything diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index 06e4ea75..09aa6aa8 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 all contributors +# Copyright (C) 2017-2021 all contributors # License: AGPL-3.0+ # Filter for lists.ruby-lang.org trailers @@ -16,7 +16,7 @@ sub new { my ($class, %opts) = @_; my $altid = delete $opts{-altid}; my $self = $class->SUPER::new(%opts); - my $ibx = $self->{-inbox}; + my $ibx = $self->{ibx}; # altid = serial:ruby-core:file=msgmap.sqlite3 if (!$altid && $ibx && $ibx->{altid}) { $altid ||= $ibx->{altid}->[0]; diff --git a/lib/PublicInbox/Filter/SubjectTag.pm b/lib/PublicInbox/Filter/SubjectTag.pm index aca6688b..ecedf666 100644 --- a/lib/PublicInbox/Filter/SubjectTag.pm +++ b/lib/PublicInbox/Filter/SubjectTag.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 all contributors +# Copyright (C) 2017-2021 all contributors # License: AGPL-3.0+ # Filter for various [tags] in subjects diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm index 2c73738d..0b1f5dd3 100644 --- a/lib/PublicInbox/Filter/Vger.pm +++ b/lib/PublicInbox/Filter/Vger.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Filter for vger.kernel.org list trailer diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm new file mode 100644 index 00000000..01b83c96 --- /dev/null +++ b/lib/PublicInbox/Gcf2.pm @@ -0,0 +1,110 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# backend for a git-cat-file-workalike based on libgit2, +# other libgit2 stuff may go here, too. +package PublicInbox::Gcf2; +use strict; +use PublicInbox::Spawn qw(which popen_rd); +use Fcntl qw(LOCK_EX); +use IO::Handle; # autoflush +my (%CFG, $c_src, $lockfh); +BEGIN { + # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY + # to ~/.cache/public-inbox/inline-c if it exists + my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // + die 'PERL_INLINE_DIRECTORY not defined'; + my $f = "$inline_dir/.public-inbox.lock"; + open $lockfh, '>', $f or die "failed to open $f: $!\n"; + my $pc = which($ENV{PKG_CONFIG} // 'pkg-config'); + my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!); + my $rdr = {}; + open $rdr->{2}, '>', '/dev/null' or die "open /dev/null: $!"; + for my $x (qw(libgit2)) { + my $l = popen_rd([$pc, '--libs', $x], undef, $rdr); + $l = do { local $/; <$l> }; + next if $?; + my $c = popen_rd([$pc, '--cflags', $x], undef, $rdr); + $c = do { local $/; <$c> }; + next if $?; + + # note: we name C source files .h to prevent + # ExtUtils::MakeMaker from automatically trying to + # build them. + my $f = "$dir/gcf2_$x.h"; + if (open(my $fh, '<', $f)) { + chomp($l, $c); + local $/; + defined($c_src = <$fh>) or die "read $f: $!\n"; + $CFG{LIBS} = $l; + $CFG{CCFLAGSEX} = $c; + last; + } else { + die "E: $f: $!\n"; + } + } + die "E: libgit2 not installed\n" unless $c_src; + + # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking + flock($lockfh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; +} + +# we use Capitalized and ALLCAPS for compatibility with old Inline::C +use Inline C => Config => %CFG, BOOT => 'git_libgit2_init();'; +use Inline C => $c_src; +undef $c_src; +undef %CFG; +undef $lockfh; + +sub add_alt ($$) { + my ($gcf2, $objdir) = @_; + + # libgit2 (tested 0.27.7+dfsg.1-0.2 and 0.28.3+dfsg.1-1~bpo10+1 + # in Debian) doesn't handle relative epochs properly when nested + # multiple levels. Add all the absolute paths to workaround it, + # since $EXTINDEX_DIR/ALL.git/objects/info/alternates uses absolute + # paths to reference $V2INBOX_DIR/all.git/objects and + # $V2INBOX_DIR/all.git/objects/info/alternates uses relative paths + # to refer to $V2INBOX_DIR/git/$EPOCH.git/objects + # + # See https://bugs.debian.org/975607 + if (open(my $fh, '<', "$objdir/info/alternates")) { + chomp(my @abs_alt = grep(m!^/!, <$fh>)); + $gcf2->add_alternate($_) for @abs_alt; + } + $gcf2->add_alternate($objdir); +} + +# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop +# (see lib/PublicInbox/Gcf2Client.pm) +sub loop () { + my $gcf2 = new(); + my %seen; + STDERR->autoflush(1); + STDOUT->autoflush(1); + + while () { + chomp; + my ($oid, $git_dir) = split(/ /, $_, 2); + $seen{$git_dir}++ or add_alt($gcf2, "$git_dir/objects"); + if (!$gcf2->cat_oid(1, $oid)) { + # retry once if missing. We only get unabbreviated OIDs + # from SQLite or Xapian DBs, here, so malicious clients + # can't trigger excessive retries: + warn "I: $$ $oid missing, retrying in $git_dir\n"; + + $gcf2 = new(); + %seen = ($git_dir => 1); + add_alt($gcf2, "$git_dir/objects"); + + if ($gcf2->cat_oid(1, $oid)) { + warn "I: $$ $oid found after retry\n"; + } else { + warn "W: $$ $oid missing after retry\n"; + print "$oid missing\n"; # mimic git-cat-file + } + } + } +} + +1; diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm new file mode 100644 index 00000000..397774f9 --- /dev/null +++ b/lib/PublicInbox/Gcf2Client.pm @@ -0,0 +1,85 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# connects public-inbox processes to PublicInbox::Gcf2::loop() +package PublicInbox::Gcf2Client; +use strict; +use parent qw(PublicInbox::DS); +use PublicInbox::Git; +use PublicInbox::Gcf2; # fails if Inline::C or libgit2-dev isn't available +use PublicInbox::Spawn qw(spawn); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +# fields: +# sock => socket to Gcf2::loop +# The rest of these fields are compatible with what PublicInbox::Git +# uses code-sharing +# pid => PID of Gcf2::loop process +# pid.owner => process which spawned {pid} +# in => same as {sock}, for compatibility with PublicInbox::Git +# inflight => array (see PublicInbox::Git) +# cat_rbuf => scalarref, may be non-existent or empty +sub new { + my ($rdr) = @_; + my $self = bless {}, __PACKAGE__; + # ensure the child process has the same @INC we do: + my $env = { PERL5LIB => join(':', @INC) }; + my ($s1, $s2); + socketpair($s1, $s2, AF_UNIX, SOCK_STREAM, 0) or die "socketpair $!"; + $rdr //= {}; + $rdr->{0} = $rdr->{1} = $s2; + my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop]]; + $self->{'pid.owner'} = $$; + $self->{pid} = spawn($cmd, $env, $rdr); + $s1->blocking(0); + $self->{inflight} = []; + $self->{in} = $s1; + $self->SUPER::new($s1, EPOLLIN|EPOLLET); +} + +sub fail { + my $self = shift; + $self->close; # PublicInbox::DS::close + PublicInbox::Git::fail($self, @_); +} + +sub gcf2_async ($$$;$) { + my ($self, $req, $cb, $arg) = @_; + my $inflight = $self->{inflight} or return $self->close; + + # {wbuf} is rare, I hope: + cat_async_step($self, $inflight) if $self->{wbuf}; + + $self->fail("gcf2c write: $!") if !$self->write($req) && !$self->{sock}; + push @$inflight, $req, $cb, $arg; +} + +# ensure PublicInbox::Git::cat_async_step never calls cat_async_retry +sub alternates_changed {} + +# DS->EventLoop will call this +sub event_step { + my ($self) = @_; + $self->flush_write; + $self->close if !$self->{in} || !$self->{sock}; # process died + my $inflight = $self->{inflight}; + if ($inflight && @$inflight) { + cat_async_step($self, $inflight); + return $self->close unless $self->{in}; # process died + + # ok, more to do, requeue for fairness + $self->requeue if @$inflight || exists($self->{cat_rbuf}); + } +} + +sub DESTROY { + my ($self) = @_; + delete $self->{sock}; # if outside EventLoop + PublicInbox::Git::DESTROY($self); +} + +no warnings 'once'; + +*cat_async_step = \&PublicInbox::Git::cat_async_step; + +1; diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm index 988bc63f..0e781224 100644 --- a/lib/PublicInbox/GetlineBody.pm +++ b/lib/PublicInbox/GetlineBody.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Wrap a pipe or file for PSGI streaming response bodies and calls the diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index a7ba57f9..3d97300c 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: GPLv2 or later # # Used to read files from a git repository without excessive forking. @@ -12,15 +12,20 @@ use v5.10.1; use parent qw(Exporter); use POSIX (); use IO::Handle; # ->autoflush -use Errno qw(EINTR); +use Errno qw(EINTR EAGAIN); use File::Glob qw(bsd_glob GLOB_NOSORT); +use File::Spec (); use Time::HiRes qw(stat); use PublicInbox::Spawn qw(popen_rd); use PublicInbox::Tmpfile; +use IO::Poll qw(POLLIN); use Carp qw(croak); +use Digest::SHA (); +use PublicInbox::DS qw(dwaitpid); our @EXPORT_OK = qw(git_unquote git_quote); our $PIPE_BUFSIZ = 65536; # Linux default our $in_cleanup; +our $RDTIMEO = 60_000; # milliseconds use constant MAX_INFLIGHT => (($^O eq 'linux' ? 4096 : POSIX::_POSIX_PIPE_BUF()) * 3) @@ -92,9 +97,9 @@ sub alternates_changed { sub last_check_err { my ($self) = @_; my $fh = $self->{err_c} or return; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); + sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); defined(sysread($fh, my $buf, -s $fh)) or - fail($self, "sysread failed: $!"); + $self->fail("sysread failed: $!"); $buf; } @@ -103,24 +108,25 @@ sub _bidi_pipe { if ($self->{$pid}) { if (defined $err) { # "err_c" my $fh = $self->{$err}; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); - truncate($fh, 0) or fail($self, "truncate failed: $!"); + sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); + truncate($fh, 0) or $self->fail("truncate failed: $!"); } return; } my ($out_r, $out_w); - pipe($out_r, $out_w) or fail($self, "pipe failed: $!"); + pipe($out_r, $out_w) or $self->fail("pipe failed: $!"); my @cmd = (qw(git), "--git-dir=$self->{git_dir}", qw(-c core.abbrev=40 cat-file), $batch); my $redir = { 0 => $out_r }; if ($err) { my $id = "git.$self->{git_dir}$batch.err"; - my $fh = tmpfile($id) or fail($self, "tmpfile($id): $!"); + my $fh = tmpfile($id) or $self->fail("tmpfile($id): $!"); $self->{$err} = $fh; $redir->{2} = $fh; } my ($in_r, $p) = popen_rd(\@cmd, undef, $redir); $self->{$pid} = $p; + $self->{"$pid.owner"} = $$; $out_w->autoflush(1); if ($^O eq 'linux') { # 1031: F_SETPIPE_SZ fcntl($out_w, 1031, 4096); @@ -130,6 +136,8 @@ sub _bidi_pipe { $self->{$in} = $in_r; } +sub poll_in ($) { IO::Poll::_poll($RDTIMEO, fileno($_[0]), my $ev = POLLIN) } + sub my_read ($$$) { my ($fh, $rbuf, $len) = @_; my $left = $len - length($$rbuf); @@ -138,9 +146,12 @@ sub my_read ($$$) { $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); if ($r) { $left -= $r; + } elsif (defined($r)) { # EOF + return 0; } else { - next if (!defined($r) && $! == EINTR); - return $r; + next if ($! == EAGAIN and poll_in($fh)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error } } \substr($$rbuf, 0, $len, ''); @@ -152,9 +163,15 @@ sub my_readline ($$) { if ((my $n = index($$rbuf, "\n")) >= 0) { return substr($$rbuf, 0, $n + 1, ''); } - my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); - next if $r || (!defined($r) && $! == EINTR); - return defined($r) ? '' : undef; # EOF or error + my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)) + and next; + + # return whatever's left on EOF + return substr($$rbuf, 0, length($$rbuf)+1, '') if defined($r); + + next if ($! == EAGAIN and poll_in($fh)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error } } @@ -172,7 +189,7 @@ sub cat_async_retry ($$$$$) { for (my $i = 0; $i < @$inflight; $i += 3) { $buf .= "$inflight->[$i]\n"; } - print { $self->{out} } $buf or fail($self, "write error: $!"); + print { $self->{out} } $buf or $self->fail("write error: $!"); unshift(@$inflight, \$req, $cb, $arg); # \$ref to indicate retried cat_async_step($self, $inflight); # take one step @@ -185,30 +202,34 @@ sub cat_async_step ($$) { my $rbuf = delete($self->{cat_rbuf}) // \(my $new = ''); my ($bref, $oid, $type, $size); my $head = my_readline($self->{in}, $rbuf); + # ->fail may be called via Gcf2Client.pm if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) { ($oid, $type, $size) = ($1, $2, $3 + 0); $bref = my_read($self->{in}, $rbuf, $size + 1) or - fail($self, defined($bref) ? 'read EOF' : "read: $!"); - chop($$bref) eq "\n" or fail($self, 'LF missing after blob'); - } elsif ($head =~ / missing$/) { + $self->fail(defined($bref) ? 'read EOF' : "read: $!"); + chop($$bref) eq "\n" or $self->fail('LF missing after blob'); + } elsif ($head =~ s/ missing\n//s) { + $oid = $head; # ref($req) indicates it's already been retried - if (!ref($req) && !$in_cleanup && alternates_changed($self)) { + # -gcf2 retries internally, so it never hits this path: + if (!ref($req) && !$in_cleanup && $self->alternates_changed) { return cat_async_retry($self, $inflight, $req, $cb, $arg); } $type = 'missing'; - $oid = ref($req) ? $$req : $req; + $oid = ref($req) ? $$req : $req if $oid eq ''; } else { - fail($self, "Unexpected result from async git cat-file: $head"); + my $err = $! ? " ($!)" : ''; + $self->fail("bad result from async cat-file: $head$err"); } - eval { $cb->($bref, $oid, $type, $size, $arg) }; $self->{cat_rbuf} = $rbuf if $$rbuf ne ''; + eval { $cb->($bref, $oid, $type, $size, $arg) }; warn "E: $oid: $@\n" if $@; } sub cat_async_wait ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or return; + my $inflight = $self->{inflight} or return; while (scalar(@$inflight)) { cat_async_step($self, $inflight); } @@ -236,7 +257,7 @@ sub check_async_step ($$) { my ($self, $inflight_c) = @_; die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3; my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3); - my $rbuf = delete($self->{rbuf_c}) // \(my $new = ''); + my $rbuf = delete($self->{chk_rbuf}) // \(my $new = ''); chomp(my $line = my_readline($self->{in_c}, $rbuf)); my ($hex, $type, $size) = split(/ /, $line); @@ -246,16 +267,16 @@ sub check_async_step ($$) { # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { my $ret = my_read($self->{in_c}, $rbuf, $type + 1); - fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret; + $self->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret; } + $self->{chk_rbuf} = $rbuf if $$rbuf ne ''; eval { $cb->($hex, $type, $size, $arg, $self) }; warn "E: check($req) $@\n" if $@; - $self->{rbuf_c} = $rbuf if $$rbuf ne ''; } sub check_async_wait ($) { my ($self) = @_; - my $inflight_c = delete $self->{inflight_c} or return; + my $inflight_c = $self->{inflight_c} or return; while (scalar(@$inflight_c)) { check_async_step($self, $inflight_c); } @@ -272,10 +293,10 @@ sub check_async_begin ($) { sub check_async ($$$$) { my ($self, $oid, $cb, $arg) = @_; my $inflight_c = $self->{inflight_c} // check_async_begin($self); - if (scalar(@$inflight_c) >= MAX_INFLIGHT) { + while (scalar(@$inflight_c) >= MAX_INFLIGHT) { check_async_step($self, $inflight_c); } - print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!"); + print { $self->{out_c} } $oid, "\n" or $self->fail("write error: $!"); push(@$inflight_c, $oid, $cb, $arg); } @@ -302,40 +323,70 @@ sub check { sub _destroy { my ($self, $rbuf, $in, $out, $pid, $err) = @_; - my $p = delete $self->{$pid} or return; delete @$self{($rbuf, $in, $out)}; delete $self->{$err} if $err; # `err_c' - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($p, undef, undef) }; - waitpid($p, 0) if $@; # wait synchronously if not in event loop + # GitAsyncCat::event_step may delete {pid} + my $p = delete $self->{$pid} or return; + dwaitpid($p) if $$ == $self->{"$pid.owner"}; } sub cat_async_abort ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or die 'BUG: not in async'; + if (my $inflight = $self->{inflight}) { + while (@$inflight) { + my ($req, $cb, $arg) = splice(@$inflight, 0, 3); + $req =~ s/ .*//; # drop git_dir for Gcf2Client + eval { $cb->(undef, $req, undef, undef, $arg) }; + warn "E: $req: $@ (in abort)\n" if $@; + } + delete $self->{cat_rbuf}; + delete $self->{inflight}; + } cleanup($self); } -sub fail { +sub fail { # may be augmented in subclasses my ($self, $msg) = @_; - $self->{inflight} ? cat_async_abort($self) : cleanup($self); - croak("git $self->{git_dir}: $msg"); + cat_async_abort($self); + croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); } +# $git->popen(qw(show f00)); # or +# $git->popen(qw(show f00), { GIT_CONFIG => ... }, { 2 => ... }); sub popen { - my ($self, @cmd) = @_; - @cmd = ('git', "--git-dir=$self->{git_dir}", @cmd); - popen_rd(\@cmd); + my ($self, $cmd) = splice(@_, 0, 2); + $cmd = [ 'git', "--git-dir=$self->{git_dir}", + ref($cmd) ? @$cmd : ($cmd, grep { defined && !ref } @_) ]; + popen_rd($cmd, grep { !defined || ref } @_); # env and opt } +# same args as popen above sub qx { - my ($self, @cmd) = @_; - my $fh = $self->popen(@cmd); - local $/ = "\n"; - return <$fh> if wantarray; - local $/; - <$fh> + my $self = shift; + my $fh = $self->popen(@_); + if (wantarray) { + local $/ = "\n"; + my @ret = <$fh>; + close $fh; # caller should check $? + @ret; + } else { + local $/; + my $ret = <$fh>; + close $fh; # caller should check $? + $ret; + } +} + +# check_async and cat_async may trigger the other, so ensure they're +# both completely done by using this: +sub async_wait_all ($) { + my ($self) = @_; + while (scalar(@{$self->{inflight_c} // []}) || + scalar(@{$self->{inflight} // []})) { + $self->check_async_wait; + $self->cat_async_wait; + } } # returns true if there are pending "git cat-file" processes @@ -343,13 +394,15 @@ sub cleanup { my ($self) = @_; local $in_cleanup = 1; delete $self->{async_cat}; - check_async_wait($self); - cat_async_wait($self); + async_wait_all($self); + delete $self->{inflight}; + delete $self->{inflight_c}; _destroy($self, qw(cat_rbuf in out pid)); _destroy($self, qw(chk_rbuf in_c out_c pid_c err_c)); !!($self->{pid} || $self->{pid_c}); } + # assuming a well-maintained repo, this should be a somewhat # accurate estimation of its size # TODO: show this in the WWW UI as a hint to potential cloners @@ -394,8 +447,8 @@ sub pub_urls { sub cat_async_begin { my ($self) = @_; - cleanup($self) if alternates_changed($self); - batch_prepare($self); + cleanup($self) if $self->alternates_changed; + $self->batch_prepare; die 'BUG: already in async' if $self->{inflight}; $self->{inflight} = []; } @@ -403,24 +456,21 @@ sub cat_async_begin { sub cat_async ($$$;$) { my ($self, $oid, $cb, $arg) = @_; my $inflight = $self->{inflight} // cat_async_begin($self); - if (scalar(@$inflight) >= MAX_INFLIGHT) { + while (scalar(@$inflight) >= MAX_INFLIGHT) { cat_async_step($self, $inflight); } - - print { $self->{out} } $oid, "\n" or fail($self, "write error: $!"); + print { $self->{out} } $oid, "\n" or $self->fail("write error: $!"); push(@$inflight, $oid, $cb, $arg); } -# this is safe to call inside $cb, but not guaranteed to enqueue -# returns true if successful, undef if not. sub async_prefetch { my ($self, $oid, $cb, $arg) = @_; - if (defined($self->{async_cat}) && (my $inflight = $self->{inflight})) { + if (my $inflight = $self->{inflight}) { # we could use MAX_INFLIGHT here w/o the halving, # but lets not allow one client to monopolize a git process if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) { print { $self->{out} } $oid, "\n" or - fail($self, "write error: $!"); + $self->fail("write error: $!"); return push(@$inflight, $oid, $cb, $arg); } } @@ -451,6 +501,56 @@ sub modified ($) { $modified || time; } +# for grokmirror, which doesn't read gitweb.description +# templates/hooks--update.sample and git-multimail in git.git +# only match "Unnamed repository", not the full contents of +# templates/this--description in git.git +sub manifest_entry { + my ($self, $epoch, $default_desc) = @_; + my $fh = $self->popen('show-ref'); + my $dig = Digest::SHA->new(1); + while (read($fh, my $buf, 65536)) { + $dig->add($buf); + } + close $fh or return; # empty, uninitialized git repo + undef $fh; # for open, below + my $git_dir = $self->{git_dir}; + my $ent = { + fingerprint => $dig->hexdigest, + reference => undef, + modified => modified($self), + }; + chomp(my $owner = $self->qx('config', 'gitweb.owner')); + utf8::decode($owner); + $ent->{owner} = $owner eq '' ? undef : $owner; + my $desc = ''; + if (open($fh, '<', "$git_dir/description")) { + local $/ = "\n"; + chomp($desc = <$fh>); + utf8::decode($desc); + } + $desc = 'Unnamed repository' if $desc eq ''; + if (defined $epoch && $desc =~ /\AUnnamed repository/) { + $desc = "$default_desc [epoch $epoch]"; + } + $ent->{description} = $desc; + if (open($fh, '<', "$git_dir/objects/info/alternates")) { + # n.b.: GitPython doesn't seem to handle comments or C-quoted + # strings like native git does; and we don't for now, either. + local $/ = "\n"; + chomp(my @alt = <$fh>); + + # grokmirror only supports 1 alternate for "reference", + if (scalar(@alt) == 1) { + my $objdir = "$git_dir/objects"; + my $ref = File::Spec->rel2abs($alt[0], $objdir); + $ref =~ s!/[^/]+/?\z!!; # basename + $ent->{reference} = $ref; + } + } + $ent; +} + 1; __END__ =pod diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index 5f785df7..7d1a13db 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -1,42 +1,88 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # # internal class used by PublicInbox::Git + PublicInbox::DS # This parses the output pipe of "git cat-file --batch" -# -# Note: this does NOT set the non-blocking flag, we expect `git cat-file' -# to be a local process, and git won't start writing a blob until it's -# fully read. So minimize context switching and read as much as possible -# and avoid holding a buffer in our heap any longer than it has to live. package PublicInbox::GitAsyncCat; use strict; use parent qw(PublicInbox::DS Exporter); +use POSIX qw(WNOHANG); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(git_async_cat); +our @EXPORT = qw(git_async_cat git_async_prefetch); +use PublicInbox::Git (); + +our $GCF2C; # singleton PublicInbox::Gcf2Client -sub _add { - my ($class, $git) = @_; - $git->batch_prepare; - my $self = bless { git => $git }, $class; - $self->SUPER::new($git->{in}, EPOLLIN|EPOLLET); - \undef; # this is a true ref() +sub close { + my ($self) = @_; + if (my $git = delete $self->{git}) { + $git->cat_async_abort; + } + $self->SUPER::close; # PublicInbox::DS::close } sub event_step { my ($self) = @_; - my $git = $self->{git}; + my $git = $self->{git} or return; return $self->close if ($git->{in} // 0) != ($self->{sock} // 1); my $inflight = $git->{inflight}; if ($inflight && @$inflight) { $git->cat_async_step($inflight); - $self->requeue if @$inflight || exists $git->{cat_rbuf}; + + # child death? + if (($git->{in} // 0) != ($self->{sock} // 1)) { + $self->close; + } elsif (@$inflight || exists $git->{cat_rbuf}) { + # ok, more to do, requeue for fairness + $self->requeue; + } + } elsif ((my $pid = waitpid($git->{pid}, WNOHANG)) > 0) { + # May happen if the child process is killed by a BOFH + # (or segfaults) + delete $git->{pid}; + warn "E: git $pid exited with \$?=$?\n"; + $self->close; } } sub git_async_cat ($$$$) { my ($git, $oid, $cb, $arg) = @_; - $git->cat_async($oid, $cb, $arg); - $git->{async_cat} //= _add(__PACKAGE__, $git); + if ($GCF2C //= eval { + require PublicInbox::Gcf2Client; + PublicInbox::Gcf2Client::new(); + } // 0) { # 0: do not retry if libgit2 or Inline::C are missing + $GCF2C->gcf2_async(\"$oid $git->{git_dir}\n", $cb, $arg); + \undef; + } else { # read-only end of git-cat-file pipe + $git->cat_async($oid, $cb, $arg); + $git->{async_cat} //= do { + my $self = bless { git => $git }, __PACKAGE__; + $git->{in}->blocking(0); + $self->SUPER::new($git->{in}, EPOLLIN|EPOLLET); + \undef; # this is a true ref() + }; + } +} + +# this is safe to call inside $cb, but not guaranteed to enqueue +# returns true if successful, undef if not. +sub git_async_prefetch { + my ($git, $oid, $cb, $arg) = @_; + if ($GCF2C) { + if (!$GCF2C->{wbuf}) { + $oid .= " $git->{git_dir}\n"; + return $GCF2C->gcf2_async(\$oid, $cb, $arg); # true + } + } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) { + # we could use MAX_INFLIGHT here w/o the halving, + # but lets not allow one client to monopolize a git process + if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) { + print { $git->{out} } $oid, "\n" or + $git->fail("write error: $!"); + return push(@$inflight, $oid, $cb, $arg); + } + } + undef; } 1; diff --git a/lib/PublicInbox/GitCredential.pm b/lib/PublicInbox/GitCredential.pm index c6da6a09..9e193029 100644 --- a/lib/PublicInbox/GitCredential.pm +++ b/lib/PublicInbox/GitCredential.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ package PublicInbox::GitCredential; use strict; diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index fd2e00dd..c179ffef 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # when no endpoints match, fallback to this and serve a static file diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index 20030433..48ed11a5 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # # In public-inbox <=1.5.0, public-inbox-httpd favored "getline" @@ -169,7 +169,7 @@ sub async_blob_cb { # git->cat_async callback if (!defined($oid)) { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $self->{-inbox}->{inboxdir}\n"; + warn "E: $smsg->{blob} missing in $self->{ibx}->{inboxdir}\n"; return $http->next_step($self->can('async_next')); } $smsg->{blob} eq $oid or bail($self, "BUG: $smsg->{blob} != $oid"); @@ -180,7 +180,7 @@ sub async_blob_cb { # git->cat_async callback sub smsg_blob { my ($self, $smsg) = @_; - git_async_cat($self->{-inbox}->git, $smsg->{blob}, + git_async_cat($self->{ibx}->git, $smsg->{blob}, \&async_blob_cb, $self); } diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 88020ae8..d0708c5b 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # Generic PSGI server for convenience. It aims to provide diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm index a9f55ff6..b193c9ae 100644 --- a/lib/PublicInbox/HTTPD.pm +++ b/lib/PublicInbox/HTTPD.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # wraps a listen socket for HTTP and links it to the PSGI app in diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm index 87a6a5f9..bd1fd8fa 100644 --- a/lib/PublicInbox/HTTPD/Async.pm +++ b/lib/PublicInbox/HTTPD/Async.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # XXX This is a totally unstable API for public-inbox internal use only diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm index de285fc2..9016db3a 100644 --- a/lib/PublicInbox/HlMod.pm +++ b/lib/PublicInbox/HlMod.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # I have no idea how stable or safe this is for handling untrusted diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index fb21041a..d20f70ae 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # # represents a header value in various forms. Used for HTML generation diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index c9a024d6..226e98a2 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # # Each instance of this represents an IMAP client connected to @@ -627,7 +627,7 @@ sub fetch_blob_cb { # called by git->cat_async via git_async_cat } my $pre; if (!$self->{wbuf} && (my $nxt = $msgs->[0])) { - $pre = $ibx->git->async_prefetch($nxt->{blob}, + $pre = git_async_prefetch($ibx->git, $nxt->{blob}, \&fetch_blob_cb, $fetch_arg); } fetch_run_ops($self, $smsg, $bref, $ops, $partial); @@ -1110,7 +1110,7 @@ sub search_uid_range { # long_response 1; # more } -sub parse_query ($$) { +sub parse_imap_query ($$) { my ($self, $query) = @_; my $q = PublicInbox::IMAPsearchqp::parse($self, $query); if (ref($q)) { @@ -1122,37 +1122,10 @@ sub parse_query ($$) { $q; } -sub refill_xap ($$$$) { - my ($self, $uids, $range_info, $q) = @_; - my ($beg, $end) = @$range_info; - my $srch = $self->{ibx}->search; - my $opt = { mset => 2, limit => 1000 }; - my $mset = $srch->mset("$q uid:$beg..$end", $opt); - @$uids = @{$srch->mset_to_artnums($mset)}; - if (@$uids) { - $range_info->[0] = $uids->[-1] + 1; # update $beg - return; # possibly more - } - 0; # all done -} - -sub search_xap_range { # long_response - my ($self, $tag, $q, $range_info, $want_msn) = @_; - my $uids = []; - if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) { - $err ||= 'OK Search done'; - $self->write("\r\n$tag $err\r\n"); - return; - } - msn_convert($self, $uids) if $want_msn; - $self->msg_more(join(' ', '', @$uids)); - 1; # more -} - sub search_common { my ($self, $tag, $query, $want_msn) = @_; my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n"; - my $q = parse_query($self, $query); + my $q = parse_imap_query($self, $query); return "$tag $q\r\n" if !ref($q); my ($sql, $range_info) = delete @$q{qw(sql range_info)}; if (!scalar(keys %$q)) { # overview.sqlite3 @@ -1160,11 +1133,17 @@ sub search_common { long_response($self, \&search_uid_range, $tag, $sql, $range_info, $want_msn); } elsif ($q = $q->{xap}) { - $self->{ibx}->search or + my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; - $self->msg_more('* SEARCH'); - long_response($self, \&search_xap_range, - $tag, $q, $range_info, $want_msn); + my $opt = { + relevance => -1, + limit => UID_SLICE, + uid_range => $range_info + }; + my $mset = $srch->mset($q, $opt); + my $uids = $srch->mset_to_artnums($mset, $opt); + msn_convert($self, $uids) if scalar(@$uids) && $want_msn; + "* SEARCH @$uids\r\n$tag OK Search done\r\n"; } else { "$tag BAD Error\r\n"; } diff --git a/lib/PublicInbox/IMAPD.pm b/lib/PublicInbox/IMAPD.pm index 3c211ee1..7425409d 100644 --- a/lib/PublicInbox/IMAPD.pm +++ b/lib/PublicInbox/IMAPD.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # represents an IMAPD (currently a singleton), @@ -19,33 +19,34 @@ sub new { err => \*STDERR, out => \*STDOUT, # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } - # pi_config => PublicInbox::Config + # pi_cfg => PublicInbox::Config # idler => PublicInbox::InboxIdle }, $class; } -sub imapd_refresh_ibx { # pi_config->each_inbox cb +sub imapd_refresh_ibx { # pi_cfg->each_inbox cb my ($ibx, $imapd) = @_; my $ngname = $ibx->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - return; - } elsif ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || - $ngname =~ /\.[0-9]+\z/) { + + # We require lower-case since IMAP mailbox names are + # case-insensitive (but -nntpd matches INN in being + # case-sensitive + if ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || + # don't confuse with 50K slices + $ngname =~ /\.[0-9]+\z/) { warn "mailbox name invalid: newsgroup=`$ngname'\n"; return; } $ibx->over or return; $ibx->{over} = undef; - my $mm = $ibx->mm or return; - $ibx->{mm} = undef; # RFC 3501 2.3.1.1 - "A good UIDVALIDITY value to use in # this case is a 32-bit representation of the creation # date/time of the mailbox" - defined($ibx->{uidvalidity} = $mm->created_at) or return; - PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max // 0); + eval { $ibx->uidvalidity }; + my $mm = delete($ibx->{mm}) or return; + defined($ibx->{uidvalidity}) or return; + PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max); # preload to avoid fragmentation: $ibx->description; @@ -59,7 +60,7 @@ sub imapd_refresh_ibx { # pi_config->each_inbox cb } sub imapd_refresh_finalize { - my ($imapd, $pi_config) = @_; + my ($imapd, $pi_cfg) = @_; my $mailboxes; if (my $next = delete $imapd->{imapd_next}) { $imapd->{mailboxes} = delete $next->{mailboxes}; @@ -77,40 +78,40 @@ sub imapd_refresh_finalize { qq[* LIST (\\Has${no}Children) "." $u\r\n] } keys %$mailboxes ]; - $imapd->{pi_config} = $pi_config; + $imapd->{pi_cfg} = $pi_cfg; if (my $idler = $imapd->{idler}) { - $idler->refresh($pi_config); + $idler->refresh($pi_cfg); } } -sub imapd_refresh_step { # pi_config->iterate_start cb - my ($pi_config, $section, $imapd) = @_; +sub imapd_refresh_step { # pi_cfg->iterate_start cb + my ($pi_cfg, $section, $imapd) = @_; if (defined($section)) { return if $section !~ m!\Apublicinbox\.([^/]+)\z!; - my $ibx = $pi_config->lookup_name($1) or return; + my $ibx = $pi_cfg->lookup_name($1) or return; imapd_refresh_ibx($ibx, $imapd->{imapd_next}); } else { # undef == "EOF" - imapd_refresh_finalize($imapd, $pi_config); + imapd_refresh_finalize($imapd, $pi_cfg); } } sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = PublicInbox::Config->new; + my $pi_cfg = PublicInbox::Config->new; if ($sig) { # SIGHUP is handled through the event loop $self->{imapd_next} = { dummies => {}, mailboxes => {} }; - my $iter = PublicInbox::ConfigIter->new($pi_config, + my $iter = PublicInbox::ConfigIter->new($pi_cfg, \&imapd_refresh_step, $self); $iter->event_step; } else { # initial start is synchronous $self->{dummies} = {}; - $pi_config->each_inbox(\&imapd_refresh_ibx, $self); - imapd_refresh_finalize($self, $pi_config); + $pi_cfg->each_inbox(\&imapd_refresh_ibx, $self); + imapd_refresh_finalize($self, $pi_cfg); } } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); } 1; diff --git a/lib/PublicInbox/IMAPTracker.pm b/lib/PublicInbox/IMAPTracker.pm index be9caf76..6d4fb227 100644 --- a/lib/PublicInbox/IMAPTracker.pm +++ b/lib/PublicInbox/IMAPTracker.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ package PublicInbox::IMAPTracker; use strict; diff --git a/lib/PublicInbox/IMAPdeflate.pm b/lib/PublicInbox/IMAPdeflate.pm index b98a069d..d5929ef2 100644 --- a/lib/PublicInbox/IMAPdeflate.pm +++ b/lib/PublicInbox/IMAPdeflate.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # TODO: reduce duplication from PublicInbox::NNTPdeflate diff --git a/lib/PublicInbox/IMAPsearchqp.pm b/lib/PublicInbox/IMAPsearchqp.pm index 190fefb9..2fb92bb8 100644 --- a/lib/PublicInbox/IMAPsearchqp.pm +++ b/lib/PublicInbox/IMAPsearchqp.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # IMAP search query parser. cf RFC 3501 @@ -124,7 +124,7 @@ sub ON { my ($self, $item) = @_; my $ts = yyyymmdd($item); my $end = $ts + 86399; # no leap day - push @{$self->{xap}}, "ts:$ts..$end"; + push @{$self->{xap}}, "rt:$ts..$end"; my $sql = $self->{sql} or return 1; $$sql .= " AND ts >= $ts AND ts <= $end"; } @@ -132,7 +132,7 @@ sub ON { sub BEFORE { my ($self, $item) = @_; my $ts = yyyymmdd($item); - push @{$self->{xap}}, "ts:..$ts"; + push @{$self->{xap}}, "rt:..$ts"; my $sql = $self->{sql} or return 1; $$sql .= " AND ts <= $ts"; } @@ -140,7 +140,7 @@ sub BEFORE { sub SINCE { my ($self, $item) = @_; my $ts = yyyymmdd($item); - push @{$self->{xap}}, "ts:$ts.."; + push @{$self->{xap}}, "rt:$ts.."; my $sql = $self->{sql} or return 1; $$sql .= " AND ts >= $ts"; } diff --git a/lib/PublicInbox/IPC.pm b/lib/PublicInbox/IPC.pm new file mode 100644 index 00000000..c52441f7 --- /dev/null +++ b/lib/PublicInbox/IPC.pm @@ -0,0 +1,447 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# base class for remote IPC calls and workqueues, requires Storable or Sereal +package PublicInbox::IPC; +use strict; +use v5.10.1; +use Carp qw(confess croak); +use PublicInbox::DS qw(dwaitpid); +use PublicInbox::Spawn; +use POSIX qw(mkfifo WNOHANG); +use Socket qw(AF_UNIX MSG_EOR SOCK_STREAM); +use Errno qw(EMSGSIZE); +use File::Temp 0.19 (); # 0.19 for ->newdir +my $SEQPACKET = eval { Socket::SOCK_SEQPACKET() }; # portable enough? +use constant PIPE_BUF => $^O eq 'linux' ? 4096 : POSIX::_POSIX_PIPE_BUF(); +my $WQ_MAX_WORKERS = 4096; +my ($enc, $dec); +# ->imports at BEGIN turns sereal_*_with_object into custom ops on 5.14+ +# and eliminate method call overhead +BEGIN { + eval { + require Sereal::Encoder; + require Sereal::Decoder; + Sereal::Encoder->import('sereal_encode_with_object'); + Sereal::Decoder->import('sereal_decode_with_object'); + ($enc, $dec) = (Sereal::Encoder->new, Sereal::Decoder->new); + }; +}; + +if ($enc && $dec) { # should be custom ops + *freeze = sub ($) { sereal_encode_with_object $enc, $_[0] }; + *thaw = sub ($) { sereal_decode_with_object $dec, $_[0], my $ret }; +} else { + eval { # some distros have Storable as a separate package from Perl + require Storable; + Storable->import(qw(freeze thaw)); + $enc = 1; + } // warn("Storable (part of Perl) missing: $@\n"); +} + +my $recv_cmd = PublicInbox::Spawn->can('recv_cmd4'); +my $send_cmd = PublicInbox::Spawn->can('send_cmd4') // do { + require PublicInbox::CmdIPC4; + $recv_cmd //= PublicInbox::CmdIPC4->can('recv_cmd4'); + PublicInbox::CmdIPC4->can('send_cmd4'); +}; + +sub _get_rec ($) { + my ($r) = @_; + defined(my $len = <$r>) or return; + chop($len) eq "\n" or croak "no LF byte in $len"; + defined(my $n = read($r, my $buf, $len)) or croak "read error: $!"; + $n == $len or croak "short read: $n != $len"; + thaw($buf); +} + +sub _pack_rec ($) { + my ($ref) = @_; + my $buf = freeze($ref); + length($buf) . "\n" . $buf; +} + +sub _send_rec ($$) { + my ($w, $ref) = @_; + print $w _pack_rec($ref) or croak "print: $!"; +} + +sub ipc_return ($$$) { + my ($w, $ret, $exc) = @_; + _send_rec($w, $exc ? bless(\$exc, 'PublicInbox::IPC::Die') : $ret); +} + +sub ipc_worker_loop ($$$) { + my ($self, $r_req, $w_res) = @_; + my ($rec, $wantarray, $sub, @args); + local $/ = "\n"; + while ($rec = _get_rec($r_req)) { + ($wantarray, $sub, @args) = @$rec; + # no waiting if client doesn't care, + # this is the overwhelmingly likely case + if (!defined($wantarray)) { + eval { $self->$sub(@args) }; + warn "$$ die: $@ (from nowait $sub)\n" if $@; + } elsif ($wantarray) { + my @ret = eval { $self->$sub(@args) }; + ipc_return($w_res, \@ret, $@); + } else { # '' => wantscalar + my $ret = eval { $self->$sub(@args) }; + ipc_return($w_res, \$ret, $@); + } + } +} + +# starts a worker if Sereal or Storable is installed +sub ipc_worker_spawn { + my ($self, $ident, $oldset) = @_; + return unless $enc; # no Sereal or Storable + return if ($self->{-ipc_ppid} // -1) == $$; # idempotent + delete(@$self{qw(-ipc_req -ipc_res -ipc_ppid -ipc_pid)}); + pipe(my ($r_req, $w_req)) or die "pipe: $!"; + pipe(my ($r_res, $w_res)) or die "pipe: $!"; + my $sigset = $oldset // PublicInbox::DS::block_signals(); + $self->ipc_atfork_prepare; + my $seed = rand(0xffffffff); + my $pid = fork // die "fork: $!"; + if ($pid == 0) { + srand($seed); + eval { PublicInbox::DS->Reset }; + delete @$self{qw(-wq_s1 -wq_workers -wq_ppid)}; + $w_req = $r_res = undef; + $w_res->autoflush(1); + $SIG{$_} = 'IGNORE' for (qw(TERM INT QUIT)); + local $0 = $ident; + PublicInbox::DS::sig_setmask($sigset); + my $on_destroy = $self->ipc_atfork_child; + eval { ipc_worker_loop($self, $r_req, $w_res) }; + die "worker $ident PID:$$ died: $@\n" if $@; + exit; + } + PublicInbox::DS::sig_setmask($sigset) unless $oldset; + $r_req = $w_res = undef; + $w_req->autoflush(1); + $self->{-ipc_req} = $w_req; + $self->{-ipc_res} = $r_res; + $self->{-ipc_ppid} = $$; + $self->{-ipc_pid} = $pid; +} + +sub ipc_worker_reap { # dwaitpid callback + my ($self, $pid) = @_; + # SIGTERM (15) is our default exit signal + warn "PID:$pid died with \$?=$?\n" if $? && ($? & 127) != 15; +} + +sub wq_wait_old { + my ($self) = @_; + my $pids = delete $self->{"-wq_old_pids.$$"} or return; + dwaitpid($_, \&ipc_worker_reap, $self) for @$pids; +} + +# for base class, override in sub classes +sub ipc_atfork_prepare {} + +sub ipc_atfork_child { + my ($self) = @_; + my $io = delete($self->{-ipc_atfork_child_close}) or return; + close($_) for @$io; + undef; +} + +# idempotent, can be called regardless of whether worker is active or not +sub ipc_worker_stop { + my ($self) = @_; + my ($pid, $ppid) = delete(@$self{qw(-ipc_pid -ipc_ppid)}); + my ($w_req, $r_res) = delete(@$self{qw(-ipc_req -ipc_res)}); + if (!$w_req && !$r_res) { + die "unexpected PID:$pid without IPC pipes" if $pid; + return; # idempotent + } + die 'no PID with IPC pipes' unless $pid; + $w_req = $r_res = undef; + + return if $$ != $ppid; + dwaitpid($pid, \&ipc_worker_reap, $self); +} + +# use this if we have multiple readers reading curl or "pigz -dc" +# and writing to the same store +sub ipc_lock_init { + my ($self, $f) = @_; + require PublicInbox::Lock; + $self->{-ipc_lock} //= bless { lock_path => $f }, 'PublicInbox::Lock' +} + +sub ipc_async_wait ($$) { + my ($self, $max) = @_; # max == -1 to wait for all + my $aif = $self->{-async_inflight} or return; + my $r_res = $self->{-ipc_res} or die 'BUG: no ipc_res'; + while (my ($sub, $bytes, $cb, $cb_arg) = splice(@$aif, 0, 4)) { + my $ret = _get_rec($r_res) // + die "no response on $sub (req.size=$bytes)"; + $self->{-async_inflight_bytes} -= $bytes; + + eval { $cb->($cb_arg, $ret) }; + warn "E: $sub callback error: $@\n" if $@; + return if --$max == 0; + } +} + +# call $self->$sub(@args), on a worker if ipc_worker_spawn was used +sub ipc_do { + my ($self, $sub, @args) = @_; + if (my $w_req = $self->{-ipc_req}) { # run in worker + my $ipc_lock = $self->{-ipc_lock}; + my $lock = $ipc_lock ? $ipc_lock->lock_for_scope : undef; + if (defined(wantarray)) { + my $r_res = $self->{-ipc_res} or die 'BUG: no ipc_res'; + ipc_async_wait($self, -1); + _send_rec($w_req, [ wantarray, $sub, @args ]); + my $ret = _get_rec($r_res) // die "no response on $sub"; + die $$ret if ref($ret) eq 'PublicInbox::IPC::Die'; + wantarray ? @$ret : $$ret; + } else { # likely, fire-and-forget into pipe + _send_rec($w_req, [ undef , $sub, @args ]); + } + } else { # run locally + $self->$sub(@args); + } +} + +sub ipc_async { + my ($self, $sub, $sub_args, $cb, $cb_arg) = @_; + if (my $w_req = $self->{-ipc_req}) { # run in worker + my $rec = _pack_rec([ 1, $sub, @$sub_args ]); + my $cur_bytes = \($self->{-async_inflight_bytes} //= 0); + while (($$cur_bytes + length($rec)) > PIPE_BUF) { + ipc_async_wait($self, 1); + } + my $ipc_lock = $self->{-ipc_lock}; + my $lock = $ipc_lock ? $ipc_lock->lock_for_scope : undef; + print $w_req $rec or croak "print: $!"; + $$cur_bytes += length($rec); + push @{$self->{-async_inflight}}, + $sub, length($rec), $cb, $cb_arg; + } else { + my $ret = [ eval { $self->$sub(@$sub_args) } ]; + if (my $exc = $@) { + $ret = ( bless(\$exc, 'PublicInbox::IPC::Die') ); + } + eval { $cb->($cb_arg, $ret) }; + warn "E: $sub callback error: $@\n" if $@; + } +} + +# needed when there's multiple IPC workers and the parent forking +# causes newer siblings to inherit older siblings sockets +sub ipc_sibling_atfork_child { + my ($self) = @_; + my ($pid, undef) = delete(@$self{qw(-ipc_pid -ipc_ppid)}); + delete(@$self{qw(-ipc_req -ipc_res)}); + $pid == $$ and die "BUG: $$ ipc_atfork_child called on itself"; +} + +sub _recv_and_run { + my ($self, $s2, $len, $full_stream) = @_; + my @fds = $recv_cmd->($s2, my $buf, $len); + my $n = length($buf // '') or return; + my $nfd = 0; + for my $fd (@fds) { + if (open(my $cmdfh, '+<&=', $fd)) { + $self->{$nfd++} = $cmdfh; + $cmdfh->autoflush(1); + } else { + die "$$ open(+<&=$fd) (FD:$nfd): $!"; + } + } + while ($full_stream && $n < $len) { + my $r = sysread($s2, $buf, $len - $n, $n) // croak "read: $!"; + croak "read EOF after $n/$len bytes" if $r == 0; + $n = length($buf); + } + # Sereal dies on truncated data, Storable returns undef + my $args = thaw($buf) // die "thaw error on buffer of size: $n"; + undef $buf; + my $sub = shift @$args; + eval { $self->$sub(@$args) }; + warn "$$ wq_worker: $@" if $@ && ref($@) ne 'PublicInbox::SIGPIPE'; + delete @$self{0..($nfd-1)}; + $n; +} + +sub wq_worker_loop ($) { + my ($self) = @_; + my $len = $self->{wq_req_len} // (4096 * 33); + my $s2 = $self->{-wq_s2} // die 'BUG: no -wq_s2'; + 1 while (_recv_and_run($self, $s2, $len)); +} + +sub do_sock_stream { # via wq_do, for big requests + my ($self, $len) = @_; + _recv_and_run($self, delete $self->{0}, $len, 1); +} + +sub wq_do { # always async + my ($self, $sub, $ios, @args) = @_; + if (my $s1 = $self->{-wq_s1}) { # run in worker + my $fds = [ map { fileno($_) } @$ios ]; + my $n = $send_cmd->($s1, $fds, freeze([$sub, @args]), MSG_EOR); + return if defined($n); + croak "sendmsg error: $!" if $! != EMSGSIZE; + socketpair(my $r, my $w, AF_UNIX, SOCK_STREAM, 0) or + croak "socketpair: $!"; + my $buf = freeze([$sub, @args]); + $n = $send_cmd->($s1, [ fileno($r) ], + freeze(['do_sock_stream', length($buf)]), + MSG_EOR) // croak "sendmsg: $!"; + undef $r; + $n = $send_cmd->($w, $fds, $buf, 0) // croak "sendmsg: $!"; + while ($n < length($buf)) { + my $x = syswrite($w, $buf, length($buf) - $n, $n) // + croak "syswrite: $!"; + croak "syswrite wrote 0 bytes" if $x == 0; + $n += $x; + } + } else { + @$self{0..$#$ios} = @$ios; + eval { $self->$sub(@args) }; + warn "wq_do: $@" if $@ && ref($@) ne 'PublicInbox::SIGPIPE'; + delete @$self{0..$#$ios}; # don't close + } +} + +sub _wq_worker_start ($$) { + my ($self, $oldset) = @_; + my $seed = rand(0xffffffff); + my $pid = fork // die "fork: $!"; + if ($pid == 0) { + srand($seed); + eval { PublicInbox::DS->Reset }; + delete @$self{qw(-wq_s1 -wq_workers -wq_ppid)}; + $SIG{$_} = 'IGNORE' for (qw(PIPE TTOU TTIN)); + $SIG{$_} = 'DEFAULT' for (qw(TERM QUIT INT CHLD)); + local $0 = $self->{-wq_ident}; + PublicInbox::DS::sig_setmask($oldset); + # ensure we properly exit even if warn() dies: + my $end = PublicInbox::OnDestroy->new($$, sub { exit(!!$@) }); + my $on_destroy = $self->ipc_atfork_child; + eval { wq_worker_loop($self) }; + warn "worker $self->{-wq_ident} PID:$$ died: $@" if $@; + undef $on_destroy; + undef $end; # trigger exit + } else { + $self->{-wq_workers}->{$pid} = \undef; + } +} + +# starts workqueue workers if Sereal or Storable is installed +sub wq_workers_start { + my ($self, $ident, $nr_workers, $oldset) = @_; + ($enc && $send_cmd && $recv_cmd && defined($SEQPACKET)) or return; + return if $self->{-wq_s1}; # idempotent + $self->{-wq_s1} = $self->{-wq_s2} = undef; + socketpair($self->{-wq_s1}, $self->{-wq_s2}, AF_UNIX, $SEQPACKET, 0) or + die "socketpair: $!"; + $self->ipc_atfork_prepare; + $nr_workers //= 4; + $nr_workers = $WQ_MAX_WORKERS if $nr_workers > $WQ_MAX_WORKERS; + my $sigset = $oldset // PublicInbox::DS::block_signals(); + $self->{-wq_workers} = {}; + $self->{-wq_ident} = $ident; + _wq_worker_start($self, $sigset) for (1..$nr_workers); + PublicInbox::DS::sig_setmask($sigset) unless $oldset; + $self->{-wq_ppid} = $$; +} + +sub wq_worker_incr { # SIGTTIN handler + my ($self, $oldset) = @_; + $self->{-wq_s2} or return; + return if wq_workers($self) >= $WQ_MAX_WORKERS; + $self->ipc_atfork_prepare; + my $sigset = $oldset // PublicInbox::DS::block_signals(); + _wq_worker_start($self, $sigset); + PublicInbox::DS::sig_setmask($sigset) unless $oldset; +} + +sub wq_exit { # wakes up wq_worker_decr_wait + send($_[0]->{-wq_s2}, $$, MSG_EOR) // die "$$ send: $!"; + exit; +} + +sub wq_worker_decr { # SIGTTOU handler, kills first idle worker + my ($self) = @_; + return unless wq_workers($self); + my $s2 = $self->{-wq_s2} // die 'BUG: no wq_s2'; + $self->wq_do('wq_exit', [ $s2, $s2, $s2 ]); + # caller must call wq_worker_decr_wait in main loop +} + +sub wq_worker_decr_wait { + my ($self, $timeout) = @_; + return if $self->{-wq_ppid} != $$; # can't reap siblings or parents + my $s1 = $self->{-wq_s1} // croak 'BUG: no wq_s1'; + vec(my $rin = '', fileno($s1), 1) = 1; + select(my $rout = $rin, undef, undef, $timeout) or + croak 'timed out waiting for wq_exit'; + recv($s1, my $pid, 64, 0) // croak "recv: $!"; + my $workers = $self->{-wq_workers} // croak 'BUG: no wq_workers'; + delete $workers->{$pid} // croak "BUG: PID:$pid invalid"; + dwaitpid($pid, \&ipc_worker_reap, $self); +} + +# set or retrieve number of workers +sub wq_workers { + my ($self, $nr) = @_; + my $cur = $self->{-wq_workers} or return; + if (defined $nr) { + while (scalar(keys(%$cur)) > $nr) { + $self->wq_worker_decr; + $self->wq_worker_decr_wait; + } + $self->wq_worker_incr while scalar(keys(%$cur)) < $nr; + } + scalar(keys(%$cur)); +} + +sub wq_close { + my ($self, $nohang) = @_; + delete @$self{qw(-wq_s1 -wq_s2)} or return; + my $ppid = delete $self->{-wq_ppid} or return; + my $workers = delete $self->{-wq_workers} // die 'BUG: no wq_workers'; + return if $ppid != $$; # can't reap siblings or parents + my @pids = map { $_ + 0 } keys %$workers; + if ($nohang) { + push @{$self->{"-wq_old_pids.$$"}}, @pids; + } else { + dwaitpid($_, \&ipc_worker_reap, $self) for @pids; + } +} + +sub wq_kill_old { + my ($self) = @_; + my $pids = $self->{"-wq_old_pids.$$"} or return; + kill 'TERM', @$pids; +} + +sub wq_kill { + my ($self, $sig) = @_; + my $workers = $self->{-wq_workers} or return; + kill($sig // 'TERM', keys %$workers); +} + +sub WQ_MAX_WORKERS { $WQ_MAX_WORKERS } + +sub DESTROY { + my ($self) = @_; + my $ppid = $self->{-wq_ppid}; + wq_kill($self) if $ppid && $ppid == $$; + wq_close($self); + wq_wait_old($self); + ipc_worker_stop($self); +} + +# Sereal doesn't have dclone +sub deep_clone { thaw(freeze($_[-1])) } + +1; diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm index ce75b46a..d5123006 100644 --- a/lib/PublicInbox/IdxStack.pm +++ b/lib/PublicInbox/IdxStack.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # temporary stack for public-inbox-index @@ -6,19 +6,27 @@ package PublicInbox::IdxStack; use v5.10.1; use strict; use Fcntl qw(:seek); -use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*'; +use constant PACK_FMT => eval { pack('Q', 1) } ? 'A1QQH*H*' : 'A1IIH*H*'; # start off in write-only mode sub new { open(my $io, '+>', undef) or die "open: $!"; + # latest_cmt is still useful when the newest revision is a `d'(elete), + # otherwise we favor $sync->{latest_cmt} for checkpoints and {quit} bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__ } # file_char = [d|m] sub push_rec { - my ($self, $file_char, $at, $ct, $blob_oid) = @_; - my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid); - $self->{rec_size} //= length($rec); + my ($self, $file_char, $at, $ct, $blob_oid, $cmt_oid) = @_; + my $rec = pack(PACK_FMT, $file_char, $at, $ct, $blob_oid, $cmt_oid); + $self->{unpack_fmt} //= do { + my $len = length($cmt_oid); + my $fmt = PACK_FMT; + $fmt =~ s/H\*/H$len/g; + $self->{rec_size} = length($rec); + $fmt; + }; print { $self->{wr} } $rec or die "print: $!"; $self->{tot_size} += length($rec); } @@ -46,7 +54,7 @@ sub pop_rec { my $r = read($io, my $buf, $sz); defined($r) or die "read: $!"; $r == $sz or die "read($r != $sz)"; - unpack(FMT, $buf); + unpack($self->{unpack_fmt}, $buf); } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2cb4896a..8a06a661 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # git fast-import-based ssoma-mda MDA replacement @@ -9,7 +9,7 @@ package PublicInbox::Import; use strict; use parent qw(PublicInbox::Lock); use v5.10.1; -use PublicInbox::Spawn qw(spawn popen_rd); +use PublicInbox::Spawn qw(run_die popen_rd); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; use PublicInbox::Smsg; @@ -19,13 +19,22 @@ use PublicInbox::MDA; use PublicInbox::Eml; use POSIX qw(strftime); +sub default_branch () { + state $default_branch = do { + delete local $ENV{GIT_CONFIG}; + my $r = popen_rd([qw(git config --global init.defaultBranch)]); + chomp(my $h = <$r> // ''); + $h eq '' ? 'refs/heads/master' : $h; + } +} + sub new { # we can't change arg order, this is documented in POD # and external projects may rely on it: my ($class, $git, $name, $email, $ibx) = @_; - my $ref = 'refs/heads/master'; + my $ref; if ($ibx) { - $ref = $ibx->{ref_head} // 'refs/heads/master'; + $ref = $ibx->{ref_head}; $name //= $ibx->{name}; $email //= $ibx->{-primary_address}; $git //= $ibx->git; @@ -34,7 +43,7 @@ sub new { git => $git, ident => "$name <$email>", mark => 1, - ref => $ref, + ref => $ref // default_branch, ibx => $ibx, path_type => '2/38', # or 'v2' lock_path => "$git->{git_dir}/ssoma.lock", # v2 changes this @@ -46,9 +55,9 @@ sub new { sub gfi_start { my ($self) = @_; - return ($self->{in}, $self->{out}) if $self->{pid}; + return ($self->{in}, $self->{out}) if $self->{in}; - my (@ret, $out_r, $out_w); + my ($in_r, $out_r, $out_w); pipe($out_r, $out_w) or die "pipe failed: $!"; $self->lock_acquire; @@ -56,27 +65,26 @@ sub gfi_start { my ($git, $ref) = @$self{qw(git ref)}; local $/ = "\n"; chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?; if ($self->{path_type} ne '2/38' && $self->{tip}) { local $/ = "\0"; my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref); + die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; chomp @t; $self->{-tree} = { map { $_ => 1 } @t }; } - my @cmd = ('git', "--git-dir=$git->{git_dir}", - qw(fast-import --quiet --done --date-format=raw)); - my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); + $in_r = $self->{in} = $git->popen(qw(fast-import + --quiet --done --date-format=raw), + undef, { 0 => $out_r }); $out_w->autoflush(1); - $self->{in} = $in_r; $self->{out} = $out_w; - $self->{pid} = $pid; $self->{nchg} = 0; - @ret = ($in_r, $out_w); }; if ($@) { $self->lock_release; die $@; } - @ret; + ($in_r, $out_w); } sub wfail () { die "write to fast-import failed: $!" } @@ -153,14 +161,14 @@ sub check_remove_v1 { sub checkpoint { my ($self) = @_; - return unless $self->{pid}; + return unless $self->{in}; print { $self->{out} } "checkpoint\n" or wfail; undef; } sub progress { my ($self, $msg) = @_; - return unless $self->{pid}; + return unless $self->{in}; print { $self->{out} } "progress $msg\n" or wfail; readline($self->{in}) eq "progress $msg\n" or die "progress $msg not received\n"; @@ -209,7 +217,7 @@ sub barrier { # used for v2 sub get_mark { my ($self, $mark) = @_; - die "not active\n" unless $self->{pid}; + die "not active\n" unless $self->{in}; my ($r, $w) = $self->gfi_start; print $w "get-mark $mark\n" or wfail; defined(my $oid = <$r>) or die "get-mark failed, need git 2.6.0+\n"; @@ -404,8 +412,10 @@ sub add { # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{raw_bytes} = $n; - $smsg->{-raw_email} = \$raw_email; + $smsg->set_bytes($raw_email, $n); + if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore + return if $oidx->blob_exists($smsg->{blob}); + } } my $ref = $self->{ref}; my $commit = $self->{mark}++; @@ -429,14 +439,7 @@ sub add { $self->{tip} = ":$commit"; } -sub run_die ($;$$) { - my ($cmd, $env, $rdr) = @_; - my $pid = spawn($cmd, $env, $rdr); - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0 or die join(' ', @$cmd) . " failed: $?\n"; -} - -my @INIT_FILES = ('HEAD' => "ref: refs/heads/master\n", +my @INIT_FILES = ('HEAD' => undef, # filled in at runtime 'description' => <{git}->{git_dir} if ref($dir); require File::Path; File::Path::mkpath([ map { "$dir/$_" } qw(objects/info refs/heads) ]); - for (my $i = 0; $i < @INIT_FILES; $i++) { - my $f = $dir.'/'.$INIT_FILES[$i++]; + $INIT_FILES[1] //= 'ref: '.default_branch."\n"; + my @fn_contents = @INIT_FILES; + $fn_contents[1] = "ref: refs/heads/$head\n" if defined $head; + while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) { + my $f = $dir.'/'.$fn; next if -f $f; open my $fh, '>', $f or die "open $f: $!"; - print $fh $INIT_FILES[$i] or die "print $f: $!"; + print $fh $contents or die "print $f: $!"; close $fh or die "close $f: $!"; } } @@ -472,10 +478,7 @@ sub done { eval { my $r = delete $self->{in} or die 'BUG: missing {in} when done'; print $w "done\n" or wfail; - my $pid = delete $self->{pid} or - die 'BUG: missing {pid} when done'; - waitpid($pid, 0) == $pid or die 'fast-import did not finish'; - $? == 0 or die "fast-import failed: $?"; + close $r or die "fast-import failed: $?"; # ProcessPipe::CLOSE }; my $wait_err = $@; my $nchg = delete $self->{nchg}; diff --git a/lib/PublicInbox/In2Tie.pm b/lib/PublicInbox/In2Tie.pm index 7dee3627..ffe26a44 100644 --- a/lib/PublicInbox/In2Tie.pm +++ b/lib/PublicInbox/In2Tie.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # used to ensure PublicInbox::DS can call fileno() as a function diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index e9efd29d..bee44f8a 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -1,13 +1,13 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # Represents a public-inbox (which may have multiple mailing addresses) package PublicInbox::Inbox; use strict; -use warnings; use PublicInbox::Git; use PublicInbox::MID qw(mid2path); use PublicInbox::Eml; +use List::Util qw(max); # Long-running "git-cat-file --batch" processes won't notice # unlinked packs, so we need to restart those processes occasionally. @@ -74,18 +74,8 @@ sub _cleanup_later ($) { $CLEANUP->{"$self"} = $self; } -sub _set_uint ($$$) { - my ($opts, $field, $default) = @_; - my $val = $opts->{$field}; - if (defined $val) { - $val = $val->[-1] if ref($val) eq 'ARRAY'; - $val = undef if $val !~ /\A[0-9]+\z/; - } - $opts->{$field} = $val || $default; -} - sub _set_limiter ($$$) { - my ($self, $pi_config, $pfx) = @_; + my ($self, $pi_cfg, $pfx) = @_; my $lkey = "-${pfx}_limiter"; $self->{$lkey} ||= do { # full key is: publicinbox.$NAME.httpbackendmax @@ -96,7 +86,7 @@ sub _set_limiter ($$$) { require PublicInbox::Qspawn; $lim = PublicInbox::Qspawn::Limiter->new($val); } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { - $lim = $pi_config->limiter($val); + $lim = $pi_cfg->limiter($val); warn "$mkey limiter=$val not found\n" if !$lim; } else { warn "$mkey limiter=$val not understood\n"; @@ -110,14 +100,15 @@ sub new { my $v = $opts->{address} ||= [ 'public-inbox@example.com' ]; my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; - my $pi_config = delete $opts->{-pi_config}; - _set_limiter($opts, $pi_config, 'httpbackend'); - _set_uint($opts, 'feedmax', 25); - $opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'}; - my $dir = $opts->{inboxdir}; - if (defined $dir && -f "$dir/inbox.lock") { - $opts->{version} = 2; + my $pi_cfg = delete $opts->{-pi_cfg}; + _set_limiter($opts, $pi_cfg, 'httpbackend'); + my $fmax = $opts->{feedmax}; + if (defined($fmax) && $fmax =~ /\A[0-9]+\z/) { + $opts->{feedmax} += 0; + } else { + delete $opts->{feedmax}; } + $opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'}; # allow any combination of multi-line or comma-delimited hide entries my $hide = {}; @@ -130,16 +121,18 @@ sub new { bless $opts, $class; } -sub version { $_[0]->{version} // 1 } +sub version { + $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1 +} sub git_epoch { - my ($self, $epoch) = @_; - $self->version == 2 or return; + my ($self, $epoch) = @_; # v2-only, callers always supply $epoch $self->{"$epoch.git"} ||= do { my $git_dir = "$self->{inboxdir}/git/$epoch.git"; + return unless -d $git_dir; my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - # no cleanup needed, we never cat-file off this, only clone + # caller must manually cleanup when done $g; }; } @@ -160,19 +153,15 @@ sub max_git_epoch { my ($self) = @_; return if $self->version < 2; my $cur = $self->{-max_git_epoch}; - my $changed = git($self)->alternates_changed; - if (!defined($cur) || $changed) { + my $changed; + if (!defined($cur) || ($changed = git($self)->alternates_changed)) { git_cleanup($self) if $changed; my $gits = "$self->{inboxdir}/git"; if (opendir my $dh, $gits) { - my $max = -1; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - $max = $1 if $1 > $max; - } - $cur = $self->{-max_git_epoch} = $max if $max >= 0; - } else { - warn "opendir $gits failed: $!\n"; + my $max = max(map { + substr($_, 0, -4) + 0; # drop ".git" suffix + } grep(/\A[0-9]+\.git\z/, readdir($dh))) // return; + $cur = $self->{-max_git_epoch} = $max; } } $cur; @@ -191,50 +180,54 @@ sub mm { }; } -sub search ($;$$) { - my ($self, $over_only, $ctx) = @_; - my $srch = $self->{search} ||= eval { +sub search { + my ($self) = @_; + my $srch = $self->{search} //= eval { _cleanup_later($self); require PublicInbox::Search; PublicInbox::Search->new($self); }; - ($over_only || eval { $srch->xdb }) ? $srch : do { - $ctx and $ctx->{env}->{'psgi.errors'}->print(<{name}' search went away unexpectedly -EOF - undef; - }; + (eval { $srch->xdb }) ? $srch : undef; } +# isrch is preferred for read-only interfaces if available since it +# reduces kernel cache and FD overhead +sub isrch { $_[0]->{isrch} // search($_[0]) } + sub over { $_[0]->{over} //= eval { - my $srch = search($_[0], 1) or return; + my $srch = $_[0]->{search} //= eval { + _cleanup_later($_[0]); + require PublicInbox::Search; + PublicInbox::Search->new($_[0]); + }; my $over = PublicInbox::Over->new("$srch->{xpfx}/over.sqlite3"); $over->dbh; # may fail $over; }; } + sub try_cat { my ($path) = @_; - my $rv = ''; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; - } - $rv; + open(my $fh, '<', $path) or return ''; + local $/; + <$fh> // ''; +} + +sub cat_desc ($) { + my $desc = try_cat($_[0]); + local $/ = "\n"; + chomp $desc; + utf8::decode($desc); + $desc =~ s/\s+/ /smg; + $desc eq '' ? undef : $desc; } sub description { my ($self) = @_; - ($self->{description} //= do { - my $desc = try_cat("$self->{inboxdir}/description"); - local $/ = "\n"; - chomp $desc; - utf8::decode($desc); - $desc =~ s/\s+/ /smg; - $desc eq '' ? undef : $desc; - }) // '($INBOX_DIR/description missing)'; + ($self->{description} //= cat_desc("$self->{inboxdir}/description")) // + '($INBOX_DIR/description missing)'; } sub cloneurl { @@ -331,7 +324,7 @@ sub msg_by_smsg ($$) { return unless defined $smsg; defined(my $blob = $smsg->{blob}) or return; - git($self)->cat_file($blob); + $self->git->cat_file($blob); } sub smsg_eml { @@ -342,39 +335,35 @@ sub smsg_eml { $eml; } -sub mid2num($$) { - my ($self, $mid) = @_; - my $mm = mm($self) or return; - $mm->num_for($mid); -} - sub smsg_by_mid ($$) { my ($self, $mid) = @_; - my $over = over($self) or return; - # favor the Message-ID we used for the NNTP article number: - defined(my $num = mid2num($self, $mid)) or return; - my $smsg = $over->get_art($num) or return; - PublicInbox::Smsg::psgi_cull($smsg); + my $over = $self->over or return; + my $smsg; + if (my $mm = $self->mm) { + # favor the Message-ID we used for the NNTP article number: + defined(my $num = $mm->num_for($mid)) or return; + $smsg = $over->get_art($num); + } else { + my ($id, $prev); + $smsg = $over->next_by_mid($mid, \$id, \$prev); + } + $smsg ? PublicInbox::Smsg::psgi_cull($smsg) : undef; } sub msg_by_mid ($$) { my ($self, $mid) = @_; - - over($self) or - return msg_by_path($self, mid2path($mid)); - my $smsg = smsg_by_mid($self, $mid); - $smsg ? msg_by_smsg($self, $smsg) : undef; + $smsg ? msg_by_smsg($self, $smsg) : msg_by_path($self, mid2path($mid)); } sub recent { my ($self, $opts, $after, $before) = @_; - over($self)->recent($opts, $after, $before); + $self->over->recent($opts, $after, $before); } sub modified { my ($self) = @_; - if (my $over = over($self)) { + if (my $over = $self->over) { my $msgs = $over->recent({limit => 1}); if (my $smsg = $msgs->[0]) { return $smsg->{ts}; @@ -428,4 +417,8 @@ sub on_unlock { } } +sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } } + +sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } + 1; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 60948bea..4d74b354 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -1,14 +1,12 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # fields: -# pi_config: PublicInbox::Config ref # inot: Linux::Inotify2-like object # pathmap => { inboxdir => [ ibx, watch1, watch2, watch3... ] } mapping package PublicInbox::InboxIdle; use strict; use parent qw(PublicInbox::DS); -use Cwd qw(abs_path); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; @@ -23,11 +21,7 @@ require PublicInbox::In2Tie if $ino_cls; sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my ($ibx, $self) = @_; - my $dir = abs_path($ibx->{inboxdir}); - if (!defined($dir)) { - warn "W: $ibx->{inboxdir} not watched: $!\n"; - return; - } + my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); @@ -65,12 +59,15 @@ I: consider increasing /proc/sys/fs/inotify/max_user_watches } sub refresh { - my ($self, $pi_config) = @_; - $pi_config->each_inbox(\&in2_arm, $self); + my ($self, $pi_cfg) = @_; + $pi_cfg->each_inbox(\&in2_arm, $self); } +# internal API for ease-of-use +sub watch_inbox { in2_arm($_[1], $_[0]) }; + sub new { - my ($class, $pi_config) = @_; + my ($class, $pi_cfg) = @_; my $self = bless {}, $class; my $inot; if ($ino_cls) { @@ -84,7 +81,7 @@ sub new { $self->{inot} = $inot; $self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...] $self->{on_unlock} = {}; # lock path => ibx - refresh($self, $pi_config); + refresh($self, $pi_cfg) if $pi_cfg; PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } @@ -95,7 +92,8 @@ sub event_step { my @events = $self->{inot}->read; # Linux::Inotify2::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { - if (my $ibx = $on_unlock->{$ev->fullname}) { + my $fn = $ev->fullname // next; # cancelled + if (my $ibx = $on_unlock->{$fn}) { $ibx->on_unlock; } } diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 752f1997..982ad6e5 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Extends read-only Inbox for writing @@ -46,12 +46,13 @@ sub _init_v1 { require PublicInbox::Msgmap; my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create $sidx->begin_txn_lazy; + my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); if (defined $skip_artnum) { - my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); $mm->{dbh}->begin_work; $mm->skip_artnum($skip_artnum); $mm->{dbh}->commit; } + undef $mm; # ->created_at set $sidx->commit_txn_lazy; } else { open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or @@ -64,7 +65,6 @@ sub init_inbox { if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); - $self->umask_prepare; $self->with_umask(\&_init_v1, $self, $skip_artnum); } else { my $v2w = importer($self); @@ -102,7 +102,7 @@ sub filter { $im->done; } - my @args = (-inbox => $self); + my @args = (ibx => $self); # basic line splitting, only # Perhaps we can have proper quote splitting one day... ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; @@ -259,7 +259,7 @@ sub _umask_for { sub with_umask { my ($self, $cb, @arg) = @_; - my $old = umask $self->{umask}; + my $old = umask($self->{umask} //= umask_prepare($self)); my $rv = eval { $cb->(@arg) }; my $err = $@; umask $old; @@ -270,8 +270,7 @@ sub with_umask { sub umask_prepare { my ($self) = @_; my $perm = _git_config_perm($self); - my $umask = _umask_for($perm); - $self->{umask} = $umask; + _umask_for($perm); } sub cleanup ($) { @@ -287,15 +286,24 @@ sub warn_ignore { # PublicInbox::MsgTime || $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/ || $s =~ /^bad Date: .+? in / + # Encode::Unicode::UTF7 + || $s =~ /^Bad UTF7 data escape at / } # this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." sub warn_ignore_cb { - my $cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $cb = $SIG{__WARN__} // \&CORE::warn; sub { return if warn_ignore(@_); $cb->(@_); } } +# v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove +sub git_dir_latest { + my ($self, $max) = @_; + defined($$max = $self->max_git_epoch) ? + "$self->{inboxdir}/git/$$max.git" : undef; +} + 1; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm new file mode 100644 index 00000000..342d7913 --- /dev/null +++ b/lib/PublicInbox/Isearch.pm @@ -0,0 +1,127 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Provides everything the PublicInbox::Search object does; +# but uses global ExtSearch (->ALL) with an eidx_key query to +# emulate per-Inbox search using ->ALL. +package PublicInbox::Isearch; +use strict; +use v5.10.1; +use PublicInbox::ExtSearch; +use PublicInbox::Search; + +sub new { + my (undef, $ibx, $es) = @_; + bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; +} + +sub _ibx_id ($) { + my ($self) = @_; + my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($self->{eidx_key}); + $sth->fetchrow_array // + die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; +} + + +sub mset { + my ($self, $str, $opt) = @_; + my %opt = $opt ? %$opt : (); + $opt{eidx_key} = $self->{eidx_key}; + if (my $uid_range = $opt{uid_range}) { + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->{over}->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= 0xffffffff; + $r[0] //= 0; + } + $opt{uid_range} = \@r; + } + $self->{es}->mset($str, \%opt); +} + +sub mset_to_artnums { + my ($self, $mset, $opt) = @_; + my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $qmarks = join(',', map { '?' } @$docids); + if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING + my $range = ''; + my @r; + if (my $r = $opt->{uid_range}) { + $range = 'AND xnum >= ? AND xnum <= ?'; + @r = @$r; + } + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r); +SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range +ORDER BY xnum ASC + + return [ map { $_->[0] } @$rows ]; + } + + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids); +SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) + + my $i = -1; + my %order = map { $_ => ++$i } @$docids; + my @xnums; + for my $row (@$rows) { # @row = ($docid, $xnum) + my $idx = delete($order{$row->[0]}) // next; + $xnums[$idx] = $row->[1]; + } + if (scalar keys %order) { + warn "W: $self->{es}->{topdir} #", + join(', ', sort { $a <=> $b } keys %order), + " not mapped to `$self->{eidx_key}'\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + @xnums = grep { defined } @xnums; + } + \@xnums; +} + +sub mset_to_smsg { + my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx + my $xnums = mset_to_artnums($self, $mset); + my $i = -1; + my %order = map { $_ => ++$i } @$xnums; + my $unordered = $ibx->over->get_all(@$xnums); + my @msgs; + for my $smsg (@$unordered) { + my $idx = delete($order{$smsg->{num}}) // do { + warn "W: $ibx->{inboxdir} #$smsg->{num}\n"; + next; + }; + $msgs[$idx] = $smsg; + } + if (scalar keys %order) { + warn "W: $ibx->{inboxdir} #", + join(', ', sort { $a <=> $b } keys %order), + " no longer valid\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + } + wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs; +} + +sub has_threadid { 1 } + +sub help { $_[0]->{es}->help } + +1; diff --git a/lib/PublicInbox/KQNotify.pm b/lib/PublicInbox/KQNotify.pm index c7740df2..cfea6b1b 100644 --- a/lib/PublicInbox/KQNotify.pm +++ b/lib/PublicInbox/KQNotify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # implements the small subset of Linux::Inotify2 functionality we use diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm new file mode 100644 index 00000000..378113e8 --- /dev/null +++ b/lib/PublicInbox/LEI.pm @@ -0,0 +1,1007 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Backend for `lei' (local email interface). Unlike the C10K-oriented +# PublicInbox::Daemon, this is designed exclusively to handle trusted +# local clients with read/write access to the FS and use as many +# system resources as the local user has access to. +package PublicInbox::LEI; +use strict; +use v5.10.1; +use parent qw(PublicInbox::DS PublicInbox::LeiExternal + PublicInbox::LeiQuery); +use Getopt::Long (); +use Socket qw(AF_UNIX SOCK_SEQPACKET MSG_EOR pack_sockaddr_un); +use Errno qw(EAGAIN EINTR ECONNREFUSED ENOENT ECONNRESET); +use POSIX (); +use IO::Handle (); +use Fcntl qw(SEEK_SET); +use Sys::Syslog qw(syslog openlog); +use PublicInbox::Config; +use PublicInbox::Syscall qw(SFD_NONBLOCK EPOLLIN EPOLLET); +use PublicInbox::Sigfd; +use PublicInbox::DS qw(now dwaitpid); +use PublicInbox::Spawn qw(spawn popen_rd); +use PublicInbox::OnDestroy; +use Text::Wrap qw(wrap); +use File::Path qw(mkpath); +use File::Spec; +our $quit = \&CORE::exit; +our ($current_lei, $errors_log, $listener); +my ($recv_cmd, $send_cmd); +my $GLP = Getopt::Long::Parser->new; +$GLP->configure(qw(gnu_getopt no_ignore_case auto_abbrev)); +my $GLP_PASS = Getopt::Long::Parser->new; +$GLP_PASS->configure(qw(gnu_getopt no_ignore_case auto_abbrev pass_through)); + +our %PATH2CFG; # persistent for socket daemon + +# TBD: this is a documentation mechanism to show a subcommand +# (may) pass options through to another command: +sub pass_through { $GLP_PASS } + +my $OPT; +sub opt_dash ($$) { + my ($spec, $re_str) = @_; # 'limit|n=i', '([0-9]+)' + my ($key) = ($spec =~ m/\A([a-z]+)/g); + my $cb = sub { # Getopt::Long "<>" catch-all handler + my ($arg) = @_; + if ($arg =~ /\A-($re_str)\z/) { + $OPT->{$key} = $1; + } elsif ($arg eq '--') { # "--" arg separator, ignore first + push @{$OPT->{-argv}}, $arg if $OPT->{'--'}++; + # lone (single) dash is handled elsewhere + } elsif (substr($arg, 0, 1) eq '-') { + if ($OPT->{'--'}) { + push @{$OPT->{-argv}}, $arg; + } else { + die "bad argument: $arg\n"; + } + } else { + push @{$OPT->{-argv}}, $arg; + } + }; + ($spec, '<>' => $cb, $GLP_PASS) # for Getopt::Long +} + +sub _store_path ($) { + my ($env) = @_; + File::Spec->rel2abs(($env->{XDG_DATA_HOME} // + ($env->{HOME} // '/nonexistent').'/.local/share') + .'/lei/store', $env->{PWD}); +} + +sub _config_path ($) { + my ($env) = @_; + File::Spec->rel2abs(($env->{XDG_CONFIG_HOME} // + ($env->{HOME} // '/nonexistent').'/.config') + .'/lei/config', $env->{PWD}); +} + +# TODO: generate shell completion + help using %CMD and %OPTDESC +# command => [ positional_args, 1-line description, Getopt::Long option spec ] +our %CMD = ( # sorted in order of importance/use: +'q' => [ 'SEARCH_TERMS...', 'search for messages matching terms', qw( + save-as=s output|mfolder|o=s format|f=s dedupe|d=s thread|t augment|a + sort|s=s reverse|r offset=i remote! local! external! pretty mua-cmd=s + torsocks=s no-torsocks verbose|v since|after=s until|before=s), + PublicInbox::LeiQuery::curl_opt(), opt_dash('limit|n=i', '[0-9]+') ], + +'show' => [ 'MID|OID', 'show a given object (Message-ID or object ID)', + qw(type=s solve! format|f=s dedupe|d=s thread|t remote local!), + pass_through('git show') ], + +'add-external' => [ 'URL_OR_PATHNAME', + 'add/set priority of a publicinbox|extindex for extra matches', + qw(boost=i quiet|q) ], +'ls-external' => [ '[FILTER...]', 'list publicinbox|extindex locations', + qw(format|f=s z|0 local remote quiet|q) ], +'forget-external' => [ 'URL_OR_PATHNAME...|--prune', + 'exclude further results from a publicinbox|extindex', + qw(prune quiet|q) ], + +'ls-query' => [ '[FILTER...]', 'list saved search queries', + qw(name-only format|f=s z) ], +'rm-query' => [ 'QUERY_NAME', 'remove a saved search' ], +'mv-query' => [ qw(OLD_NAME NEW_NAME), 'rename a saved search' ], + +'plonk' => [ '--thread|--from=IDENT', + 'exclude mail matching From: or thread from non-Message-ID searches', + qw(stdin| thread|t from|f=s mid=s oid=s) ], +'mark' => [ 'MESSAGE_FLAGS...', + 'set/unset flags on message(s) from stdin', + qw(stdin| oid=s exact by-mid|mid:s) ], +'forget' => [ '[--stdin|--oid=OID|--by-mid=MID]', + "exclude message(s) on stdin from `q' search results", + qw(stdin| oid=s exact by-mid|mid:s quiet|q) ], + +'purge-mailsource' => [ 'URL_OR_PATHNAME|--all', + 'remove imported messages from IMAP, Maildirs, and MH', + qw(exact! all jobs:i indexed) ], + +# code repos are used for `show' to solve blobs from patch mails +'add-coderepo' => [ 'PATHNAME', 'add or set priority of a git code repo', + qw(boost=i) ], +'ls-coderepo' => [ '[FILTER_TERMS...]', + 'list known code repos', qw(format|f=s z) ], +'forget-coderepo' => [ 'PATHNAME', + 'stop using repo to solve blobs from patches', + qw(prune) ], + +'add-watch' => [ '[URL_OR_PATHNAME]', + 'watch for new messages and flag changes', + qw(import! flags! interval=s recursive|r exclude=s include=s) ], +'ls-watch' => [ '[FILTER...]', 'list active watches with numbers and status', + qw(format|f=s z) ], +'pause-watch' => [ '[WATCH_NUMBER_OR_FILTER]', qw(all local remote) ], +'resume-watch' => [ '[WATCH_NUMBER_OR_FILTER]', qw(all local remote) ], +'forget-watch' => [ '{WATCH_NUMBER|--prune}', 'stop and forget a watch', + qw(prune) ], + +'import' => [ 'URL_OR_PATHNAME|--stdin', + 'one-shot import/update from URL or filesystem', + qw(stdin| offset=i recursive|r exclude=s include=s !flags), + ], + +'config' => [ '[...]', sub { + 'git-config(1) wrapper for '._config_path($_[0]); + }, qw(config-file|system|global|file|f=s), # for conflict detection + pass_through('git config') ], +'init' => [ '[PATHNAME]', sub { + 'initialize storage, default: '._store_path($_[0]); + }, qw(quiet|q) ], +'daemon-kill' => [ '[-SIGNAL]', 'signal the lei-daemon', + opt_dash('signal|s=s', '[0-9]+|(?:[A-Z][A-Z0-9]+)') ], +'daemon-pid' => [ '', 'show the PID of the lei-daemon' ], +'help' => [ '[SUBCOMMAND]', 'show help' ], + +# XXX do we need this? +# 'git' => [ '[ANYTHING...]', 'git(1) wrapper', pass_through('git') ], + +'reorder-local-store-and-break-history' => [ '[REFNAME]', + 'rewrite git history in an attempt to improve compression', + 'gc!' ], + +# internal commands are prefixed with '_' +'_complete' => [ '[...]', 'internal shell completion helper', + pass_through('everything') ], +); # @CMD + +# switch descriptions, try to keep consistent across commands +# $spec: Getopt::Long option specification +# $spec => [@ALLOWED_VALUES (default is first), $description], +# $spec => $description +# "$SUB_COMMAND TAB $spec" => as above +my $stdin_formats = [ 'IN|auto|raw|mboxrd|mboxcl2|mboxcl|mboxo', + 'specify message input format' ]; +my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ]; + +my %OPTDESC = ( +'help|h' => 'show this built-in help', +'quiet|q' => 'be quiet', +'solve!' => 'do not attempt to reconstruct blobs from emails', +'save-as=s' => ['NAME', 'save a search terms by given name'], + +'type=s' => [ 'any|mid|git', 'disambiguate type' ], + +'dedupe|d=s' => ['STRAT|content|oid|mid|none', + 'deduplication strategy'], +'show thread|t' => 'display entire thread a message belongs to', +'q thread|t' => + 'return all messages in the same thread as the actual match(es)', +'augment|a' => 'augment --output destination instead of clobbering', + +'output|o=s' => [ 'DEST', + "destination (e.g. `/path/to/Maildir', or `-' for stdout)" ], +'mua-cmd|mua=s' => [ 'COMMAND', + "MUA to run on --output Maildir or mbox (e.g. `mutt -f %f'" ], + +'show format|f=s' => [ 'OUT|plain|raw|html|mboxrd|mboxcl2|mboxcl', + 'message/object output format' ], +'mark format|f=s' => $stdin_formats, +'forget format|f=s' => $stdin_formats, +'q format|f=s' => [ 'OUT|maildir|mboxrd|mboxcl2|mboxcl|html|oid|json', + 'specify output format, default depends on --output'], +'ls-query format|f=s' => $ls_format, +'ls-external format|f=s' => $ls_format, + +'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ], +'offset=i' => ['OFF', 'search result offset (default: 0)'], + +'sort|s=s' => [ 'VAL|received,relevance,docid', + "order of results `--output'-dependent"], +'reverse|r' => [ 'reverse search results' ], # like sort(1) + +'boost=i' => 'increase/decrease priority of results (default: 0)', + +'local' => 'limit operations to the local filesystem', +'local!' => 'exclude results from the local filesystem', +'remote' => 'limit operations to those requiring network access', +'remote!' => 'prevent operations requiring network access', + +'mid=s' => 'specify the Message-ID of a message', +'oid=s' => 'specify the git object ID of a message', + +'recursive|r' => 'scan directories/mailboxes/newsgroups recursively', +'exclude=s' => 'exclude mailboxes/newsgroups based on pattern', +'include=s' => 'include mailboxes/newsgroups based on pattern', + +'exact' => 'operate on exact header matches only', +'exact!' => 'rely on content match instead of exact header matches', + +'by-mid|mid:s' => [ 'MID', 'match only by Message-ID, ignoring contents' ], +'jobs:i' => 'set parallelism level', + +# xargs, env, use "-0", git(1) uses "-z". We support z|0 everywhere +'z|0' => 'use NUL \\0 instead of newline (CR) to delimit lines', + +'signal|s=s' => [ 'SIG', 'signal to send lei-daemon (default: TERM)' ], +); # %OPTDESC + +my %CONFIG_KEYS = ( + 'leistore.dir' => 'top-level storage location', +); + +# pronounced "exit": x_it(1 << 8) => exit(1); x_it(13) => SIGPIPE +sub x_it ($$) { + my ($self, $code) = @_; + # make sure client sees stdout before exit + $self->{1}->autoflush(1) if $self->{1}; + dump_and_clear_log(); + if (my $sock = $self->{sock}) { + send($sock, "x_it $code", MSG_EOR); + } elsif (!($code & 127)) { # oneshot, ignore signals + # don't want to end up using $? from child processes + for my $f (qw(lxs l2m)) { + my $wq = delete $self->{$f} or next; + $wq->DESTROY; + } + $quit->($code >> 8); + } +} + +sub puts ($;@) { print { shift->{1} } map { "$_\n" } @_ } + +sub out ($;@) { print { shift->{1} } @_ } + +sub err ($;@) { + my $self = shift; + my $err = $self->{2} // ($self->{pgr} // [])->[2] // *STDERR{IO}; + print $err @_, (substr($_[-1], -1, 1) eq "\n" ? () : "\n"); +} + +sub qerr ($;@) { $_[0]->{opt}->{quiet} or err(shift, @_) } + +sub fail ($$;$) { + my ($self, $buf, $exit_code) = @_; + err($self, $buf); + x_it($self, ($exit_code // 1) << 8); + undef; +} + +sub child_error { # passes non-fatal curl exit codes to user + my ($self, $child_error) = @_; # child_error is $? + if (my $sock = $self->{sock}) { # send to lei(1) client + send($sock, "child_error $child_error", MSG_EOR); + } else { # oneshot + $self->{child_error} = $child_error; + } + undef; +} + +sub atfork_prepare_wq { + my ($self, $wq) = @_; + my $tcafc = $wq->{-ipc_atfork_child_close} //= [ $listener // () ]; + if (my $sock = $self->{sock}) { + push @$tcafc, @$self{qw(0 1 2)}, $sock; + } + if (my $pgr = $self->{pgr}) { + push @$tcafc, @$pgr[1,2]; + } + if (my $old_1 = $self->{old_1}) { + push @$tcafc, $old_1; + } + for my $f (qw(lxs l2m)) { + my $ipc = $self->{$f} or next; + push @$tcafc, grep { defined } + @$ipc{qw(-wq_s1 -wq_s2 -ipc_req -ipc_res)}; + } +} + +# usage: my %sig = $lei->atfork_child_wq($wq); +# local @SIG{keys %sig} = values %sig; +sub atfork_child_wq { + my ($self, $wq) = @_; + my ($sock, $l2m_wq_s1); + (@$self{qw(0 1 2)}, $sock, $l2m_wq_s1) = delete(@$wq{0..4}); + $self->{sock} = $sock if -S $sock; + $self->{l2m}->{-wq_s1} = $l2m_wq_s1 if $l2m_wq_s1 && -S $l2m_wq_s1; + %PATH2CFG = (); + undef $errors_log; + $quit = \&CORE::exit; + (__WARN__ => sub { err($self, @_) }, + PIPE => sub { + $self->x_it(13); # SIGPIPE = 13 + # we need to close explicitly to avoid Perl warning on SIGPIPE + for my $i (1, 2) { + next unless $self->{$i} && (-p $self->{$i} || -S _); + close(delete $self->{$i}); + } + # trigger the LeiXSearch $done OpPipe: + syswrite($self->{0}, '!') if $self->{0} && -p $self->{0}; + $SIG{PIPE} = 'DEFAULT'; + die bless(\"$_[0]", 'PublicInbox::SIGPIPE'), + }); +} + +# usage: ($lei, @io) = $lei->atfork_parent_wq($wq); +sub atfork_parent_wq { + my ($self, $wq) = @_; + my $env = delete $self->{env}; # env is inherited at fork + my $ret = bless { %$self }, ref($self); + if (my $dedupe = delete $ret->{dedupe}) { + $ret->{dedupe} = $wq->deep_clone($dedupe); + } + $self->{env} = $env; + delete @$ret{qw(-lei_store cfg old_1 pgr lxs)}; # keep l2m + my @io = delete @$ret{0..2}; + $io[3] = delete($ret->{sock}) // $io[2]; + my $l2m = $ret->{l2m}; + if ($l2m && $l2m != $wq) { # $wq == lxs + $io[4] = $l2m->{-wq_s1} if $l2m->{-wq_s1}; + $l2m->wq_close(1); + } + ($ret, @io); +} + +sub _help ($;$) { + my ($self, $errmsg) = @_; + my $cmd = $self->{cmd} // 'COMMAND'; + my @info = @{$CMD{$cmd} // [ '...', '...' ]}; + my @top = ($cmd, shift(@info) // ()); + my $cmd_desc = shift(@info); + $cmd_desc = $cmd_desc->($self->{env}) if ref($cmd_desc) eq 'CODE'; + my @opt_desc; + my $lpad = 2; + for my $sw (grep { !ref } @info) { # ("prio=s", "z", $GLP_PASS) + my $desc = $OPTDESC{"$cmd\t$sw"} // $OPTDESC{$sw} // next; + my $arg_vals = ''; + ($arg_vals, $desc) = @$desc if ref($desc) eq 'ARRAY'; + + # lower-case is a keyword (e.g. `content', `oid'), + # ALL_CAPS is a string description (e.g. `PATH') + if ($desc !~ /default/ && $arg_vals =~ /\b([a-z]+)[,\|]/) { + $desc .= "\ndefault: `$1'"; + } + my (@vals, @s, @l); + my $x = $sw; + if ($x =~ s/!\z//) { # solve! => --no-solve + $x = "no-$x"; + } elsif ($x =~ s/:.+//) { # optional args: $x = "mid:s" + @vals = (' [', undef, ']'); + } elsif ($x =~ s/=.+//) { # required arg: $x = "type=s" + @vals = (' ', undef); + } # else: no args $x = 'thread|t' + for (split(/\|/, $x)) { # help|h + length($_) > 1 ? push(@l, "--$_") : push(@s, "-$_"); + } + if (!scalar(@vals)) { # no args 'thread|t' + } elsif ($arg_vals =~ s/\A([A-Z_]+)\b//) { # "NAME" + $vals[1] = $1; + } else { + $vals[1] = uc(substr($l[0], 2)); # "--type" => "TYPE" + } + if ($arg_vals =~ /([,\|])/) { + my $sep = $1; + my @allow = split(/\Q$sep\E/, $arg_vals); + my $must = $sep eq '|' ? 'Must' : 'Can'; + @allow = map { "`$_'" } @allow; + my $last = pop @allow; + $desc .= "\n$must be one of: " . + join(', ', @allow) . " or $last"; + } + my $lhs = join(', ', @s, @l) . join('', @vals); + if ($x =~ /\|\z/) { # "stdin|" or "clear|" + $lhs =~ s/\A--/- , --/; + } else { + $lhs =~ s/\A--/ --/; # pad if no short options + } + $lpad = length($lhs) if length($lhs) > $lpad; + push @opt_desc, $lhs, $desc; + } + my $msg = $errmsg ? "E: $errmsg\n" : ''; + $msg .= <{$errmsg ? 2 : 1} } $msg; + x_it($self, $errmsg ? 1 << 8 : 0); # stderr => failure + undef; +} + +sub optparse ($$$) { + my ($self, $cmd, $argv) = @_; + $self->{cmd} = $cmd; + $OPT = $self->{opt} = {}; + my $info = $CMD{$cmd} // [ '[...]' ]; + my ($proto, undef, @spec) = @$info; + my $glp = ref($spec[-1]) eq ref($GLP) ? pop(@spec) : $GLP; + push @spec, qw(help|h); + my $lone_dash; + if ($spec[0] =~ s/\|\z//s) { # "stdin|" or "clear|" allows "-" alias + $lone_dash = $spec[0]; + $OPT->{$spec[0]} = \(my $var); + push @spec, '' => \$var; + } + $glp->getoptionsfromarray($argv, $OPT, @spec) or + return _help($self, "bad arguments or options for $cmd"); + return _help($self) if $OPT->{help}; + + push @$argv, @{$OPT->{-argv}} if defined($OPT->{-argv}); + + # "-" aliases "stdin" or "clear" + $OPT->{$lone_dash} = ${$OPT->{$lone_dash}} if defined $lone_dash; + + my $i = 0; + my $POS_ARG = '[A-Z][A-Z0-9_]+'; + my ($err, $inf); + my @args = split(/ /, $proto); + for my $var (@args) { + if ($var =~ /\A$POS_ARG\.\.\.\z/o) { # >= 1 args; + $inf = defined($argv->[$i]) and last; + $var =~ s/\.\.\.\z//; + $err = "$var not supplied"; + } elsif ($var =~ /\A$POS_ARG\z/o) { # required arg at $i + $argv->[$i++] // ($err = "$var not supplied"); + } elsif ($var =~ /\.\.\.\]\z/) { # optional args start + $inf = 1; + last; + } elsif ($var =~ /\A\[-?$POS_ARG\]\z/) { # one optional arg + $i++; + } elsif ($var =~ /\A.+?\|/) { # required FOO|--stdin + my @or = split(/\|/, $var); + my $ok; + for my $o (@or) { + if ($o =~ /\A--([a-z0-9\-]+)/) { + $ok = defined($OPT->{$1}); + last; + } elsif (defined($argv->[$i])) { + $ok = 1; + $i++; + last; + } # else continue looping + } + last if $ok; + my $last = pop @or; + $err = join(', ', @or) . " or $last must be set"; + } else { + warn "BUG: can't parse `$var' in $proto"; + } + last if $err; + } + if (!$inf && scalar(@$argv) > scalar(@args)) { + $err //= 'too many arguments'; + } + $err ? fail($self, "usage: lei $cmd $proto\nE: $err") : 1; +} + +sub dispatch { + my ($self, $cmd, @argv) = @_; + local $current_lei = $self; # for __WARN__ + dump_and_clear_log("from previous run\n"); + return _help($self, 'no command given') unless defined($cmd); + my $func = "lei_$cmd"; + $func =~ tr/-/_/; + if (my $cb = __PACKAGE__->can($func)) { + optparse($self, $cmd, \@argv) or return; + $cb->($self, @argv); + } elsif (grep(/\A-/, $cmd, @argv)) { # --help or -h only + my $opt = {}; + $GLP->getoptionsfromarray([$cmd, @argv], $opt, qw(help|h)) or + return _help($self, 'bad arguments or options'); + _help($self); + } else { + fail($self, "`$cmd' is not an lei command"); + } +} + +sub _lei_cfg ($;$) { + my ($self, $creat) = @_; + my $f = _config_path($self->{env}); + my @st = stat($f); + my $cur_st = @st ? pack('dd', $st[10], $st[7]) : ''; # 10:ctime, 7:size + if (my $cfg = $PATH2CFG{$f}) { # reuse existing object in common case + return ($self->{cfg} = $cfg) if $cur_st eq $cfg->{-st}; + } + if (!@st) { + unless ($creat) { + delete $self->{cfg}; + return; + } + my (undef, $cfg_dir, undef) = File::Spec->splitpath($f); + -d $cfg_dir or mkpath($cfg_dir) or die "mkpath($cfg_dir): $!\n"; + open my $fh, '>>', $f or die "open($f): $!\n"; + @st = stat($fh) or die "fstat($f): $!\n"; + $cur_st = pack('dd', $st[10], $st[7]); + qerr($self, "I: $f created") if $self->{cmd} ne 'config'; + } + my $cfg = PublicInbox::Config::git_config_dump($f); + $cfg->{-st} = $cur_st; + $cfg->{'-f'} = $f; + $self->{cfg} = $PATH2CFG{$f} = $cfg; +} + +sub _lei_store ($;$) { + my ($self, $creat) = @_; + my $cfg = _lei_cfg($self, $creat); + $cfg->{-lei_store} //= do { + require PublicInbox::LeiStore; + my $dir = $cfg->{'leistore.dir'}; + $dir //= _store_path($self->{env}) if $creat; + return unless $dir; + PublicInbox::LeiStore->new($dir, { creat => $creat }); + }; +} + +sub lei_show { + my ($self, @argv) = @_; +} + +sub lei_mark { + my ($self, @argv) = @_; +} + +sub _config { + my ($self, @argv) = @_; + my $env = $self->{env}; + delete local $env->{GIT_CONFIG}; + delete local $ENV{GIT_CONFIG}; + my $cfg = _lei_cfg($self, 1); + my $cmd = [ qw(git config -f), $cfg->{'-f'}, @argv ]; + my %rdr = map { $_ => $self->{$_} } (0..2); + waitpid(spawn($cmd, $env, \%rdr), 0); +} + +sub lei_config { + my ($self, @argv) = @_; + $self->{opt}->{'config-file'} and return fail $self, + "config file switches not supported by `lei config'"; + _config(@_); + x_it($self, $?) if $?; +} + +sub lei_init { + my ($self, $dir) = @_; + my $cfg = _lei_cfg($self, 1); + my $cur = $cfg->{'leistore.dir'}; + my $env = $self->{env}; + $dir //= _store_path($env); + $dir = File::Spec->rel2abs($dir, $env->{PWD}); # PWD is symlink-aware + my @cur = stat($cur) if defined($cur); + $cur = File::Spec->canonpath($cur // $dir); + my @dir = stat($dir); + my $exists = "I: leistore.dir=$cur already initialized" if @dir; + if (@cur) { + if ($cur eq $dir) { + _lei_store($self, 1)->done; + return qerr($self, $exists); + } + + # some folks like symlinks and bind mounts :P + if (@dir && "$cur[0] $cur[1]" eq "$dir[0] $dir[1]") { + lei_config($self, 'leistore.dir', $dir); + _lei_store($self, 1)->done; + return qerr($self, "$exists (as $cur)"); + } + return fail($self, <<""); +E: leistore.dir=$cur already initialized and it is not $dir + + } + lei_config($self, 'leistore.dir', $dir); + _lei_store($self, 1)->done; + $exists //= "I: leistore.dir=$dir newly initialized"; + return qerr($self, $exists); +} + +sub lei_daemon_pid { puts shift, $$ } + +sub lei_daemon_kill { + my ($self) = @_; + my $sig = $self->{opt}->{signal} // 'TERM'; + kill($sig, $$) or fail($self, "kill($sig, $$): $!"); +} + +sub lei_help { _help($_[0]) } + +# Shell completion helper. Used by lei-completion.bash and hopefully +# other shells. Try to do as much here as possible to avoid redundancy +# and improve maintainability. +sub lei__complete { + my ($self, @argv) = @_; # argv = qw(lei and any other args...) + shift @argv; # ignore "lei", the entire command is sent + @argv or return puts $self, grep(!/^_/, keys %CMD), qw(--help -h); + my $cmd = shift @argv; + my $info = $CMD{$cmd} // do { # filter matching commands + @argv or puts $self, grep(/\A\Q$cmd\E/, keys %CMD); + return; + }; + my ($proto, undef, @spec) = @$info; + my $cur = pop @argv; + my $re = defined($cur) ? qr/\A\Q$cur\E/ : qr/./; + if (substr($cur // '-', 0, 1) eq '-') { # --switches + # gross special case since the only git-config options + # Consider moving to a table if we need more special cases + # we use Getopt::Long for are the ones we reject, so these + # are the ones we don't reject: + if ($cmd eq 'config') { + puts $self, grep(/$re/, keys %CONFIG_KEYS); + @spec = qw(add z|null get get-all unset unset-all + replace-all get-urlmatch + remove-section rename-section + name-only list|l edit|e + get-color-name get-colorbool); + # fall-through + } + # TODO: arg support + puts $self, grep(/$re/, map { # generate short/long names + my $eq = ''; + if (s/=.+\z//) { # required arg, e.g. output|o=i + $eq = '='; + } elsif (s/:.+\z//) { # optional arg, e.g. mid:s + } else { # negation: solve! => no-solve|solve + s/\A(.+)!\z/no-$1|$1/; + } + map { + length > 1 ? "--$_$eq" : "-$_" + } split(/\|/, $_, -1) # help|h + } grep { $OPTDESC{"$cmd\t$_"} || $OPTDESC{$_} } @spec); + } elsif ($cmd eq 'config' && !@argv && !$CONFIG_KEYS{$cur}) { + puts $self, grep(/$re/, keys %CONFIG_KEYS); + } + $cmd =~ tr/-/_/; + if (my $sub = $self->can("_complete_$cmd")) { + puts $self, $sub->($self, @argv, $cur); + } + # TODO: URLs, pathnames, OIDs, MIDs, etc... See optparse() for + # proto parsing. +} + +sub reap_exec { # dwaitpid callback + my ($self, $pid) = @_; + x_it($self, $?); +} + +sub lei_git { # support passing through random git commands + my ($self, @argv) = @_; + my %rdr = map { $_ => $self->{$_} } (0..2); + my $pid = spawn(['git', @argv], $self->{env}, \%rdr); + dwaitpid($pid, \&reap_exec, $self); +} + +sub exec_buf ($$) { + my ($argv, $env) = @_; + my $argc = scalar @$argv; + my $buf = 'exec '.join("\0", scalar(@$argv), @$argv); + while (my ($k, $v) = each %$env) { $buf .= "\0$k=$v" }; + $buf; +} + +sub start_mua { + my ($self) = @_; + my $mua = $self->{opt}->{'mua-cmd'} // return; + my $mfolder = $self->{ovv}->{dst}; + my (@cmd, $replaced); + if ($mua =~ /\A(?:mutt|mailx|mail|neomutt)\z/) { + @cmd = ($mua, '-f'); + # TODO: help wanted: other common FOSS MUAs + } else { + require Text::ParseWords; + my @cmd = Text::ParseWords::shellwords($mua); + # mutt uses '%f' for open-hook with compressed mbox, we follow + @cmd = map { $_ eq '%f' ? ($replaced = $mfolder) : $_ } @cmd; + } + push @cmd, $mfolder unless defined($replaced); + if (my $sock = $self->{sock}) { # lei(1) client process runs it + send($sock, exec_buf(\@cmd, {}), MSG_EOR); + } else { # oneshot + $self->{"mua.pid.$self.$$"} = spawn(\@cmd); + } +} + +# caller needs to "-t $self->{1}" to check if tty +sub start_pager { + my ($self) = @_; + my $env = $self->{env}; + my $fh = popen_rd([qw(git var GIT_PAGER)], $env); + chomp(my $pager = <$fh> // ''); + close($fh) or warn "`git var PAGER' error: \$?=$?"; + return if $pager eq 'cat' || $pager eq ''; + # TODO TIOCGWINSZ + my $new_env = { LESS => 'FRX', LV => '-c', COLUMNS => 80 }; + $new_env->{MORE} = 'FRX' if $^O eq 'freebsd'; + pipe(my ($r, $wpager)) or return warn "pipe: $!"; + my $rdr = { 0 => $r, 1 => $self->{1}, 2 => $self->{2} }; + my $pgr = [ undef, @$rdr{1, 2}, $$ ]; + if (my $sock = $self->{sock}) { # lei(1) process runs it + delete @$new_env{keys %$env}; # only set iff unset + my $fds = [ map { fileno($_) } @$rdr{0..2} ]; + $send_cmd->($sock, $fds, exec_buf([$pager], $new_env), MSG_EOR); + } else { + $pgr->[0] = spawn([$pager], $new_env, $rdr); + } + $self->{1} = $wpager; + $self->{2} = $wpager if -t $self->{2}; + $env->{GIT_PAGER_IN_USE} = 'true'; # we may spawn git + $self->{pgr} = $pgr; +} + +sub stop_pager { + my ($self) = @_; + my $pgr = delete($self->{pgr}) or return; + $self->{2} = $pgr->[2]; + # do not restore original stdout, just close it so we error out + close(delete($self->{1})) if $self->{1}; + my $pid = $pgr->[0]; + dwaitpid($pid, undef, $self->{sock}) if $pid && $pgr->[3] == $$; +} + +sub accept_dispatch { # Listener {post_accept} callback + my ($sock) = @_; # ignore other + $sock->autoflush(1); + my $self = bless { sock => $sock }, __PACKAGE__; + vec(my $rvec = '', fileno($sock), 1) = 1; + select($rvec, undef, undef, 1) or + return send($sock, 'timed out waiting to recv FDs', MSG_EOR); + my @fds = $recv_cmd->($sock, my $buf, 4096 * 33); # >MAX_ARG_STRLEN + if (scalar(@fds) == 4) { + for my $i (0..3) { + my $fd = shift(@fds); + open($self->{$i}, '+<&=', $fd) and next; + send($sock, "open(+<&=$fd) (FD=$i): $!", MSG_EOR); + } + } else { + return send($sock, "recv_cmd failed: $!", MSG_EOR); + } + $self->{2}->autoflush(1); # keep stdout buffered until x_it|DESTROY + # $ENV_STR = join('', map { "\0$_=$ENV{$_}" } keys %ENV); + # $buf = "$$\0$argc\0".join("\0", @ARGV).$ENV_STR."\0\0"; + substr($buf, -2, 2, '') eq "\0\0" or # s/\0\0\z// + return send($sock, 'request command truncated', MSG_EOR); + my ($argc, @argv) = split(/\0/, $buf, -1); + undef $buf; + my %env = map { split(/=/, $_, 2) } splice(@argv, $argc); + if (chdir(delete($self->{3}))) { + local %ENV = %env; + $self->{env} = \%env; + eval { dispatch($self, @argv) }; + send($sock, $@, MSG_EOR) if $@; + } else { + send($sock, "fchdir: $!", MSG_EOR); # implicit close + } +} + +sub dclose { + my ($self) = @_; + for my $f (qw(lxs l2m)) { + my $wq = delete $self->{$f} or next; + if ($wq->wq_kill) { + $self->wq_close + } elsif ($wq->wq_kill_old) { + $wq->wq_wait_old; + } + } + close(delete $self->{1}) if $self->{1}; # may reap_compress + $self->close if $self->{sock}; # PublicInbox::DS::close +} + +# for long-running results +sub event_step { + my ($self) = @_; + local %ENV = %{$self->{env}}; + my $sock = $self->{sock}; + local $current_lei = $self; + eval { + while (my @fds = $recv_cmd->($sock, my $buf, 4096)) { + if (scalar(@fds) == 1 && !defined($fds[0])) { + return if $! == EAGAIN; + next if $! == EINTR; + last if $! == ECONNRESET; + die "recvmsg: $!"; + } + for my $fd (@fds) { + open my $rfh, '+<&=', $fd; + } + die "unrecognized client signal: $buf"; + } + dclose($self); + }; + if (my $err = $@) { + eval { $self->fail($err) }; + dclose($self); + } +} + +sub event_step_init { + my ($self) = @_; + if (my $sock = $self->{sock}) { # using DS->EventLoop + $sock->blocking(0); + $self->SUPER::new($sock, EPOLLIN|EPOLLET); + } +} + +sub noop {} + +our $oldset; sub oldset { $oldset } + +sub dump_and_clear_log { + if (defined($errors_log) && -s STDIN && seek(STDIN, 0, SEEK_SET)) { + my @pfx = @_; + unshift(@pfx, "$errors_log ") if @pfx; + warn @pfx, do { local $/; }; + truncate(STDIN, 0) or warn "ftruncate ($errors_log): $!"; + } +} + +# lei(1) calls this when it can't connect +sub lazy_start { + my ($path, $errno, $narg) = @_; + if ($errno == ECONNREFUSED) { + unlink($path) or die "unlink($path): $!"; + } elsif ($errno != ENOENT) { + $! = $errno; # allow interpolation to stringify in die + die "connect($path): $!"; + } + umask(077) // die("umask(077): $!"); + local $listener; + socket($listener, AF_UNIX, SOCK_SEQPACKET, 0) or die "socket: $!"; + bind($listener, pack_sockaddr_un($path)) or die "bind($path): $!"; + listen($listener, 1024) or die "listen: $!"; + my @st = stat($path) or die "stat($path): $!"; + my $dev_ino_expect = pack('dd', $st[0], $st[1]); # dev+ino + local $oldset = PublicInbox::DS::block_signals(); + if ($narg == 5) { + $send_cmd = PublicInbox::Spawn->can('send_cmd4'); + $recv_cmd = PublicInbox::Spawn->can('recv_cmd4') // do { + require PublicInbox::CmdIPC4; + $send_cmd = PublicInbox::CmdIPC4->can('send_cmd4'); + PublicInbox::CmdIPC4->can('recv_cmd4'); + }; + } + $recv_cmd or die <<""; +(Socket::MsgHdr || Inline::C) missing/unconfigured (narg=$narg); + + require PublicInbox::Listener; + require PublicInbox::EOFpipe; + (-p STDOUT) or die "E: stdout must be a pipe\n"; + local $errors_log; + ($errors_log) = ($path =~ m!\A(.+?/)[^/]+\z!); + $errors_log .= 'errors.log'; + open(STDIN, '+>>', $errors_log) or die "open($errors_log): $!"; + STDIN->autoflush(1); + dump_and_clear_log("from previous daemon process:\n"); + POSIX::setsid() > 0 or die "setsid: $!"; + my $pid = fork // die "fork: $!"; + return if $pid; + $0 = "lei-daemon $path"; + local %PATH2CFG; + $listener->blocking(0); + my $exit_code; + my $pil = PublicInbox::Listener->new($listener, \&accept_dispatch); + local $quit = do { + pipe(my ($eof_r, $eof_w)) or die "pipe: $!"; + PublicInbox::EOFpipe->new($eof_r, \&noop, undef); + sub { + $exit_code //= shift; + my $lis = $pil or exit($exit_code); + # closing eof_w triggers \&noop wakeup + $listener = $eof_w = $pil = $path = undef; + $lis->close; # DS::close + PublicInbox::DS->SetLoopTimeout(1000); + }; + }; + my $sig = { + CHLD => \&PublicInbox::DS::enqueue_reap, + QUIT => $quit, + INT => $quit, + TERM => $quit, + HUP => \&noop, + USR1 => \&noop, + USR2 => \&noop, + }; + my $sigfd = PublicInbox::Sigfd->new($sig, SFD_NONBLOCK); + local @SIG{keys %$sig} = values(%$sig) unless $sigfd; + undef $sig; + local $SIG{PIPE} = 'IGNORE'; + if ($sigfd) { # TODO: use inotify/kqueue to detect unlinked sockets + undef $sigfd; + PublicInbox::DS->SetLoopTimeout(5000); + } else { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + PublicInbox::DS::sig_setmask($oldset); + PublicInbox::DS->SetLoopTimeout(1000); + } + PublicInbox::DS->SetPostLoopCallback(sub { + my ($dmap, undef) = @_; + if (@st = defined($path) ? stat($path) : ()) { + if ($dev_ino_expect ne pack('dd', $st[0], $st[1])) { + warn "$path dev/ino changed, quitting\n"; + $path = undef; + } + } elsif (defined($path)) { + warn "stat($path): $!, quitting ...\n"; + undef $path; # don't unlink + $quit->(); + } + return 1 if defined($path); + my $now = now(); + my $n = 0; + for my $s (values %$dmap) { + $s->can('busy') or next; + if ($s->busy($now)) { + ++$n; + } else { + $s->close; + } + } + $n; # true: continue, false: stop + }); + + # STDIN was redirected to /dev/null above, closing STDERR and + # STDOUT will cause the calling `lei' client process to finish + # reading the <$daemon> pipe. + openlog($path, 'pid', 'user'); + local $SIG{__WARN__} = sub { + $current_lei ? err($current_lei, @_) : syslog('warning', "@_"); + }; + my $on_destroy = PublicInbox::OnDestroy->new($$, sub { + syslog('crit', "$@") if $@; + }); + open STDERR, '>&STDIN' or die "redirect stderr failed: $!"; + open STDOUT, '>&STDIN' or die "redirect stdout failed: $!"; + # $daemon pipe to `lei' closed, main loop begins: + PublicInbox::DS->EventLoop; + @$on_destroy = (); # cancel on_destroy if we get here + exit($exit_code // 0); +} + +# for users w/o Socket::Msghdr installed or Inline::C enabled +sub oneshot { + my ($main_pkg) = @_; + my $exit = $main_pkg->can('exit'); # caller may override exit() + local $quit = $exit if $exit; + local %PATH2CFG; + umask(077) // die("umask(077): $!"); + my $self = bless { + 0 => *STDIN{GLOB}, + 1 => *STDOUT{GLOB}, + 2 => *STDERR{GLOB}, + env => \%ENV + }, __PACKAGE__; + dispatch($self, @ARGV); + x_it($self, $self->{child_error}) if $self->{child_error}; +} + +# ensures stdout hits the FS before sock disconnects so a client +# can immediately reread it +sub DESTROY { + my ($self) = @_; + $self->{1}->autoflush(1) if $self->{1}; + stop_pager($self); + if (my $mua_pid = delete $self->{"mua.pid.$self.$$"}) { + waitpid($mua_pid, 0); + } +} + +1; diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm new file mode 100644 index 00000000..3f478aa4 --- /dev/null +++ b/lib/PublicInbox/LeiDedupe.pm @@ -0,0 +1,131 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ +package PublicInbox::LeiDedupe; +use strict; +use v5.10.1; +use PublicInbox::SharedKV; +use PublicInbox::ContentHash qw(content_hash); + +# n.b. mutt sets most of these headers not sure about Bytes +our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); + +# best-effort regeneration of OID when augmenting existing results +sub _regen_oid ($) { + my ($eml) = @_; + my @stash; # stash away headers we shouldn't have in git + for my $k (@OID_IGNORE) { + my @v = $eml->header_raw($k) or next; + push @stash, [ $k, \@v ]; + $eml->header_set($k); # restore below + } + my $dig = Digest::SHA->new(1); # XXX SHA256 later + my $buf = $eml->as_string; + $dig->add('blob '.length($buf)."\0"); + $dig->add($buf); + undef $buf; + + for my $kv (@stash) { # restore stashed headers + my ($k, @v) = @$kv; + $eml->header_set($k, @v); + } + $dig->digest; +} + +sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } + +sub smsg_hash ($) { + my ($smsg) = @_; + my $dig = Digest::SHA->new(256); + my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); + utf8::encode($x); + $dig->add($x); + $dig->digest; +} + +# the paranoid option +sub dedupe_oid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oid) = @_; + $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(_oidbin($smsg->{blob}), ''); + }); +} + +# dangerous if there's duplicate messages with different Message-IDs +sub dedupe_mid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oid) = @_; + # TODO: lei will support non-public messages w/o Message-ID + my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // + content_hash($eml); + $skv->set_maybe($mid, ''); + }, sub { + my ($smsg) = @_; + my $mid = $smsg->{mid}; + $mid = undef if $mid eq ''; + $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob}); + $skv->set_maybe($mid, ''); + }); +} + +# our default deduplication strategy (used by v2, also) +sub dedupe_content ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml) = @_; # oid = $_[1], ignored + $skv->set_maybe(content_hash($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(smsg_hash($smsg), ''); + }); +} + +# no deduplication at all +sub true { 1 } +sub dedupe_none ($) { (\&true, \&true) } + +sub new { + my ($cls, $lei) = @_; + my $dd = $lei->{opt}->{dedupe} // 'content'; + my $dst = $lei->{ovv}->{dst}; + + # allow "none" to bypass Eml->new if writing to directory: + return if ($dd eq 'none' && substr($dst // '', -1) eq '/'); + my $m = "dedupe_$dd"; + $cls->can($m) or die "unsupported dedupe strategy: $dd\n"; + my $skv = $dd eq 'none' ? undef : PublicInbox::SharedKV->new; + + # [ $skv, $eml_cb, $smsg_cb, "dedupe_$dd" ] + bless [ $skv, undef, undef, $m ], $cls; +} + +# returns true on unseen messages according to the deduplication strategy, +# returns false if seen +sub is_dup { + my ($self, $eml, $oid) = @_; + !$self->[1]->($eml, $oid); +} + +sub is_smsg_dup { + my ($self, $smsg) = @_; + !$self->[2]->($smsg); +} + +sub prepare_dedupe { + my ($self) = @_; + my $skv = $self->[0]; + $self->[1] or @$self[1,2] = $self->can($self->[3])->($skv); + $skv ? $skv->dbh : undef; +} + +sub pause_dedupe { + my ($self) = @_; + my $skv = $self->[0]; + delete($skv->{dbh}) if $skv; +} + +1; diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm new file mode 100644 index 00000000..bf07c41c --- /dev/null +++ b/lib/PublicInbox/LeiExternal.pm @@ -0,0 +1,142 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# *-external commands of lei +package PublicInbox::LeiExternal; +use strict; +use v5.10.1; +use parent qw(Exporter); +our @EXPORT = qw(lei_ls_external lei_add_external lei_forget_external); +use PublicInbox::Config; + +sub _externals_each { + my ($self, $cb, @arg) = @_; + my $cfg = $self->_lei_cfg(0); + my %boost; + for my $sec (grep(/\Aexternal\./, @{$cfg->{-section_order}})) { + my $loc = substr($sec, length('external.')); + $boost{$loc} = $cfg->{"$sec.boost"}; + } + return \%boost if !wantarray && !$cb; + + # highest boost first, but stable for alphabetic tie break + use sort 'stable'; + my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost; + return @order if !$cb; + for my $loc (@order) { + $cb->(@arg, $loc, $boost{$loc}); + } + @order; # scalar or array +} + +sub lei_ls_external { + my ($self, @argv) = @_; + my $out = $self->{1}; + my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); + $self->_externals_each(sub { + my ($loc, $boost_val) = @_; + print $out $loc, $OFS, 'boost=', $boost_val, $ORS; + }); +} + +sub _canonicalize { + my ($location) = @_; + if ($location !~ m!\Ahttps?://!) { + PublicInbox::Config::rel2abs_collapsed($location); + } else { + require URI; + my $uri = URI->new($location)->canonical; + my $path = $uri->path . '/'; + $path =~ tr!/!/!s; # squeeze redundant '/' + $uri->path($path); + $uri->as_string; + } +} + +sub lei_add_external { + my ($self, $location) = @_; + my $cfg = $self->_lei_cfg(1); + my $new_boost = $self->{opt}->{boost} // 0; + $location = _canonicalize($location); + if ($location !~ m!\Ahttps?://! && !-d $location) { + return $self->fail("$location not a directory"); + } + my $key = "external.$location.boost"; + my $cur_boost = $cfg->{$key}; + return if defined($cur_boost) && $cur_boost == $new_boost; # idempotent + $self->lei_config($key, $new_boost); + $self->_lei_store(1)->done; # just create the store +} + +sub lei_forget_external { + my ($self, @locations) = @_; + my $cfg = $self->_lei_cfg(1); + my $quiet = $self->{opt}->{quiet}; + my %seen; + for my $loc (@locations) { + my (@unset, @not_found); + for my $l ($loc, _canonicalize($loc)) { + next if $seen{$l}++; + my $key = "external.$l.boost"; + delete($cfg->{$key}); + $self->_config('--unset', $key); + if ($? == 0) { + push @unset, $l; + } elsif (($? >> 8) == 5) { + push @not_found, $l; + } else { + $self->err("# --unset $key error"); + return $self->x_it($?); + } + } + if (@unset) { + next if $quiet; + $self->err("# $_ gone") for @unset; + } elsif (@not_found) { + $self->err("# $_ not found") for @not_found; + } # else { already exited + } +} + +# shell completion helper called by lei__complete +sub _complete_forget_external { + my ($self, @argv) = @_; + my $cfg = $self->_lei_cfg(0); + my $cur = pop @argv; + # Workaround bash word-splitting URLs to ['https', ':', '//' ...] + # Maybe there's a better way to go about this in + # contrib/completion/lei-completion.bash + my $re = ''; + if (@argv) { + my @x = @argv; + if ($cur eq ':' && @x) { + push @x, $cur; + $cur = ''; + } + while (@x > 2 && $x[0] !~ /\Ahttps?\z/ && $x[1] ne ':') { + shift @x; + } + if (@x >= 2) { # qw(https : hostname : 443) or qw(http :) + $re = join('', @x); + } else { # just filter out the flags and hope for the best + $re = join('', grep(!/^-/, @argv)); + } + $re = quotemeta($re); + } + # FIXME: bash completion off "http:" or "https:" when the last + # character is a colon doesn't work properly even if we're + # returning "//$HTTP_HOST/$PATH_INFO/", not sure why, could + # be a bash issue. + map { + my $x = substr($_, length('external.')); + # only return the part specified on the CLI + if ($x =~ /\A$re(\Q$cur\E.*)/) { + # don't duplicate if already 100% completed + $cur eq $1 ? () : $1; + } else { + (); + } + } grep(/\Aexternal\.$re\Q$cur/, @{$cfg->{-section_order}}); +} + +1; diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm new file mode 100644 index 00000000..928d66cb --- /dev/null +++ b/lib/PublicInbox/LeiOverview.pm @@ -0,0 +1,294 @@ +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ + +# per-mitem/smsg iterators for search results +# "ovv" => "Overview viewer" +package PublicInbox::LeiOverview; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock); +use POSIX qw(strftime); +use Fcntl qw(F_GETFL O_APPEND); +use File::Spec; +use File::Temp (); +use PublicInbox::MID qw($MID_EXTRACT); +use PublicInbox::Address qw(pairs); +use PublicInbox::Config; +use PublicInbox::Search qw(get_pct); +use PublicInbox::LeiDedupe; +use PublicInbox::LeiToMail; + +# cf. https://en.wikipedia.org/wiki/JSON_streaming +my $JSONL = 'ldjson|ndjson|jsonl'; # 3 names for the same thing + +sub _iso8601 ($) { strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($_[0])) } + +# we open this in the parent process before ->wq_do handoff +sub ovv_out_lk_init ($) { + my ($self) = @_; + $self->{tmp_lk_id} = "$self.$$"; + my $tmp = File::Temp->new("lei-ovv.dst.$$.lock-XXXXXX", + TMPDIR => 1, UNLINK => 0); + $self->{lock_path} = $tmp->filename; +} + +sub ovv_out_lk_cancel ($) { + my ($self) = @_; + ($self->{tmp_lk_id}//'') eq "$self.$$" and + unlink(delete($self->{lock_path})); +} + +sub detect_fmt ($$) { + my ($lei, $dst) = @_; + if ($dst =~ m!\A([:/]+://)!) { + $lei->fail("$1 support not implemented, yet\n"); + } elsif (!-e $dst || -d _) { + 'maildir'; # the default TODO: MH? + } elsif (-f _ || -p _) { + $lei->fail("unable to determine mbox family of $dst\n"); + } else { + $lei->fail("unable to determine format of $dst\n"); + } +} + +sub new { + my ($class, $lei) = @_; + my $opt = $lei->{opt}; + my $dst = $opt->{output} // '-'; + $dst = '/dev/stdout' if $dst eq '-'; + + my $fmt = $opt->{'format'}; + $fmt = lc($fmt) if defined $fmt; + if ($dst =~ s/\A([a-z0-9]+)://is) { # e.g. Maildir:/home/user/Mail/ + my $ofmt = lc $1; + $fmt //= $ofmt; + return $lei->fail(<<"") if $fmt ne $ofmt; +--format=$fmt and --output=$ofmt conflict + + } + $fmt //= 'json' if $dst eq '/dev/stdout'; + $fmt //= detect_fmt($lei, $dst) or return; + + if (index($dst, '://') < 0) { # not a URL, so assume path + $dst = File::Spec->canonpath($dst); + } # else URL + + my $self = bless { fmt => $fmt, dst => $dst }, $class; + $lei->{ovv} = $self; + my $json; + if ($fmt =~ /\A($JSONL|(?:concat)?json)\z/) { + $json = $self->{json} = ref(PublicInbox::Config->json); + } + my ($isatty, $seekable); + if ($dst eq '/dev/stdout') { + $isatty = -t $lei->{1}; + $lei->start_pager if $isatty; + $opt->{pretty} //= $isatty; + if (!$isatty && -f _) { + my $fl = fcntl($lei->{1}, F_GETFL, 0) // + return $lei->fail("fcntl(stdout): $!"); + ovv_out_lk_init($self) unless ($fl & O_APPEND); + } else { + ovv_out_lk_init($self); + } + } + if (!$json) { + # default to the cheapest sort since MUA usually resorts + $lei->{opt}->{'sort'} //= 'docid' if $dst ne '/dev/stdout'; + $lei->{l2m} = eval { PublicInbox::LeiToMail->new($lei) }; + return $lei->fail($@) if $@; + } + $lei->{dedupe} //= PublicInbox::LeiDedupe->new($lei); + $self; +} + +# called once by parent +sub ovv_begin { + my ($self, $lei) = @_; + if ($self->{fmt} eq 'json') { + print { $lei->{1} } '['; + } # TODO HTML/Atom/... +} + +# called once by parent (via PublicInbox::EOFpipe) +sub ovv_end { + my ($self, $lei) = @_; + my $out = $lei->{1} or return; + if ($self->{fmt} eq 'json') { + # JSON doesn't allow trailing commas, and preventing + # trailing commas is a PITA when parallelizing outputs + print $out "null]\n"; + } elsif ($self->{fmt} eq 'concatjson') { + print $out "\n"; + } +} + +sub ovv_atfork_child { + my ($self) = @_; + # reopen dedupe here +} + +# prepares an smsg for JSON +sub _unbless_smsg { + my ($smsg, $mitem) = @_; + + delete @$smsg{qw(lines bytes num tid)}; + $smsg->{rt} = _iso8601(delete $smsg->{ts}); # JMAP receivedAt + $smsg->{dt} = _iso8601(delete $smsg->{ds}); # JMAP UTCDate + $smsg->{pct} = get_pct($mitem) if $mitem; + if (my $r = delete $smsg->{references}) { + $smsg->{refs} = [ map { "<$_>" } ($r =~ m/$MID_EXTRACT/go) ]; + } + if (my $m = delete($smsg->{mid})) { + $smsg->{'m'} = "<$m>"; + } + for my $f (qw(from to cc)) { + my $v = delete $smsg->{$f} or next; + $smsg->{substr($f, 0, 1)} = pairs($v); + } + $smsg->{'s'} = delete $smsg->{subject}; + # can we be bothered to parse From/To/Cc into arrays? + scalar { %$smsg }; # unbless +} + +sub ovv_atexit_child { + my ($self, $lei) = @_; + if (my $l2m = delete $lei->{l2m}) { + # gracefully stop lei2mail processes after all + # ->write_mail work is complete + delete $l2m->{-wq_s1}; + if (my $rd = delete $l2m->{each_smsg_done}) { + read($rd, my $buf, 1); # wait for EOF + } + } + # order matters, git->{-tmp}->DESTROY must not fire until + # {each_smsg_done} hits EOF above + if (my $git = delete $self->{git}) { + $git->async_wait_all; + } + if (my $bref = delete $lei->{ovv_buf}) { + my $out = $lei->{1} or return; + my $lk = $self->lock_for_scope; + print $out $$bref; + } +} + +# JSON module ->pretty output wastes too much vertical white space, +# this (IMHO) provides better use of screen real-estate while not +# being excessively compact: +sub _json_pretty { + my ($json, $k, $v) = @_; + if (ref $v eq 'ARRAY') { + if (@$v) { + my $sep = ",\n" . (' ' x (length($k) + 7)); + if (ref($v->[0])) { # f/t/c + $v = '[' . join($sep, map { + my $pair = $json->encode($_); + $pair =~ s/(null|"),"/$1, "/g; + $pair; + } @$v) . ']'; + } else { # references + $v = '[' . join($sep, map { + substr($json->encode([$_]), 1, -1); + } @$v) . ']'; + } + } else { + $v = '[]'; + } + } + qq{ "$k": }.$v; +} + +sub ovv_each_smsg_cb { # runs in wq worker usually + my ($self, $lei, $ibxish) = @_; + my $json; + $lei->{1}->autoflush(1); + if (my $pkg = $self->{json}) { + $json = $pkg->new; + $json->utf8->canonical; + $json->ascii(1) if $lei->{opt}->{ascii}; + } + my $l2m = $lei->{l2m}; + if ($l2m && !$ibxish) { # remote https?:// mboxrd + delete $l2m->{-wq_s1}; + my $g2m = $l2m->can('git_to_mail'); + my $wcb = $l2m->write_cb($lei); + sub { + my ($smsg, undef, $eml) = @_; # no mitem in $_[1] + $wcb->(undef, $smsg, $eml); + }; + } elsif ($l2m && $l2m->{-wq_s1}) { + my ($lei_ipc, @io) = $lei->atfork_parent_wq($l2m); + # n.b. $io[0] = qry_status_wr, $io[1] = mbox|stdout, + # $io[4] becomes a notification pipe that triggers EOF + # in this wq worker when all outstanding ->write_mail + # calls are complete + die "BUG: \$io[4] $io[4] unexpected" if $io[4]; + pipe($l2m->{each_smsg_done}, $io[4]) or die "pipe: $!"; + fcntl($io[4], 1031, 4096) if $^O eq 'linux'; + delete @$lei_ipc{qw(l2m opt mset_opt cmd)}; + my $git = $ibxish->git; # (LeiXSearch|Inbox|ExtSearch)->git + $self->{git} = $git; + my $git_dir = $git->{git_dir}; + sub { + my ($smsg, $mitem) = @_; + $smsg->{pct} = get_pct($mitem) if $mitem; + $l2m->wq_do('write_mail', \@io, $git_dir, $smsg, + $lei_ipc); + } + } elsif ($l2m) { + my $wcb = $l2m->write_cb($lei); + my $git = $ibxish->git; # (LeiXSearch|Inbox|ExtSearch)->git + $self->{git} = $git; # for ovv_atexit_child + my $g2m = $l2m->can('git_to_mail'); + sub { + my ($smsg, $mitem) = @_; + $smsg->{pct} = get_pct($mitem) if $mitem; + $git->cat_async($smsg->{blob}, $g2m, [ $wcb, $smsg ]); + }; + } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) { + my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; + $lei->{ovv_buf} = \(my $buf = ''); + sub { # DIY prettiness :P + my ($smsg, $mitem) = @_; + $smsg = _unbless_smsg($smsg, $mitem); + $buf .= "{\n"; + $buf .= join(",\n", map { + my $v = $smsg->{$_}; + if (ref($v)) { + _json_pretty($json, $_, $v); + } else { + $v = $json->encode([$v]); + qq{ "$_": }.substr($v, 1, -1); + } + } sort keys %$smsg); + $buf .= $EOR; + if (length($buf) > 65536) { + my $lk = $self->lock_for_scope; + print { $lei->{1} } $buf; + $buf = ''; + } + } + } elsif ($json) { + my $ORS = $self->{fmt} eq 'json' ? ",\n" : "\n"; # JSONL + $lei->{ovv_buf} = \(my $buf = ''); + sub { + my ($smsg, $mitem) = @_; + $buf .= $json->encode(_unbless_smsg(@_)) . $ORS; + if (length($buf) > 65536) { + my $lk = $self->lock_for_scope; + print { $lei->{1} } $buf; + $buf = ''; + } + } + } elsif ($self->{fmt} eq 'oid') { + sub { + my ($smsg, $mitem) = @_; + } + } # else { ... +} + +no warnings 'once'; +*DESTROY = \&ovv_out_lk_cancel; + +1; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm new file mode 100644 index 00000000..953d1fc2 --- /dev/null +++ b/lib/PublicInbox/LeiQuery.pm @@ -0,0 +1,119 @@ +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ + +# handles lei commands +package PublicInbox::LeiQuery; +use strict; +use v5.10.1; +use PublicInbox::DS qw(dwaitpid); + +# the main "lei q SEARCH_TERMS" method +sub lei_q { + my ($self, @argv) = @_; + require PublicInbox::LeiXSearch; + require PublicInbox::LeiOverview; + PublicInbox::Config->json; # preload before forking + my $opt = $self->{opt}; + my $lxs = $self->{lxs} = PublicInbox::LeiXSearch->new; + # any number of LeiXSearch || LeiSearch || Inbox + if ($opt->{'local'} //= 1) { # --local is enabled by default + my $sto = $self->_lei_store(1); + $lxs->prepare_external($sto->search); + } + + # --external is enabled by default, but allow --no-external + if ($opt->{external} //= 1) { + my $cb = $lxs->can('prepare_external'); + my $ne = $self->_externals_each($cb, $lxs); + $opt->{remote} //= $ne == $lxs->remotes; + if ($opt->{'local'}) { + delete($lxs->{remotes}) if !$opt->{remote}; + } else { + delete($lxs->{locals}); + } + } + unless ($lxs->locals || $lxs->remotes) { + return $self->fail('no local or remote inboxes to search'); + } + my $xj = $lxs->concurrency($opt); + my $ovv = PublicInbox::LeiOverview->new($self) or return; + $self->atfork_prepare_wq($lxs); + $lxs->wq_workers_start('lei_xsearch', $xj, $self->oldset); + delete $lxs->{-ipc_atfork_child_close}; + if (my $l2m = $self->{l2m}) { + my $mj = 4; # TODO: configurable + $self->atfork_prepare_wq($l2m); + $l2m->wq_workers_start('lei2mail', $mj, $self->oldset); + delete $l2m->{-ipc_atfork_child_close}; + } + + # no forking workers after this + + my %mset_opt = map { $_ => $opt->{$_} } qw(thread limit offset); + $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0; + $mset_opt{qstr} = join(' ', map {; + # Consider spaces in argv to be for phrase search in Xapian. + # In other words, the users should need only care about + # normal shell quotes and not have to learn Xapian quoting. + /\s/ ? (s/\A(\w+:)// ? qq{$1"$_"} : qq{"$_"}) : $_ + } @argv); + if (defined(my $sort = $opt->{'sort'})) { + if ($sort eq 'relevance') { + $mset_opt{relevance} = 1; + } elsif ($sort eq 'docid') { + $mset_opt{relevance} = $mset_opt{asc} ? -1 : -2; + } elsif ($sort =~ /\Areceived(?:-?[aA]t)?\z/) { + # the default + } else { + die "unrecognized --sort=$sort\n"; + } + } + # descending docid order + $mset_opt{relevance} //= -2 if $opt->{thread}; + $self->{mset_opt} = \%mset_opt; + $ovv->ovv_begin($self); + $lxs->do_query($self); +} + +# Stuff we may pass through to curl (as of 7.64.0), see curl manpage for +# details, so most options which make sense for HTTP/HTTPS (including proxy +# support for Tor and other methods of getting past weird networks). +# Most of these are untested by us, some may not make sense for our use case +# and typos below are likely. +# n.b. some short options (-$NUMBER) are not supported since they conflict +# with other "lei q" switches. +# FIXME: Getopt::Long doesn't easily let us support support options with +# '.' in them (e.g. --http1.1) +sub curl_opt { qw( + abstract-unix-socket=s anyauth basic cacert=s capath=s + cert-status cert-type cert|E=s ciphers=s config|K=s@ + connect-timeout=s connect-to=s cookie-jar|c=s cookie|b=s crlfile=s + digest disable dns-interface=s dns-ipv4-addr=s dns-ipv6-addr=s + dns-servers=s doh-url=s egd-file=s engine=s false-start + happy-eyeballs-timeout-ms=s haproxy-protocol header|H=s@ + http2-prior-knowledge http2 insecure|k + interface=s ipv4 ipv6 junk-session-cookies + key-type=s key=s limit-rate=s local-port=s location-trusted location|L + max-redirs=i max-time=s negotiate netrc-file=s netrc-optional netrc + no-alpn no-buffer|N no-npn no-sessionid noproxy=s ntlm-wb ntlm + pass=s pinnedpubkey=s post301 post302 post303 preproxy=s + proxy-anyauth proxy-basic proxy-cacert=s proxy-capath=s + proxy-cert-type=s proxy-cert=s proxy-ciphers=s proxy-crlfile=s + proxy-digest proxy-header=s@ proxy-insecure + proxy-key-type=s proxy-key proxy-negotiate proxy-ntlm proxy-pass=s + proxy-pinnedpubkey=s proxy-service-name=s proxy-ssl-allow-beast + proxy-tls13-ciphers=s proxy-tlsauthtype=s proxy-tlspassword=s + proxy-tlsuser=s proxy-tlsv1 proxy-user|U=s proxy=s + proxytunnel=s pubkey=s random-file=s referer=s resolve=s + retry-connrefused retry-delay=s retry-max-time=s retry=i + sasl-ir service-name=s socks4=s socks4a=s socks5-basic + socks5-gssapi-service-name=s socks5-gssapi socks5-hostname=s socks5=s + speed-limit|Y speed-type|y ssl-allow-beast sslv2 sslv3 + suppress-connect-headers tcp-fastopen tls-max=s + tls13-ciphers=s tlsauthtype=s tlspassword=s tlsuser=s + tlsv1 trace-ascii=s trace-time trace=s + unix-socket=s user-agent|A=s user|u=s +) +} + +1; diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm new file mode 100644 index 00000000..440bacf5 --- /dev/null +++ b/lib/PublicInbox/LeiSearch.pm @@ -0,0 +1,27 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +package PublicInbox::LeiSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::ExtSearch); +use PublicInbox::Search qw(xap_terms); + +# get combined docid from over.num: +# (not generic Xapian, only works with our sharding scheme) +sub num2docid ($$) { + my ($self, $num) = @_; + my $nshard = $self->{nshard}; + ($num - 1) * $nshard + $num % $nshard + 1; +} + +sub msg_keywords { + my ($self, $num) = @_; # num_or_mitem + my $xdb = $self->xdb; # set {nshard}; + my $docid = ref($num) ? $num->get_docid : num2docid($self, $num); + my $kw = xap_terms('K', $xdb, $docid); + warn "E: #$docid ($num): $@\n" if $@; + wantarray ? sort(keys(%$kw)) : $kw; +} + +1; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm new file mode 100644 index 00000000..a7d7d953 --- /dev/null +++ b/lib/PublicInbox/LeiStore.pm @@ -0,0 +1,240 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ +# +# Local storage (cache/memo) for lei(1), suitable for personal/private +# mail iff on encrypted device/FS. Based on v2, but only deduplicates +# based on git OID. +# +# for xref3, the following are constant: $eidx_key = '.', $xnum = -1 +package PublicInbox::LeiStore; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock PublicInbox::IPC); +use PublicInbox::ExtSearchIdx; +use PublicInbox::Import; +use PublicInbox::InboxWritable; +use PublicInbox::V2Writable; +use PublicInbox::ContentHash qw(content_hash content_digest); +use PublicInbox::MID qw(mids mids_in); +use PublicInbox::LeiSearch; +use List::Util qw(max); + +sub new { + my (undef, $dir, $opt) = @_; + my $eidx = PublicInbox::ExtSearchIdx->new($dir, $opt); + my $self = bless { priv_eidx => $eidx }, __PACKAGE__; + eidx_init($self)->done if $opt->{creat}; + $self; +} + +sub git { $_[0]->{priv_eidx}->git } # read-only + +sub packing_factor { $PublicInbox::V2Writable::PACKING_FACTOR } + +sub rotate_bytes { + $_[0]->{rotate_bytes} // ((1024 * 1024 * 1024) / $_[0]->packing_factor) +} + +sub git_pfx { "$_[0]->{priv_eidx}->{topdir}/local" }; + +sub git_epoch_max { + my ($self) = @_; + if (opendir(my $dh, $self->git_pfx)) { + max(map { + substr($_, 0, -4) + 0; # drop ".git" suffix + } grep(/\A[0-9]+\.git\z/, readdir($dh))) // 0; + } else { + $!{ENOENT} ? 0 : die("opendir ${\$self->git_pfx}: $!\n"); + } +} + +sub git_ident ($) { + my ($git) = @_; + chomp(my $i = $git->qx(qw(var GIT_COMMITTER_IDENT))); + warn "$git->{git_dir} GIT_COMMITTER_IDENT failed\n" if $?; + $i =~ /\A(.+) <([^>]+)> [0-9]+ [-\+]?[0-9]+$/ ? ($1, $2) : + ('lei user', 'x@example.com') +} + +sub importer { + my ($self) = @_; + my $max; + my $im = $self->{im}; + if ($im) { + return $im if $im->{bytes_added} < $self->rotate_bytes; + + delete $self->{im}; + $im->done; + undef $im; + $self->checkpoint; + $max = $self->git_epoch_max + 1; + } + my $pfx = $self->git_pfx; + $max //= $self->git_epoch_max; + while (1) { + my $latest = "$pfx/$max.git"; + my $old = -e $latest; + PublicInbox::Import::init_bare($latest); + my $git = PublicInbox::Git->new($latest); + $git->qx(qw(config core.sharedRepository 0600)) if !$old; + my $packed_bytes = $git->packed_bytes; + my $unpacked_bytes = $packed_bytes / $self->packing_factor; + if ($unpacked_bytes >= $self->rotate_bytes) { + $max++; + next; + } + my ($n, $e) = git_ident($git); + $self->{im} = $im = PublicInbox::Import->new($git, $n, $e); + $im->{bytes_added} = int($packed_bytes / $self->packing_factor); + $im->{lock_path} = undef; + $im->{path_type} = 'v2'; + return $im; + } +} + +sub search { + PublicInbox::LeiSearch->new($_[0]->{priv_eidx}->{topdir}); +} + +sub eidx_init { + my ($self) = @_; + my $eidx = $self->{priv_eidx}; + $eidx->idx_init({-private => 1}); + $eidx; +} + +# when a message has no Message-IDs at all, this is needed for +# unsent Draft messages, at least +sub _fake_mid_for ($$) { + my ($eml, $dig) = @_; + my $mids = mids_in($eml, qw(X-Alt-Message-ID Resent-Message-ID)); + $eml->{-lei_fake_mid} = + $mids->[0] // PublicInbox::Import::digest2mid($dig, $eml); +} + +sub _docids_for ($$) { + my ($self, $eml) = @_; + my %docids; + my $dig = content_digest($eml); + my $chash = $dig->clone->digest; + my $eidx = eidx_init($self); + my $oidx = $eidx->{oidx}; + my $im = $self->{im}; + my $mids = mids($eml); + $mids->[0] //= _fake_mid_for($eml, $dig); + for my $mid (@$mids) { + my ($id, $prev); + while (my $cur = $oidx->next_by_mid($mid, \$id, \$prev)) { + my $oid = $cur->{blob}; + my $docid = $cur->{num}; + my $bref = $im ? $im->cat_blob($oid) : undef; + $bref //= $eidx->git->cat_file($oid) // do { + warn "W: $oid (#$docid) <$mid> not found\n"; + next; + }; + local $self->{current_info} = $oid; + my $x = PublicInbox::Eml->new($bref); + $docids{$docid} = $docid if content_hash($x) eq $chash; + } + } + sort { $a <=> $b } values %docids; +} + +sub set_eml_keywords { + my ($self, $eml, @kw) = @_; + my $eidx = eidx_init($self); + my @docids = _docids_for($self, $eml); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('set_keywords', $docid, @kw); + } + \@docids; +} + +sub add_eml_keywords { + my ($self, $eml, @kw) = @_; + my $eidx = eidx_init($self); + my @docids = _docids_for($self, $eml); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('add_keywords', $docid, @kw); + } + \@docids; +} + +sub remove_eml_keywords { + my ($self, $eml, @kw) = @_; + my $eidx = eidx_init($self); + my @docids = _docids_for($self, $eml); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('remove_keywords', $docid, @kw) + } + \@docids; +} + +# cf: https://doc.dovecot.org/configuration_manual/mail_location/mbox/ +my %status2kw = (F => 'flagged', A => 'answered', R => 'seen', T => 'draft'); +# O (old/non-recent), and D (deleted) aren't in JMAP, +# so probably won't be supported by us. +sub mbox_keywords { + my $eml = $_[-1]; + my $s = "@{[$eml->header_raw('X-Status'),$eml->header_raw('Status')]}"; + my %kw; + $s =~ s/([FART])/$kw{$status2kw{$1}} = 1/sge; + sort(keys %kw); +} + +# cf: https://cr.yp.to/proto/maildir.html +my %c2kw = ('D' => 'draft', F => 'flagged', R => 'answered', S => 'seen'); +sub maildir_keywords { + $_[-1] =~ /:2,([A-Z]+)\z/i ? + sort(map { $c2kw{$_} // () } split(//, $1)) : (); +} + +sub add_eml { + my ($self, $eml, @kw) = @_; + my $eidx = eidx_init($self); + my $oidx = $eidx->{oidx}; + my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; + my $im = $self->importer; + $im->add($eml, undef, $smsg) or return; # duplicate returns undef + + local $self->{current_info} = $smsg->{blob}; + if (my @docids = _docids_for($self, $eml)) { + for my $docid (@docids) { + my $idx = $eidx->idx_shard($docid); + $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); + # add_eidx_info for List-Id + $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + $idx->ipc_do('add_keywords', $docid, @kw) if @kw; + } + \@docids; + } else { + $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); + $oidx->add_overview($eml, $smsg); + $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.'); + my $idx = $eidx->idx_shard($smsg->{num}); + $idx->index_eml($eml, $smsg); + $idx->ipc_do('add_keywords', $smsg->{num}, @kw) if @kw; + $smsg; + } +} + +sub set_eml { + my ($self, $eml, @kw) = @_; + add_eml($self, $eml, @kw) // set_eml_keywords($self, $eml, @kw); +} + +sub done { + my ($self) = @_; + my $err = ''; + if (my $im = delete($self->{im})) { + eval { $im->done }; + if ($@) { + $err .= "import done: $@\n"; + warn $err; + } + } + $self->{priv_eidx}->done; + die $err if $err; +} + +1; diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm new file mode 100644 index 00000000..08a1570d --- /dev/null +++ b/lib/PublicInbox/LeiToMail.pm @@ -0,0 +1,500 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir +package PublicInbox::LeiToMail; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::Eml; +use PublicInbox::Lock; +use PublicInbox::ProcessPipe; +use PublicInbox::Spawn qw(which spawn popen_rd); +use PublicInbox::LeiDedupe; +use PublicInbox::OnDestroy; +use Symbol qw(gensym); +use IO::Handle; # ->autoflush +use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); +use Errno qw(EEXIST ESPIPE ENOENT); +use PublicInbox::Git; + +my %kw2char = ( # Maildir characters + draft => 'D', + flagged => 'F', + answered => 'R', + seen => 'S' +); + +my %kw2status = ( + flagged => [ 'X-Status' => 'F' ], + answered => [ 'X-Status' => 'A' ], + seen => [ 'Status' => 'R' ], + draft => [ 'X-Status' => 'T' ], +); + +sub _mbox_hdr_buf ($$$) { + my ($eml, $type, $smsg) = @_; + $eml->header_set($_) for (qw(Lines Bytes Content-Length)); + + # Messages are always 'O' (non-\Recent in IMAP), it saves + # MUAs the trouble of rewriting the mbox if no other + # changes are made + my %hdr = (Status => [ 'O' ]); # set Status, X-Status + for my $k (@{$smsg->{kw} // []}) { + if (my $ent = $kw2status{$k}) { + push @{$hdr{$ent->[0]}}, $ent->[1]; + } else { # X-Label? + warn "TODO: keyword `$k' not supported for mbox\n"; + } + } + while (my ($name, $chars) = each %hdr) { + $eml->header_set($name, join('', sort @$chars)); + } + my $buf = delete $eml->{hdr}; + + # fixup old bug from import (pre-a0c07cba0e5d8b6a) + $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $ident = $smsg->{blob} // 'lei'; + if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } + + substr($$buf, 0, 0, # prepend From line + "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); + $buf; +} + +sub atomic_append { # for on-disk destinations (O_APPEND, or O_EXCL) + my ($fh, $buf) = @_; + defined(my $w = syswrite($fh, $$buf)) or die "write: $!"; + $w == length($$buf) or die "short write: $w != ".length($$buf); +} + +sub _print_full { + my ($fh, $buf) = @_; + print $fh $$buf or die "print: $!"; +} + +sub eml2mboxrd ($;$) { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg); + if (my $bdy = delete $eml->{bdy}) { + $$bdy =~ s/^(>*From )/>$1/gm; + $$buf .= $eml->{crlf}; + substr($$bdy, 0, 0, $$buf); # prepend header + $buf = $bdy; + } + $$buf .= $eml->{crlf}; + $buf; +} + +sub eml2mboxo { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg); + if (my $bdy = delete $eml->{bdy}) { + $$bdy =~ s/^From />From /gm; + $$buf .= $eml->{crlf}; + substr($$bdy, 0, 0, $$buf); # prepend header + $buf = $bdy; + } + $$buf .= $eml->{crlf}; + $buf; +} + +sub _mboxcl_common ($$$) { + my ($buf, $bdy, $crlf) = @_; + # add Lines: so mutt won't have to add it on MUA close + my $lines = $$bdy =~ tr!\n!\n!; + $$buf .= 'Content-Length: '.length($$bdy).$crlf. + 'Lines: '.$lines.$crlf.$crlf; + substr($$bdy, 0, 0, $$buf); # prepend header + $_[0] = $bdy; +} + +# mboxcl still escapes "From " lines +sub eml2mboxcl { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); + my $crlf = $eml->{crlf}; + if (my $bdy = delete $eml->{bdy}) { + $$bdy =~ s/^From />From /gm; + _mboxcl_common($buf, $bdy, $crlf); + } + $$buf .= $crlf; + $buf; +} + +# mboxcl2 has no "From " escaping +sub eml2mboxcl2 { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); + my $crlf = $eml->{crlf}; + if (my $bdy = delete $eml->{bdy}) { + _mboxcl_common($buf, $bdy, $crlf); + } + $$buf .= $crlf; + $buf; +} + +sub git_to_mail { # git->cat_async callback + my ($bref, $oid, $type, $size, $arg) = @_; + if ($type ne 'blob') { + if ($type eq 'missing') { + warn "missing $oid\n"; + } else { + warn "unexpected type=$type for $oid\n"; + } + } + my ($write_cb, $smsg) = @$arg; + if ($smsg->{blob} ne $oid) { + die "BUG: expected=$smsg->{blob} got=$oid"; + } + $write_cb->($bref, $smsg) if $size > 0; +} + +sub reap_compress { # dwaitpid callback + my ($lei, $pid) = @_; + my $cmd = delete $lei->{"pid.$pid"}; + return if $? == 0; + $lei->fail("@$cmd failed", $? >> 8); +} + +# all of these support -c for stdout and -d for decompression, +# mutt is commonly distributed with hooks for gz, bz2 and xz, at least +# { foo => '' } means "--foo" is passed to the command-line, +# otherwise { foo => '--bar' } passes "--bar" +our %zsfx2cmd = ( + gz => [ qw(GZIP pigz gzip), { rsyncable => '', threads => '-p' } ], + bz2 => [ 'bzip2', {} ], + xz => [ 'xz', { threads => '-T' } ], + # XXX does anybody care for these? I prefer zstd on entire FSes, + # so it's probably not necessary on a per-file basis + # zst => [ 'zstd', { -default => [ qw(-q) ], # it's noisy by default + # rsyncable => '', threads => '-T' } ], + # zz => [ 'pigz', { -default => [ '--zlib' ], + # rsyncable => '', threads => '-p' }], + # lzo => [ 'lzop', {} ], + # lzma => [ 'lzma', {} ], +); + +sub zsfx2cmd ($$$) { + my ($zsfx, $decompress, $lei) = @_; + my $x = $zsfx2cmd{$zsfx} // die "no support for suffix=.$zsfx"; + my @info = @$x; + my $cmd_opt = pop @info; + my @cmd = (undef, $decompress ? qw(-dc) : qw(-c)); + for my $exe (@info) { + # I think respecting client's ENV{GZIP} is OK, not sure + # about ENV overrides for other, less-common compressors + if ($exe eq uc($exe)) { + $exe = $lei->{env}->{$exe} or next; + } + $cmd[0] = which($exe) and last; + } + $cmd[0] // die join(' or ', @info)." missing for .$zsfx"; + # push @cmd, @{$cmd_opt->{-default}} if $cmd_opt->{-default}; + for my $bool (qw(rsyncable)) { + my $switch = $cmd_opt->{rsyncable} // next; + push @cmd, '--'.($switch || $bool); + } + for my $key (qw(threads)) { # support compression level? + my $switch = $cmd_opt->{$key} // next; + my $val = $lei->{opt}->{$key} // next; + push @cmd, $switch, $val; + } + \@cmd; +} + +sub _post_augment_mbox { # open a compressor process + my ($self, $lei, $zpipe) = @_; + my $zsfx = $self->{zsfx} or return; + my $cmd = zsfx2cmd($zsfx, undef, $lei); + my ($r, $w) = splice(@$zpipe, 0, 2); + my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2} }; + my $pid = spawn($cmd, $lei->{env}, $rdr); + my $pp = gensym; + my $dup = bless { "pid.$pid" => $cmd }, ref($lei); + $dup->{$_} = $lei->{$_} for qw(2 sock); + tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $dup; + $lei->{1} = $pp; + die 'BUG: unexpected {ovv}->{lock_path}' if $lei->{ovv}->{lock_path}; + $lei->{ovv}->ovv_out_lk_init; +} + +sub decompress_src ($$$) { + my ($in, $zsfx, $lei) = @_; + my $cmd = zsfx2cmd($zsfx, 1, $lei); + popen_rd($cmd, $lei->{env}, { 0 => $in, 2 => $lei->{2} }); +} + +sub dup_src ($) { + my ($in) = @_; + open my $dup, '+>>&', $in or die "dup: $!"; + $dup; +} + +# --augment existing output destination, with deduplication +sub _augment { # MboxReader eml_cb + my ($eml, $lei) = @_; + # ignore return value, just populate the skv + $lei->{dedupe}->is_dup($eml); +} + +sub _mbox_write_cb ($$) { + my ($self, $lei) = @_; + my $ovv = $lei->{ovv}; + my $m = 'eml2'.$ovv->{fmt}; + my $eml2mbox = $self->can($m) or die "$self->$m missing"; + my $out = $lei->{1} // die "no stdout ($m, $ovv->{dst})"; # redirected earlier + $out->autoflush(1); + my $write = $ovv->{lock_path} ? \&_print_full : \&atomic_append; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + return unless $out; + $eml //= PublicInbox::Eml->new($buf); + if (!$dedupe->is_dup($eml, $smsg->{blob})) { + $buf = $eml2mbox->($eml, $smsg); + my $lk = $ovv->lock_for_scope; + eval { $write->($out, $buf) }; + if ($@) { + die $@ if ref($@) ne 'PublicInbox::SIGPIPE'; + undef $out + } + } + } +} + +sub _maildir_each_file ($$;@) { + my ($dir, $cb, @arg) = @_; + for my $d (qw(new/ cur/)) { + my $pfx = $dir.$d; + opendir my $dh, $pfx or next; + while (defined(my $fn = readdir($dh))) { + $cb->($pfx.$fn, @arg) if $fn =~ /:2,[A-Za-z]*\z/; + } + } +} + +sub _augment_file { # _maildir_each_file cb + my ($f, $lei) = @_; + my $eml = PublicInbox::InboxWritable::eml_from_path($f) or return; + _augment($eml, $lei); +} + +# _maildir_each_file callback, \&CORE::unlink doesn't work with it +sub _unlink { unlink($_[0]) } + +sub _rand () { + state $seq = 0; + sprintf('%x,%x,%x,%x', rand(0xffffffff), time, $$, ++$seq); +} + +sub _buf2maildir { + my ($dst, $buf, $smsg) = @_; + my $kw = $smsg->{kw} // []; + my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw)); + my $rand = ''; # chosen by die roll :P + my ($tmp, $fh, $final); + my $common = $smsg->{blob} // _rand; + if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" } + do { + $tmp = $dst.'tmp/'.$rand.$common; + } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) && + $! == EEXIST && ($rand = _rand.',')); + if (print $fh $$buf and close($fh)) { + # ignore new/ and write only to cur/, otherwise MUAs + # with R/W access to the Maildir will end up doing + # a mass rename which can take a while with thousands + # of messages. + $dst .= 'cur/'; + $rand = ''; + do { + $final = $dst.$rand.$common.':2,'.$sfx; + } while (!link($tmp, $final) && $! == EEXIST && + ($rand = _rand.',')); + unlink($tmp) or warn "W: failed to unlink $tmp: $!\n"; + } else { + my $err = $!; + unlink($tmp); + die "Error writing $smsg->{blob} to $dst: $err"; + } +} + +sub _maildir_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe; + my $dst = $lei->{ovv}->{dst}; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + $buf //= \($eml->as_string); + return _buf2maildir($dst, $buf, $smsg) if !$dedupe; + $eml //= PublicInbox::Eml->new($$buf); # copy buf + return if $dedupe->is_dup($eml, $smsg->{blob}); + undef $eml; + _buf2maildir($dst, $buf, $smsg); + } +} + +sub write_cb { # returns a callback for git_to_mail + my ($self, $lei) = @_; + # _mbox_write_cb or _maildir_write_cb + my $m = "_$self->{base_type}_write_cb"; + $self->$m($lei); +} + +sub new { + my ($cls, $lei) = @_; + my $fmt = $lei->{ovv}->{fmt}; + my $dst = $lei->{ovv}->{dst}; + my $self = bless {}, $cls; + if ($fmt eq 'maildir') { + $self->{base_type} = 'maildir'; + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + } elsif (substr($fmt, 0, 4) eq 'mbox') { + (-d $dst || (-e _ && !-w _)) and die + "$dst exists and is not a writable file\n"; + $self->can("eml2$fmt") or die "bad mbox --format=$fmt\n"; + $self->{base_type} = 'mbox'; + } else { + die "bad mail --format=$fmt\n"; + } + $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei); + $self; +} + +sub _pre_augment_maildir {} # noop + +sub _do_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + if ($lei->{opt}->{augment}) { + my $dedupe = $lei->{dedupe}; + if ($dedupe && $dedupe->prepare_dedupe) { + require PublicInbox::InboxWritable; # eml_from_path + _maildir_each_file($dst, \&_augment_file, $lei); + $dedupe->pause_dedupe; + } + } else { # clobber existing Maildir + _maildir_each_file($dst, \&_unlink); + } +} + +sub _post_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + for my $x (qw(tmp new cur)) { + my $d = $dst.$x; + next if -d $d; + require File::Path; + File::Path::mkpath($d); + -d $d or die "$d is not a directory"; + } +} + +sub _pre_augment_mbox { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + if ($dst ne '/dev/stdout') { + my $mode = -p $dst ? '>' : '+>>'; + if (-f _ && !$lei->{opt}->{augment} and !unlink($dst)) { + $! == ENOENT or die "unlink($dst): $!"; + } + open my $out, $mode, $dst or die "open($dst): $!"; + $lei->{old_1} = $lei->{1}; + $lei->{1} = $out; + } + # Perl does SEEK_END even with O_APPEND :< + $self->{seekable} = seek($lei->{1}, 0, SEEK_SET); + if (!$self->{seekable} && $! != ESPIPE && $dst ne '/dev/stdout') { + die "seek($dst): $!\n"; + } + state $zsfx_allow = join('|', keys %zsfx2cmd); + ($self->{zsfx}) = ($dst =~ /\.($zsfx_allow)\z/) or return; + pipe(my ($r, $w)) or die "pipe: $!"; + [ $r, $w ]; +} + +sub _do_augment_mbox { + my ($self, $lei) = @_; + return if !$lei->{opt}->{augment}; + my $dedupe = $lei->{dedupe}; + my $dst = $lei->{ovv}->{dst}; + die "cannot augment $dst, not seekable\n" if !$self->{seekable}; + my $out = $lei->{1}; + if (-s $out && $dedupe && $dedupe->prepare_dedupe) { + my $zsfx = $self->{zsfx}; + my $rd = $zsfx ? decompress_src($out, $zsfx, $lei) : + dup_src($out); + my $fmt = $lei->{ovv}->{fmt}; + require PublicInbox::MboxReader; + PublicInbox::MboxReader->$fmt($rd, \&_augment, $lei); + } + # maybe some systems don't honor O_APPEND, Perl does this: + seek($out, 0, SEEK_END) or die "seek $dst: $!"; + $dedupe->pause_dedupe if $dedupe; +} + +sub pre_augment { # fast (1 disk seek), runs in main daemon + my ($self, $lei) = @_; + # _pre_augment_maildir, _pre_augment_mbox + my $m = "_pre_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub do_augment { # slow, runs in wq worker + my ($self, $lei) = @_; + # _do_augment_maildir, _do_augment_mbox + my $m = "_do_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon + my ($self, $lei, @args) = @_; + # _post_augment_maildir, _post_augment_mbox + my $m = "_post_augment_$self->{base_type}"; + $self->$m($lei, @args); +} + +sub write_mail { # via ->wq_do + my ($self, $git_dir, $smsg, $lei) = @_; + my $not_done = delete $self->{4}; # write end of {each_smsg_done} + my $wcb = $self->{wcb} //= do { # first message + my %sig = $lei->atfork_child_wq($self); + @SIG{keys %sig} = values %sig; # not local + $lei->{dedupe}->prepare_dedupe; + $self->write_cb($lei); + }; + my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir); + $git->cat_async($smsg->{blob}, \&git_to_mail, [$wcb, $smsg, $not_done]); +} + +sub ipc_atfork_prepare { + my ($self) = @_; + # FDs: (done_wr, stdout|mbox, stderr, 3: sock, 4: each_smsg_done_wr) + $self->SUPER::ipc_atfork_prepare; # PublicInbox::IPC +} + +# We rely on OnDestroy to run this before ->DESTROY, since ->DESTROY +# ordering is unstable at worker exit and may cause segfaults +sub reap_gits { + my ($self) = @_; + delete $self->{wcb}; + for my $git (delete @$self{grep(/\A$$\0/, keys %$self)}) { + $git->async_wait_all; + } +} + +sub DESTROY { delete $_[0]->{wcb} } + +sub ipc_atfork_child { # runs after IPC::wq_worker_loop + my ($self) = @_; + $self->SUPER::ipc_atfork_child; + # reap_gits needs to run before $self->DESTROY, + # IPC.pm will ensure that. + PublicInbox::OnDestroy->new($$, \&reap_gits, $self); +} + +1; diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm new file mode 100644 index 00000000..fb608d00 --- /dev/null +++ b/lib/PublicInbox/LeiXSearch.pm @@ -0,0 +1,412 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# Combine any combination of PublicInbox::Search, +# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects +# into one Xapian DB +package PublicInbox::LeiXSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiSearch PublicInbox::IPC); +use PublicInbox::DS qw(dwaitpid); +use PublicInbox::OpPipe; +use PublicInbox::Import; +use File::Temp 0.19 (); # 0.19 for ->newdir +use File::Spec (); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::MID qw(mids); + +sub new { + my ($class) = @_; + PublicInbox::Search::load_xapian(); + bless { + qp_flags => $PublicInbox::Search::QP_FLAGS | + PublicInbox::Search::FLAG_PURE_NOT(), + }, $class +} + +sub attach_external { + my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox + my $desc = $ibxish->{inboxdir} // $ibxish->{topdir}; + my $srch = $ibxish->search or + return warn("$desc not indexed for Xapian\n"); + my @shards = $srch->xdb_shards_flat or + return warn("$desc has no Xapian shardsXapian\n"); + + if (delete $self->{xdb}) { # XXX: do we need this? + # clobber existing {xdb} if amending + my $expect = delete $self->{nshard}; + my $shards = delete $self->{shards_flat}; + scalar(@$shards) == $expect or die + "BUG: {nshard}$expect != shards=".scalar(@$shards); + + my $prev = {}; + for my $old_ibxish (@{$self->{shard2ibx}}) { + next if $prev == $old_ibxish; + $prev = $old_ibxish; + my @shards = $old_ibxish->search->xdb_shards_flat; + push @{$self->{shards_flat}}, @shards; + } + my $nr = scalar(@{$self->{shards_flat}}); + $nr == $expect or die + "BUG: reloaded $nr shards, expected $expect" + } + push @{$self->{shards_flat}}, @shards; + push(@{$self->{shard2ibx}}, $ibxish) for (@shards); +} + +# returns a list of local inboxes (or count in scalar context) +sub locals { @{$_[0]->{locals} // []} } + +sub remotes { @{$_[0]->{remotes} // []} } + +# called by PublicInbox::Search::xdb +sub xdb_shards_flat { @{$_[0]->{shards_flat} // []} } + +# like over->get_art +sub smsg_for { + my ($self, $mitem) = @_; + # cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID + my $nshard = $self->{nshard}; + my $docid = $mitem->get_docid; + my $shard = ($docid - 1) % $nshard; + my $num = int(($docid - 1) / $nshard) + 1; + my $ibx = $self->{shard2ibx}->[$shard]; + my $smsg = $ibx->over->get_art($num); + if (ref($ibx->can('msg_keywords'))) { + my $kw = xap_terms('K', $mitem->get_document); + $smsg->{kw} = [ sort keys %$kw ]; + } + $smsg->{docid} = $docid; + $smsg; +} + +sub recent { + my ($self, $qstr, $opt) = @_; + $opt //= {}; + $opt->{relevance} //= -2; + $self->mset($qstr //= 'bytes:1..', $opt); +} + +sub over {} + +sub _mset_more ($$) { + my ($mset, $mo) = @_; + my $size = $mset->size; + $size && (($mo->{offset} += $size) < ($mo->{limit} // 10000)); +} + +# $startq will EOF when query_prepare is done augmenting and allow +# query_mset and query_thread_mset to proceed. +sub wait_startq ($) { + my ($startq) = @_; + $_[0] = undef; + read($startq, my $query_prepare_done, 1); +} + +sub query_thread_mset { # for --thread + my ($self, $lei, $ibxish) = @_; + local $0 = "$0 query_thread_mset"; + my $startq = delete $self->{5}; + my %sig = $lei->atfork_child_wq($self); + local @SIG{keys %sig} = values %sig; + + my ($srch, $over) = ($ibxish->search, $ibxish->over); + unless ($srch && $over) { + my $desc = $ibxish->{inboxdir} // $ibxish->{topdir}; + warn "$desc not indexed by Xapian\n"; + return; + } + my $mo = { %{$lei->{mset_opt}} }; + my $mset; + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei, $ibxish); + do { + $mset = $srch->mset($mo->{qstr}, $mo); + my $ids = $srch->mset_to_artnums($mset, $mo); + my $ctx = { ids => $ids }; + my $i = 0; + my %n2item = map { ($ids->[$i++], $_) } $mset->items; + while ($over->expand_thread($ctx)) { + for my $n (@{$ctx->{xids}}) { + my $smsg = $over->get_art($n) or next; + wait_startq($startq) if $startq; + my $mitem = delete $n2item{$smsg->{num}}; + $each_smsg->($smsg, $mitem); + } + @{$ctx->{xids}} = (); + } + } while (_mset_more($mset, $mo)); + undef $each_smsg; # drops @io for l2m->{each_smsg_done} + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub query_mset { # non-parallel for non-"--thread" users + my ($self, $lei) = @_; + local $0 = "$0 query_mset"; + my $startq = delete $self->{5}; + my %sig = $lei->atfork_child_wq($self); + local @SIG{keys %sig} = values %sig; + my $mo = { %{$lei->{mset_opt}} }; + my $mset; + for my $loc (locals($self)) { + attach_external($self, $loc); + } + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei, $self); + do { + $mset = $self->mset($mo->{qstr}, $mo); + for my $mitem ($mset->items) { + my $smsg = smsg_for($self, $mitem) or next; + wait_startq($startq) if $startq; + $each_smsg->($smsg, $mitem); + } + } while (_mset_more($mset, $mo)); + undef $each_smsg; # drops @io for l2m->{each_smsg_done} + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub each_eml { # callback for MboxReader->mboxrd + my ($eml, $self, $lei, $each_smsg) = @_; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->populate($eml); + $smsg->parse_references($eml, mids($eml)); + $smsg->{$_} //= '' for qw(from to cc ds subject references mid); + delete @$smsg{qw(From Subject -ds -ts)}; + if (my $startq = delete($self->{5})) { wait_startq($startq) } + $each_smsg->($smsg, undef, $eml); +} + +sub query_remote_mboxrd { + my ($self, $lei, $uris) = @_; + local $0 = "$0 query_remote_mboxrd"; + my %sig = $lei->atfork_child_wq($self); # keep $self->{5} startq + local @SIG{keys %sig} = values %sig; + my ($opt, $env) = @$lei{qw(opt env)}; + my @qform = (q => $lei->{mset_opt}->{qstr}, x => 'm'); + push(@qform, t => 1) if $opt->{thread}; + my @cmd = (qw(curl -sSf -d), ''); + my $verbose = $opt->{verbose}; + push @cmd, '-v' if $verbose; + for my $o ($lei->curl_opt) { + $o =~ s/\|[a-z0-9]\b//i; # remove single char short option + if ($o =~ s/=[is]@\z//) { + my $ary = $opt->{$o} or next; + push @cmd, map { ("--$o", $_) } @$ary; + } elsif ($o =~ s/=[is]\z//) { + my $val = $opt->{$o} // next; + push @cmd, "--$o", $val; + } elsif ($opt->{$o}) { + push @cmd, "--$o"; + } + } + $opt->{torsocks} = 'false' if $opt->{'no-torsocks'}; + my $tor = $opt->{torsocks} //= 'auto'; + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + for my $uri (@$uris) { + $uri->query_form(@qform); + my $cmd = [ @cmd, $uri->as_string ]; + if ($tor eq 'auto' && substr($uri->host, -6) eq '.onion' && + (($env->{LD_PRELOAD}//'') !~ /torsocks/)) { + unshift @$cmd, 'torsocks'; + } elsif (PublicInbox::Config::git_bool($tor)) { + unshift @$cmd, 'torsocks'; + } + $lei->err("# @$cmd") if $verbose; + $? = 0; + my $fh = popen_rd($cmd, $env, { 2 => $lei->{2} }); + $fh = IO::Uncompress::Gunzip->new($fh); + eval { + PublicInbox::MboxReader->mboxrd($fh, \&each_eml, $self, + $lei, $each_smsg); + }; + return $lei->fail("E: @$cmd: $@") if $@; + if (($? >> 8) == 22) { # HTTP 404 from curl(1) + $uri->query_form(q => $lei->{mset_opt}->{qstr}); + $lei->err('# no results from '.$uri->as_string); + } elsif ($?) { + $uri->query_form(q => $lei->{mset_opt}->{qstr}); + $lei->err('E: '.$uri->as_string); + $lei->child_error($?); + } + } + undef $each_smsg; + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub git { + my ($self) = @_; + my (%seen, @dirs); + my $tmp = File::Temp->newdir('lei_xsrch_git-XXXXXXXX', TMPDIR => 1); + for my $ibx (@{$self->{shard2ibx} // []}) { + my $d = File::Spec->canonpath($ibx->git->{git_dir}); + $seen{$d} //= push @dirs, "$d/objects\n" + } + my $git_dir = $tmp->dirname; + PublicInbox::Import::init_bare($git_dir); + my $f = "$git_dir/objects/info/alternates"; + open my $alt, '>', $f or die "open($f): $!"; + print $alt @dirs or die "print $f: $!"; + close $alt or die "close $f: $!"; + my $git = PublicInbox::Git->new($git_dir); + $git->{-tmp} = $tmp; + $git; +} + +sub query_done { # EOF callback + my ($lei) = @_; + my $has_l2m = exists $lei->{l2m}; + for my $f (qw(lxs l2m)) { + my $wq = delete $lei->{$f} or next; + $wq->wq_wait_old; + } + $lei->{ovv}->ovv_end($lei); + if ($has_l2m) { # close() calls LeiToMail reap_compress + if (my $out = delete $lei->{old_1}) { + if (my $mbout = $lei->{1}) { + close($mbout) or return $lei->fail(<<""); +Error closing $lei->{ovv}->{dst}: $! + + } + $lei->{1} = $out; + } + $lei->start_mua; + } + $lei->dclose; +} + +sub do_post_augment { + my ($lei, $zpipe, $au_done) = @_; + my $l2m = $lei->{l2m} or die 'BUG: no {l2m}'; + eval { $l2m->post_augment($lei, $zpipe) }; + if (my $err = $@) { + if (my $lxs = delete $lei->{lxs}) { + $lxs->wq_kill; + $lxs->wq_close; + } + $lei->fail("$err"); + } + close $au_done; # triggers wait_startq +} + +my $MAX_PER_HOST = 4; +sub MAX_PER_HOST { $MAX_PER_HOST } + +sub concurrency { + my ($self, $opt) = @_; + my $nl = $opt->{thread} ? locals($self) : 1; + my $nr = remotes($self); + $nr = $MAX_PER_HOST if $nr > $MAX_PER_HOST; + $nl + $nr; +} + +sub start_query { # always runs in main (lei-daemon) process + my ($self, $io, $lei) = @_; + if ($lei->{opt}->{thread}) { + for my $ibxish (locals($self)) { + $self->wq_do('query_thread_mset', $io, $lei, $ibxish); + } + } elsif (locals($self)) { + $self->wq_do('query_mset', $io, $lei); + } + my $i = 0; + my $q = []; + for my $uri (remotes($self)) { + push @{$q->[$i++ % $MAX_PER_HOST]}, $uri; + } + for my $uris (@$q) { + $self->wq_do('query_remote_mboxrd', $io, $lei, $uris); + } + @$io = (); +} + +sub query_prepare { # called by wq_do + my ($self, $lei) = @_; + local $0 = "$0 query_prepare"; + my %sig = $lei->atfork_child_wq($self); + -p $lei->{0} or die "BUG: \$done pipe expected"; + local @SIG{keys %sig} = values %sig; + eval { $lei->{l2m}->do_augment($lei) }; + $lei->fail($@) if $@; + syswrite($lei->{0}, '.') == 1 or die "do_post_augment trigger: $!"; +} + +sub sigpipe_handler { # handles SIGPIPE from l2m/lxs workers + my ($lei) = @_; + my $lxs = delete $lei->{lxs}; + if ($lxs && $lxs->wq_kill_old) { + kill 'PIPE', $$; + $lxs->wq_wait_old; + } + close(delete $lei->{1}) if $lei->{1}; +} + +sub do_query { + my ($self, $lei_orig) = @_; + my ($lei, @io) = $lei_orig->atfork_parent_wq($self); + $io[0] = undef; + pipe(my $done, $io[0]) or die "pipe $!"; + $lei_orig->{1}->autoflush(1); + + $lei_orig->event_step_init; # wait for shutdowns + my $done_op = { + '' => [ \&query_done, $lei_orig ], + '!' => [ \&sigpipe_handler, $lei_orig ] + }; + my $in_loop = exists $lei_orig->{sock}; + $done = PublicInbox::OpPipe->new($done, $done_op, $in_loop); + my $l2m = $lei->{l2m}; + if ($l2m) { + # may redirect $lei->{1} for mbox + my $zpipe = $l2m->pre_augment($lei_orig); + $io[1] = $lei_orig->{1}; + pipe(my ($startq, $au_done)) or die "pipe: $!"; + $done_op->{'.'} = [ \&do_post_augment, $lei_orig, + $zpipe, $au_done ]; + local $io[4] = *STDERR{GLOB}; # don't send l2m->{-wq_s1} + die "BUG: unexpected \$io[5]: $io[5]" if $io[5]; + $self->wq_do('query_prepare', \@io, $lei); + fcntl($startq, 1031, 4096) if $^O eq 'linux'; # F_SETPIPE_SZ + $io[5] = $startq; + $io[1] = $zpipe->[1] if $zpipe; + } + start_query($self, \@io, $lei); + $self->wq_close(1); + unless ($in_loop) { + # for the $lei->atfork_child_wq PIPE handler: + while ($done->{sock}) { $done->event_step } + } +} + +sub ipc_atfork_prepare { + my ($self) = @_; + if (exists $self->{remotes}) { + require PublicInbox::MboxReader; + require IO::Uncompress::Gunzip; + } + # FDS: (0: done_wr, 1: stdout|mbox, 2: stderr, + # 3: sock, 4: $l2m->{-wq_s1}, 5: $startq) + $self->SUPER::ipc_atfork_prepare; # PublicInbox::IPC +} + +sub prepare_external { + my ($self, $loc, $boost) = @_; # n.b. already ordered by boost + if (ref $loc) { # already a URI, or PublicInbox::Inbox-like object + return push(@{$self->{remotes}}, $loc) if $loc->can('scheme'); + } elsif ($loc =~ m!\Ahttps?://!) { + require URI; + return push(@{$self->{remotes}}, URI->new($loc)); + } elsif (-f "$loc/ei.lock") { + require PublicInbox::ExtSearch; + $loc = PublicInbox::ExtSearch->new($loc); + } elsif (-f "$loc/inbox.lock" || -d "$loc/public-inbox") { + require PublicInbox::Inbox; # v2, v1 + $loc = bless { inboxdir => $loc }, 'PublicInbox::Inbox'; + } else { + warn "W: ignoring $loc, unable to determine type\n"; + return; + } + push @{$self->{locals}}, $loc; +} + + +1; diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index a02eafc4..2ac74e2a 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # two-step linkification. diff --git a/lib/PublicInbox/Listener.pm b/lib/PublicInbox/Listener.pm index 2e0fc248..c8315810 100644 --- a/lib/PublicInbox/Listener.pm +++ b/lib/PublicInbox/Listener.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # Used by -nntpd for listen sockets diff --git a/lib/PublicInbox/Lock.pm b/lib/PublicInbox/Lock.pm index b2c8227f..bb213de4 100644 --- a/lib/PublicInbox/Lock.pm +++ b/lib/PublicInbox/Lock.pm @@ -1,12 +1,14 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Base class for per-inbox locking package PublicInbox::Lock; use strict; -use warnings; +use v5.10.1; use Fcntl qw(:flock :DEFAULT); use Carp qw(croak); +use PublicInbox::OnDestroy; +use File::Temp (); # we only acquire the flock if creating or reindexing; # PublicInbox::Import already has the lock on its own. @@ -32,4 +34,17 @@ sub lock_release { close $lockfh or croak "close $lock_path failed: $!\n"; } +# caller must use return value +sub lock_for_scope { + my ($self, @single_pid) = @_; + lock_acquire($self) or return; # lock_path not set + PublicInbox::OnDestroy->new(@single_pid, \&lock_release, $self); +} + +sub new_tmp { + my ($cls, $ident) = @_; + my $tmp = File::Temp->new("$ident.lock-XXXXXX", TMPDIR => 1); + bless { lock_path => $tmp->filename, tmp => $tmp }, $cls; +} + 1; diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index fa4a2ad8..f82194a3 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2020 all contributors +# Copyright (C) 2013-2021 all contributors # License: AGPL-3.0+ # # For the -mda script (mail delivery agent) @@ -83,7 +83,7 @@ sub set_list_headers { } sub inboxes_for_list_id ($$) { - my ($klass, $config, $simple) = @_; + my ($klass, $pi_cfg, $simple) = @_; # newer Email::Simple allows header_raw, as does Email::MIME: my @list_ids = $simple->can('header_raw') ? @@ -92,7 +92,7 @@ sub inboxes_for_list_id ($$) { my @dests; for my $list_id (@list_ids) { $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next; - if (my $ibx = $config->lookup_list_id($1)) { + if (my $ibx = $pi_cfg->lookup_list_id($1)) { push @dests, $ibx; } } diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 5aeffb8c..35b517e0 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # Various Message-ID-related functions. @@ -7,7 +7,7 @@ use strict; use warnings; use base qw/Exporter/; our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_escape MID_ESC - mids references mids_for_index $MID_EXTRACT); + mids references mids_for_index mids_in $MID_EXTRACT); use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; require PublicInbox::Address; @@ -73,14 +73,17 @@ sub mids ($) { uniq_mids(extract_mids(@mids)); } +# for Resent-Message-ID and maybe others +sub mids_in ($@) { + my ($eml, @headers) = @_; + uniq_mids(extract_mids(map { ($eml->header_raw($_)) } @headers)); +} + # we allow searching on X-Alt-Message-ID since PublicInbox::NNTP uses them # to placate some clients, and we want to ensure NNTP-only clients can # import and index without relying on HTTP endpoints sub mids_for_index ($) { - my ($hdr) = @_; - my @mids = $hdr->header_raw('Message-ID'); - my @alts = $hdr->header_raw('X-Alt-Message-ID'); - uniq_mids(extract_mids(@mids, @alts)); + mids_in($_[0], qw(Message-ID X-Alt-Message-ID)); } # last References should be IRT, but some mail clients do things @@ -119,7 +122,7 @@ sub uniq_mids ($;$) { warn "Message-ID: <$mid> too long, truncating\n"; $mid = substr($mid, 0, MAX_MID_SIZE); } - push(@ret, $mid) unless $seen->{$mid}++; + $seen->{$mid} //= push(@ret, $mid); } \@ret; } diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index 74820fb5..31cf15dc 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # generates manifest.js.gz for grokmirror(1) @@ -6,21 +6,12 @@ package PublicInbox::ManifestJsGz; use strict; use v5.10.1; use parent qw(PublicInbox::WwwListing); -use Digest::SHA (); -use File::Spec (); use bytes (); # length -use PublicInbox::Inbox; -use PublicInbox::Git; +use PublicInbox::Config; use IO::Compress::Gzip qw(gzip); use HTTP::Date qw(time2str); -*try_cat = \&PublicInbox::Inbox::try_cat; -our $json; -for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { - eval "require $mod" or next; - # ->ascii encodes non-ASCII to "\uXXXX" - $json = $mod->new->ascii(1) and last; -} +my $json = PublicInbox::Config::json(); # called by WwwListing sub url_regexp { @@ -30,76 +21,29 @@ sub url_regexp { $ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain'); } -sub fingerprint ($) { - my ($git) = @_; - # TODO: convert to qspawn for fairness when there's - # thousands of repos - my ($fh, $pid) = $git->popen('show-ref'); - my $dig = Digest::SHA->new(1); - while (read($fh, my $buf, 65536)) { - $dig->add($buf); - } - close $fh; - waitpid($pid, 0); - return if $?; # empty, uninitialized git repo - $dig->hexdigest; +sub inject_entry ($$$;$) { + my ($ctx, $url_path, $ent, $git_dir) = @_; + $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path; + my $modified = $ent->{modified}; + $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0); + $ctx->{manifest}->{$url_path} = $ent; } sub manifest_add ($$;$$) { my ($ctx, $ibx, $epoch, $default_desc) = @_; my $url_path = "/$ibx->{name}"; - my $git_dir = $ibx->{inboxdir}; + my $git; if (defined $epoch) { - $git_dir .= "/git/$epoch.git"; $url_path .= "/git/$epoch.git"; + $git = $ibx->git_epoch($epoch) or return; + } else { + $git = $ibx->git; } - return unless -d $git_dir; - my $git = PublicInbox::Git->new($git_dir); - my $fingerprint = fingerprint($git) or return; # no empty repos - - chomp(my $owner = $git->qx('config', 'gitweb.owner')); - chomp(my $desc = try_cat("$git_dir/description")); - utf8::decode($owner); - utf8::decode($desc); - $owner = undef if $owner eq ''; - $desc = 'Unnamed repository' if $desc eq ''; - - # templates/hooks--update.sample and git-multimail in git.git - # only match "Unnamed repository", not the full contents of - # templates/this--description in git.git - if ($desc =~ /\AUnnamed repository/) { - $desc = "$default_desc [epoch $epoch]" if defined($epoch); - } - - my $reference; - chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); - if ($alt) { - # n.b.: GitPython doesn't seem to handle comments or C-quoted - # strings like native git does; and we don't for now, either. - my @alt = split(/\n+/, $alt); - - # grokmirror only supports 1 alternate for "reference", - if (scalar(@alt) == 1) { - my $objdir = "$git_dir/objects"; - $reference = File::Spec->rel2abs($alt[0], $objdir); - $reference =~ s!/[^/]+/?\z!!; # basename - } - } - $ctx->{-abs2urlpath}->{$git_dir} = $url_path; - my $modified = $git->modified; - if ($modified > ($ctx->{-mtime} // 0)) { - $ctx->{-mtime} = $modified; - } - $ctx->{manifest}->{$url_path} = { - owner => $owner, - reference => $reference, - description => $desc, - modified => $modified, - fingerprint => $fingerprint, - }; + my $ent = $git->manifest_entry($epoch, $default_desc) or return; + inject_entry($ctx, $url_path, $ent, $git->{git_dir}); } -sub ibx_entry { +sub slow_manifest_add ($$) { my ($ctx, $ibx) = @_; eval { if (defined(my $max = $ibx->max_git_epoch)) { @@ -111,6 +55,29 @@ sub ibx_entry { manifest_add($ctx, $ibx); } }; +} + +sub eidx_manifest_add ($$$) { + my ($ctx, $ALL, $ibx) = @_; + if (my $data = $ALL->misc->inbox_data($ibx)) { + $data = $json->decode($data); + delete $data->{''}; # private + while (my ($url_path, $ent) = each %$data) { + inject_entry($ctx, $url_path, $ent); + } + } else { + warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n"; + } +} + +sub ibx_entry { + my ($ctx, $ibx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL; + if ($ALL) { + eidx_manifest_add($ctx, $ALL, $ibx); + } else { + slow_manifest_add($ctx, $ibx); + } warn "E: $@" if $@; } @@ -134,7 +101,8 @@ sub psgi_triple { sub per_inbox { my ($ctx) = @_; - ibx_entry($ctx, $ctx->{-inbox}); + # only one inbox, slow is probably OK + slow_manifest_add($ctx, $ctx->{ibx}); psgi_triple($ctx); } diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 47025891..964147fa 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # Streaming interface for mboxrd HTTP responses @@ -17,10 +17,10 @@ use PublicInbox::Eml; sub getline { my ($ctx) = @_; # ctx my $smsg = $ctx->{smsg} or return; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $eml = $ibx->smsg_eml($smsg) or return; my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); + $ctx->zmore(msg_hdr($ctx, $eml)); if ($n) { $ctx->translate(msg_body($eml)); } else { # last message @@ -44,9 +44,9 @@ sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; my $smsg = delete $ctx->{smsg}; # next message - $ctx->{smsg} = $ctx->{-inbox}->over->next_by_mid(@{$ctx->{next_arg}}); + $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); + $ctx->zmore(msg_hdr($ctx, $eml)); $ctx->{http_out}->write($ctx->translate(msg_body($eml))); } @@ -56,7 +56,7 @@ sub res_hdr ($$) { $fn =~ s/^re:\s+//i; $fn = to_filename($fn) // 'no-subject'; my @hdr = ('Content-Type'); - if ($ctx->{-inbox}->{obfuscate}) { + if ($ctx->{ibx}->{obfuscate}) { # obfuscation is stupid, but maybe scrapers are, too... push @hdr, 'application/mbox'; $fn .= '.mbox'; @@ -71,17 +71,17 @@ sub res_hdr ($$) { # for rare cases where v1 inboxes aren't indexed w/ ->over at all sub no_over_raw ($) { my ($ctx) = @_; - my $mref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; + my $mref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; my $eml = PublicInbox::Eml->new($mref); [ 200, res_hdr($ctx, $eml->header_str('Subject')), - [ msg_hdr($ctx, $eml, $ctx->{mid}) . msg_body($eml) ] ] + [ msg_hdr($ctx, $eml) . msg_body($eml) ] ] } # /$INBOX/$MESSAGE_ID/raw sub emit_raw { my ($ctx) = @_; - $ctx->{base_url} = $ctx->{-inbox}->base_url($ctx->{env}); - my $over = $ctx->{-inbox}->over or return no_over_raw($ctx); + $ctx->{base_url} = $ctx->{ibx}->base_url($ctx->{env}); + my $over = $ctx->{ibx}->over or return no_over_raw($ctx); my ($id, $prev); my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; my $smsg = $ctx->{smsg} = $over->next_by_mid(@$mip) or return; @@ -90,8 +90,8 @@ sub emit_raw { $ctx->psgi_response(200, $res_hdr); } -sub msg_hdr ($$;$) { - my ($ctx, $eml, $mid) = @_; +sub msg_hdr ($$) { + my ($ctx, $eml) = @_; my $header_obj = $eml->header_obj; # drop potentially confusing headers, ssoma already should've dropped @@ -99,34 +99,11 @@ sub msg_hdr ($$;$) { foreach my $d (qw(Lines Bytes Content-Length Status)) { $header_obj->header_set($d); } - my $ibx = $ctx->{-inbox}; - my $base = $ctx->{base_url}; - $mid = $ctx->{mid} unless defined $mid; - $mid = mid_escape($mid); - my @append = ( - 'Archived-At', "<$base$mid/>", - 'List-Archive', "<$base>", - 'List-Post', "{-primary_address}>", - ); my $crlf = $header_obj->crlf; my $buf = $header_obj->as_string; # fixup old bug from import (pre-a0c07cba0e5d8b6a) $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf; - - for (my $i = 0; $i < @append; $i += 2) { - my $k = $append[$i]; - my $v = $append[$i + 1]; - my @v = $header_obj->header_raw($k); - foreach (@v) { - if ($v eq $_) { - $v = undef; - last; - } - } - $buf .= "$k: $v$crlf" if defined $v; - } - $buf .= $crlf; + "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf; } sub msg_body ($) { @@ -190,7 +167,7 @@ sub all_ids_cb { sub mbox_all_ids { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $prev = 0; my $mm = $ctx->{mm} = $ibx->mm; my $ids = $mm->ids_after(\$prev) or return @@ -203,27 +180,33 @@ sub mbox_all_ids { PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all'); } +sub gone ($$) { + my ($ctx, $what) = @_; + warn "W: `$ctx->{ibx}->{inboxdir}' $what went away unexpectedly\n"; + undef; +} + sub results_cb { my ($ctx) = @_; - my $over = $ctx->{-inbox}->over or return; + my $over = $ctx->{ibx}->over or return gone($ctx, 'over'); while (1) { while (defined(my $num = shift(@{$ctx->{ids}}))) { my $smsg = $over->get_art($num) or next; return $smsg; } # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; + my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } sub results_thread_cb { my ($ctx) = @_; - my $over = $ctx->{-inbox}->over or return; + my $over = $ctx->{ibx}->over or return gone($ctx, 'over'); while (1) { while (defined(my $num = shift(@{$ctx->{xids}}))) { my $smsg = $over->get_art($num) or next; @@ -234,11 +217,11 @@ sub results_thread_cb { next if $over->expand_thread($ctx); # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; + my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } @@ -247,19 +230,19 @@ sub mbox_all { my ($ctx, $q) = @_; my $q_string = $q->{'q'}; return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); - my $over = $ctx->{-inbox}->over or + my $over = $ctx->{ibx}->over or return PublicInbox::WWW::need($ctx, 'Overview'); - my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid + my $qopts = $ctx->{qopts} = { relevance => -1 }; # ORDER BY docid ASC $qopts->{thread} = 1 if $q->{t}; my $mset = $srch->mset($q_string, $qopts); $qopts->{offset} = $mset->size or return [404, [qw(Content-Type text/plain)], ["No results found\n"]]; $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts); require PublicInbox::MboxGz; my $fn; if ($q->{t} && $srch->has_threadid) { diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm index 913be6e4..3ed33867 100644 --- a/lib/PublicInbox/MboxGz.pm +++ b/lib/PublicInbox/MboxGz.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ package PublicInbox::MboxGz; use strict; @@ -22,7 +22,7 @@ sub async_next ($) { sub mbox_gz { my ($self, $cb, $fn) = @_; $self->{cb} = $cb; - $self->{base_url} = $self->{-inbox}->base_url($self->{env}); + $self->{base_url} = $self->{ibx}->base_url($self->{env}); $self->{gz} = PublicInbox::GzipFilter::gzip_or_die(); $fn = to_filename($fn // '') // 'no-subject'; # http://www.iana.org/assignments/media-types/application/gzip @@ -37,8 +37,8 @@ sub getline { my ($self) = @_; my $cb = $self->{cb} or return; while (my $smsg = $cb->($self)) { - my $eml = $self->{-inbox}->smsg_eml($smsg) or next; - $self->zmore(msg_hdr($self, $eml, $smsg->{mid})); + my $eml = $self->{ibx}->smsg_eml($smsg) or next; + $self->zmore(msg_hdr($self, $eml)); return $self->translate(msg_body($eml)); } # signal that we're done and can return undef next call: diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm new file mode 100644 index 00000000..59ce4fb6 --- /dev/null +++ b/lib/PublicInbox/MboxReader.pm @@ -0,0 +1,124 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# reader for mbox variants we support +package PublicInbox::MboxReader; +use strict; +use v5.10.1; +use Data::Dumper; +$Data::Dumper::Useqq = 1; # should've been the default, for bad data + +my $from_strict = + qr/^From \S+ +\S+ \S+ +\S+ [^\n:]+:[^\n:]+:[^\n:]+ [^\n:]+\n/sm; + +sub _mbox_from { + my ($mbfh, $from_re, $eml_cb, @arg) = @_; + my $buf = ''; + my @raw; + while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { + if ($r == 0) { # close here to check for "curl --fail" + close($mbfh) or die "error closing mbox: \$?=$? $!"; + @raw = ($buf); + } else { + @raw = split(/$from_strict/mos, $buf, -1); + next if scalar(@raw) == 0; + $buf = pop(@raw); # last bit may be incomplete + } + @raw = grep /[^ \t\r\n]/s, @raw; # skip empty messages + while (defined(my $raw = shift @raw)) { + $raw =~ s/\r?\n\z//s; + $raw =~ s/$from_re/$1/gms; + my $eml = PublicInbox::Eml->new(\$raw); + $eml_cb->($eml, @arg); + } + return if $r == 0; # EOF + } + die "error reading mboxo/mboxrd handle: $!"; +} + +sub mboxrd { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_from($mbfh, qr/^>(>*From )/ms, $eml_cb, @arg); +} + +sub mboxo { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_from($mbfh, qr/^>(From )/ms, $eml_cb, @arg); +} + +sub _cl_body { + my ($mbfh, $bref, $cl) = @_; + my $body = substr($$bref, 0, $cl, ''); + my $need = $cl - length($body); + if ($need > 0) { + $mbfh or die "E: needed $need bytes after EOF"; + defined(my $r = read($mbfh, $body, $need, length($body))) or + die "E: read error: $!\n"; + $r == $need or die "E: read $r of $need bytes\n"; + } + \$body; +} + +sub _extract_hdr { + my ($ref) = @_; + if (index($$ref, "\r\n") < 0 && (my $pos = index($$ref, "\n\n")) >= 0) { + # likely on *nix + \substr($$ref, 0, $pos + 2, ''); # sv_chop on $$ref + } elsif ($$ref =~ /\r?\n\r?\n/s) { + \substr($$ref, 0, $+[0], ''); # sv_chop on $$ref + } else { + undef + } +} + +sub _mbox_cl ($$$;@) { + my ($mbfh, $uxs_from, $eml_cb, @arg) = @_; + my $buf = ''; + while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { + if ($r == 0) { # detect "curl --fail" + close($mbfh) or + die "error closing mboxcl/mboxcl2: \$?=$? $!"; + undef $mbfh; + } + while (my $hdr = _extract_hdr(\$buf)) { + $$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or + die "E: no 'From ' line in:\n", Dumper($hdr); + my $eml = PublicInbox::Eml->new($hdr); + my @cl = $eml->header_raw('Content-Length'); + my $n = scalar(@cl); + $n == 0 and die "E: Content-Length missing in:\n", + Dumper($eml->as_string); + $n == 1 or die "E: multiple ($n) Content-Length in:\n", + Dumper($eml->as_string); + $cl[0] =~ /\A[0-9]+\z/ or die + "E: Content-Length `$cl[0]' invalid\n", + Dumper($eml->as_string); + if (($eml->{bdy} = _cl_body($mbfh, \$buf, $cl[0]))) { + $uxs_from and + ${$eml->{bdy}} =~ s/^>From /From /sgm; + } + $eml_cb->($eml, @arg); + } + if ($r == 0) { + $buf =~ /[^ \r\n\t]/ and + warn "W: leftover at end of mboxcl/mboxcl2:\n", + Dumper(\$buf); + return; + } + } + die "error reading mboxcl/mboxcl2 handle: $!"; +} + +sub mboxcl { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_cl($mbfh, 1, $eml_cb, @arg); +} + +sub mboxcl2 { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_cl($mbfh, undef, $eml_cb, @arg); +} + +sub new { bless \(my $x), __PACKAGE__ } + +1; diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm new file mode 100644 index 00000000..ab5e029a --- /dev/null +++ b/lib/PublicInbox/MiscIdx.pm @@ -0,0 +1,151 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# like PublicInbox::SearchIdx, but for searching for non-mail messages. +# Things indexed include: +# * inboxes themselves +# * epoch information +# * (maybe) git code repository information +# Expect ~100K-1M documents with no parallelism opportunities, +# so no sharding, here. +# +# See MiscSearch for read-only counterpart +package PublicInbox::MiscIdx; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable; +use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat +use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use PublicInbox::Spawn qw(nodatacow_dir); +use Carp qw(croak); +use File::Path (); +use PublicInbox::MiscSearch; +use PublicInbox::Config; +my $json; + +sub new { + my ($class, $eidx) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $mi_dir = "$eidx->{xpfx}/misc"; + File::Path::mkpath($mi_dir); + nodatacow_dir($mi_dir); + my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; + $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $json //= PublicInbox::Config::json(); + bless { + mi_dir => $mi_dir, + flags => $flags, + indexlevel => 'full', # small DB, no point in medium? + }, $class; +} + +sub begin_txn { + my ($self) = @_; + croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy? + my $wdb = $PublicInbox::Search::X{WritableDatabase}; + my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; + croak "Failed opening $self->{mi_dir}: $@" if $@; + $self->{xdb} = $xdb; + $xdb->begin_transaction; +} + +sub commit_txn { + my ($self) = @_; + croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy? + delete($self->{xdb})->commit_transaction; +} + +sub remove_eidx_key { + my ($self, $eidx_key) = @_; + my $xdb = $self->{xdb}; + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my @docids; # only one, unless we had bugs + for (; $head != $tail; $head++) { + push @docids, $head->get_docid; + } + for my $docid (@docids) { + $xdb->delete_document($docid); + warn "I: remove inbox docid #$docid ($eidx_key)\n"; + } +} + +# adds or updates according to $eidx_key +sub index_ibx { + my ($self, $ibx) = @_; + my $eidx_key = $ibx->eidx_key; + my $xdb = $self->{xdb}; + # Q = uniQue in Xapian terminology + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my ($docid, @drop); + for (; $head != $tail; $head++) { + if (defined $docid) { + my $i = $head->get_docid; + push @drop, $i; + warn <get_docid; + } + } + $xdb->delete_document($_) for @drop; # just in case + + my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); + + # allow sorting by modified and uidvalidity (created at) + add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); + + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + } + + # force reread from disk, {description} could be loaded from {misc} + delete $ibx->{description}; + my $desc = $ibx->description; + + # description = S/Subject (or title) + # address = A/Author + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); + my %map = ( + address => 'A', + listid => 'XLISTID', + infourl => 'XINFOURL', + url => 'XURL' + ); + while (my ($f, $pfx) = each %map) { + for my $v (@{$ibx->{$f} // []}) { + index_text($self, $v, 1, $pfx); + } + } + my $data = {}; + if (defined(my $max = $ibx->max_git_epoch)) { # v2 + my $pfx = "/$ibx->{name}/git/"; + for my $epoch (0..$max) { + my $git = $ibx->git_epoch($epoch) or return; + if (my $ent = $git->manifest_entry($epoch, $desc)) { + $data->{"$pfx$epoch.git"} = $ent; + $ent->{git_dir} = $git->{git_dir}; + } + $git->cleanup; # ->modified starts cat-file --batch + } + } elsif (my $ent = $ibx->git->manifest_entry) { # v1 + $ent->{git_dir} = $ibx->{inboxdir}; + $data->{"/$ibx->{name}"} = $ent; + } + $doc->set_data($json->encode($data)); + if (defined $docid) { + $xdb->replace_document($docid, $doc); + } else { + $xdb->add_document($doc); + } +} + +1; diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm new file mode 100644 index 00000000..ead9a278 --- /dev/null +++ b/lib/PublicInbox/MiscSearch.pm @@ -0,0 +1,191 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# read-only counterpart to MiscIdx +package PublicInbox::MiscSearch; +use strict; +use v5.10.1; +use PublicInbox::Search qw(retry_reopen int_val); +my $json; + +# Xapian value columns: +our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) + +# avoid conflicting with message Search::prob_prefix for UI/UX reasons +my %PROB_PREFIX = ( + description => 'S', # $INBOX_DIR/description + address => 'A', + listid => 'XLISTID', + url => 'XURL', + infourl => 'XINFOURL', + name => 'XNAME', + '' => 'S A XLISTID XNAME XURL XINFOURL' +); + +sub new { + my ($class, $dir) = @_; + PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); + bless { + xdb => $PublicInbox::Search::X{Database}->new($dir) + }, $class; +} + +# read-only +sub mi_qp_new ($) { + my ($self) = @_; + my $xdb = $self->{xdb}; + my $qp = $PublicInbox::Search::X{QueryParser}->new; + $qp->set_default_op(PublicInbox::Search::OP_AND()); + $qp->set_database($xdb); + $qp->set_stemmer(PublicInbox::Search::stemmer($self)); + $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME()); + my $cb = $qp->can('set_max_wildcard_expansion') // + $qp->can('set_max_expansion'); # Xapian 1.5.0+ + $cb->($qp, 100); + $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + while (my ($name, $prefix) = each %PROB_PREFIX) { + $qp->add_prefix($name, $_) for split(/ /, $prefix); + } + $qp->add_boolean_prefix('type', 'T'); + $qp; +} + +sub misc_enquire_once { # retry_reopen callback + my ($self, $qr, $opt) = @_; + my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $eq->set_query($qr); + my $desc = !$opt->{asc}; + my $rel = $opt->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + } elsif ($rel) { + $eq->set_sort_by_relevance_then_value($MODIFIED, $desc); + } else { + $eq->set_sort_by_value_then_relevance($MODIFIED, $desc); + } + $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200); +} + +sub mset { + my ($self, $qs, $opt) = @_; + $opt ||= {}; + reopen($self); + my $qp = $self->{qp} //= mi_qp_new($self); + $qs = 'type:inbox' if $qs eq ''; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + $opt->{relevance} = 1 unless exists $opt->{relevance}; + retry_reopen($self, \&misc_enquire_once, $qr, $opt); +} + +sub ibx_matches_once { # retry_reopen callback + my ($self, $qr, $by_newsgroup) = @_; + # double in case no newsgroups are configured: + my $limit = scalar(keys %$by_newsgroup) * 2; + my $opt = { limit => $limit, offset => 0, relevance => -1 }; + my $ret = {}; # newsgroup => $ibx of matches + while (1) { + my $mset = misc_enquire_once($self, $qr, $opt); + for my $mi ($mset->items) { + my $doc = $mi->get_document; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + if ($cur != $end) { + my $ng = $cur->get_termname; # eidx_key + $ng =~ s/\AQ// or warn "BUG: no `Q': $ng"; + if (my $ibx = $by_newsgroup->{$ng}) { + $ret->{$ng} = $ibx; + } + } else { + warn <get_docid} has no `Q' (eidx_key) term +EOF + } + } + my $nr = $mset->size; + return $ret if $nr < $limit; + $opt->{offset} += $nr; + } +} + +# returns a newsgroup => PublicInbox::Inbox mapping +sub newsgroup_matches { + my ($self, $qs, $pi_cfg) = @_; + my $qp = $self->{qp} //= mi_qp_new($self); + $qs .= ' type:inbox'; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + retry_reopen($self, \&ibx_matches_once, $qr, $pi_cfg->{-by_newsgroup}); +} + +sub ibx_data_once { + my ($self, $ibx) = @_; + my $xdb = $self->{xdb}; + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); + if ($head != $tail) { + my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); + $doc->get_data; + } else { + undef; + } +} + +sub inbox_data { + my ($self, $ibx) = @_; + retry_reopen($self, \&ibx_data_once, $ibx); +} + +sub ibx_cache_load { + my ($doc, $cache) = @_; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + return if $cur == $end; + my $eidx_key = $cur->get_termname; + $eidx_key =~ s/\AQ// or return; # expired + my $ce = $cache->{$eidx_key} = {}; + $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY); + $ce->{-modified} = int_val($doc, $MODIFIED); + $ce->{description} = do { + # extract description from manifest.js.gz epoch description + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + $d; + } +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + +no warnings 'once'; +*reopen = \&PublicInbox::Search::reopen; + +1; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index bb1dfead..c503eb98 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # read-only utilities for Email::MIME diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 8596f01c..5ee087fd 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Various date/time-related functions diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index f15875e3..826c4b30 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # bidirectional Message-ID <-> Article Number mapping for the NNTP @@ -36,8 +36,7 @@ sub new_file { create_tables($dbh); $self->created_at(time) unless $self->created_at; - my $max = $self->max // 0; - $self->num_highwater($max); + $self->num_highwater(max($self)); $dbh->commit; } $self; @@ -144,7 +143,7 @@ sub max { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MAX(num) FROM msgmap', undef, 1); $sth->execute; - $sth->fetchrow_array; + $sth->fetchrow_array // 0; } sub minmax { @@ -153,7 +152,7 @@ sub minmax { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1); $sth->execute; - ($sth->fetchrow_array, max($_[0])); + ($sth->fetchrow_array // 0, max($_[0])); } sub mid_delete { diff --git a/lib/PublicInbox/NDC_PP.pm b/lib/PublicInbox/NDC_PP.pm index 10a7ee2a..57abccbe 100644 --- a/lib/PublicInbox/NDC_PP.pm +++ b/lib/PublicInbox/NDC_PP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Pure-perl class for Linux non-Inline::C users to disable COW for btrfs diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 2f821fa6..18822d3b 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -1,11 +1,11 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # Each instance of this represents a NNTP client socket # fields: # nntpd: PublicInbox::NNTPD ref # article: per-session current article number -# ng: PublicInbox::Inbox ref +# ibx: PublicInbox::Inbox ref # long_cb: long_response private data package PublicInbox::NNTP; use strict; @@ -17,6 +17,8 @@ use PublicInbox::DS qw(now); use Digest::SHA qw(sha1_hex); use Time::Local qw(timegm timelocal); use PublicInbox::GitAsyncCat; +use PublicInbox::Address; + use constant { LINE_MAX => 512, # RFC 977 section 2.3 r501 => '501 command syntax error', @@ -31,9 +33,9 @@ use Errno qw(EAGAIN); my $ONE_MSGID = qr/\A$MID_EXTRACT\z/; my @OVERVIEW = qw(Subject From Date Message-ID References); my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') . - "Xref:full\r\n"; + "Xref:full\r\n."; my $LIST_HEADERS = join("\r\n", @OVERVIEW, - qw(:bytes :lines Xref To Cc)) . "\r\n"; + qw(:bytes :lines Xref To Cc)) . "\r\n."; my $CAPABILITIES = <<""; 101 Capability list:\r VERSION 2\r @@ -92,8 +94,7 @@ sub process_line ($$) { err($self, 'error from: %s (%s)', $l, $err); $res = '503 program fault - command not performed'; } - return 0 unless defined $res; - res($self, $res); + defined($res) ? res($self, $res) : 0; } # The keyword argument is not used (rfc3977 5.2.2) @@ -109,9 +110,7 @@ sub cmd_capabilities ($;$) { sub cmd_mode ($$) { my ($self, $arg) = @_; - $arg = uc $arg; - return r501 unless $arg eq 'READER'; - '201 Posting prohibited'; + uc($arg) eq 'READER' ? '201 Posting prohibited' : r501; } sub cmd_slave ($) { '202 slave status noted' } @@ -120,46 +119,66 @@ sub cmd_xgtitle ($;$) { my ($self, $wildmat) = @_; more($self, '282 list of groups and descriptions follows'); list_newsgroups($self, $wildmat); - '.' } -sub list_overview_fmt ($) { - my ($self) = @_; - $self->msg_more($OVERVIEW_FMT); -} +sub list_overview_fmt ($) { $OVERVIEW_FMT } -sub list_headers ($;$) { - my ($self) = @_; - $self->msg_more($LIST_HEADERS); +sub list_headers ($;$) { $LIST_HEADERS } + +sub list_active_i { # "LIST ACTIVE" and also just "LIST" (no args) + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $ibx; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + for my $ngname (@window) { + $ibx = $groups->{$ngname} and group_line($self, $ibx); + } + scalar(@$groupnames); # continue if there's more } -sub list_active ($;$) { +sub list_active ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - group_line($self, $ng); + long_response($self, \&list_active_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); +} + +sub list_active_times_i { + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + for my $ngname (@window) { + my $ibx = $groups->{$ngname} or next; + my $c = eval { $ibx->uidvalidity } // time; + more($self, "$ngname $c <$ibx->{-primary_address}>"); } + scalar(@$groupnames); # continue if there's more } -sub list_active_times ($;$) { +sub list_active_times ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $c = eval { $ng->mm->created_at } || time; - more($self, "$ng->{newsgroup} $c $ng->{-primary_address}"); + long_response($self, \&list_active_times_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); +} + +sub list_newsgroups_i { + my ($self, $groupnames) = @_; + my @window = splice(@$groupnames, 0, 100) or return 0; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + my $ibx; + for my $ngname (@window) { + $ibx = $groups->{$ngname} and + more($self, "$ngname ".$ibx->description); } + scalar(@$groupnames); # continue if there's more } -sub list_newsgroups ($;$) { +sub list_newsgroups ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $d = $ng->description; - more($self, "$ng->{newsgroup} $d"); - } + long_response($self, \&list_newsgroups_i, [ + grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]); } # LIST SUBSCRIPTIONS, DISTRIB.PATS are not supported @@ -168,6 +187,7 @@ sub cmd_list ($;$$) { if (scalar @args) { my $arg = shift @args; $arg =~ tr/A-Z./a-z_/; + my $ret = $arg eq 'active'; $arg = "list_$arg"; $arg = $self->can($arg); return r501 unless $arg && args_ok($arg, scalar @args); @@ -175,24 +195,22 @@ sub cmd_list ($;$$) { $arg->($self, @args); } else { more($self, '215 list of newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - group_line($self, $ng); - } + long_response($self, \&list_active_i, [ # copy array + @{$self->{nntpd}->{groupnames}} ]); } - '.' } sub listgroup_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end, 'num'); + my $r = $self->{ibx}->mm->msg_range($beg, $end, 'num'); scalar(@$r) or return; - more($self, join("\r\n", map { $_->[0] } @$r)); + $self->msg_more(join('', map { "$_->[0]\r\n" } @$r)); 1; } sub listgroup_all_i { my ($self, $num) = @_; - my $ary = $self->{ng}->mm->ids_after($num); + my $ary = $self->{ibx}->mm->ids_after($num); scalar(@$ary) or return; more($self, join("\r\n", @$ary)); 1; @@ -205,7 +223,7 @@ sub cmd_listgroup ($;$$) { return $res if ($res !~ /\A211 /); more($self, $res); } - $self->{ng} or return '412 no newsgroup selected'; + $self->{ibx} or return '412 no newsgroup selected'; if (defined $range) { my $r = get_range($self, $range); return $r unless ref $r; @@ -242,9 +260,22 @@ sub parse_time ($$;$) { } sub group_line ($$) { - my ($self, $ng) = @_; - my ($min, $max) = $ng->mm->minmax; - more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max; + my ($self, $ibx) = @_; + my ($min, $max) = $ibx->mm->minmax; + more($self, "$ibx->{newsgroup} $max $min n"); +} + +sub newgroups_i { + my ($self, $ts, $i, $groupnames) = @_; + my $end = $$i + 100; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + while ($$i < $end) { + my $ngname = $groupnames->[$$i++] // return; + my $ibx = $groups->{$ngname} or next; # expired on reload + next unless (eval { $ibx->uidvalidity } // 0) > $ts; + group_line($self, $ibx); + } + 1; } sub cmd_newgroups ($$$;$$) { @@ -254,12 +285,8 @@ sub cmd_newgroups ($$$;$$) { # TODO dists more($self, '231 list of new newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - my $c = eval { $ng->mm->created_at } || 0; - next unless $c > $ts; - group_line($self, $ng); - } - '.' + long_response($self, \&newgroups_i, $ts, \(my $i = 0), + $self->{nntpd}->{groupnames}); } sub wildmat2re (;$) { @@ -294,23 +321,27 @@ sub ngpat2re (;$) { } sub newnews_i { - my ($self, $overs, $ts, $prev) = @_; - my $over = $overs->[0]; - my $msgs = $over->query_ts($ts, $$prev); - if (scalar @$msgs) { - more($self, '<' . - join(">\r\n<", map { $_->{mid} } @$msgs ). - '>'); - $$prev = $msgs->[-1]->{num}; - } else { - shift @$overs; - if (@$overs) { # continue onto next newsgroup - $$prev = 0; - return 1; - } else { # break out of the long response. - return; + my ($self, $names, $ts, $prev) = @_; + my $ngname = $names->[0]; + if (my $ibx = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}->{$ngname}) { + if (my $over = $ibx->over) { + my $msgs = $over->query_ts($ts, $$prev); + if (scalar @$msgs) { + $self->msg_more(join('', map { + "<$_->{mid}>\r\n"; + } @$msgs)); + $$prev = $msgs->[-1]->{num}; + return 1; # continue on current group + } } } + shift @$names; + if (@$names) { # continue onto next newsgroup + $$prev = 0; + 1; + } else { # all done, break out of the long_response + undef; + } } sub cmd_newnews ($$$$;$$) { @@ -321,30 +352,22 @@ sub cmd_newnews ($$$$;$$) { my ($keep, $skip) = split('!', $newsgroups, 2); ngpat2re($keep); ngpat2re($skip); - my @overs; - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $keep or next; - $ng->{newsgroup} =~ $skip and next; - my $over = $ng->over or next; - push @overs, $over; - }; - return '.' unless @overs; - + my @names = grep(!/$skip/, grep(/$keep/, + @{$self->{nntpd}->{groupnames}})); + return '.' unless scalar(@names); my $prev = 0; - long_response($self, \&newnews_i, \@overs, $ts, \$prev); + long_response($self, \&newnews_i, \@names, $ts, \$prev); } sub cmd_group ($$) { my ($self, $group) = @_; - my $no_such = '411 no such news group'; my $nntpd = $self->{nntpd}; - my $ng = $nntpd->{groups}->{$group} or return $no_such; + my $ibx = $nntpd->{pi_cfg}->{-by_newsgroup}->{$group} or + return '411 no such news group'; $nntpd->idler_start; - $self->{ng} = $ng; - my ($min, $max) = $ng->mm->minmax; - $min ||= 0; - $max ||= 0; + $self->{ibx} = $ibx; + my ($min, $max) = $ibx->mm->minmax; $self->{article} = $min; my $est_size = $max - $min; "211 $est_size $min $max $group"; @@ -352,13 +375,13 @@ sub cmd_group ($$) { sub article_adj ($$) { my ($self, $off) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; + my $ibx = $self->{ibx} or return '412 no newsgroup selected'; my $n = $self->{article}; defined $n or return '420 no current article has been selected'; $n += $off; - my $mid = $ng->mm->mid_for($n); + my $mid = $ibx->mm->mid_for($n); unless ($mid) { $n = $off > 0 ? 'next' : 'previous'; return "421 no $n article in this group"; @@ -374,8 +397,8 @@ sub cmd_last ($) { article_adj($_[0], -1) } # the single-point-of-failure a single server provides. sub cmd_post ($) { my ($self) = @_; - my $ng = $self->{ng}; - $ng ? "440 mailto:$ng->{-primary_address} to post" + my $ibx = $self->{ibx}; + $ibx ? "440 mailto:$ibx->{-primary_address} to post" : '440 posting not allowed' } @@ -395,19 +418,41 @@ sub header_append ($$$) { $hdr->header_set($k, @v, $v); } -sub xref ($$$$) { - my ($self, $ng, $n, $mid) = @_; - my $ret = $self->{nntpd}->{servername} . " $ng->{newsgroup}:$n"; +sub xref_by_tc ($$$) { + my ($xref, $pi_cfg, $smsg) = @_; + my $by_addr = $pi_cfg->{-by_addr}; + my $mid = $smsg->{mid}; + for my $f (qw(to cc)) { + my @ibxs = map { + $by_addr->{lc($_)} // () + } (PublicInbox::Address::emails($smsg->{$f} // '')); + for my $ibx (@ibxs) { + my $ngname = $ibx->{newsgroup} // next; + next if defined $xref->{$ngname}; + $xref->{$ngname} = eval { $ibx->mm->num_for($mid) }; + } + } +} - # num_for is pretty cheap and sometimes we'll lookup the existence - # of an article without getting even the OVER info. In other words, - # I'm not sure if its worth optimizing by scanning To:/Cc: and - # PublicInbox::ExtMsg on the PSGI end is just as expensive - foreach my $other (@{$self->{nntpd}->{grouplist}}) { - next if $ng eq $other; - my $num = eval { $other->mm->num_for($mid) } or next; - $ret .= " $other->{newsgroup}:$num"; +sub xref ($$$) { + my ($self, $cur_ibx, $smsg) = @_; + my $nntpd = $self->{nntpd}; + my $cur_ng = $cur_ibx->{newsgroup}; + my $xref; + if (my $ALL = $nntpd->{pi_cfg}->ALL) { + $xref = $ALL->nntp_xref_for($cur_ibx, $smsg); + xref_by_tc($xref, $nntpd->{pi_cfg}, $smsg); + } else { # slow path + $xref = { $cur_ng => $smsg->{num} }; + my $mid = $smsg->{mid}; + for my $ibx (values %{$nntpd->{pi_cfg}->{-by_newsgroup}}) { + next if defined($xref->{$ibx->{newsgroup}}); + my $num = eval { $ibx->mm->num_for($mid) } // next; + $xref->{$ibx->{newsgroup}} = $num; + } } + my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng}); + $ret .= " $_:$xref->{$_}" for (sort keys %$xref); $ret; } @@ -430,7 +475,7 @@ sub set_nntp_headers ($$) { # clobber some existing headers my $ibx = $smsg->{-ibx}; - my $xref = xref($smsg->{nntp}, $ibx, $smsg->{num}, $mid); + my $xref = xref($smsg->{nntp}, $ibx, $smsg); $hdr->header_set('Xref', $xref); # RFC 5536 3.1.4 @@ -442,53 +487,34 @@ sub set_nntp_headers ($$) { # *something* here is required for leafnode, try to follow # RFC 5536 3.1.5... $hdr->header_set('Path', $server_name . '!not-for-mail'); - - header_append($hdr, 'List-Post', "{-primary_address}>"); - if (my $url = $ibx->base_url) { - $mid = mid_escape($mid); - header_append($hdr, 'Archived-At', "<$url$mid/>"); - header_append($hdr, 'List-Archive', "<$url>"); - } } sub art_lookup ($$$) { my ($self, $art, $code) = @_; - my $ng = $self->{ng}; - my ($n, $mid); + my ($ibx, $n); my $err; if (defined $art) { if ($art =~ /\A[0-9]+\z/) { $err = '423 no such article number in this group'; $n = int($art); - goto find_mid; + goto find_ibx; } elsif ($art =~ $ONE_MSGID) { - $mid = $1; - $err = r430; - $n = $ng->mm->num_for($mid) if $ng; - goto found if defined $n; - foreach my $g (values %{$self->{nntpd}->{groups}}) { - $n = $g->mm->num_for($mid); - if (defined $n) { - $ng = $g; - goto found; - } - } - return $err; + ($ibx, $n) = mid_lookup($self, $1); + goto found if $ibx; + return r430; } else { return r501; } } else { $err = '420 no current article has been selected'; - $n = $self->{article}; - defined $n or return $err; -find_mid: - $ng or return '412 no newsgroup has been selected'; - $mid = $ng->mm->mid_for($n); - defined $mid or return $err; + $n = $self->{article} // return $err; +find_ibx: + $ibx = $self->{ibx} or + return '412 no newsgroup has been selected'; } found: - my $smsg = $ng->over->get_art($n) or return $err; - $smsg->{-ibx} = $ng; + my $smsg = $ibx->over->get_art($n) or return $err; + $smsg->{-ibx} = $ibx; if ($code == 223) { # STAT set_art($self, $n); "223 $n <$smsg->{mid}> article retrieved - " . @@ -498,7 +524,7 @@ found: $smsg->{nntp_code} = $code; set_art($self, $art); # this dereferences to `undef' - ${git_async_cat($ng->git, $smsg->{blob}, \&blob_cb, $smsg)}; + ${git_async_cat($ibx->git, $smsg->{blob}, \&blob_cb, $smsg)}; } } @@ -598,10 +624,10 @@ sub cmd_help ($) { sub get_range ($$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no news group has been selected'; + my $ibx = $self->{ibx} or return '412 no news group has been selected'; defined $range or return '420 No article(s) selected'; my ($beg, $end); - my ($min, $max) = $ng->mm->minmax; + my ($min, $max) = $ibx->mm->minmax; if ($range =~ /\A([0-9]+)\z/) { $beg = $end = $1; } elsif ($range =~ /\A([0-9]+)-\z/) { @@ -671,9 +697,9 @@ sub long_response ($$;@) { sub hdr_msgid_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end); + my $r = $self->{ibx}->mm->msg_range($beg, $end); @$r or return; - more($self, join("\r\n", map { "$_->[0] <$_->[1]>" } @$r)); + $self->msg_more(join('', map { "$_->[0] <$_->[1]>\r\n" } @$r)); 1; } @@ -681,9 +707,9 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. my ($self, $xhdr, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, $range); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $range); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -695,28 +721,54 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. sub mid_lookup ($$) { my ($self, $mid) = @_; - my $self_ng = $self->{ng}; - if ($self_ng) { - my $n = $self_ng->mm->num_for($mid); - return ($self_ng, $n) if defined $n; + my $cur_ibx = $self->{ibx}; + if ($cur_ibx) { + my $n = $cur_ibx->mm->num_for($mid); + return ($cur_ibx, $n) if defined $n; } - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - next if defined $self_ng && $ng eq $self_ng; - my $n = $ng->mm->num_for($mid); - return ($ng, $n) if defined $n; + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + if (my @x = grep(/:$smsg->{blob}\z/, @$xr3)) { + my ($ngname, $xnum) = split(/:/, $x[0]); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + # fall through to trying all xref3s + } else { + warn < ($smsg->{blob}) in $ALL->{topdir}, -extindex bug? +EOF + } + # try all xref3s + for my $x (@$xr3) { + my ($ngname, $xnum) = split(/:/, $x); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + warn "W: `$ngname' does not exist for #$xnum\n"; + } + } + # no warning here, $mid is just invalid + } else { # slow path for non-ALL users + for my $ibx (values %{$pi_cfg->{-by_newsgroup}}) { + next if defined $cur_ibx && $ibx eq $cur_ibx; + my $n = $ibx->mm->num_for($mid); + return ($ibx, $n) if defined $n; + } } (undef, undef); } sub xref_range_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $r = $ng->mm->msg_range($beg, $end); - @$r or return; - more($self, join("\r\n", map { - my $num = $_->[0]; - "$num ".xref($self, $ng, $num, $_->[1]); - } @$r)); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over->query_xover($$beg, $end); + scalar(@$msgs) or return; + $$beg = $msgs->[-1]->{num} + 1; + $self->msg_more(join('', map { + "$_->{num} ".xref($self, $ibx, $_) . "\r\n"; + } @$msgs)); 1; } @@ -725,10 +777,11 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin if (defined $range && $range =~ $ONE_MSGID) { my $mid = $1; - my ($ng, $n) = mid_lookup($self, $mid); + my ($ibx, $n) = mid_lookup($self, $mid); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, - xref($self, $ng, $n, $mid)); + my $smsg = $ibx->over->get_art($n) or return; + hdr_mid_response($self, $xhdr, $ibx, $n, $range, + xref($self, $ibx, $smsg)); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -747,7 +800,7 @@ sub over_header_for { sub smsg_range_i { my ($self, $beg, $end, $field) = @_; - my $over = $self->{ng}->over; + my $over = $self->{ibx}->over; my $msgs = $over->query_xover($$beg, $end); scalar(@$msgs) or return; my $tmp = ''; @@ -770,10 +823,10 @@ sub smsg_range_i { sub hdr_smsg ($$$$) { my ($self, $xhdr, $field, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless defined $n; - my $v = over_header_for($ng->over, $n, $field); - hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); + my $v = over_header_for($ibx->over, $n, $field); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $v); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -813,26 +866,26 @@ sub cmd_xhdr ($$;$) { } sub hdr_mid_prefix ($$$$$) { - my ($self, $xhdr, $ng, $n, $mid) = @_; + my ($self, $xhdr, $ibx, $n, $mid) = @_; return $mid if $xhdr; # HDR for RFC 3977 users - if (my $self_ng = $self->{ng}) { - ($self_ng eq $ng) ? $n : '0'; + if (my $cur_ibx = $self->{ibx}) { + ($cur_ibx eq $ibx) ? $n : '0'; } else { '0'; } } sub hdr_mid_response ($$$$$$) { - my ($self, $xhdr, $ng, $n, $mid, $v) = @_; + my ($self, $xhdr, $ibx, $n, $mid, $v) = @_; my $res = ''; if ($xhdr) { $res .= r221 . "\r\n"; $res .= "$mid $v\r\n"; } else { $res .= r225 . "\r\n"; - my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid); + my $pfx = hdr_mid_prefix($self, $xhdr, $ibx, $n, $mid); $res .= "$pfx $v\r\n"; } res($self, $res .= '.'); @@ -841,14 +894,14 @@ sub hdr_mid_response ($$$$$$) { sub xrover_i { my ($self, $beg, $end) = @_; - my $h = over_header_for($self->{ng}->over, $$beg, 'references'); + my $h = over_header_for($self->{ibx}->over, $$beg, 'references'); more($self, "$$beg $h") if defined($h); $$beg++ < $end; } sub cmd_xrover ($;$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; + my $ibx = $self->{ibx} or return '412 no newsgroup selected'; (defined $range && $range =~ /[<>]/) and return '420 No article(s) selected'; # no message IDs @@ -859,11 +912,11 @@ sub cmd_xrover ($;$) { long_response($self, \&xrover_i, @$r); } -sub over_line ($$$$) { - my ($self, $ng, $num, $smsg) = @_; +sub over_line ($$$) { + my ($self, $ibx, $smsg) = @_; # n.b. field access and procedural calls can be # 10%-15% faster than OO method calls: - my $s = join("\t", $num, + my $s = join("\t", $smsg->{num}, $smsg->{subject}, $smsg->{from}, PublicInbox::Smsg::date($smsg), @@ -871,23 +924,28 @@ sub over_line ($$$$) { $smsg->{references}, $smsg->{bytes}, $smsg->{lines}, - "Xref: " . xref($self, $ng, $num, $smsg->{mid})); + "Xref: " . xref($self, $ibx, $smsg)); utf8::encode($s); - $s + $s .= "\r\n"; } sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); defined $n or return r430; - my $smsg = $ng->over->get_art($n) or return r430; + my $smsg = $ibx->over->get_art($n) or return r430; more($self, '224 Overview information follows (multi-line)'); # Only set article number column if it's the current group - my $self_ng = $self->{ng}; - $n = 0 if (!$self_ng || $self_ng ne $ng); - more($self, over_line($self, $ng, $n, $smsg)); + # (RFC 3977 8.3.2) + my $cur_ibx = $self->{ibx}; + if (!$cur_ibx || $cur_ibx ne $ibx) { + # set {-orig_num} for nntp_xref_for + $smsg->{-orig_num} = $smsg->{num}; + $smsg->{num} = 0; + } + $self->msg_more(over_line($self, $ibx, $smsg)); '.'; } else { cmd_xover($self, $range); @@ -896,13 +954,13 @@ sub cmd_over ($;$) { sub xover_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $msgs = $ng->over->query_xover($$beg, $end); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over->query_xover($$beg, $end); my $nr = scalar @$msgs or return; # OVERVIEW.FMT - more($self, join("\r\n", map { - over_line($self, $ng, $_->{num}, $_); + $self->msg_more(join('', map { + over_line($self, $ibx, $_); } @$msgs)); $$beg = $msgs->[-1]->{num} + 1; } @@ -949,12 +1007,28 @@ sub cmd_xpath ($$) { return r501 unless $mid =~ $ONE_MSGID; $mid = $1; my @paths; - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - my $n = $ng->mm->num_for($mid); - push @paths, "$ng->{newsgroup}/$n" if defined $n; + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + my $groups = $pi_cfg->{-by_newsgroup}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev, %seen); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + for my $x (@$xr3) { + my ($ngname, $n) = split(/:/, $x); + $x = "$ngname/$n"; + if ($groups->{$ngname} && !$seen{$x}++) { + push(@paths, $x); + } + } + } + } else { # slow path, no point in using long_response + for my $ibx (values %$groups) { + my $n = $ibx->mm->num_for($mid) // next; + push @paths, "$ibx->{newsgroup}/$n"; + } } return '430 no such article on server' unless @paths; - '223 '.join(' ', @paths); + '223 '.join(' ', sort(@paths)); } sub res ($$) { do_write($_[0], $_[1] . "\r\n") } diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 6b762d89..1e4ddd18 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # represents an NNTPD (currently a singleton), @@ -12,8 +12,8 @@ use PublicInbox::InboxIdle; sub new { my ($class) = @_; - my $pi_config = PublicInbox::Config->new; - my $name = $pi_config->{'publicinbox.nntpserver'}; + my $pi_cfg = PublicInbox::Config->new; + my $name = $pi_cfg->{'publicinbox.nntpserver'}; if (!defined($name) or $name eq '') { $name = hostname; } elsif (ref($name) eq 'ARRAY') { @@ -24,8 +24,7 @@ sub new { groups => {}, err => \*STDERR, out => \*STDOUT, - grouplist => [], - pi_config => $pi_config, + pi_cfg => $pi_cfg, servername => $name, greet => \"201 $name ready - post via email\r\n", # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } @@ -35,40 +34,33 @@ sub new { sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = $sig ? PublicInbox::Config->new : $self->{pi_config}; - my $new = {}; - my @list; - $pi_config->each_inbox(sub { - my ($ng) = @_; - my $ngname = $ng->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - # Newsgroup name needs to be compatible with RFC 3977 - # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. - # Leave out a few chars likely to cause problems or conflicts: - # '|', '<', '>', ';', '#', '$', '&', - } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) { - warn "newsgroup name invalid: `$ngname'\n"; - } elsif ($ng->nntp_usable) { - # Only valid if msgmap and search works - $new->{$ngname} = $ng; - push @list, $ng; - + my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg}; + my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; + $pi_cfg->each_inbox(sub { + my ($ibx) = @_; + my $ngname = $ibx->{newsgroup} // return; + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { + # only valid if msgmap and over works # preload to avoid fragmentation: - $ng->description; - $ng->base_url; + $ibx->description; + $ibx->base_url; + } else { + delete $groups->{$ngname}; + delete $ibx->{newsgroup}; + # Note: don't be tempted to delete more for memory + # savings just yet: NNTP, IMAP, and WWW may all + # run in the same process someday. } }); - @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list; - $self->{grouplist} = \@list; - $self->{pi_config} = $pi_config; + $self->{groupnames} = [ sort(keys %$groups) ]; # this will destroy old groups that got deleted - %{$self->{groups}} = %$new; + $self->{pi_cfg} = $pi_cfg; } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); } 1; diff --git a/lib/PublicInbox/NNTPdeflate.pm b/lib/PublicInbox/NNTPdeflate.pm index 02af935f..06b4499c 100644 --- a/lib/PublicInbox/NNTPdeflate.pm +++ b/lib/PublicInbox/NNTPdeflate.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # RFC 8054 NNTP COMPRESS DEFLATE implementation diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index 6bed0103..d7dd637f 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # Plack app redirector for mapping /$NEWSGROUP requests to @@ -13,9 +13,8 @@ use PublicInbox::MID qw(mid_escape); use PublicInbox::Hval qw(prurl); sub new { - my ($class, $pi_config) = @_; - $pi_config ||= PublicInbox::Config->new; - bless { pi_config => $pi_config }, $class; + my ($class, $pi_cfg) = @_; + bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class; } sub redirect ($$) { @@ -47,8 +46,8 @@ sub call { # /inbox.foo.bar/123456 my (undef, @parts) = split(m!/!, $env->{PATH_INFO}); my ($ng, $article) = @parts; - my $pi_config = $self->{pi_config}; - if (my $ibx = $pi_config->lookup_newsgroup($ng)) { + my $pi_cfg = $self->{pi_cfg}; + if (my $ibx = $pi_cfg->lookup_newsgroup($ng)) { my $url = prurl($env, $ibx->{url}); my $code = 301; if (defined $article && $article =~ /\A[0-9]+\z/) { @@ -63,7 +62,6 @@ sub call { return redirect($code, $url); } - my $res; my @try = (join('/', @parts)); # trailing slash is in the rest of our WWW, so maybe some users @@ -72,13 +70,30 @@ sub call { pop @parts; push @try, join('/', @parts); } - - foreach my $mid (@try) { - my $arg = [ $mid ]; - $pi_config->each_inbox(\&try_inbox, $arg); - defined($res = $arg->[1]) and last; + my $ALL = $pi_cfg->ALL; + if (my $over = $ALL ? $ALL->over : undef) { + my $by_eidx_key = $pi_cfg->{-by_eidx_key}; + for my $mid (@try) { + my ($id, $prev); + while (my $x = $over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $over->get_xref3($x->{num}); + for (@$xr3) { + s/:[0-9]+:$x->{blob}\z// or next; + my $ibx = $by_eidx_key->{$_} // next; + my $url = $ibx->base_url or next; + $url .= mid_escape($mid) . '/'; + return redirect(302, $url); + } + } + } + } else { # slow path, scan every inbox + for my $mid (@try) { + my $arg = [ $mid ]; # [1] => result + $pi_cfg->each_inbox(\&try_inbox, $arg); + return $arg->[1] if $arg->[1]; + } } - $res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; + [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; } 1; diff --git a/lib/PublicInbox/OnDestroy.pm b/lib/PublicInbox/OnDestroy.pm new file mode 100644 index 00000000..0ae4c4c9 --- /dev/null +++ b/lib/PublicInbox/OnDestroy.pm @@ -0,0 +1,21 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +package PublicInbox::OnDestroy; + +sub new { + shift; # ($class, $cb, @args) + bless [ @_ ], __PACKAGE__; +} + +sub DESTROY { + my ($cb, @args) = @{$_[0]}; + if (!ref($cb)) { + my $pid = $cb; + return if $pid != $$; + $cb = shift @args; + } + $cb->(@args) if $cb; +} + +1; diff --git a/lib/PublicInbox/OpPipe.pm b/lib/PublicInbox/OpPipe.pm new file mode 100644 index 00000000..295a8aa5 --- /dev/null +++ b/lib/PublicInbox/OpPipe.pm @@ -0,0 +1,41 @@ +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ + +# bytecode dispatch pipe, reads a byte, runs a sub +# byte => [ sub, @operands ] +package PublicInbox::OpPipe; +use strict; +use v5.10.1; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN); + +sub new { + my ($cls, $rd, $op_map, $in_loop) = @_; + my $self = bless { sock => $rd, op_map => $op_map }, $cls; + # 1031: F_SETPIPE_SZ, 4096: page size + fcntl($rd, 1031, 4096) if $^O eq 'linux'; + if ($in_loop) { # iff using DS->EventLoop + $rd->blocking(0); + $self->SUPER::new($rd, EPOLLIN); + } + $self; +} + +sub event_step { + my ($self) = @_; + my $rd = $self->{sock}; + my $byte; + until (defined(sysread($rd, $byte, 1))) { + return if $!{EAGAIN}; + next if $!{EINTR}; + die "read \$rd: $!"; + } + my $op = $self->{op_map}->{$byte} or die "BUG: unknown byte `$byte'"; + if ($byte eq '') { # close on EOF + $rd->blocking ? delete($self->{sock}) : $self->close; + } + my ($sub, @args) = @$op; + $sub->(@args); +} + +1; diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 08112386..06ea439d 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -260,6 +260,27 @@ SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 $smsg ? load_from_row($smsg) : undef; } +sub get_xref3 { + my ($self, $num, $raw) = @_; + my $dbh = dbh($self); + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum,oidbin FROM xref3 WHERE docid = ? ORDER BY ibx_id,xnum ASC + + $sth->execute($num); + my $rows = $sth->fetchall_arrayref; + return $rows if $raw; + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? + + [ map { + my $r = $_; + $eidx_key_sth->execute($r->[0]); + my $eidx_key = $eidx_key_sth->fetchrow_array; + $eidx_key //= "missing://ibx_id=$r->[0]"; + "$eidx_key:$r->[1]:".unpack('H*', $r->[2]); + } @$rows ]; +} + sub next_by_mid { my ($self, $mid, $id, $prev) = @_; my $dbh = dbh($self); diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 840e2c2a..985c5473 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -79,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -238,26 +243,6 @@ sub link_refs { $tid; } -sub parse_references ($$$) { - my ($smsg, $hdr, $mids) = @_; - my $refs = references($hdr); - push(@$refs, @$mids) if scalar(@$mids) > 1; - return $refs if scalar(@$refs) == 0; - - # prevent circular references here: - my %seen = ( $smsg->{mid} => 1 ); - my @keep; - foreach my $ref (@$refs) { - if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - next; - } - push(@keep, $ref) unless $seen{$ref}++; - } - $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; - \@keep; -} - # normalize subjects so they are suitable as pathnames for URLs # XXX: consider for removal sub subject_path ($) { @@ -267,21 +252,27 @@ sub subject_path ($) { lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; my $mids = mids_for_index($eml); - my $refs = parse_references($smsg, $eml, $mids); + my $refs = $smsg->parse_references($eml, $mids); + $mids->[0] //= $smsg->{mid} //= $eml->{-lei_fake_mid}; + $smsg->{mid} //= ''; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, $smsg, $mids, $refs, $xpath, $dd); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -385,13 +376,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, /* NNTP article number == IMAP UID */ + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ ds INTEGER, /* RFC-2822 sent Date: header, git author time */ - ddd VARBINARY, /* doc-data-deflated (->to_doc_data, ->load_from_data) */ - UNIQUE (num) + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -465,10 +455,14 @@ sub dbh_close { sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: PublicInbox::Over::dbh($self); @@ -518,4 +512,170 @@ EOM $pr->("I: rethread culled $total ghosts\n") if $pr && $total; } +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<''); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL) + + 1; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# returns remaining reference count to $docid +sub remove_xref3 { + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; + begin_lazy($self); + my $oidbin = pack('H*', $oidhex); + my ($sth, $ibx_id); + if (defined $eidx_key) { + $ibx_id = ibx_id($self, $eidx_key); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $oidbin, SQL_BLOB); + } else { + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + } + $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + +sub blob_exists { + my ($self, $oidhex) = @_; + my $sth = $self->dbh->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE oidbin = ? + + $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); + $sth->execute; + $sth->fetchrow_array; +} + 1; diff --git a/lib/PublicInbox/ProcessPipe.pm b/lib/PublicInbox/ProcessPipe.pm index 2ce7eb8f..97e9c268 100644 --- a/lib/PublicInbox/ProcessPipe.pm +++ b/lib/PublicInbox/ProcessPipe.pm @@ -1,43 +1,70 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # a tied handle for auto reaping of children tied to a pipe, see perltie(1) package PublicInbox::ProcessPipe; use strict; -use warnings; +use v5.10.1; +use Carp qw(carp); sub TIEHANDLE { - my ($class, $pid, $fh) = @_; - bless { pid => $pid, fh => $fh }, $class; + my ($class, $pid, $fh, $cb, $arg) = @_; + bless { pid => $pid, fh => $fh, ppid => $$, cb => $cb, arg => $arg }, + $class; } +sub BINMODE { binmode(shift->{fh}) } # for IO::Uncompress::Gunzip + sub READ { read($_[0]->{fh}, $_[1], $_[2], $_[3] || 0) } sub READLINE { readline($_[0]->{fh}) } -sub CLOSE { - my $fh = delete($_[0]->{fh}); - my $ret = defined $fh ? close($fh) : ''; - my $pid = delete $_[0]->{pid}; - if (defined $pid) { - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($pid, undef, undef) }; +sub WRITE { + use bytes qw(length); + syswrite($_[0]->{fh}, $_[1], $_[2] // length($_[1]), $_[3] // 0); +} + +sub PRINT { + my $self = shift; + print { $self->{fh} } @_; +} + +sub FILENO { fileno($_[0]->{fh}) } - if ($@) { # ok, not in the event loop, work synchronously - waitpid($pid, 0); +sub _close ($;$) { + my ($self, $wait) = @_; + my $fh = delete $self->{fh}; + my $ret = defined($fh) ? close($fh) : ''; + my ($pid, $cb, $arg) = delete @$self{qw(pid cb arg)}; + return $ret unless defined($pid) && $self->{ppid} == $$; + if ($wait) { # caller cares about the exit status: + my $wp = waitpid($pid, 0); + if ($wp == $pid) { $ret = '' if $?; + if ($cb) { + eval { $cb->($arg, $pid) }; + carp "E: cb(arg, $pid): $@" if $@; + } + } else { + carp "waitpid($pid, 0) = $wp, \$!=$!, \$?=$?"; } + } else { # caller just undef-ed it, let event loop deal with it + require PublicInbox::DS; + PublicInbox::DS::dwaitpid($pid, $cb, $arg); } $ret; } -sub FILENO { fileno($_[0]->{fh}) } +# if caller uses close(), assume they want to check $? immediately so +# we'll waitpid() synchronously. n.b. wantarray doesn't seem to +# propagate `undef' down to tied methods, otherwise I'd rely on that. +sub CLOSE { _close($_[0], 1) } +# if relying on DESTROY, assume the caller doesn't care about $? and +# we can let the event loop call waitpid() whenever it gets SIGCHLD sub DESTROY { - CLOSE(@_); + _close($_[0]); undef; } -sub pid { $_[0]->{pid} } - 1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm index 88b6d390..7e50a59a 100644 --- a/lib/PublicInbox/Qspawn.pm +++ b/lib/PublicInbox/Qspawn.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Like most Perl modules in public-inbox, this is internal and @@ -12,12 +12,13 @@ # operate in. This can be useful to ensure smaller inboxes can # be cloned while cloning of large inboxes is maxed out. # -# This does not depend on PublicInbox::DS or any other external -# scheduling mechanism, you just need to call start() and finish() -# appropriately. However, public-inbox-httpd (which uses PublicInbox::DS) -# will be able to schedule this based on readability of stdout from -# the spawned process. See GitHTTPBackend.pm and SolverGit.pm for -# usage examples. It does not depend on any form of threading. +# This does not depend on the PublicInbox::DS->EventLoop or any +# other external scheduling mechanism, you just need to call +# start() and finish() appropriately. However, public-inbox-httpd +# (which uses PublicInbox::DS) will be able to schedule this +# based on readability of stdout from the spawned process. +# See GitHTTPBackend.pm and SolverGit.pm for usage examples. +# It does not depend on any form of threading. # # This is useful for scheduling CGI execution of both long-lived # git-http-backend(1) process (for "git clone") as well as short-lived @@ -56,9 +57,9 @@ sub _do_spawn { $self->{cmd} = $o{quiet} ? undef : $cmd; eval { # popen_rd may die on EMFILE, ENFILE - ($self->{rpipe}, $self->{pid}) = popen_rd($cmd, $cmd_env, \%o); + $self->{rpipe} = popen_rd($cmd, $cmd_env, \%o); - die "E: $!" unless defined($self->{pid}); + die "E: $!" unless defined($self->{rpipe}); $limiter->{running}++; $start_cb->($self); # EPOLL_CTL_ADD may ENOSPC/ENOMEM @@ -115,41 +116,14 @@ sub finalize ($$) { } } -# callback for dwaitpid -sub waitpid_err ($$) { - my ($self, $pid) = @_; - my $xpid = delete $self->{pid}; - my $err; - if (defined $pid) { - if ($pid > 0) { # success! - $err = child_err($?); - } elsif ($pid < 0) { # ??? does this happen in our case? - $err = "W: waitpid($xpid, 0) => $pid: $!"; - } # else should not be called with pid == 0 - } - finalize($self, $err); -} - -sub do_waitpid ($) { - my ($self) = @_; - my $pid = $self->{pid}; - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($pid, \&waitpid_err, $self) }; - # done if we're running in PublicInbox::DS::EventLoop - if ($@) { - # non public-inbox-{httpd,nntpd} callers may block: - my $ret = waitpid($pid, 0); - waitpid_err($self, $ret); - } -} +# callback for dwaitpid or ProcessPipe +sub waitpid_err { finalize($_[0], child_err($?)) } sub finish ($;$) { my ($self, $err) = @_; - if (delete $self->{rpipe}) { - do_waitpid($self); - } else { - finalize($self, $err); - } + my $tied_pp = delete($self->{rpipe}) or return finalize($self, $err); + my PublicInbox::ProcessPipe $pp = tied *$tied_pp; + @$pp{qw(cb arg)} = (\&waitpid_err, $self); # for ->DESTROY } sub start ($$$) { @@ -359,12 +333,12 @@ sub new { } sub setup_rlimit { - my ($self, $name, $config) = @_; + my ($self, $name, $cfg) = @_; foreach my $rlim (@PublicInbox::Spawn::RLIMITS) { my $k = lc($rlim); $k =~ tr/_//d; $k = "publicinboxlimiter.$name.$k"; - defined(my $v = $config->{$k}) or next; + defined(my $v = $cfg->{$k}) or next; my @rlimit = split(/\s*,\s*/, $v); if (scalar(@rlimit) == 1) { push @rlimit, $rlimit[0]; diff --git a/lib/PublicInbox/Reply.pm b/lib/PublicInbox/Reply.pm index 5058ff8c..8226fdc3 100644 --- a/lib/PublicInbox/Reply.pm +++ b/lib/PublicInbox/Reply.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # For reply instructions and address generation in WWW UI diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm index a2a54944..9acf86c0 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pm +++ b/lib/PublicInbox/SaPlugin/ListMirror.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # SpamAssassin rules useful for running a mailing list mirror. We want to: diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pod b/lib/PublicInbox/SaPlugin/ListMirror.pod index 3c4ec8c1..6fdcf8c1 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pod +++ b/lib/PublicInbox/SaPlugin/ListMirror.pod @@ -105,7 +105,7 @@ and L =head1 COPYRIGHT -Copyright (C) 2016-2020 all contributors L +Copyright (C) 2016-2021 all contributors L License: AGPL-3.0+ L diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index fb35b747..7c6a16be 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # based on notmuch, but with no concept of folders, files or flags # @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(mdocid); +our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -54,11 +54,15 @@ use constant { use PublicInbox::Smsg; use PublicInbox::Over; -my $QP_FLAGS; -our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem); +our $QP_FLAGS; +our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); our $Xap; # 'Search::Xapian' or 'Xapian' -my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -my $ENQ_ASCENDING; +our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') + +# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, +# let's hope the ABI is stable +our $ENQ_DESCENDING = 0; +our $ENQ_ASCENDING = 1; sub load_xapian () { return 1 if defined $Xap; @@ -84,15 +88,8 @@ sub load_xapian () { 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); - # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, - # so lets hope this part of the ABI is stable because it's - # just an integer: - $ENQ_ASCENDING = $x eq 'Xapian' ? - 1 : Search::Xapian::ENQ_ASCENDING(); - - # for Smsg: - *PublicInbox::Smsg::sortable_unserialise = - $Xap.'::sortable_unserialise'; + *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -193,38 +190,29 @@ sub xdir ($;$) { } } -sub _xdb ($) { +# returns all shards as separate Xapian::Database objects w/o combining +sub xdb_shards_flat ($) { my ($self) = @_; - my $dir = xdir($self, 1); - my ($xdb, $slow_phrase); - my $qpf = \($self->{qp_flags} ||= $QP_FLAGS); - if ($self->{ibx_ver} >= 2) { - my @xdb; - opendir(my $dh, $dir) or return; # not initialized yet - + my $xpfx = $self->{xpfx}; + my (@xdb, $slow_phrase); + load_xapian(); + $self->{qp_flags} //= $QP_FLAGS; + if ($xpfx =~ m/xapian${\SCHEMA_VERSION}\z/) { + @xdb = ($X{Database}->new($xpfx)); + $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert"; + } else { + opendir(my $dh, $xpfx) or return (); # not initialized yet # We need numeric sorting so shard[0] is first for reading # Xapian metadata, if needed - my $last = max(grep(/\A[0-9]+\z/, readdir($dh))); - return if !defined($last); + my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); for (0..$last) { - my $shard_dir = "$dir/$_"; - if (-d $shard_dir && -r _) { - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } else { # gaps from missing epochs throw off mdocid() - warn "E: $shard_dir missing or unreadable\n"; - return; - } + my $shard_dir = "$self->{xpfx}/$_"; + push @xdb, $X{Database}->new($shard_dir); + $slow_phrase ||= -f "$shard_dir/iamchert"; } - $self->{nshard} = scalar(@xdb); - $xdb = shift @xdb; - $xdb->add_database($_) for @xdb; - } else { - $slow_phrase = -f "$dir/iamchert"; - $xdb = $X{Database}->new($dir); + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; } - $$qpf |= FLAG_PHRASE() unless $slow_phrase; - $xdb; + @xdb; } # v2 Xapian docids don't conflict, so they're identical to @@ -238,37 +226,29 @@ sub mdocid { sub mset_to_artnums { my ($self, $mset) = @_; - my $nshard = $self->{nshard} // 1; + my $nshard = $self->{nshard}; [ map { mdocid($nshard, $_) } $mset->items ]; } sub xdb ($) { my ($self) = @_; - $self->{xdb} ||= do { - load_xapian(); - _xdb($self); + $self->{xdb} //= do { + my @xdb = $self->xdb_shards_flat or return; + $self->{nshard} = scalar(@xdb); + my $xdb = shift @xdb; + $xdb->add_database($_) for @xdb; + $xdb; }; } -sub xpfx_init ($) { - my ($self) = @_; - if ($self->{ibx_ver} == 1) { - $self->{xpfx} .= '/public-inbox/xapian' . SCHEMA_VERSION; - } else { - $self->{xpfx} .= '/xap'.SCHEMA_VERSION; - } -} - sub new { my ($class, $ibx) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $self = bless { - xpfx => $ibx->{inboxdir}, # for xpfx_init + my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian'; + bless { + xpfx => "$ibx->{inboxdir}/$xap" . SCHEMA_VERSION, altid => $ibx->{altid}, - ibx_ver => $ibx->version, }, $class; - xpfx_init($self); - $self; } sub reopen { @@ -285,20 +265,19 @@ sub mset { $opts ||= {}; my $qp = $self->{qp} //= qparse_new($self); my $query = $qp->parse_query($query_string, $self->{qp_flags}); - $opts->{relevance} = 1 unless exists $opts->{relevance}; _do_enquire($self, $query, $opts); } sub retry_reopen { - my ($self, $cb, $arg) = @_; + my ($self, $cb, @arg) = @_; for my $i (1..10) { if (wantarray) { my @ret; - eval { @ret = $cb->($arg) }; + eval { @ret = $cb->($self, @arg) }; return @ret unless $@; } else { my $ret; - eval { $ret = $cb->($arg) }; + eval { $ret = $cb->($self, @arg) }; return $ret unless $@; } # Exception: The revision being read has been discarded - @@ -318,7 +297,7 @@ sub retry_reopen { sub _do_enquire { my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); + retry_reopen($self, \&_enquire_once, $query, $opts); } # returns true if all docs have the THREADID value @@ -328,19 +307,32 @@ sub has_threadid ($) { } sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @{$_[0]}; + my ($self, $query, $opts) = @_; my $xdb = xdb($self); + if (defined(my $eidx_key = $opts->{eidx_key})) { + $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opts->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $query = $X{Query}->new(OP_FILTER(), $query, $range); + } my $enquire = $X{Enquire}->new($xdb); $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; - if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID + my $rel = $opts->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $enquire->set_weighting_scheme($X{BoolWeight}->new); $enquire->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enquire->set_sort_by_value_then_relevance(TS, $desc); + } elsif ($rel == -2) { $enquire->set_weighting_scheme($X{BoolWeight}->new); - } elsif ($opts->{relevance}) { + $enquire->set_docid_order($ENQ_DESCENDING); + } else { # rel > 0 $enquire->set_sort_by_relevance_then_value(TS, $desc); - } else { - $enquire->set_sort_by_value_then_relevance(TS, $desc); } # `mairix -t / --threads' or JMAP collapseThreads @@ -352,7 +344,7 @@ sub _enquire_once { # retry_reopen callback sub mset_to_smsg { my ($self, $ibx, $mset) = @_; - my $nshard = $self->{nshard} // 1; + my $nshard = $self->{nshard}; my $i = 0; my %order = map { mdocid($nshard, $_) => ++$i } $mset->items; my @msgs = sort { @@ -384,7 +376,7 @@ sub qparse_new ($) { # for IMAP, undocumented for WWW and may be split off go away $cb->($qp, $NVRP->new(BYTES, 'bytes:')); - $cb->($qp, $NVRP->new(TS, 'ts:')); + $cb->($qp, $NVRP->new(TS, 'rt:')); $cb->($qp, $NVRP->new(UID, 'uid:')); while (my ($name, $prefix) = each %bool_pfx_external) { @@ -426,4 +418,36 @@ sub help { \@ret; } +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + +sub get_pct ($) { # mset item + # Capped at "99%" since "100%" takes an extra column in the + # thread skeleton view. says the value isn't + # very meaningful, anyways. + my $n = $_[0]->get_percent; + $n > 99 ? 99 : $n; +} + +sub xap_terms ($$;@) { + my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () + my %ret; + eval { + my $end = $xdb_or_doc->termlist_end(@docid); + my $cur = $xdb_or_doc->termlist_begin(@docid); + for (; $cur != $end; $cur++) { + $cur->skip_to($pfx); + last if $cur == $end; + my $tn = $cur->get_termname; + if (index($tn, $pfx) == 0) { + $ret{substr($tn, length($pfx))} = undef; + } + } + }; + \%ret; +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c36fc6c7..826302de 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -1,6 +1,6 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ -# based on notmuch, but with no concept of folders, files or flags +# based on notmuch, but with no concept of folders, files # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread @@ -15,15 +15,17 @@ use PublicInbox::InboxWritable; use PublicInbox::MID qw(mids_for_index mids); use PublicInbox::MsgIter; use PublicInbox::IdxStack; -use Carp qw(croak); +use Carp qw(croak carp); use POSIX qw(strftime); +use Time::Local qw(timegm); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size); +our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -31,11 +33,11 @@ use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; +our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; sub new { my ($class, $ibx, $creat, $shard) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $levels = qr/\A(?:full|medium|basic)\z/; my $inboxdir = $ibx->{inboxdir}; my $version = $ibx->version; my $indexlevel = 'full'; @@ -45,27 +47,23 @@ sub new { $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; } if ($ibx->{indexlevel}) { - if ($ibx->{indexlevel} =~ $levels) { + if ($ibx->{indexlevel} =~ $INDEXLEVELS) { $indexlevel = $ibx->{indexlevel}; } else { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } $ibx = PublicInbox::InboxWritable->new($ibx); - my $self = bless { - ibx => $ibx, - xpfx => $inboxdir, # for xpfx_init - -altid => $altid, - ibx_ver => $version, - indexlevel => $indexlevel, - }, $class; - $self->xpfx_init; + my $self = PublicInbox::Search->new($ibx); + bless $self, $class; + $self->{ibx} = $ibx; + $self->{-altid} = $altid; + $self->{indexlevel} = $indexlevel; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; if ($ibx->{-skip_docdata}) { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; @@ -107,8 +105,11 @@ sub load_xapian_writable () { $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); my $ver = (eval($xap.'::major_version()') << 16) | - (eval($xap.'::minor_version()') << 8); + (eval($xap.'::minor_version()') << 8) | + eval($xap.'::revision()'); $DB_NO_SYNC = 0x4 if $ver >= 0x10400; + # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks + $X->{CLOEXEC_UNSET} = 1 if $ver >= 0x010215 && $ver <= 0x010218; 1; } @@ -135,7 +136,7 @@ sub idx_acquire { } } return unless defined $flag; - $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync}; + $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -152,7 +153,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } @@ -323,6 +324,16 @@ sub index_xapian { # msg_iter callback } } +sub index_list_id ($$$) { + my ($self, $doc, $hdr) = @_; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } +} + sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; for my $mid (@$mids) { @@ -336,16 +347,12 @@ sub index_ids ($$$$) { } } $doc->add_boolean_term('Q' . $_) for @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = lc $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } + index_list_id($self, $doc, $hdr); } -sub add_xapian ($$$$) { +sub eml2doc ($$$;$) { my ($self, $eml, $smsg, $mids) = @_; + $mids //= mids_for_index($eml); my $doc = $X->{Document}->new; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); @@ -361,6 +368,9 @@ sub add_xapian ($$$$) { $tg->set_document($doc); index_headers($self, $smsg); + if (defined(my $eidx_key = $smsg->{eidx_key})) { + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + } msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); @@ -370,7 +380,7 @@ sub add_xapian ($$$$) { if (!$self->{-skip_docdata}) { # WWW doesn't need {to} or {cc}, only NNTP $smsg->{to} = $smsg->{cc} = ''; - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + $smsg->parse_references($eml, $mids); my $data = $smsg->to_doc_data; $doc->set_data($data); } @@ -385,12 +395,19 @@ sub add_xapian ($$$$) { } } } + $doc; +} + +sub add_xapian ($$$$) { + my ($self, $eml, $smsg, $mids) = @_; + begin_txn_lazy($self); + my $doc = eml2doc($self, $eml, $smsg, $mids); $self->{xdb}->replace_document($smsg->{num}, $doc); } sub _msgmap_init ($) { my ($self) = @_; - die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1; + die "BUG: _msgmap_init is only for v1\n" if $self->{ibx}->version != 1; $self->{mm} //= eval { require PublicInbox::Msgmap; my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1; @@ -401,6 +418,7 @@ sub _msgmap_init ($) { sub add_message { # mime = PublicInbox::Eml or Email::MIME object my ($self, $mime, $smsg, $sync) = @_; + begin_txn_lazy($self); my $mids = mids_for_index($mime); $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat $smsg->{mid} //= $mids->[0]; # v1 compatibility @@ -434,32 +452,116 @@ sub add_message { $smsg->{num}; } -sub xdb_remove { - my ($self, $oid, @removed) = @_; - my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = eval { $xdb->get_document($num) }; - unless ($doc) { - warn "E: $@\n" if $@; - warn "E: #$num $oid missing in Xapian\n"; - next; - } - my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->load_expand($doc); - my $blob = $smsg->{blob} // '(unset)'; - if ($blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; - } +sub _get_doc ($$) { + my ($self, $docid) = @_; + my $doc = eval { $self->{xdb}->get_document($docid) }; + $doc // do { + warn "E: $@\n" if $@; + warn "E: #$docid missing in Xapian\n"; + undef; + } +} + +sub add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + term_generator($self)->set_document($doc); + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + index_list_id($self, $doc, $eml); + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + eval { $doc->remove_term('O'.$eidx_key) }; + warn "W: ->remove_term O$eidx_key: $@\n" if $@; + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + eval { $doc->remove_term('G' . $lid) }; + warn "W: ->remove_term G$lid: $@\n" if $@; + + # nb: we don't remove the XL probabilistic terms + # since terms may overlap if cross-posted. + # + # IOW, a message which has both + # and would have overlapping + # "XLexample" and "XLcom" as terms and which we + # wouldn't know if they're safe to remove if we just + # unindex while preserving + # . + # + # In any case, this entire sub is will likely never + # be needed and users using the "l:" prefix are probably + # rarer. + } + $self->{xdb}->replace_document($docid, $doc); +} + +sub set_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my %keep = map { $_ => 1 } @kw; + my %add = %keep; + my @rm; + my $end = $doc->termlist_end; + for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { + $cur->skip_to('K'); + last if $cur == $end; + my $kw = $cur->get_termname; + $kw =~ s/\AK//s or next; + $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw); } + return unless (scalar(@rm) + scalar(keys %add)); + $doc->remove_term('K'.$_) for @rm; + $doc->add_boolean_term('K'.$_) for (keys %add); + $self->{xdb}->replace_document($docid, $doc); } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{oidx}; +sub add_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + $doc->add_boolean_term('K'.$_) for @kw; + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my $replace; + eval { + $doc->remove_term('K'.$_); + $replace = 1 + } for @kw; + $self->{xdb}->replace_document($docid, $doc) if $replace; +} + +sub smsg_from_doc ($) { + my ($doc) = @_; + my $data = $doc->get_data or return; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); + $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); + $smsg->load_from_data($data); + $smsg; +} + +sub xdb_remove { + my ($self, @docids) = @_; $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); + my $xdb = $self->{xdb} or return; + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in in Xapian? $@\n" if $@; + } } sub index_git_blob_id { @@ -484,8 +586,8 @@ sub unindex_eml { $tmp{$_}++ for @removed; } if (!$nr) { - $mids = join('> <', @$mids); - warn "W: <$mids> missing for removal from overview\n"; + my $m = join('> <', @$mids); + warn "W: <$m> missing for removal from overview\n"; } while (my ($num, $nr) = each %tmp) { warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; @@ -495,7 +597,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -515,45 +617,63 @@ sub index_mm { } } -# returns the number of bytes to add if given a non-CRLF arg -sub crlf_adjust ($) { - if (index($_[0], "\r\n") < 0) { - # common case is LF-only, every \n needs an \r; - # so favor a cheap tr// over an expensive m//g - $_[0] =~ tr/\n/\n/; - } else { # count number of '\n' w/o '\r', expensive: - scalar(my @n = ($_[0] =~ m/(?cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; - $size += crlf_adjust($$bref); - my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $smsg->set_bytes($$bref, $size); my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; + my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + ${$sync->{latest_cmt}} = $cur_cmt; } sub unindex_both { # git->cat_async callback - my ($bref, $oid, $type, $size, $self) = @_; + my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); + my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); + # may be undef if leftover + if (defined(my $cur_cmt = $sync->{cur_cmt})) { + ${$sync->{latest_cmt}} = $cur_cmt; + } + ++$self->{nidx}; +} + +sub with_umask { + my $self = shift; + ($self->{ibx} // $self->{eidx})->with_umask(@_); } # called by public-inbox-index sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; - $self->{ibx}->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + $self->with_umask(\&_index_sync, $self, $opt); + if ($opt->{reindex} && !$opt->{quit}) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } @@ -569,46 +689,44 @@ sub check_size { # check_async cb for -index --max-size=... sub v1_checkpoint ($$;$) { my ($self, $sync, $stk) = @_; - $self->{ibx}->git->check_async_wait; - $self->{ibx}->git->cat_async_wait; + $self->{ibx}->git->async_wait_all; - # latest_cmt may be undef - my $newest = $stk ? $stk->{latest_cmt} : undef; - if ($newest) { + # $newest may be undef + my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + if (defined($newest)) { my $cur = $self->{mm}->last_commit || ''; if (need_update($self, $cur, $newest)) { $self->{mm}->last_commit($newest); } - } else { - ${$sync->{max}} = $self->{batch_bytes}; } + ${$sync->{max}} = $self->{batch_bytes}; $self->{mm}->{dbh}->commit; - if ($newest && need_xapian($self)) { - my $xdb = $self->{xdb}; + my $xdb = need_xapian($self) ? $self->{xdb} : undef; + if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } - + } + if ($stk) { # all done if $stk is passed # let SearchView know a full --reindex was done so it can # generate ->has_threadid-dependent links - if ($sync->{reindex} && !ref($sync->{reindex})) { + if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) { my $n = $xdb->get_metadata('has_threadid'); $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } + $self->{oidx}->rethread_done($sync->{-opt}); # all done } - - $self->{oidx}->rethread_done($sync->{-opt}) if $newest; # all done commit_txn_lazy($self); - $self->{ibx}->git->cleanup; + $sync->{ibx}->git->cleanup; my $nr = ${$sync->{nr}}; idx_release($self, $nr); # let another process do some work... if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -617,27 +735,32 @@ sub v1_checkpoint ($$;$) { # only for v1 sub process_stack { my ($self, $sync, $stk) = @_; - my $git = $self->{ibx}->git; + my $git = $sync->{ibx}->git; my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; $sync->{sidx} = $self; + $sync->{latest_cmt} = \(my $latest_cmt); $self->{mm}->{dbh}->begin_work; if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $sync); } } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { + my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid }; + last if $sync->{quit}; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; + $arg->{autime} = $at; + $arg->{cotime} = $ct; if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { @@ -645,17 +768,17 @@ sub process_stack { } v1_checkpoint($self, $sync) if $max <= 0; } elsif ($f eq 'd') { - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } -sub log2stack ($$$$) { - my ($sync, $git, $range, $ibx) = @_; +sub log2stack ($$$) { + my ($sync, $git, $range) = @_; my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise) my ($add, $del); - if ($ibx->version == 1) { + if ($sync->{ibx}->version == 1) { my $path = $hex.'{2}/'.$hex.'{38}'; $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!; $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!; @@ -669,17 +792,18 @@ sub log2stack ($$$$) { my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H --no-notes --no-color --no-renames --no-abbrev), $range); - my ($at, $ct, $stk); + my ($at, $ct, $stk, $cmt); while (<$fh>) { + return if $sync->{quit}; if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); + ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3); + $stk //= PublicInbox::IdxStack->new($cmt); } elsif (/$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); + $stk->push_rec('d', $at, $ct, $oid, $cmt); } } elsif (/$add/) { my $oid = $1; @@ -687,12 +811,10 @@ sub log2stack ($$$$) { my $oid_bin = pack('H*', $oid); my $nr = --$D->{$oid_bin}; delete($D->{$oid_bin}) if $nr <= 0; - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); + next if $nr >= 0; } + $stk->push_rec('m', $at, $ct, $oid, $cmt); } } close $fh or die "git log failed: \$?=$?"; @@ -700,9 +822,9 @@ sub log2stack ($$$$) { $stk->read_prepare; } -sub prepare_stack ($$$) { - my ($self, $sync, $range) = @_; - my $git = $self->{ibx}->git; +sub prepare_stack ($$) { + my ($sync, $range) = @_; + my $git = $sync->{ibx}->git; if (index($range, '..') < 0) { # don't show annoying git errors to users who run -index @@ -711,7 +833,7 @@ sub prepare_stack ($$$) { return PublicInbox::IdxStack->new->read_prepare if $?; } $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR - log2stack($sync, $git, $range, $self->{ibx}); + log2stack($sync, $git, $range); } # --is-ancestor requires git 1.8.0+ @@ -759,15 +881,30 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; my $tip = $opt->{ref} || 'HEAD'; - my $git = $self->{ibx}->git; + my $ibx = $self->{ibx}; + local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $git->batch_prepare; + $ibx->git->batch_prepare; my $pr = $opt->{-progress}; - my $sync = { reindex => $opt->{reindex}, -opt => $opt }; + my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); @@ -785,10 +922,10 @@ sub _index_sync { my $lx = reindex_from($sync->{reindex}, $last_commit); my $range = $lx eq '' ? $tip : "$lx..$tip"; $pr->("counting changes\n\t$range ... ") if $pr; - my $stk = prepare_stack($self, $sync, $range); + my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -808,7 +945,7 @@ sub _begin_txn { sub begin_txn_lazy { my ($self) = @_; - $self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn}; + $self->with_umask(\&_begin_txn, $self) if !$self->{txn}; } # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) @@ -836,6 +973,10 @@ sub set_metadata_once { sub _commit_txn { my ($self) = @_; + if (my $eidx = $self->{eidx}) { + $eidx->git->async_wait_all; + $eidx->{transact_bytes} = 0; + } if (my $xdb = $self->{xdb}) { set_metadata_once($self); $xdb->commit_transaction; @@ -846,15 +987,42 @@ sub _commit_txn { sub commit_txn_lazy { my ($self) = @_; delete($self->{txn}) and - $self->{ibx}->with_umask(\&_commit_txn, $self); + $self->with_umask(\&_commit_txn, $self); } -sub worker_done { - my ($self) = @_; - if (need_xapian($self)) { - die "$$ $0 xdb not released\n" if $self->{xdb}; +sub eidx_shard_new { + my ($class, $eidx, $shard) = @_; + my $self = bless { + eidx => $eidx, + xpfx => $eidx->{xpfx}, + indexlevel => $eidx->{indexlevel}, + -skip_docdata => 1, + shard => $shard, + creat => 1, + }, $class; + $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium'; + $self; +} + +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); } - die "$$ $0 still in transaction\n" if $self->{txn}; } 1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index f23d23d0..1598faeb 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Internal interface for a single Xapian shard in V2 inboxes. @@ -6,150 +6,82 @@ package PublicInbox::SearchIdxShard; use strict; use v5.10.1; -use parent qw(PublicInbox::SearchIdx); -use IO::Handle (); # autoflush -use PublicInbox::Eml; +use parent qw(PublicInbox::SearchIdx PublicInbox::IPC); +use PublicInbox::OnDestroy; sub new { - my ($class, $v2w, $shard) = @_; + my ($class, $v2w, $shard) = @_; # v2w may be ExtSearchIdx my $ibx = $v2w->{ibx}; - my $self = $class->SUPER::new($ibx, 1, $shard); + my $self = $ibx ? $class->SUPER::new($ibx, 1, $shard) + : $class->eidx_shard_new($v2w, $shard); # create the DB before forking: $self->idx_acquire; $self->set_metadata_once; $self->idx_release; - $self->spawn_worker($v2w, $shard) if $v2w->{parallel}; + if ($v2w->{parallel}) { + local $self->{-v2w_afc} = $v2w; + $self->ipc_worker_spawn("shard[$shard]"); + # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size for + # inputs speeds V2Writable batch imports across 8 cores by + # nearly 20%. Since any of our responses are small, make + # the response pipe as small as possible + if ($^O eq 'linux') { + fcntl($self->{-ipc_req}, 1031, 1048576); + fcntl($self->{-ipc_res}, 1031, 4096); + } + } $self; } -sub spawn_worker { - my ($self, $v2w, $shard) = @_; - my ($r, $w); - pipe($r, $w) or die "pipe failed: $!\n"; - $w->autoflush(1); - my $pid = fork; - defined $pid or die "fork failed: $!\n"; - if ($pid == 0) { - my $bnote = $v2w->atfork_child; - close $w or die "failed to close: $!"; - - # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size here - # speeds V2Writable batch imports across 8 cores by nearly 20% - fcntl($r, 1031, 1048576) if $^O eq 'linux'; - - eval { shard_worker_loop($self, $v2w, $r, $shard, $bnote) }; - die "worker $shard died: $@\n" if $@; - die "unexpected MM $self->{mm}" if $self->{mm}; - exit; +sub _worker_done { + my ($self) = @_; + if ($self->need_xapian) { + die "$$ $0 xdb not released\n" if $self->{xdb}; } - $self->{pid} = $pid; - $self->{w} = $w; - close $r or die "failed to close: $!"; + die "$$ $0 still in transaction\n" if $self->{txn}; } -# this reads all the writes to $self->{w} from the parent process -sub shard_worker_loop ($$$$$) { - my ($self, $v2w, $r, $shard, $bnote) = @_; - $0 = "pi-v2-shard[$shard]"; +sub ipc_atfork_child { # called automatically before ipc_worker_loop + my ($self) = @_; + my $v2w = delete $self->{-v2w_afc} or die 'BUG: {-v2w_afc} missing'; + $v2w->atfork_child; # calls ipc_sibling_atfork_child on our siblings + $v2w->{current_info} = "[$self->{shard}]"; # for $SIG{__WARN__} $self->begin_txn_lazy; - while (my $line = readline($r)) { - $v2w->{current_info} = "[$shard] $line"; - if ($line eq "commit\n") { - $self->commit_txn_lazy; - } elsif ($line eq "close\n") { - $self->idx_release; - } elsif ($line eq "barrier\n") { - $self->commit_txn_lazy; - # no need to lock < 512 bytes is atomic under POSIX - print $bnote "barrier $shard\n" or - die "write failed for barrier $!\n"; - } elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) { - $self->remove_by_oid($1, $2 + 0); - } else { - chomp $line; - # n.b. $mid may contain spaces(!) - my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid) - = split(/ /, $line, 8); - $self->begin_txn_lazy; - my $n = read($r, my $msg, $to_read) or die "read: $!\n"; - $n == $to_read or die "short read: $n != $to_read\n"; - my $mime = PublicInbox::Eml->new(\$msg); - my $smsg = bless { - bytes => $bytes, - num => $num + 0, - blob => $blob, - mid => $mid, - tid => $tid, - ds => $ds, - ts => $ts, - }, 'PublicInbox::Smsg'; - $self->add_message($mime, $smsg); - } - } - $self->worker_done; + # caller must capture this: + PublicInbox::OnDestroy->new($$, \&_worker_done, $self); } -sub index_raw { - my ($self, $msgref, $eml, $smsg) = @_; - if (my $w = $self->{w}) { - # mid must be last, it can contain spaces (but not LF) - print $w join(' ', @$smsg{qw(raw_bytes bytes - num blob ds ts tid mid)}), - "\n", $$msgref or die "failed to write shard $!\n"; - } else { - if ($eml) { - undef $$msgref; - } else { # --xapian-only + --sequential-shard: - $eml = PublicInbox::Eml->new($msgref); - } - $self->begin_txn_lazy; - $self->add_message($eml, $smsg); - } -} - -sub atfork_child { - close $_[0]->{w} or die "failed to close write pipe: $!\n"; +sub index_eml { + my ($self, $eml, $smsg, $eidx_key) = @_; + $smsg->{eidx_key} = $eidx_key if defined $eidx_key; + $self->ipc_do('add_xapian', $eml, $smsg); } -sub shard_barrier { - my ($self) = @_; - if (my $w = $self->{w}) { - print $w "barrier\n" or die "failed to print: $!"; - } else { - $self->commit_txn_lazy; - } +# wait for return to determine when ipc_do('commit_txn_lazy') is done +sub echo { + shift; + "@_"; } -sub shard_commit { +sub idx_close { my ($self) = @_; - if (my $w = $self->{w}) { - print $w "commit\n" or die "failed to write commit: $!"; - } else { - $self->commit_txn_lazy; - } + die "transaction in progress $self\n" if $self->{txn}; + $self->idx_release if $self->{xdb}; } sub shard_close { my ($self) = @_; - if (my $w = delete $self->{w}) { - my $pid = delete $self->{pid} or die "no process to wait on\n"; - print $w "close\n" or die "failed to write to pid:$pid: $!\n"; - close $w or die "failed to close pipe for pid:$pid: $!\n"; - waitpid($pid, 0) == $pid or die "remote process did not finish"; - $? == 0 or die ref($self)." pid:$pid exited with: $?"; - } else { - die "transaction in progress $self\n" if $self->{txn}; - $self->idx_release if $self->{xdb}; - } + $self->ipc_do('idx_close'); + $self->ipc_worker_stop; } -sub shard_remove { - my ($self, $oid, $num) = @_; - if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child - print $w "D $oid $num\n" or die "failed to write remove $!"; - } else { # same process - $self->remove_by_oid($oid, $num); +sub shard_over_check { + my ($self, $over) = @_; + if ($self->{-ipc_req} && $over->{dbh}) { + # can't send DB handles over IPC + $over = ref($over)->new($over->{dbh}->sqlite_db_filename); } + $self->ipc_do('over_check', $over); } 1; diff --git a/lib/PublicInbox/SearchQuery.pm b/lib/PublicInbox/SearchQuery.pm index 6724ae39..0f360500 100644 --- a/lib/PublicInbox/SearchQuery.pm +++ b/lib/PublicInbox/SearchQuery.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # used by PublicInbox::SearchView diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 60f692b2..8fb3a030 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -42,7 +42,7 @@ sub thread { # We'll trust the client Date: header here instead of the Received: # time since this is for display (and not retrieval) _set_parent(\%id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $rootset = [ grep { !delete($_->{parent}) && $_->visible($ibx) } values %id_table ]; @@ -166,7 +166,7 @@ sub order_children { my %seen = ($cur => 1); # self-referential loop prevention my @q = ($cur); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; while (defined($cur = shift @q)) { my $c = $cur->{children}; # The hashref here... diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index c482f1c9..d50d3cf6 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # Displays search results for the web interface @@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::SearchThread; use PublicInbox::SearchQuery; -use PublicInbox::Search qw(mdocid); +use PublicInbox::Search qw(get_pct); my %rmap_inc; sub mbox_results { @@ -30,7 +30,7 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $x = $q->{x}; @@ -93,9 +93,9 @@ sub mset_summary { my $pad = length("$total"); my $pfx = ' ' x $pad; my $res = \($ctx->{-html_tip}); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - my @nums = @{$ibx->search->mset_to_artnums($mset)}; + my @nums = @{$ibx->isrch->mset_to_artnums($mset)}; my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)}; my ($min, $max); @@ -156,7 +156,7 @@ sub path2inc ($) { sub err_txt { my ($ctx, $err) = @_; - my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/'; + my $u = $ctx->{ibx}->base_url($ctx->{env}) . '_/text/help/'; $err =~ s/^\s*Exception:\s*//; # bad word to show users :P $err =~ s!(\S+)!path2inc($1)!sge; $err = ascii_html($err); @@ -201,7 +201,7 @@ sub search_nav_top { } my $A = $q->qs_html(x => 'A', r => undef); $rv .= qq{|Atom feed]}; - if ($ctx->{-inbox}->search->has_threadid) { + if ($ctx->{ibx}->isrch->has_threadid) { $rv .= qq{\n\t\t\tdownload mbox.gz: } . # we set name=z w/o using it since it seems required for # lynx (but works fine for w3m). @@ -276,24 +276,15 @@ sub sort_relevance { } @{$_[0]} ] } -sub get_pct ($) { - # Capped at "99%" since "100%" takes an extra column in the - # thread skeleton view. says the value isn't - # very meaningful, anyways. - my $n = $_[0]->get_percent; - $n > 99 ? 99 : $n; -} - sub mset_thread { my ($ctx, $mset, $q) = @_; - my $ibx = $ctx->{-inbox}; - my $nshard = $ibx->search->{nshard} // 1; - my %pct = map { mdocid($nshard, $_) => get_pct($_) } $mset->items; - my $msgs = $ibx->over->get_all(keys %pct); - $_->{pct} = $pct{$_->{num}} for @$msgs; + my $ibx = $ctx->{ibx}; + my @pct = map { get_pct($_) } $mset->items; + my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset); + my $i = 0; + $_->{pct} = $pct[$i++] for @$msgs; my $r = $q->{r}; if ($r) { # for descriptions in search_nav_bot - my @pct = values %pct; $q->{-min_pct} = min(@pct); $q->{-max_pct} = max(@pct); } @@ -354,7 +345,7 @@ sub ctx_prepare { sub adump { my ($cb, $mset, $q, $ctx) = @_; - $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset); + $ctx->{ids} = $ctx->{ibx}->isrch->mset_to_artnums($mset); $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i); } @@ -363,7 +354,7 @@ sub adump { sub adump_i { my ($ctx) = @_; while (my $num = shift @{$ctx->{ids}}) { - my $smsg = eval { $ctx->{-inbox}->over->get_art($num) } or next; + my $smsg = eval { $ctx->{ibx}->over->get_art($num) } or next; return $smsg; } } diff --git a/lib/PublicInbox/SharedKV.pm b/lib/PublicInbox/SharedKV.pm new file mode 100644 index 00000000..072c94ca --- /dev/null +++ b/lib/PublicInbox/SharedKV.pm @@ -0,0 +1,154 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ + +# fork()-friendly key-value store. Will be used for making +# augmenting Maildirs and mboxes less expensive, maybe. +# We use flock(2) to avoid SQLite lock problems (busy timeouts, backoff) +package PublicInbox::SharedKV; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock); +use File::Temp qw(tempdir); +use DBI (); +use PublicInbox::Spawn; +use File::Path qw(rmtree); + +sub dbh { + my ($self, $lock) = @_; + $self->{dbh} //= do { + my $f = $self->{filename}; + $lock //= $self->lock_for_scope; + my $dbh = DBI->connect("dbi:SQLite:dbname=$f", '', '', { + AutoCommit => 1, + RaiseError => 1, + PrintError => 0, + sqlite_use_immediate_transaction => 1, + # no sqlite_unicode here, this is for binary data + }); + my $opt = $self->{opt} // {}; + $dbh->do('PRAGMA synchronous = OFF') if !$opt->{fsync}; + $dbh->do('PRAGMA cache_size = '.($opt->{cache_size} || 80000)); + $dbh->do('PRAGMA journal_mode = '. + ($opt->{journal_mode} // 'WAL')); + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS kv ( + k VARBINARY PRIMARY KEY NOT NULL, + v VARBINARY NOT NULL, + UNIQUE (k) +) + + $dbh; + } +} + +sub new { + my ($cls, $dir, $base, $opt) = @_; + my $self = bless { opt => $opt }, $cls; + unless (defined $dir) { + $self->{tmpdir} = $dir = tempdir('skv-XXXXXX', TMPDIR => 1); + $self->{tmpid} = "$$.$self"; + } + -d $dir or mkdir($dir) or die "mkdir($dir): $!"; + $base //= ''; + my $f = $self->{filename} = "$dir/$base.sqlite3"; + $self->{lock_path} = $opt->{lock_path} // "$dir/$base.flock"; + unless (-f $f) { + open my $fh, '+>>', $f or die "failed to open $f: $!"; + PublicInbox::Spawn::nodatacow_fd(fileno($fh)); + } + $self; +} + +sub index_values { + my ($self) = @_; + my $lock = $self->lock_for_scope; + $self->dbh($lock)->do('CREATE INDEX IF NOT EXISTS idx_v ON kv (v)'); +} + +sub set_maybe { + my ($self, $key, $val, $lock) = @_; + $lock //= $self->lock_for_scope; + my $e = $self->{dbh}->prepare_cached(<<'')->execute($key, $val); +INSERT OR IGNORE INTO kv (k,v) VALUES (?, ?) + + $e == 0 ? undef : $e; +} + +# caller calls sth->fetchrow_array +sub each_kv_iter { + my ($self) = @_; + my $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT k,v FROM kv + + $sth->execute; + $sth +} + +sub delete_by_val { + my ($self, $val, $lock) = @_; + $lock //= $self->lock_for_scope; + $self->{dbh}->prepare_cached(<<'')->execute($val) + 0; +DELETE FROM kv WHERE v = ? + +} + +sub replace_values { + my ($self, $oldval, $newval, $lock) = @_; + $lock //= $self->lock_for_scope; + $self->{dbh}->prepare_cached(<<'')->execute($newval, $oldval) + 0; +UPDATE kv SET v = ? WHERE v = ? + +} + +sub set { + my ($self, $key, $val) = @_; + if (defined $val) { + my $e = $self->{dbh}->prepare_cached(<<'')->execute($key, $val); +INSERT OR REPLACE INTO kv (k,v) VALUES (?,?) + + $e == 0 ? undef : $e; + } else { + $self->{dbh}->prepare_cached(<<'')->execute($key); +DELETE FROM kv WHERE k = ? + + } +} + +sub get { + my ($self, $key) = @_; + my $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT v FROM kv WHERE k = ? + + $sth->execute($key); + $sth->fetchrow_array; +} + +sub xchg { + my ($self, $key, $newval, $lock) = @_; + $lock //= $self->lock_for_scope; + my $oldval = get($self, $key); + if (defined $newval) { + set($self, $key, $newval); + } else { + $self->{dbh}->prepare_cached(<<'')->execute($key); +DELETE FROM kv WHERE k = ? + + } + $oldval; +} + +sub count { + my ($self) = @_; + my $sth = $self->{dbh}->prepare_cached(<<''); +SELECT COUNT(k) FROM kv + + $sth->execute; + $sth->fetchrow_array; +} + +sub DESTROY { + my ($self) = @_; + rmtree($self->{tmpdir}) if ($self->{tmpid} // '') eq "$$.$self"; +} + +1; diff --git a/lib/PublicInbox/Sigfd.pm b/lib/PublicInbox/Sigfd.pm index 5d61e630..a4d1b3bb 100644 --- a/lib/PublicInbox/Sigfd.pm +++ b/lib/PublicInbox/Sigfd.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # Wraps a signalfd (or similar) for PublicInbox::DS @@ -6,8 +6,8 @@ package PublicInbox::Sigfd; use strict; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET $SFD_NONBLOCK); -use POSIX qw(:signal_h); +use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET SFD_NONBLOCK); +use POSIX (); use IO::Handle (); # returns a coderef to unblock signals if neither signalfd or kqueue @@ -33,7 +33,7 @@ sub new { } else { return; # wake up every second to check for signals } - if ($flags & $SFD_NONBLOCK) { # it can go into the event loop + if ($flags & SFD_NONBLOCK) { # it can go into the event loop $self->SUPER::new($io, EPOLLIN | EPOLLET); } else { # master main loop $self->{sock} = $io; @@ -63,14 +63,4 @@ sub event_step { while (wait_once($_[0])) {} # non-blocking } -sub sig_setmask { sigprocmask(SIG_SETMASK, @_) or die "sigprocmask: $!" } - -sub block_signals () { - my $oldset = POSIX::SigSet->new; - my $newset = POSIX::SigSet->new; - $newset->fillset or die "fillset: $!"; - sig_setmask($newset, $oldset); - $oldset; -} - 1; diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 171e0a00..b4cc2ecb 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # A small/skeleton/slim representation of a message. @@ -12,16 +12,9 @@ use strict; use warnings; use base qw(Exporter); our @EXPORT_OK = qw(subject_normalized); -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -use Time::Local qw(timegm); - -sub get_val ($$) { - my ($doc, $col) = @_; - # sortable_unserialise is defined by PublicInbox::Search::load_xapian() - sortable_unserialise($doc->get_value($col)); -} sub to_doc_data { my ($self) = @_; @@ -61,17 +54,6 @@ sub load_from_data ($$) { ) = split(/\n/, $_[1]); } -sub load_expand { - my ($self, $doc) = @_; - my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); - my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); - $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); - load_from_data($self, $data); - $self; -} - sub psgi_cull ($) { my ($self) = @_; @@ -87,7 +69,27 @@ sub psgi_cull ($) { $self; } -# for Import and v1 non-SQLite WWW code paths +sub parse_references ($$$) { + my ($smsg, $hdr, $mids) = @_; + my $refs = references($hdr); + push(@$refs, @$mids) if scalar(@$mids) > 1; + return $refs if scalar(@$refs) == 0; + + # prevent circular references here: + my %seen = ( $smsg->{mid} => 1 ); + my @keep; + foreach my $ref (@$refs) { + if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { + warn "References: <$ref> too long, ignoring\n"; + next; + } + $seen{$ref} //= push(@keep, $ref); + } + $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; + \@keep; +} + +# used for v2, Import and v1 non-SQLite WWW code paths sub populate { my ($self, $hdr, $sync) = @_; for my $f (qw(From To Cc Subject)) { @@ -118,9 +120,7 @@ sub populate { $self->{-ts} = [ my @ts = msg_timestamp($hdr, $sync->{cotime}) ]; $self->{ds} //= $ds[0]; # no zone $self->{ts} //= $ts[0]; - - # for v1 users w/o SQLite - $self->{mid} //= eval { mids($hdr)->[0] } // ''; + $self->{mid} //= mids($hdr)->[0]; } # no strftime, that is locale-dependent and not for RFC822 @@ -155,4 +155,17 @@ sub subject_normalized ($) { $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?{bytes} = $_[2] + crlf_adjust($_[1]) } + 1; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 83f7a4ee..1d70975e 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # "Solve" blobs which don't exist in git code repositories by @@ -216,7 +216,7 @@ sub filename_query ($) { sub find_smsgs ($$$) { my ($self, $ibx, $want) = @_; - my $srch = $ibx->search or return; + my $srch = $ibx->isrch or return; my $post = $want->{oid_b} or die 'BUG: no {oid_b}'; $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post"; diff --git a/lib/PublicInbox/Spamcheck.pm b/lib/PublicInbox/Spamcheck.pm index ffebb3cf..d8fa80c8 100644 --- a/lib/PublicInbox/Spamcheck.pm +++ b/lib/PublicInbox/Spamcheck.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Spamchecking used by -watch and -mda tools @@ -7,8 +7,8 @@ use strict; use warnings; sub get { - my ($config, $key, $default) = @_; - my $spamcheck = $config->{$key}; + my ($cfg, $key, $default) = @_; + my $spamcheck = $cfg->{$key}; $spamcheck = $default unless $spamcheck; return if !$spamcheck || $spamcheck eq 'none'; diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm index 3ba2c3c9..d2b6429c 100644 --- a/lib/PublicInbox/Spamcheck/Spamc.pm +++ b/lib/PublicInbox/Spamcheck/Spamc.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Default spam filter class for wrapping spamc(1) diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm index cb16fcf6..376d2190 100644 --- a/lib/PublicInbox/Spawn.pm +++ b/lib/PublicInbox/Spawn.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # This allows vfork to be used for spawning subprocesses if @@ -19,7 +19,7 @@ use strict; use parent qw(Exporter); use Symbol qw(gensym); use PublicInbox::ProcessPipe; -our @EXPORT_OK = qw/which spawn popen_rd nodatacow_dir/; +our @EXPORT_OK = qw(which spawn popen_rd run_die nodatacow_dir); our @RLIMITS = qw(RLIMIT_CPU RLIMIT_CORE RLIMIT_DATA); my $vfork_spawn = <<'VFORK_SPAWN'; @@ -201,12 +201,103 @@ void nodatacow_dir(const char *dir) } SET_NODATACOW +# last choice for script/lei, 1st choice for lei internals +# compatible with PublicInbox::CmdIPC4 +my $fdpass = <<'FDPASS'; +#include +#include +#include + +#if defined(CMSG_SPACE) && defined(CMSG_LEN) +#define SEND_FD_CAPA 6 +#define SEND_FD_SPACE (SEND_FD_CAPA * sizeof(int)) +union my_cmsg { + struct cmsghdr hdr; + char pad[sizeof(struct cmsghdr) + 16 + SEND_FD_SPACE]; +}; + +SV *send_cmd4(PerlIO *s, SV *svfds, SV *data, int flags) +{ + struct msghdr msg = { 0 }; + union my_cmsg cmsg = { 0 }; + STRLEN dlen = 0; + struct iovec iov; + ssize_t sent; + AV *fds = (AV *)SvRV(svfds); + I32 i, nfds = av_len(fds) + 1; + int *fdp; + + if (SvOK(data)) { + iov.iov_base = SvPV(data, dlen); + iov.iov_len = dlen; + } + if (!dlen) { /* must be non-zero */ + iov.iov_base = &msg.msg_namelen; /* whatever */ + iov.iov_len = 1; + } + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + if (nfds) { + if (nfds > SEND_FD_CAPA) { + fprintf(stderr, "FIXME: bump SEND_FD_CAPA=%d\n", nfds); + nfds = SEND_FD_CAPA; + } + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(nfds * sizeof(int)); + cmsg.hdr.cmsg_level = SOL_SOCKET; + cmsg.hdr.cmsg_type = SCM_RIGHTS; + cmsg.hdr.cmsg_len = CMSG_LEN(nfds * sizeof(int)); + fdp = (int *)CMSG_DATA(&cmsg.hdr); + for (i = 0; i < nfds; i++) { + SV **fd = av_fetch(fds, i, 0); + *fdp++ = SvIV(*fd); + } + } + sent = sendmsg(PerlIO_fileno(s), &msg, flags); + return sent >= 0 ? newSViv(sent) : &PL_sv_undef; +} + +void recv_cmd4(PerlIO *s, SV *buf, STRLEN n) +{ + union my_cmsg cmsg = { 0 }; + struct msghdr msg = { 0 }; + struct iovec iov; + ssize_t i; + Inline_Stack_Vars; + Inline_Stack_Reset; + + if (!SvOK(buf)) + sv_setpvn(buf, "", 0); + iov.iov_base = SvGROW(buf, n + 1); + iov.iov_len = n; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(SEND_FD_SPACE); + + i = recvmsg(PerlIO_fileno(s), &msg, 0); + if (i < 0) + Inline_Stack_Push(&PL_sv_undef); + else + SvCUR_set(buf, i); + if (i > 0 && cmsg.hdr.cmsg_level == SOL_SOCKET && + cmsg.hdr.cmsg_type == SCM_RIGHTS) { + size_t len = cmsg.hdr.cmsg_len; + int *fdp = (int *)CMSG_DATA(&cmsg.hdr); + for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= len; i++) + Inline_Stack_Push(sv_2mortal(newSViv(*fdp++))); + } + Inline_Stack_Done; +} +#endif /* defined(CMSG_SPACE) && defined(CMSG_LEN) */ +FDPASS + my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} //= ( $ENV{XDG_CACHE_HOME} // ( ($ENV{HOME} // '/nonexistent').'/.cache' ) ).'/public-inbox/inline-c'; -$set_nodatacow = $vfork_spawn = undef unless -d $inline_dir && -w _; +$set_nodatacow = $vfork_spawn = $fdpass = undef unless -d $inline_dir && -w _; if (defined $vfork_spawn) { # Inline 0.64 or later has locking in multi-process env, # but we support 0.5 on Debian wheezy @@ -215,13 +306,14 @@ if (defined $vfork_spawn) { my $f = "$inline_dir/.public-inbox.lock"; open my $fh, '>', $f or die "failed to open $f: $!\n"; flock($fh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; - eval 'use Inline C => $vfork_spawn . $set_nodatacow'; + eval 'use Inline C => $vfork_spawn.$fdpass.$set_nodatacow'; + # . ', BUILD_NOISY => 1'; my $err = $@; my $ndc_err; if ($err && $set_nodatacow) { # missing Linux kernel headers $ndc_err = $err; undef $set_nodatacow; - eval 'use Inline C => $vfork_spawn'; + eval 'use Inline C => $vfork_spawn . $fdpass'; } flock($fh, LOCK_UN) or die "LOCK_UN failed on $f: $!\n"; die $err if $err; @@ -229,7 +321,7 @@ if (defined $vfork_spawn) { }; if ($@) { warn "Inline::C failed for vfork: $@\n"; - $set_nodatacow = $vfork_spawn = undef; + $set_nodatacow = $vfork_spawn = $fdpass = undef; } } @@ -243,8 +335,10 @@ unless ($set_nodatacow) { *nodatacow_fd = \&PublicInbox::NDC_PP::nodatacow_fd; *nodatacow_dir = \&PublicInbox::NDC_PP::nodatacow_dir; } + undef $set_nodatacow; undef $vfork_spawn; +undef $fdpass; sub which ($) { my ($file) = @_; @@ -295,15 +389,22 @@ sub spawn ($;$$) { } sub popen_rd { - my ($cmd, $env, $opts) = @_; + my ($cmd, $env, $opt) = @_; pipe(my ($r, $w)) or die "pipe: $!\n"; - $opts ||= {}; - $opts->{1} = fileno($w); - my $pid = spawn($cmd, $env, $opts); + $opt ||= {}; + $opt->{1} = fileno($w); + my $pid = spawn($cmd, $env, $opt); return ($r, $pid) if wantarray; my $ret = gensym; - tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r; + tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r, @$opt{qw(cb arg)}; $ret; } +sub run_die ($;$$) { + my ($cmd, $env, $rdr) = @_; + my $pid = spawn($cmd, $env, $rdr); + waitpid($pid, 0) == $pid or die "@$cmd did not finish"; + $? == 0 or die "@$cmd failed: \$?=$?\n"; +} + 1; diff --git a/lib/PublicInbox/SpawnPP.pm b/lib/PublicInbox/SpawnPP.pm index a72d5a2d..b0ad4da5 100644 --- a/lib/PublicInbox/SpawnPP.pm +++ b/lib/PublicInbox/SpawnPP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # Pure-Perl implementation of "spawn". This can't take advantage diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index e4f00a2a..5ff1d65f 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -5,7 +5,7 @@ # This license differs from the rest of public-inbox # # This module is Copyright (c) 2005 Six Apart, Ltd. -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # # All rights reserved. # @@ -22,7 +22,7 @@ our @EXPORT_OK = qw(epoll_ctl epoll_create epoll_wait EPOLLIN EPOLLOUT EPOLLET EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD EPOLLONESHOT EPOLLEXCLUSIVE - signalfd $SFD_NONBLOCK); + signalfd SFD_NONBLOCK); our %EXPORT_TAGS = (epoll => [qw(epoll_ctl epoll_create epoll_wait EPOLLIN EPOLLOUT EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD @@ -67,7 +67,7 @@ our ( ); my $SFD_CLOEXEC = 02000000; # Perl does not expose O_CLOEXEC -our $SFD_NONBLOCK = O_NONBLOCK; +sub SFD_NONBLOCK () { O_NONBLOCK } our $no_deprecated = 0; if ($^O eq "linux") { @@ -224,41 +224,49 @@ sub epoll_ctl_mod8 { # epoll_wait wrapper # ARGS: (epfd, maxevents, timeout (milliseconds), arrayref) # arrayref: values modified to be [$fd, $event] -our $epoll_wait_events; +our $epoll_wait_events = ''; our $epoll_wait_size = 0; sub epoll_wait_mod4 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 12 x $epoll_wait_size; - } - my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - for (0..$ct-1) { - @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec); + for (0..$ct - 1) { + # 12-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 8 bytes: epoll_data_t union (first 4 bytes are the fd) + # So we skip the first 4 bytes and take the middle 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 12 * $_ + 4, 4)); + } } sub epoll_wait_mod8 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 16 x $epoll_wait_size; - } - my $ct; - if ($no_deprecated) { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef); - } else { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - } - for (0..$ct-1) { - # 16 byte epoll_event structs, with format: - # 4 byte mask [idx 1] - # 4 byte padding (we put it into idx 2, useless) - # 8 byte data (first 4 bytes are fd, into idx 0) - @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec, + $no_deprecated ? undef : ()); + for (0..$ct - 1) { + # 16-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 4 bytes padding (skipped, useless) + # 8 bytes epoll_data_t union (first 4 bytes are the fd) + # So skip the first 8 bytes, take 4, and ignore the last 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 16 * $_ + 8, 4)); + } } sub signalfd ($$$) { diff --git a/lib/PublicInbox/TLS.pm b/lib/PublicInbox/TLS.pm index 86e6331d..3fe16a62 100644 --- a/lib/PublicInbox/TLS.pm +++ b/lib/PublicInbox/TLS.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # IO::Socket::SSL support code diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index 299b9c6a..16ae2650 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # internal APIs used only for tests @@ -10,8 +10,14 @@ use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD :seek); use POSIX qw(dup2); use IO::Socket::INET; our @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods - run_script start_script key2sub xsys xqx eml_load tick + run_script start_script key2sub xsys xsys_e xqx eml_load tick have_xapian_compact); +BEGIN { + require Test::More; + *BAIL_OUT = \&Test::More::BAIL_OUT; + *plan = \&Test::More::plan; + *skip = \&Test::More::skip; +} sub eml_load ($) { my ($path, $cb) = @_; @@ -38,7 +44,7 @@ sub tcp_server () { Type => Socket::SOCK_STREAM(), Listen => 1024, Blocking => 0, - ) or Test::More::BAIL_OUT("failed to create TCP server: $!"); + ) or BAIL_OUT "failed to create TCP server: $!"; } sub tcp_connect { @@ -49,7 +55,7 @@ sub tcp_connect { Type => Socket::SOCK_STREAM(), PeerAddr => $addr, %opt, - ) or Test::More::BAIL_OUT("failed to connect to $addr: $!"); + ) or BAIL_OUT "failed to connect to $addr: $!"; $s->autoflush(1); $s; } @@ -64,8 +70,8 @@ sub require_git ($;$) { my $cur_int = ($cur_maj << 24) | ($cur_min << 16) | ($cur_sub // 0); if ($cur_int < $req_int) { return 0 if $maybe; - Test::More::plan(skip_all => - "git $req+ required, have $cur_maj.$cur_min.$cur_sub"); + plan skip_all => + "git $req+ required, have $cur_maj.$cur_min.$cur_sub"; } 1; } @@ -75,6 +81,10 @@ sub require_mods { my $maybe = pop @mods if $mods[-1] =~ /\A[0-9]+\z/; my @need; while (my $mod = shift(@mods)) { + if ($mod eq 'json') { + $mod = 'Cpanel::JSON::XS||JSON::MaybeXS||'. + 'JSON||JSON::PP' + } if ($mod eq 'Search::Xapian') { if (eval { require PublicInbox::Search } && PublicInbox::Search::load_xapian()) { @@ -109,8 +119,8 @@ sub require_mods { } return unless @need; my $m = join(', ', @need)." missing for $0"; - Test::More::skip($m, $maybe) if $maybe; - Test::More::plan(skip_all => $m) + skip($m, $maybe) if $maybe; + plan(skip_all => $m) } sub key2script ($) { @@ -131,9 +141,9 @@ sub _prepare_redirects ($) { for (my $fd = 0; $fd <= $#io_mode; $fd++) { my $fh = $fhref->[$fd] or next; my ($oldfh, $mode) = @{$io_mode[$fd]}; - open my $orig, $mode, $oldfh or die "$$oldfh $mode stash: $!"; + open my $orig, $mode, $oldfh or die "$oldfh $mode stash: $!"; $orig_io->[$fd] = $orig; - open $oldfh, $mode, $fh or die "$$oldfh $mode redirect: $!"; + open $oldfh, $mode, $fh or die "$oldfh $mode redirect: $!"; } $orig_io; } @@ -164,7 +174,7 @@ sub run_script_exit { die RUN_SCRIPT_EXIT; } -my %cached_scripts; +our %cached_scripts; sub key2sub ($) { my ($key) = @_; $cached_scripts{$key} //= do { @@ -257,6 +267,7 @@ sub run_script ($;$$) { my $orig_io = _prepare_redirects($fhref); _run_sub($sub, $key, \@argv); _undo_redirects($orig_io); + select STDOUT; } # slurp the redirects back into user-supplied strings @@ -318,6 +329,11 @@ sub xsys { $? >> 8 } +sub xsys_e { # like "/bin/sh -e" + xsys(@_) == 0 or + BAIL_OUT (ref $_[0] ? "@{$_[0]}" : "@_"). " failed \$?=$?" +} + # like `backtick` or qx{} op, but uses spawn() for env/rdr + vfork sub xqx { my ($cmd, $env, $rdr) = @_; diff --git a/lib/PublicInbox/Tmpfile.pm b/lib/PublicInbox/Tmpfile.pm index 25bb3a52..3040dd77 100644 --- a/lib/PublicInbox/Tmpfile.pm +++ b/lib/PublicInbox/Tmpfile.pm @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ package PublicInbox::Tmpfile; use strict; -use warnings; -use base qw(Exporter); +use v5.10.1; +use parent qw(Exporter); our @EXPORT = qw(tmpfile); use Fcntl qw(:DEFAULT); use Errno qw(EEXIST); @@ -13,6 +13,9 @@ use File::Spec; # unlinked filename which makes sense when viewed with lsof # (at least on Linux) # And if we ever stop caring to have debuggable filenames, O_TMPFILE :) +# +# This is also for Perl <5.32 which lacks: open(..., '+>>', undef) +# sub tmpfile ($;$$) { my ($id, $sock, $append) = @_; if (defined $sock) { diff --git a/lib/PublicInbox/URIimap.pm b/lib/PublicInbox/URIimap.pm index 56b6002a..ab0908b7 100644 --- a/lib/PublicInbox/URIimap.pm +++ b/lib/PublicInbox/URIimap.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # cf. RFC 5092, which the `URI' package doesn't support # diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm index 945e7ae7..621a7e0f 100644 --- a/lib/PublicInbox/Unsubscribe.pm +++ b/lib/PublicInbox/Unsubscribe.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors +# Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ # # Standalone PSGI app to handle HTTP(s) unsubscribe links generated @@ -12,7 +12,8 @@ use warnings; use Crypt::CBC; use Plack::Util; use MIME::Base64 qw(decode_base64url); -my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CODE_URL = qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git); my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); sub new { @@ -38,13 +39,15 @@ sub new { my $unsubscribe = $opt{unsubscribe} or die "`unsubscribe' callback not given\n"; + my $code_url = $opt{code_url} || \@CODE_URL; + $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY'; bless { - pi_config => $opt{pi_config}, # PublicInbox::Config + pi_cfg => $opt{pi_config}, # PublicInbox::Config owner_email => $opt{owner_email}, cipher => $cipher, unsubscribe => $unsubscribe, contact => qq($e), - code_url => $opt{code_url} || $CODE_URL, + code_url => $code_url, confirm => $opt{confirm}, }, $class; } @@ -138,7 +141,7 @@ sub r { "$title
".
 		join("\n", "$title\n", @body) . '

'. "
This page is available under AGPL-3.0+\n" .
-		"git clone $self->{code_url}\n" .
+		join('', map { "git clone $_\n" } @{$self->{code_url}}) .
 		qq(Email $self->{contact} if you have any questions).
 		'
' ] ]; @@ -149,9 +152,9 @@ sub archive_info { my $archive_url = $self->{archive_urls}->{$list_addr}; unless ($archive_url) { - if (my $config = $self->{pi_config}) { + if (my $cfg = $self->{pi_cfg}) { # PublicInbox::Config::lookup - my $ibx = $config->lookup($list_addr); + my $ibx = $cfg->lookup($list_addr); # PublicInbox::Inbox::base_url $archive_url = $ibx->base_url if $ibx; } diff --git a/lib/PublicInbox/UserContent.pm b/lib/PublicInbox/UserContent.pm index 789da2f1..b63d0617 100644 --- a/lib/PublicInbox/UserContent.pm +++ b/lib/PublicInbox/UserContent.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors +# Copyright (C) 2019-2021 all contributors # License: AGPL-3.0+ # Self-updating module containing a sample CSS for client-side diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b8abfa94..0104f87a 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # This interface wraps and mimics PublicInbox::Import @@ -16,14 +16,15 @@ use PublicInbox::ContentHash qw(content_hash content_digest); use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; -use PublicInbox::Spawn qw(spawn popen_rd); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size); +use PublicInbox::Spawn qw(spawn popen_rd run_die); +use PublicInbox::Search; +use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush use File::Temp (); my $OID = qr/[a-f0-9]{40,}/; # an estimate of the post-packed size to the raw uncompressed size -my $PACKING_FACTOR = 0.4; +our $PACKING_FACTOR = 0.4; # SATA storage lags behind what CPUs are capable of, so relying on # nproc(1) can be misleading and having extra Xapian shards is a @@ -65,11 +66,15 @@ sub nproc_shards ($) { sub count_shards ($) { my ($self) = @_; - # always load existing shards in case core count changes: - # Also, shard count may change while -watch is running - my $srch = $self->{ibx}->search or return 0; - delete $self->{ibx}->{search}; - $srch->{nshard} // 0 + if (my $ibx = $self->{ibx}) { + # always load existing shards in case core count changes: + # Also, shard count may change while -watch is running + my $srch = $ibx->search or return 0; + delete $ibx->{search}; + $srch->{nshard} // 0 + } else { # ExtSearchIdx + $self->{nshard} ||= scalar($self->xdb_shards_flat); + } } sub new { @@ -86,8 +91,6 @@ sub new { die "$dir does not exist\n"; } } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, @@ -117,12 +120,9 @@ sub init_inbox { } $self->idx_init; $self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum; - my $epoch_max = -1; - git_dir_latest($self, \$epoch_max); - if (defined $skip_epoch && $epoch_max == -1) { - $epoch_max = $skip_epoch; - } - $self->git_init($epoch_max >= 0 ? $epoch_max : 0); + my $max = $self->{ibx}->max_git_epoch; + $max = $skip_epoch if (defined($skip_epoch) && !defined($max)); + $self->git_init($max // 0); $self->done; } @@ -133,14 +133,20 @@ sub add { $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb); } +sub idx_shard ($$) { + my ($self, $num) = @_; + $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})]; +} + # indexes a message, returns true if checkpointing is needed -sub do_idx ($$$$) { - my ($self, $msgref, $mime, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); - $self->{oidx}->add_overview($mime, $smsg); - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); - $idx->index_raw($msgref, $mime, $smsg); - my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; +sub do_idx ($$$) { + my ($self, $eml, $smsg) = @_; + $self->{oidx}->add_overview($eml, $smsg); + if ($self->{-need_xapian}) { + my $idx = idx_shard($self, $smsg->{num}); + $idx->index_eml($eml, $smsg); + } + my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= $self->{batch_bytes}; } @@ -168,8 +174,7 @@ sub _add { $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - my $msgref = delete $smsg->{-raw_email}; - if (do_idx($self, $msgref, $mime, $smsg)) { + if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } @@ -249,11 +254,6 @@ sub v2_num_for_harder { ($num, $mid0); } -sub idx_shard { - my ($self, $shard_i) = @_; - $self->{idx_shards}->[$shard_i]; -} - sub _idx_init { # with_umask callback my ($self, $opt) = @_; $self->lock_acquire unless $opt && $opt->{-skip_lock}; @@ -264,22 +264,33 @@ sub _idx_init { # with_umask callback $self->{shards} = $nshards if $nshards && $nshards != $self->{shards}; $self->{batch_bytes} = $opt->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; - $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; # need to create all shards before initializing msgmap FD # idx_shards must be visible to all forked processes my $max = $self->{shards} - 1; my $idx = $self->{idx_shards} = []; push @$idx, PublicInbox::SearchIdxShard->new($self, $_) for (0..$max); + $self->{-need_xapian} = $idx->[0]->need_xapian; + + # SearchIdxShard may do their own flushing, so don't scale + # until after forking + $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; + + my $ibx = $self->{ibx} or return; # ExtIdxSearch # Now that all subprocesses are up, we can open the FDs # for SQLite: my $mm = $self->{mm} = PublicInbox::Msgmap->new_file( - "$self->{ibx}->{inboxdir}/msgmap.sqlite3", - $self->{ibx}->{-no_fsync} ? 2 : 1); + "$ibx->{inboxdir}/msgmap.sqlite3", + $ibx->{-no_fsync} ? 2 : 1); $mm->{dbh}->begin_work; } +sub parallel_init ($$) { + my ($self, $indexlevel) = @_; + $self->{parallel} = 0 if ($indexlevel // 'full') eq 'basic'; +} + # idempotent sub idx_init { my ($self, $opt) = @_; @@ -292,17 +303,7 @@ sub idx_init { delete @$ibx{qw(mm search)}; $ibx->git->cleanup; - $self->{parallel} = 0 if ($ibx->{indexlevel}//'') eq 'basic'; - if ($self->{parallel}) { - pipe(my ($r, $w)) or die "pipe failed: $!"; - # pipe for barrier notifications doesn't need to be big, - # 1031: F_SETPIPE_SZ - fcntl($w, 1031, 4096) if $^O eq 'linux'; - $self->{bnote} = [ $r, $w ]; - $w->autoflush(1); - } - - $ibx->umask_prepare; + parallel_init($self, $ibx->{indexlevel}); $ibx->with_umask(\&_idx_init, $self, $opt); } @@ -312,14 +313,10 @@ sub idx_init { sub _replace_oids ($$$) { my ($self, $mime, $replace_map) = @_; $self->done; - my $pfx = "$self->{ibx}->{inboxdir}/git"; + my $ibx = $self->{ibx}; + my $pfx = "$ibx->{inboxdir}/git"; my $rewrites = []; # epoch => commit - my $max = $self->{epoch_max}; - - unless (defined($max)) { - defined(my $latest = git_dir_latest($self, \$max)) or return; - $self->{epoch_max} = $max; - } + my $max = $self->{epoch_max} //= $ibx->max_git_epoch // return; foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; @@ -414,7 +411,7 @@ sub rewrite_internal ($$;$$$) { } else { # ->purge or ->remove $self->{mm}->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } } @@ -467,7 +464,7 @@ sub git_hash_raw ($$) { my ($self, $raw) = @_; # grab the expected OID we have to reindex: pipe(my($in, $w)) or die "pipe: $!"; - my $git_dir = $self->{ibx}->git->{git_dir}; + my $git_dir = $self->git->{git_dir}; my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; my $r = popen_rd($cmd, undef, { 0 => $in }); print $w $$raw or die "print \$w: $!"; @@ -531,23 +528,23 @@ W: $list } # make sure we really got the OID: - my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid); + my ($blob, $type, $bytes) = $self->git->check($expect_oid); $blob eq $expect_oid or die "BUG: $expect_oid not found after replace"; # don't leak FDs to Xapian: - $self->{ibx}->git->cleanup; + $self->git->cleanup; # reindex modified messages: for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; $new_smsg->populate($new_mime, $sync); - do_idx($self, \$raw, $new_mime, $new_smsg); + $new_smsg->set_bytes($raw, $bytes); + do_idx($self, $new_mime, $new_smsg); } $rewritten->{rewrites}; } @@ -558,7 +555,7 @@ sub last_epoch_commit ($$;$) { $self->{mm}->last_commit_xap($v, $i, $cmt); } -sub set_last_commits ($) { +sub set_last_commits ($) { # this is NOT for ExtSearchIdx my ($self) = @_; defined(my $epoch_max = $self->{epoch_max}) or return; my $last_commit = $self->{last_commit}; @@ -569,24 +566,6 @@ sub set_last_commits ($) { } } -sub barrier_init { - my ($self, $n) = @_; - $self->{bnote} or return; - --$n; - my $barrier = { map { $_ => 1 } (0..$n) }; -} - -sub barrier_wait { - my ($self, $barrier) = @_; - my $bnote = $self->{bnote} or return; - my $r = $bnote->[0]; - while (scalar keys %$barrier) { - defined(my $l = readline($r)) or die "EOF on barrier_wait: $!"; - $l =~ /\Abarrier (\d+)/ or die "bad line on barrier_wait: $l"; - delete $barrier->{$1} or die "bad shard[$1] on barrier wait"; - } -} - # public sub checkpoint ($;$) { my ($self, $wait) = @_; @@ -600,34 +579,54 @@ sub checkpoint ($;$) { } my $shards = $self->{idx_shards}; if ($shards) { - my $dbh = $self->{mm}->{dbh}; + my $mm = $self->{mm}; + my $dbh = $mm->{dbh} if $mm; # SQLite msgmap data is second in importance - $dbh->commit; + $dbh->commit if $dbh; # SQLite overview is third $self->{oidx}->commit_lazy; # Now deal with Xapian - if ($wait) { - my $barrier = $self->barrier_init(scalar @$shards); - # each shard needs to issue a barrier command - $_->shard_barrier for @$shards; + # start commit_txn_lazy asynchronously on all parallel shards + # (non-parallel waits here) + $_->ipc_do('commit_txn_lazy') for @$shards; + + # transactions started on parallel shards, + # wait for them by issuing an echo command (echo can only + # run after commit_txn_lazy is done) + if ($wait && $self->{parallel}) { + my $i = 0; + for my $shard (@$shards) { + my $echo = $shard->ipc_do('echo', $i); + $echo == $i or die <<""; +shard[$i] bad echo:$echo != $i waiting for txn commit + + ++$i; + } + } - # wait for each Xapian shard - $self->barrier_wait($barrier); - } else { - $_->shard_commit for @$shards; + my $midx = $self->{midx}; # misc index + if ($midx) { + $midx->commit_txn; + $PublicInbox::Search::X{CLOEXEC_UNSET} and + $self->git->cleanup; } # last_commit is special, don't commit these until - # remote shards are done: - $dbh->begin_work; + # Xapian shards are done: + $dbh->begin_work if $dbh; set_last_commits($self); - $dbh->commit; - - $dbh->begin_work; + if ($dbh) { + $dbh->commit; + $dbh->begin_work; + } + if ($midx) { + $self->git->batch_prepare; + $midx->begin_txn; + } } $self->{total_bytes} += $self->{transact_bytes}; $self->{transact_bytes} = 0; @@ -667,14 +666,26 @@ sub done { } eval { $self->{oidx}->dbh_close }; $err .= "over close: $@\n" if $@; - delete $self->{bnote}; + delete $self->{midx}; my $nbytes = $self->{total_bytes}; $self->{total_bytes} = 0; $self->lock_release(!!$nbytes) if $shards; - $self->{ibx}->git->cleanup; + $self->git->cleanup; die $err if $err; } +sub write_alternates ($$$) { + my ($info_dir, $mode, $out) = @_; + my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir); + my $tmp = $fh->filename; + print $fh @$out or die "print $tmp: $!\n"; + chmod($mode, $fh) or die "fchmod $tmp: $!\n"; + close $fh or die "close $tmp $!\n"; + my $alt = "$info_dir/alternates"; + rename($tmp, $alt) or die "rename $tmp => $alt: $!\n"; + $fh->unlink_on_destroy(0); +} + sub fill_alternates ($$) { my ($self, $epoch) = @_; @@ -713,45 +724,20 @@ sub fill_alternates ($$) { } } return unless $new; - - my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir); - my $tmp = $fh->filename; - print $fh join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n" - or die "print $tmp: $!\n"; - chmod($mode, $fh) or die "fchmod $tmp: $!\n"; - close $fh or die "close $tmp $!\n"; - rename($tmp, $alt) or die "rename $tmp => $alt: $!\n"; - $fh->unlink_on_destroy(0); + write_alternates($info_dir, $mode, + [join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n"]); } sub git_init { my ($self, $epoch) = @_; my $git_dir = "$self->{ibx}->{inboxdir}/git/$epoch.git"; PublicInbox::Import::init_bare($git_dir); - my @cmd = (qw/git config/, "--file=$git_dir/config", - 'include.path', '../../all.git/config'); - PublicInbox::Import::run_die(\@cmd); + run_die([qw(git config), "--file=$git_dir/config", + qw(include.path ../../all.git/config)]); fill_alternates($self, $epoch); $git_dir } -sub git_dir_latest { - my ($self, $max) = @_; - $$max = -1; - my $pfx = "$self->{ibx}->{inboxdir}/git"; - return unless -d $pfx; - my $latest; - opendir my $dh, $pfx or die "opendir $pfx: $!\n"; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - if ($1 > $$max) { - $$max = $1; - $latest = "$pfx/$git_dir"; - } - } - $latest; -} - sub importer { my ($self) = @_; my $im = $self->{im}; @@ -770,7 +756,7 @@ sub importer { } my $epoch = 0; my $max; - my $latest = git_dir_latest($self, \$max); + my $latest = $self->{ibx}->git_dir_latest(\$max); if (defined $latest) { my $git = PublicInbox::Git->new($latest); my $packed_bytes = $git->packed_bytes; @@ -847,43 +833,62 @@ sub content_exists ($$$) { sub atfork_child { my ($self) = @_; - if (my $shards = $self->{idx_shards}) { - $_->atfork_child foreach @$shards; + if (my $older_siblings = $self->{idx_shards}) { + $_->ipc_sibling_atfork_child for @$older_siblings; } if (my $im = $self->{im}) { $im->atfork_child; } - die "unexpected mm" if $self->{mm}; - close $self->{bnote}->[0] or die "close bnote[0]: $!\n"; - $self->{bnote}->[1]; + die "BUG: unexpected mm" if $self->{mm}; } sub reindex_checkpoint ($$) { my ($self, $sync) = @_; - $self->{ibx}->git->cleanup; # *async_wait + $self->git->async_wait_all; + $self->update_last_commit($sync); ${$sync->{need_checkpoint}} = 0; my $mm_tmp = $sync->{mm_tmp}; $mm_tmp->atfork_prepare if $mm_tmp; - $self->done; # release lock + die 'BUG: {im} during reindex' if $self->{im}; + if ($self->{ibx_map} && !$sync->{checkpoint_unlocks}) { + checkpoint($self, 1); # no need to release lock on pure index + } else { + $self->done; # release lock + } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } # allow -watch or -mda to write... $self->idx_init($sync->{-opt}); # reacquire lock + if (my $intvl = $sync->{check_intvl}) { # eidx + $sync->{next_check} = PublicInbox::DS::now() + $intvl; + } $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if $size == 0; # purged + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); my $chash = content_hash($eml); - my $self = $arg->{v2w}; if (scalar(@$mids) == 0) { warn "E: $oid has no Message-ID, skipping\n"; @@ -891,12 +896,16 @@ sub index_oid { # cat_async callback } # {unindexed} is unlikely - if ((my $unindexed = $arg->{unindexed}) && scalar(@$mids) == 1) { - $num = delete($unindexed->{$mids->[0]}); + if (my $unindexed = $arg->{unindexed}) { + my $oidbin = pack('H*', $oid); + my $u = $unindexed->{$oidbin}; + ($num, $mid0) = splice(@$u, 0, 2) if $u; if (defined $num) { - $mid0 = $mids->[0]; $self->{mm}->mid_set($num, $mid0); - delete($arg->{unindexed}) if !keys(%$unindexed); + if (scalar(@$u) == 0) { # done with current OID + delete $unindexed->{$oidbin}; + delete($arg->{unindexed}) if !keys(%$unindexed); + } } } if (!defined($num)) { # reuse if reindexing (or duplicates) @@ -941,45 +950,48 @@ sub index_oid { # cat_async callback } ++${$arg->{nr}}; my $smsg = bless { - raw_bytes => $size, num => $num, blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; $smsg->populate($eml, $arg); - if (do_idx($self, $bref, $eml, $smsg)) { + $smsg->set_bytes($$bref, $size); + if (do_idx($self, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current -sub update_last_commit ($$$$) { - my ($self, $git, $i, $cmt) = @_; - my $last = last_epoch_commit($self, $i); - if (defined $last && is_ancestor($git, $last, $cmt)) { - my @cmd = (qw(rev-list --count), "$last..$cmt"); - chomp(my $n = $git->qx(@cmd)); +sub update_last_commit { + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $last = last_epoch_commit($self, $unit->{epoch}); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); return if $n ne '' && $n == 0; } - last_epoch_commit($self, $i, $cmt); + last_epoch_commit($self, $unit->{epoch}, $latest_cmt); } -sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" } - -sub last_commits ($$) { - my ($self, $epoch_max) = @_; +sub last_commits { + my ($self, $sync) = @_; my $heads = []; - for (my $i = $epoch_max; $i >= 0; $i--) { + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { $heads->[$i] = last_epoch_commit($self, $i); } $heads; } # returns a revision range for git-log(1) -sub log_range ($$$$$) { - my ($self, $sync, $git, $i, $tip) = @_; +sub log_range ($$$) { + my ($sync, $unit, $tip) = @_; my $opt = $sync->{-opt}; my $pr = $opt->{-progress} if (($opt->{verbose} || 0) > 1); + my $i = $unit->{epoch}; my $cur = $sync->{ranges}->[$i] or do { $pr->("$i.git indexing all of $tip\n") if $pr; return $tip; # all of it @@ -993,7 +1005,8 @@ sub log_range ($$$$$) { my $range = "$cur..$tip"; $pr->("$i.git checking contiguity... ") if $pr; - if (is_ancestor($git, $cur, $tip)) { # common case + my $git = $unit->{git}; + if (is_ancestor($sync->{self}->git, $cur, $tip)) { # common case $pr->("OK\n") if $pr; my $n = $git->qx(qw(rev-list --count), $range); chomp($n); @@ -1018,63 +1031,103 @@ Rewritten history? (in $git->{git_dir}) warn "discarding history at $cur\n"; } warn <<""; -reindexing $git->{git_dir} starting at -$range - - $sync->{unindex_range}->{$i} = "$base..$cur"; +reindexing $git->{git_dir} +starting at $range + + # $cur^0 may no longer exist if pruned by git + if ($git->qx(qw(rev-parse -q --verify), "$cur^0")) { + $unit->{unindex_range} = "$base..$cur"; + } elsif ($base && $git->qx(qw(rev-parse -q --verify), $base)) { + $unit->{unindex_range} = "$base.."; + } else { + warn "W: unable to unindex before $range\n"; + } } $range; } -sub sync_prepare ($$$) { - my ($self, $sync, $epoch_max) = @_; +# overridden by ExtSearchIdx +sub artnum_max { $_[0]->{mm}->num_highwater } + +sub sync_prepare ($$) { + my ($self, $sync) = @_; + $sync->{ranges} = sync_ranges($self, $sync); my $pr = $sync->{-opt}->{-progress}; my $regen_max = 0; - my $head = $self->{ibx}->{ref_head} || 'refs/heads/master'; - - # reindex stops at the current heads and we later rerun index_sync - # without {reindex} - my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex}; - - for (my $i = $epoch_max; $i >= 0; $i--) { - my $git_dir = git_dir_n($self, $i); + my $head = $sync->{ibx}->{ref_head} || 'HEAD'; + my $pfx; + if ($pr) { + ($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g); + $pfx //= $sync->{ibx}->{inboxdir}; + } + + my $reindex_heads; + if ($self->{ibx_map}) { + # ExtSearchIdx won't index messages unless they're in + # over.sqlite3 for a given inbox, so don't read beyond + # what's in the per-inbox index. + $reindex_heads = []; + my $v = PublicInbox::Search::SCHEMA_VERSION; + my $mm = $sync->{ibx}->mm; + for my $i (0..$sync->{epoch_max}) { + $reindex_heads->[$i] = $mm->last_commit_xap($v, $i); + } + } elsif ($sync->{reindex}) { # V2 inbox + # reindex stops at the current heads and we later + # rerun index_sync without {reindex} + $reindex_heads = $self->last_commits($sync); + } + if ($sync->{max_size} = $sync->{-opt}->{max_size}) { + $sync->{index_oid} = $self->can('index_oid'); + } + my $git_pfx = "$sync->{ibx}->{inboxdir}/git"; + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { + my $git_dir = "$git_pfx/$i.git"; -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); + my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - - next if $?; # new repo - my $range = log_range($self, $sync, $git, $i, $tip) or next; + my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter - $pr->("$i.git counting $range ... ") if $pr; + $pr->("$pfx $i.git counting $range ... ") if $pr; # Don't bump num_highwater on --reindex by using {D}. # We intentionally do NOT use {D} in the non-reindex case # because we want NNTP article number gaps from unindexed # messages to show up in mirrors, too. $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR - my $stk = log2stack($sync, $git, $range, $self->{ibx}); + my $stk = log2stack($sync, $git, $range); + return 0 if $sync->{quit}; my $nr = $stk ? $stk->num_records : 0; $pr->("$nr\n") if $pr; - $sync->{stacks}->[$i] = $stk if $stk; + $unit->{stack} = $stk; # may be undef + unshift @{$sync->{todo}}, $unit; $regen_max += $nr; } + return 0 if $sync->{quit}; # XXX this should not happen unless somebody bypasses checks in # our code and blindly injects "d" file history into git repos if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); - my $arg = { v2w => $self }; - my $all = $self->{ibx}->git; + local $self->{current_info} = 'leftover '; + my $unindex_oid = $self->can('unindex_oid'); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $self->{current_info} = "leftover $oid"; - $all->cat_async($oid, \&unindex_oid, $arg); + my $req = { %$sync, oid => $oid }; + $self->git->cat_async($oid, $unindex_oid, $req); } - $all->cat_async_wait; + $self->git->cat_async_wait; } - if (!$regen_max && !keys(%{$self->{unindex_range}})) { + return 0 if $sync->{quit}; + if (!$regen_max) { $sync->{-regen_fmt} = "%u/?\n"; return 0; } @@ -1085,22 +1138,25 @@ sub sync_prepare ($$$) { $sync->{-regen_fmt} = "% ${pad}u/$regen_max\n"; $sync->{nr} = \(my $nr = 0); return -1 if $sync->{reindex}; - $regen_max + $self->{mm}->num_highwater() || 0; + $regen_max + $self->artnum_max || 0; } -sub unindex_oid_remote ($$$) { +sub unindex_oid_aux ($$$) { my ($self, $oid, $mid) = @_; my @removed = $self->{oidx}->remove_oid($oid, $mid); + return unless $self->{-need_xapian}; for my $num (@removed) { - my $idx = idx_shard($self, $num % $self->{shards}); - $idx->shard_remove($oid, $num); + idx_shard($self, $num)->ipc_do('xdb_remove', $num); } } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - my $self = $sync->{v2w}; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1116,50 +1172,55 @@ sub unindex_oid ($$;$) { # git->cat_async callback warn "BUG: multiple articles linked to $oid\n", join(',',sort keys %gone), "\n"; } - foreach my $num (keys %gone) { + # reuse (num => mid) mapping in ascending numeric order + for my $num (sort { $a <=> $b } keys %gone) { + $num += 0; if ($unindexed) { my $mid0 = $mm->mid_for($num); - $unindexed->{$mid0} = $num; + my $oidbin = pack('H*', $oid); + push @{$unindexed->{$oidbin}}, $num, $mid0; } $mm->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } +sub git { $_[0]->{ibx}->git } + # this is rare, it only happens when we get discontiguous history in # a mirror because the source used -purge or -edit -sub unindex ($$$$) { - my ($self, $sync, $git, $unindex_range) = @_; - my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num +sub unindex_todo ($$$) { + my ($self, $sync, $unit) = @_; + my $unindex_range = delete($unit->{unindex_range}) // return; + my $unindexed = $sync->{unindexed} //= {}; # $oidbin => [$num, $mid0] my $before = scalar keys %$unindexed; # order does not matter, here: - my @cmd = qw(log --raw -r - --no-notes --no-color --no-abbrev --no-renames); - my $fh = $git->popen(@cmd, $unindex_range); - my $all = $self->{ibx}->git; + my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color + --no-abbrev --no-renames), $unindex_range); local $sync->{in_unindex} = 1; + my $unindex_oid = $self->can('unindex_oid'); while (<$fh>) { /\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next; - $all->cat_async($1, \&unindex_oid, $sync); + $self->git->cat_async($1, $unindex_oid, { %$sync, oid => $1 }); } close $fh or die "git log failed: \$?=$?"; - $all->cat_async_wait; + $self->git->cat_async_wait; return unless $sync->{-opt}->{prune}; my $after = scalar keys %$unindexed; return if $before == $after; # ensure any blob can not longer be accessed via dumb HTTP - PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}", + run_die(['git', "--git-dir=$unit->{git}->{git_dir}", qw(-c gc.reflogExpire=now gc --prune=all --quiet)]); } -sub sync_ranges ($$$) { - my ($self, $sync, $epoch_max) = @_; +sub sync_ranges ($$) { + my ($self, $sync) = @_; my $reindex = $sync->{reindex}; - - return last_commits($self, $epoch_max) unless $reindex; + return $self->last_commits($sync) unless $reindex; return [] if ref($reindex) ne 'HASH'; my $ranges = $reindex->{from}; # arrayref; @@ -1171,11 +1232,10 @@ sub sync_ranges ($$$) { sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; - my $self = $smsg->{v2w}; - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); - $smsg->{raw_bytes} = $size; - $idx->index_raw($bref, undef, $smsg); - $self->{transact_bytes} += $size; + my $self = $smsg->{self}; + my $idx = idx_shard($self, $smsg->{num}); + $idx->index_eml(PublicInbox::Eml->new($bref), $smsg); + $self->{transact_bytes} += $smsg->{bytes}; } sub index_xap_step ($$$;$) { @@ -1190,8 +1250,9 @@ sub index_xap_step ($$$;$) { "$beg..$end (% $step)\n"); } for (my $num = $beg; $num <= $end; $num += $step) { + last if $sync->{quit}; my $smsg = $ibx->over->get_art($num) or next; - $smsg->{v2w} = $self; + $smsg->{self} = $self; $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg); if ($self->{transact_bytes} >= $self->{batch_bytes}) { ${$sync->{nr}} = $num; @@ -1200,37 +1261,53 @@ sub index_xap_step ($$$;$) { } } -sub index_epoch ($$$) { - my ($self, $sync, $i) = @_; - - my $git_dir = git_dir_n($self, $i); - -d $git_dir or return; # missing epochs are fine - my $git = PublicInbox::Git->new($git_dir); - if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare - unindex($self, $sync, $git, $unindex_range); - } - defined(my $stk = $sync->{stacks}->[$i]) or return; - $sync->{stacks}->[$i] = undef; - my $all = $self->{ibx}->git; - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { - $self->{current_info} = "$i.git $oid"; +sub index_todo ($$$) { + my ($self, $sync, $unit) = @_; + return if $sync->{quit}; + unindex_todo($self, $sync, $unit); + my $stk = delete($unit->{stack}) or return; + my $all = $self->git; + my $index_oid = $self->can('index_oid'); + my $unindex_oid = $self->can('unindex_oid'); + my $pfx; + if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) { + $pfx = "$1 $2"; # v2 + } else { # v1 + ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g); + $pfx //= $unit->{git}->{git_dir}; + } + local $self->{current_info} = "$pfx "; + local $sync->{latest_cmt} = \(my $latest_cmt); + local $sync->{unit} = $unit; + while (my ($f, $at, $ct, $oid, $cmt) = $stk->pop_rec) { + if ($sync->{quit}) { + warn "waiting to quit...\n"; + $all->async_wait_all; + $self->update_last_commit($sync); + return; + } + my $req = { + %$sync, + autime => $at, + cotime => $ct, + oid => $oid, + cur_cmt => $cmt + }; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; if ($sync->{max_size}) { - $all->check_async($oid, \&check_size, $arg); + $all->check_async($oid, \&check_size, $req); } else { - $all->cat_async($oid, \&index_oid, $arg); + $all->cat_async($oid, $index_oid, $req); } } elsif ($f eq 'd') { - $all->cat_async($oid, \&unindex_oid, $sync); + $all->cat_async($oid, $unindex_oid, $req); } if (${$sync->{need_checkpoint}}) { reindex_checkpoint($self, $sync); } } - $all->check_async_wait; - $all->cat_async_wait; - update_last_commit($self, $git, $i, $stk->{latest_cmt}); + $all->async_wait_all; + $self->update_last_commit($sync, $stk); } sub xapian_only { @@ -1243,7 +1320,7 @@ sub xapian_only { $sync //= { need_checkpoint => \(my $bool = 0), -opt => $opt, - v2w => $self, + self => $self, nr => \(my $nr = 0), -regen_fmt => "%u/?\n", }; @@ -1251,6 +1328,7 @@ sub xapian_only { if ($seq || !$self->{parallel}) { my $shard_end = $self->{shards} - 1; for my $i (0..$shard_end) { + last if $sync->{quit}; index_xap_step($self, $sync, $art_beg + $i); if ($i != $shard_end) { reindex_checkpoint($self, $sync); @@ -1260,7 +1338,7 @@ sub xapian_only { index_xap_step($self, $sync, $art_beg, 1); } } - $self->{ibx}->git->cat_async_wait; + $self->git->cat_async_wait; $self->done; } @@ -1270,11 +1348,19 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = git_dir_latest($self, \$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } + my $pr = $opt->{-progress}; my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; @@ -1285,13 +1371,18 @@ sub index_sync { $self->{oidx}->rethread_prepare($opt); my $sync = { need_checkpoint => \(my $bool = 0), - unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, -opt => $opt, - v2w => $self, + self => $self, + ibx => $self->{ibx}, + epoch_max => $epoch_max, }; - $sync->{ranges} = sync_ranges($self, $sync, $epoch_max); - if (sync_prepare($self, $sync, $epoch_max)) { + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + + if (sync_prepare($self, $sync)) { # tmp_clone seems to fail if inside a transaction, so # we rollback here (because we opened {mm} for reading) # Note: we do NOT rely on DBI transactions for atomicity; @@ -1303,16 +1394,13 @@ sub index_sync { # xapian_only works incrementally w/o --reindex if ($seq && !$opt->{reindex}) { - $art_beg = $sync->{mm_tmp}->max; - $art_beg++ if defined($art_beg); + $art_beg = $sync->{mm_tmp}->max || -1; + $art_beg++; } } - if ($sync->{max_size} = $opt->{max_size}) { - $sync->{index_oid} = \&index_oid; - } # work forwards through history - index_epoch($self, $sync, $_) for (0..$epoch_max); - $self->{oidx}->rethread_done($opt); + index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []}; + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; $self->done; if (my $nr = $sync->{nr}) { @@ -1320,14 +1408,21 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } + my $quit_warn; # deal with Xapian shards sequentially if ($seq && delete($sync->{mm_tmp})) { - $self->{ibx}->{indexlevel} = $idxlevel; - xapian_only($self, $opt, $sync, $art_beg); + if ($sync->{quit}) { + $quit_warn = 1; + } else { + $self->{ibx}->{indexlevel} = $idxlevel; + xapian_only($self, $opt, $sync, $art_beg); + $quit_warn = 1 if $sync->{quit}; + } } # --reindex on the command-line - if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') { + if (!$sync->{quit} && $opt->{reindex} && + !ref($opt->{reindex}) && $idxlevel ne 'basic') { $self->lock_acquire; my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0); if (my $xdb = $s0->idx_acquire) { @@ -1339,12 +1434,16 @@ sub index_sync { } # reindex does not pick up new changes, so we rerun w/o it: - if ($opt->{reindex}) { + if ($opt->{reindex} && !$sync->{quit}) { my %again = %$opt; $sync = undef; delete @again{qw(rethread reindex -skip_lock)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } + warn < +# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # # Used for displaying the HTML web interface. @@ -48,7 +48,7 @@ sub msg_page_i { # /$INBOX/$MSGID/ for unindexed v1 inboxes sub no_over_html ($) { my ($ctx) = @_; - my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; # 404 + my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; # 404 my $eml = PublicInbox::Eml->new($bref); $ctx->{mhref} = ''; PublicInbox::WwwStream::init($ctx); @@ -64,7 +64,7 @@ sub no_over_html ($) { sub msg_page { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; my $over = $ctx->{over} = $ibx->over or return no_over_html($ctx); my ($id, $prev); @@ -88,7 +88,7 @@ sub msg_reply ($$) { 'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style'; my $info = ''; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if (my $url = $ibx->{infourl}) { $url = prurl($ctx->{env}, $url); $info = qq(\n List information: $url\n); @@ -421,7 +421,7 @@ sub stream_thread ($$) { sub thread_html { my ($ctx) = @_; my $mid = $ctx->{mid}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); return missing_thread($ctx) if $nr == 0; @@ -554,7 +554,7 @@ EOF sub add_text_body { # callback for each_part my ($p, $ctx) = @_; my $upfx = $ctx->{mhref}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; # $p - from each_part: [ Email::MIME-like, depth, $idx ] my ($part, $depth, $idx) = @$p; @@ -639,7 +639,7 @@ sub add_text_body { # callback for each_part sub _msg_page_prepare_obuf { my ($eml, $ctx) = @_; - my $over = $ctx->{-inbox}->over; + my $over = $ctx->{ibx}->over; my $obfs_ibx = $ctx->{-obfs_ibx}; my $rv = ''; my $mids = mids_for_index($eml); @@ -729,7 +729,7 @@ sub SKEL_EXPAND () { sub thread_skel ($$$) { my ($skel, $ctx, $hdr) = @_; my $mid = mids($hdr)->[0]; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); my $parent = in_reply_to($hdr); $$skel .= "\nThread overview: "; @@ -800,7 +800,7 @@ sub _parent_headers { # returns a string buffer sub html_footer { my ($ctx, $hdr) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $upfx = '../'; my $skel; my $rv = '
';
@@ -1072,7 +1072,7 @@ sub acc_topic { # walk_thread callback
 	my ($ctx, $level, $smsg) = @_;
 	my $mid = $smsg->{mid};
 	my $has_blob = $smsg->{blob} // do {
-		if (my $by_mid = $ctx->{-inbox}->smsg_by_mid($mid)) {
+		if (my $by_mid = $ctx->{ibx}->smsg_by_mid($mid)) {
 			%$smsg = (%$smsg, %$by_mid);
 			1;
 		}
@@ -1116,7 +1116,7 @@ sub dump_topics {
 	}
 
 	my @out;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
 
 	# sort by recency, this allows new posts to "bump" old topics...
@@ -1194,7 +1194,7 @@ sub paginate_recent ($$) {
 	$t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1);
 	$t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1);
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $msgs = $ibx->recent($opts, $after, $before);
 	my $nr = scalar @$msgs;
 	if ($nr < $lim && defined($after)) {
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 7ec57d8d..8fe7261f 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # used by PublicInbox::View
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index 87927d5e..702a075d 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # show any VCS object, similar to "git show"
@@ -197,7 +197,7 @@ sub show ($$;$) {
 
 	$ctx->{'log'} = tmpfile("solve.$oid_b");
 	$ctx->{fn} = $fn;
-	my $solver = PublicInbox::SolverGit->new($ctx->{-inbox},
+	my $solver = PublicInbox::SolverGit->new($ctx->{ibx},
 						\&solve_result, $ctx);
 	# PSGI server will call this immediately and give us a callback (-wcb)
 	sub {
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 37f55347..500021d4 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Main web interface for mailing list archives
@@ -32,9 +32,8 @@ our $ATTACH_RE = qr!([0-9][0-9\.]*)-($PublicInbox::Hval::FN)!;
 our $OID_RE = qr![a-f0-9]{7,}!;
 
 sub new {
-	my ($class, $pi_config) = @_;
-	$pi_config ||= PublicInbox::Config->new;
-	bless { pi_config => $pi_config }, $class;
+	my ($class, $pi_cfg) = @_;
+	bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class;
 }
 
 # backwards compatibility, do not use
@@ -169,14 +168,14 @@ sub preload {
 		eval "require PublicInbox::$_;";
 	}
 	if (ref($self)) {
-		my $pi_config = $self->{pi_config};
-		if (defined($pi_config->{'publicinbox.cgitrc'})) {
-			$pi_config->limiter('-cgit');
+		my $pi_cfg = $self->{pi_cfg};
+		if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
+			$pi_cfg->limiter('-cgit');
 		}
 		$self->cgit;
 		$self->stylesheets_prepare($_) for ('', '../', '../../');
 		$self->news_www;
-		$pi_config->each_inbox(\&preload_inbox);
+		$pi_cfg->each_inbox(\&preload_inbox);
 	}
 }
 
@@ -210,9 +209,10 @@ sub news_cgit_fallback ($) {
 # returns undef if valid, array ref response if invalid
 sub invalid_inbox ($$) {
 	my ($ctx, $inbox) = @_;
-	my $ibx = $ctx->{www}->{pi_config}->lookup_name($inbox);
+	my $ibx = $ctx->{www}->{pi_cfg}->lookup_name($inbox) //
+			$ctx->{www}->{pi_cfg}->lookup_ei($inbox);
 	if (defined $ibx) {
-		$ctx->{-inbox} = $ibx;
+		$ctx->{ibx} = $ibx;
 		return;
 	}
 
@@ -230,11 +230,11 @@ sub invalid_inbox_mid {
 	return $ret if $ret;
 
 	my $mid = $ctx->{mid} = uri_unescape($mid_ue);
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) {
 		my ($x2, $x38) = ($1, $2);
 		# this is horrifically wasteful for legacy URLs:
-		my $str = $ctx->{-inbox}->msg_by_path("$x2/$x38") or return;
+		my $str = $ctx->{ibx}->msg_by_path("$x2/$x38") or return;
 		my $s = PublicInbox::Eml->new($str);
 		$mid = PublicInbox::MID::mid_clean($s->header_raw('Message-ID'));
 		return r301($ctx, $inbox, mid_escape($mid));
@@ -285,7 +285,7 @@ sub get_mid_html {
 # /$INBOX/$MESSAGE_ID/t/
 sub get_thread {
 	my ($ctx, $flat) = @_;
-	$ctx->{-inbox}->over or return need($ctx, 'Overview');
+	$ctx->{ibx}->over or return need($ctx, 'Overview');
 	$ctx->{flat} = $flat;
 	require PublicInbox::View;
 	PublicInbox::View::thread_html($ctx);
@@ -338,7 +338,7 @@ EOF
 # especially on older systems.  Stick to zlib since that's what git uses.
 sub get_thread_mbox {
 	my ($ctx, $sfx) = @_;
-	my $over = $ctx->{-inbox}->over or return need($ctx, 'Overview');
+	my $over = $ctx->{ibx}->over or return need($ctx, 'Overview');
 	require PublicInbox::Mbox;
 	PublicInbox::Mbox::thread_mbox($ctx, $over, $sfx);
 }
@@ -347,7 +347,7 @@ sub get_thread_mbox {
 # /$INBOX/$MESSAGE_ID/t.atom		  -> thread as Atom feed
 sub get_thread_atom {
 	my ($ctx) = @_;
-	$ctx->{-inbox}->over or return need($ctx, 'Overview');
+	$ctx->{ibx}->over or return need($ctx, 'Overview');
 	require PublicInbox::Feed;
 	PublicInbox::Feed::generate_thread_atom($ctx);
 }
@@ -412,11 +412,11 @@ sub legacy_redirects {
 
 sub r301 {
 	my ($ctx, $inbox, $mid_ue, $suffix) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	unless ($ibx) {
 		my $r404 = invalid_inbox($ctx, $inbox);
 		return $r404 if $r404;
-		$ibx = $ctx->{-inbox};
+		$ibx = $ctx->{ibx};
 	}
 	my $url = $ibx->base_url($ctx->{env});
 	my $qs = $ctx->{env}->{QUERY_STRING};
@@ -453,7 +453,7 @@ sub msg_page {
 sub serve_git {
 	my ($ctx, $epoch, $path) = @_;
 	my $env = $ctx->{env};
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $git = defined $epoch ? $ibx->git_epoch($epoch) : $ibx->git;
 	$git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404();
 }
@@ -461,7 +461,7 @@ sub serve_git {
 sub mbox_results {
 	my ($ctx) = @_;
 	if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
-		$ctx->{-inbox}->search or return need($ctx, 'search');
+		$ctx->{ibx}->isrch or return need($ctx, 'search');
 		require PublicInbox::SearchView;
 		return PublicInbox::SearchView::mbox_results($ctx);
 	}
@@ -480,18 +480,18 @@ sub news_www {
 	my ($self) = @_;
 	$self->{news_www} ||= do {
 		require PublicInbox::NewsWWW;
-		PublicInbox::NewsWWW->new($self->{pi_config});
+		PublicInbox::NewsWWW->new($self->{pi_cfg});
 	}
 }
 
 sub cgit {
 	my ($self) = @_;
 	$self->{cgit} ||= do {
-		my $pi_config = $self->{pi_config};
+		my $pi_cfg = $self->{pi_cfg};
 
-		if (defined($pi_config->{'publicinbox.cgitrc'})) {
+		if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
 			require PublicInbox::Cgit;
-			PublicInbox::Cgit->new($pi_config);
+			PublicInbox::Cgit->new($pi_cfg);
 		} else {
 			require Plack::Util;
 			Plack::Util::inline_object(call => sub { r404() });
@@ -537,7 +537,7 @@ sub stylesheets_prepare ($$) {
 	} || sub { $_[0] };
 
 	my $css_map = {};
-	my $stylesheets = $self->{pi_config}->{css} || [];
+	my $stylesheets = $self->{pi_cfg}->{css} || [];
 	my $links = [];
 	my $inline_ok = 1;
 
@@ -641,7 +641,7 @@ sub get_css ($$$) {
 	my $css = $css_map->{$key};
 	if (!defined($css) && $key eq 'userContent') {
 		my $env = $ctx->{env};
-		$css = PublicInbox::UserContent::sample($ctx->{-inbox}, $env);
+		$css = PublicInbox::UserContent::sample($ctx->{ibx}, $env);
 	}
 	defined $css or return r404();
 	my $h = [ 'Content-Length', bytes::length($css),
@@ -653,7 +653,7 @@ sub get_css ($$$) {
 sub get_description {
 	my ($ctx, $inbox) = @_;
 	invalid_inbox($ctx, $inbox) || do {
-		my $d = $ctx->{-inbox}->description . "\n";
+		my $d = $ctx->{ibx}->description . "\n";
 		[ 200, [ 'Content-Length', bytes::length($d),
 			'Content-Type', 'text/plain' ], [ $d ] ];
 	};
diff --git a/lib/PublicInbox/WWW.pod b/lib/PublicInbox/WWW.pod
index 30fe602d..276dfc4c 100644
--- a/lib/PublicInbox/WWW.pod
+++ b/lib/PublicInbox/WWW.pod
@@ -47,7 +47,7 @@ and L
 
 =head1 COPYRIGHT
 
-Copyright (C) 2016-2020 all contributors L
+Copyright (C) 2016-2021 all contributors L
 
 License: AGPL-3.0+ L
 
diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index 8bbce929..1de5018d 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # ref: https://cr.yp.to/proto/maildir.html
@@ -41,7 +41,7 @@ sub compile_watchheaders ($) {
 }
 
 sub new {
-	my ($class, $config) = @_;
+	my ($class, $cfg) = @_;
 	my (%mdmap, $spamc);
 	my (%imap, %nntp); # url => [inbox objects] or 'watchspam'
 
@@ -50,7 +50,7 @@ sub new {
 	# indefinitely...
 	foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) {
 		my $k = "$pfx.watchspam";
-		defined(my $dirs = $config->{$k}) or next;
+		defined(my $dirs = $cfg->{$k}) or next;
 		$dirs = PublicInbox::Config::_array($dirs);
 		for my $dir (@$dirs) {
 			my $url;
@@ -69,10 +69,10 @@ sub new {
 
 	my $k = 'publicinboxwatch.spamcheck';
 	my $default = undef;
-	my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default);
+	my $spamcheck = PublicInbox::Spamcheck::get($cfg, $k, $default);
 	$spamcheck = _spamcheck_cb($spamcheck) if $spamcheck;
 
-	$config->each_inbox(sub {
+	$cfg->each_inbox(sub {
 		# need to make all inboxes writable for spam removal:
 		my $ibx = $_[0] = PublicInbox::InboxWritable->new($_[0]);
 
@@ -113,7 +113,7 @@ sub new {
 		spamcheck => $spamcheck,
 		mdmap => \%mdmap,
 		mdre => $mdre,
-		config => $config,
+		pi_cfg => $cfg,
 		imap => scalar keys %imap ? \%imap : undef,
 		nntp => scalar keys %nntp? \%nntp : undef,
 		importers => {},
@@ -175,7 +175,7 @@ sub _remove_spam {
 	$path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return;
 	my $eml = eml_from_path($path) or return;
 	local $SIG{__WARN__} = warn_ignore_cb();
-	$self->{config}->each_inbox(\&remove_eml_i, $self, $eml, $path);
+	$self->{pi_cfg}->each_inbox(\&remove_eml_i, $self, $eml, $path);
 }
 
 sub import_eml ($$$) {
@@ -217,7 +217,7 @@ sub _try_path {
 		warn "unmappable dir: $1\n";
 		return;
 	}
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
 		$warn_cb->($pfx, "path: $path\n", @_);
@@ -316,7 +316,7 @@ sub cfg_bool ($$$) {
 # flesh out common IMAP-specific data structures
 sub imap_common_init ($) {
 	my ($self) = @_;
-	my $cfg = $self->{config};
+	my $cfg = $self->{pi_cfg};
 	my $mic_args = {}; # scheme://authority => Mail:IMAPClient arg
 	for my $url (sort keys %{$self->{imap}}) {
 		my $uri = PublicInbox::URIimap->new($url);
@@ -418,7 +418,7 @@ sub imap_import_msg ($$$$$) {
 		if ($flags =~ /\\Seen\b/) {
 			local $SIG{__WARN__} = warn_ignore_cb();
 			my $eml = PublicInbox::Eml->new($raw);
-			$self->{config}->each_inbox(\&remove_eml_i,
+			$self->{pi_cfg}->each_inbox(\&remove_eml_i,
 						$self, $eml, "$url UID:$uid");
 		}
 	} else {
@@ -467,7 +467,7 @@ sub imap_fetch_all ($$$) {
 	my $key = $req;
 	$key =~ s/\.PEEK//;
 	my ($uids, $batch);
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
 		$batch //= '?';
@@ -583,13 +583,13 @@ sub watch_atfork_child ($) {
 	delete $self->{opendirs};
 	PublicInbox::DS->Reset;
 	%SIG = (%SIG, %{$self->{sig}}, CHLD => 'DEFAULT');
-	PublicInbox::Sigfd::sig_setmask($self->{oldset});
+	PublicInbox::DS::sig_setmask($self->{oldset});
 }
 
 sub watch_atfork_parent ($) {
 	my ($self) = @_;
 	_done_for_now($self);
-	PublicInbox::Sigfd::block_signals();
+	PublicInbox::DS::block_signals();
 }
 
 sub imap_idle_requeue ($) { # DS::add_timer callback
@@ -625,8 +625,11 @@ sub imap_idle_fork ($$) {
 	my ($self, $url_intvl) = @_;
 	my ($url, $intvl) = @$url_intvl;
 	pipe(my ($r, $w)) or die "pipe: $!";
+	my $seed = rand(0xffffffff);
 	defined(my $pid = fork) or die "fork: $!";
 	if ($pid == 0) {
+		srand($seed);
+		eval { Net::SSLeay::randomize() };
 		close $r;
 		watch_atfork_child($self);
 		watch_imap_idle_1($self, $url, $intvl);
@@ -648,7 +651,7 @@ sub event_step {
 				imap_idle_fork($self, $url_intvl);
 			}
 		};
-		PublicInbox::Sigfd::sig_setmask($oldset);
+		PublicInbox::DS::sig_setmask($oldset);
 		die $@ if $@;
 	}
 	fs_scan_step($self) if $self->{mdre};
@@ -704,8 +707,11 @@ sub poll_fetch_fork ($) { # DS::add_timer callback
 	return if $self->{quit};
 	pipe(my ($r, $w)) or die "pipe: $!";
 	my $oldset = watch_atfork_parent($self);
+	my $seed = rand(0xffffffff);
 	my $pid = fork;
 	if (defined($pid) && $pid == 0) {
+		srand($seed);
+		eval { Net::SSLeay::randomize() };
 		close $r;
 		watch_atfork_child($self);
 		if ($urls->[0] =~ m!\Aimaps?://!i) {
@@ -716,7 +722,7 @@ sub poll_fetch_fork ($) { # DS::add_timer callback
 		close $w;
 		_exit(0);
 	}
-	PublicInbox::Sigfd::sig_setmask($oldset);
+	PublicInbox::DS::sig_setmask($oldset);
 	die "fork: $!"  unless defined $pid;
 	$self->{poll_pids}->{$pid} = [ $intvl, $urls ];
 	PublicInbox::EOFpipe->new($r, \&reap, [$pid, \&poll_fetch_reap, $self]);
@@ -775,7 +781,7 @@ sub watch_imap_init ($$) {
 # flesh out common NNTP-specific data structures
 sub nntp_common_init ($) {
 	my ($self) = @_;
-	my $cfg = $self->{config};
+	my $cfg = $self->{pi_cfg};
 	my $nn_args = {}; # scheme://authority => Net::NNTP->new arg
 	for my $url (sort keys %{$self->{nntp}}) {
 		my $sec = uri_section(uri_new($url));
@@ -929,7 +935,7 @@ sub nntp_fetch_all ($$$) {
 	$beg = $l_art + 1;
 
 	warn "I: $url fetching ARTICLE $beg..$end\n";
-	my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
 	my ($err, $art);
 	local $SIG{__WARN__} = sub {
 		my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
@@ -966,7 +972,7 @@ sub nntp_fetch_all ($$$) {
 			}
 		} elsif ($inboxes eq 'watchspam') {
 			my $eml = PublicInbox::Eml->new(\$raw);
-			$self->{config}->each_inbox(\&remove_eml_i,
+			$self->{pi_cfg}->each_inbox(\&remove_eml_i,
 					$self, $eml, "$url ARTICLE $art");
 		} else {
 			die "BUG: destination unknown $inboxes";
diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm
index 2818400e..b90819a2 100644
--- a/lib/PublicInbox/WwwAltId.pm
+++ b/lib/PublicInbox/WwwAltId.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # dumps using the ".dump" command of sqlite3(1)
@@ -30,7 +30,7 @@ sub check_output {
 sub sqldump ($$) {
 	my ($ctx, $altid_pfx) = @_;
 	my $env = $ctx->{env};
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $altid_map = $ibx->altid_map;
 	my $fn = $altid_map->{$altid_pfx};
 	unless (defined $fn) {
diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm
index 388def12..361e61f6 100644
--- a/lib/PublicInbox/WwwAtomStream.pm
+++ b/lib/PublicInbox/WwwAtomStream.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Atom body stream for HTTP responses
@@ -15,7 +15,7 @@ use PublicInbox::MsgTime qw(msg_timestamp);
 
 sub new {
 	my ($class, $ctx, $cb) = @_;
-	$ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env});
+	$ctx->{feed_base_url} = $ctx->{ibx}->base_url($ctx->{env});
 	$ctx->{cb} = $cb || \&PublicInbox::GzipFilter::close;
 	$ctx->{emit_header} = 1;
 	bless $ctx, $class;
@@ -53,7 +53,7 @@ sub getline {
 	my ($self) = @_;
 	my $cb = $self->{cb} or return;
 	while (my $smsg = $cb->($self)) {
-		my $eml = $self->{-inbox}->smsg_eml($smsg) or next;
+		my $eml = $self->{ibx}->smsg_eml($smsg) or next;
 		return $self->translate(feed_entry($self, $smsg, $eml));
 	}
 	delete $self->{cb};
@@ -82,7 +82,7 @@ sub to_uuid ($) {
 
 sub atom_header {
 	my ($ctx, $title) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $base_url = $ctx->{feed_base_url};
 	my $search_q = $ctx->{search_query};
 	my $self_url = $base_url;
@@ -136,10 +136,10 @@ sub feed_entry {
 	$title = title_tag($title);
 
 	my $from = $eml->header('From') // $eml->header('Sender') //
-		$ctx->{-inbox}->{-primary_address};
+		$ctx->{ibx}->{-primary_address};
 	my ($email) = PublicInbox::Address::emails($from);
 	my $name = ascii_html(join(', ', PublicInbox::Address::names($from)));
-	$email = ascii_html($email // $ctx->{-inbox}->{-primary_address});
+	$email = ascii_html($email // $ctx->{ibx}->{-primary_address});
 
 	my $s = delete($ctx->{emit_header}) ? atom_header($ctx, $title) : '';
 	$s .= "$name$email" .
diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm
index 09c66d02..93c43af8 100644
--- a/lib/PublicInbox/WwwAttach.pm
+++ b/lib/PublicInbox/WwwAttach.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # For retrieving attachments from messages in the WWW interface
@@ -16,7 +16,7 @@ sub referer_match ($) {
 	return 1 if $referer eq ''; # no referer is always OK for wget/curl
 
 	# prevent deep-linking from other domains on some browsers (Firefox)
-	# n.b.: $ctx->{-inbox}->base_url($env) with INBOX_URL won't work
+	# n.b.: $ctx->{ibx}->base_url($env) with INBOX_URL won't work
 	# with dillo, we can only match "$url_scheme://$HTTP_HOST/" without
 	# path components
 	my $base_url = $env->{'psgi.url_scheme'} . '://' .
@@ -88,15 +88,15 @@ sub get_attach ($$$) {
 	$ctx->{idx} = $idx;
 	bless $ctx, __PACKAGE__;
 	my $eml;
-	if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) {
+	if ($ctx->{smsg} = $ctx->{ibx}->smsg_by_mid($ctx->{mid})) {
 		return sub { # public-inbox-httpd-only
 			$ctx->{wcb} = $_[0];
 			scan_attach($ctx);
 		} if $ctx->{env}->{'pi-httpd.async'};
 		# generic PSGI:
-		$eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg});
-	} elsif (!$ctx->{-inbox}->over) {
-		if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) {
+		$eml = $ctx->{ibx}->smsg_eml($ctx->{smsg});
+	} elsif (!$ctx->{ibx}->over) {
+		if (my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid})) {
 			$eml = PublicInbox::Eml->new($bref);
 		}
 	}
diff --git a/lib/PublicInbox/WwwHighlight.pm b/lib/PublicInbox/WwwHighlight.pm
index 170bfcaa..6fed2fed 100644
--- a/lib/PublicInbox/WwwHighlight.pm
+++ b/lib/PublicInbox/WwwHighlight.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # Standalone PSGI app to provide syntax highlighting as-a-service
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index bda2761c..d58618cc 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # Provide an HTTP-accessible listing of inboxes.
@@ -44,7 +44,7 @@ sub url_regexp {
 	my ($ctx, $key, $default) = @_;
 	$key //= 'publicInbox.wwwListing';
 	$default //= '404';
-	my $v = $ctx->{www}->{pi_config}->{lc $key} // $default;
+	my $v = $ctx->{www}->{pi_cfg}->{lc $key} // $default;
 again:
 	if ($v eq 'match=domain') {
 		my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME};
@@ -69,8 +69,11 @@ sub hide_key { 'www' }
 sub response {
 	my ($class, $ctx) = @_;
 	bless $ctx, $class;
+	if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) {
+		$ALL->misc->reopen;
+	}
 	my $re = $ctx->url_regexp or return $ctx->psgi_triple;
-	my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_config},
+	my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg},
 						\&list_match_i, $re, $ctx);
 	sub {
 		$ctx->{-wcb} = $_[0]; # HTTP server callback
diff --git a/lib/PublicInbox/WwwStatic.pm b/lib/PublicInbox/WwwStatic.pm
index 051d2e03..29e4819d 100644
--- a/lib/PublicInbox/WwwStatic.pm
+++ b/lib/PublicInbox/WwwStatic.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # This package can either be a PSGI response body for a static file
diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm
index 638f4e27..bcf2ecec 100644
--- a/lib/PublicInbox/WwwStream.pm
+++ b/lib/PublicInbox/WwwStream.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # HTML body stream for which yields getline+close methods for
@@ -12,11 +12,12 @@ our @EXPORT_OK = qw(html_oneshot);
 use bytes (); # length
 use PublicInbox::Hval qw(ascii_html prurl ts2str);
 our $TOR_URL = 'https://www.torproject.org/';
-our $CODE_URL = 'https://public-inbox.org/public-inbox.git';
+our $CODE_URL = [ qw(http://ou63pmih66umazou.onion/public-inbox.git
+	https://public-inbox.org/public-inbox.git) ];
 
 sub base_url ($) {
 	my $ctx = shift;
-	my $base_url = $ctx->{-inbox}->base_url($ctx->{env});
+	my $base_url = $ctx->{ibx}->base_url($ctx->{env});
 	chop $base_url; # no trailing slash for clone
 	$base_url;
 }
@@ -35,7 +36,7 @@ sub async_eml { # for async_blob_cb
 
 sub html_top ($) {
 	my ($ctx) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $desc = ascii_html($ibx->description);
 	my $title = delete($ctx->{-title_html}) // $desc;
 	my $upfx = $ctx->{-upfx} || '';
@@ -54,7 +55,7 @@ sub html_top ($) {
 			qq(color / ).
 			qq(mirror / ).
 			qq(Atom feed);
-	if ($ibx->search) {
+	if ($ibx->isrch) {
 		my $q_val = delete($ctx->{-q_value_html}) // '';
 		$q_val = qq(\nvalue="$q_val") if $q_val ne '';
 		# XXX gross, for SearchView.pm
@@ -78,22 +79,24 @@ sub html_top ($) {
 
 sub coderepos ($) {
 	my ($ctx) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $cr = $ctx->{ibx}->{coderepo} // return ();
+	my $cfg = $ctx->{www}->{pi_cfg};
+	my $upfx = ($ctx->{-upfx} // ''). '../';
 	my @ret;
-	if (defined(my $cr = $ibx->{coderepo})) {
-		my $cfg = $ctx->{www}->{pi_config};
-		my $env = $ctx->{env};
-		for my $cr_name (@$cr) {
-			my $urls = $cfg->{"coderepo.$cr_name.cgiturl"};
-			if ($urls) {
-				$ret[0] //= <{"coderepo.$cr_name.cgiturl"} // next;
+		$ret[0] //= <{env}, $u));
+			$ret[0] .= qq(\n\t$u);
 		}
 	}
-	@ret; # may be empty
+	@ret; # may be empty, this sub is called as an arg for join()
 }
 
 sub code_footer ($) {
@@ -109,7 +112,7 @@ sub _html_end {
 id=mirror>This inbox may be cloned and mirrored by anyone:
 EOF
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $desc = ascii_html($ibx->description);
 
 	my @urls;
@@ -143,10 +146,10 @@ EOF
 	}
 
 	$urls .= "\n" . join('', map { "\tgit clone --mirror $_\n" } @urls);
-	my $addrs = $ibx->{address};
-	$addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
-	my $v = defined $max ? '-V2' : '-V1';
-	$urls .= <{address}) {
+		$addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
+		my $v = defined $max ? '-V2' : '-V1';
+		$urls .= <{-upfx} // '').'_/text/config/raw';
 	$urls .= <{cb} or return;
 	while (defined(my $x = $cb->($ctx))) { # x = smsg or scalar non-ref
 		if (ref($x)) { # smsg
-			my $eml = $ctx->{-inbox}->smsg_eml($x) or next;
+			my $eml = $ctx->{ibx}->smsg_eml($x) or next;
 			$ctx->{smsg} = $x;
 			return $ctx->translate($cb->($ctx, $eml));
 		} else { # scalar
diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm
index 04c9b1c4..817d032c 100644
--- a/lib/PublicInbox/WwwText.pm
+++ b/lib/PublicInbox/WwwText.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # used for displaying help texts and other non-mail content
@@ -49,7 +49,7 @@ sub get_text {
 
 	# enforce trailing slash for "wget -r" compatibility
 	if (!$have_tslash && $code == 200) {
-		my $url = $ctx->{-inbox}->base_url($env);
+		my $url = $ctx->{ibx}->base_url($env);
 		$url .= "_/text/$key/";
 
 		return [ 302, [ 'Content-Type', 'text/plain',
@@ -100,7 +100,7 @@ sub _srch_prefix ($$) {
 
 sub _colors_help ($$) {
 	my ($ctx, $txt) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $env = $ctx->{env};
 	my $base_url = $ibx->base_url($env);
 	$$txt .= "color customization for $base_url\n";
@@ -135,7 +135,7 @@ sub URI_PATH () { '^A-Za-z0-9\-\._~/' }
 # n.b. this is a perfect candidate for memoization
 sub inbox_config ($$$) {
 	my ($ctx, $hdr, $txt) = @_;
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	push @$hdr, 'Content-Disposition', 'inline; filename=inbox.config';
 	my $name = dq_escape($ibx->{name});
 	my $inboxdir = '/path/to/top-level-inbox';
@@ -189,9 +189,9 @@ EOF
 ; line number ranges in `[PATCH]' emails link to /$INBOX_NAME/$OID/s/,
 ; an HTTP endpoint which reconstructs git blobs via git-apply(1).
 EOF
-		my $pi_config = $ctx->{www}->{pi_config};
+		my $pi_cfg = $ctx->{www}->{pi_cfg};
 		for my $cr_name (@$cr) {
-			my $urls = $pi_config->{"coderepo.$cr_name.cgiturl"};
+			my $urls = $pi_cfg->{"coderepo.$cr_name.cgiturl"};
 			my $path = "/path/to/$cr_name";
 			$cr_name = dq_escape($cr_name);
 
@@ -221,7 +221,7 @@ sub _default_text ($$$$) {
 	return inbox_config($ctx, $hdr, $txt) if $key eq 'config';
 	return if $key ne 'help'; # TODO more keys?
 
-	my $ibx = $ctx->{-inbox};
+	my $ibx = $ctx->{ibx};
 	my $base_url = $ibx->base_url($ctx->{env});
 	$$txt .= "public-inbox help for $base_url\n";
 	$$txt .= <search;
+	my $srch = $ibx->isrch;
 	if ($srch) {
 		$$txt .= <
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 package PublicInbox::Xapcmd;
 use strict;
@@ -89,8 +89,10 @@ sub commit_changes ($$$$) {
 
 sub cb_spawn {
 	my ($cb, $args, $opt) = @_; # $cb = cpdb() or compact()
-	defined(my $pid = fork) or die "fork: $!";
+	my $seed = rand(0xffffffff);
+	my $pid = fork // die "fork: $!";
 	return $pid if $pid > 0;
+	srand($seed);
 	$cb->($args, $opt);
 	POSIX::_exit(0);
 }
@@ -109,8 +111,7 @@ sub prepare_reindex ($$$) {
 			$opt->{reindex}->{from} = $lc;
 		}
 	} else { # v2
-		my $max;
-		$im->git_dir_latest(\$max) or return;
+		my $max = $ibx->max_git_epoch // return;
 		my $from = $opt->{reindex}->{from};
 		my $mm = $ibx->mm;
 		my $v = PublicInbox::Search::SCHEMA_VERSION();
@@ -271,7 +272,6 @@ sub run {
 
 	local %SIG = %SIG;
 	setup_signals();
-	$ibx->umask_prepare;
 	$ibx->with_umask(\&_run, $ibx, $cb, $opt);
 }
 
diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h
new file mode 100644
index 00000000..e1f0ef39
--- /dev/null
+++ b/lib/PublicInbox/gcf2_libgit2.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2020-2021 all contributors 
+ * License: AGPL-3.0+ 
+ *
+ * libgit2 for Inline::C
+ * Avoiding Git::Raw since it doesn't guarantee a stable API,
+ * while libgit2 itself seems reasonably stable.
+ */
+#include 
+#include 
+#include 
+#include 
+
+static void croak_if_err(int rc, const char *msg)
+{
+	if (rc != GIT_OK) {
+		const git_error *e = giterr_last();
+
+		croak("%d %s (%s)", rc, msg, e ? e->message : "unknown");
+	}
+}
+
+SV *new()
+{
+	git_odb *odb;
+	SV *ref, *self;
+	int rc = git_odb_new(&odb);
+	croak_if_err(rc, "git_odb_new");
+
+	ref = newSViv((IV)odb);
+	self = newRV_noinc(ref);
+	sv_bless(self, gv_stashpv("PublicInbox::Gcf2", GV_ADD));
+	SvREADONLY_on(ref);
+
+	return self;
+}
+
+static git_odb *odb_ptr(SV *self)
+{
+	return (git_odb *)SvIV(SvRV(self));
+}
+
+void DESTROY(SV *self)
+{
+	git_odb_free(odb_ptr(self));
+}
+
+/* needs "$GIT_DIR/objects", not $GIT_DIR */
+void add_alternate(SV *self, const char *objects_path)
+{
+	int rc = git_odb_add_disk_alternate(odb_ptr(self), objects_path);
+	croak_if_err(rc, "git_odb_add_disk_alternate");
+}
+
+#define CAPA(v) (sizeof(v) / sizeof((v)[0]))
+
+/*
+ * returns true on success, false on failure
+ * this requires an unabbreviated git OID
+ */
+int cat_oid(SV *self, int fd, SV *oidsv)
+{
+	/*
+	 * adjust when libgit2 gets SHA-256 support, we return the
+	 * same header as git-cat-file --batch "$OID $TYPE $SIZE\n"
+	 */
+	char hdr[GIT_OID_HEXSZ + sizeof(" commit 18446744073709551615")];
+	struct iovec vec[3];
+	size_t nvec = CAPA(vec);
+	git_oid oid;
+	git_odb_object *object = NULL;
+	int rc, err = 0;
+	STRLEN oidlen;
+	char *oidptr = SvPV(oidsv, oidlen);
+
+	/* same trailer as git-cat-file --batch */
+	vec[2].iov_len = 1;
+	vec[2].iov_base = "\n";
+
+	rc = git_oid_fromstrn(&oid, oidptr, oidlen);
+	if (rc == GIT_OK)
+		rc = git_odb_read(&object, odb_ptr(self), &oid);
+	if (rc == GIT_OK) {
+		vec[0].iov_base = hdr;
+		vec[1].iov_base = (void *)git_odb_object_data(object);
+		vec[1].iov_len = git_odb_object_size(object);
+
+		git_oid_nfmt(hdr, GIT_OID_HEXSZ, git_odb_object_id(object));
+		vec[0].iov_len = GIT_OID_HEXSZ +
+				snprintf(hdr + GIT_OID_HEXSZ,
+					sizeof(hdr) - GIT_OID_HEXSZ,
+					" %s %zu\n",
+					git_object_type2string(
+						git_odb_object_type(object)),
+					vec[1].iov_len);
+	} else { /* caller retries */
+		nvec = 0;
+	}
+	while (nvec && !err) {
+		ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec);
+
+		if (w > 0) {
+			size_t done = 0;
+			size_t i;
+
+			for (i = CAPA(vec) - nvec; i < CAPA(vec); i++) {
+				if (w >= vec[i].iov_len) {
+					/* fully written vec */
+					w -= vec[i].iov_len;
+					done++;
+				} else { /* partially written vec */
+					char *p = vec[i].iov_base;
+					vec[i].iov_base = p + w;
+					vec[i].iov_len -= w;
+					break;
+				}
+			}
+			nvec -= done;
+		} else if (w < 0) {
+			err = errno;
+			switch (err) {
+			case EAGAIN: {
+				struct pollfd pfd;
+				pfd.events = POLLOUT;
+				pfd.fd = fd;
+				poll(&pfd, 1, -1);
+			}
+				/* fall-through */
+			case EINTR:
+				err = 0;
+			}
+		} else { /* w == 0 */
+			err = ENOSPC;
+		}
+	}
+	if (object)
+		git_odb_object_free(object);
+	if (err)
+		croak("writev error: %s", strerror(err));
+
+	return rc == GIT_OK;
+}
diff --git a/script/lei b/script/lei
new file mode 100755
index 00000000..006c1180
--- /dev/null
+++ b/script/lei
@@ -0,0 +1,114 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Socket qw(AF_UNIX SOCK_SEQPACKET MSG_EOR pack_sockaddr_un);
+use Errno qw(EINTR ECONNRESET);
+use PublicInbox::CmdIPC4;
+my $narg = 5;
+my ($sock, $pwd);
+my $recv_cmd = PublicInbox::CmdIPC4->can('recv_cmd4');
+my $send_cmd = PublicInbox::CmdIPC4->can('send_cmd4') // do {
+	require PublicInbox::Spawn; # takes ~50ms even if built *sigh*
+	$recv_cmd = PublicInbox::Spawn->can('recv_cmd4');
+	PublicInbox::Spawn->can('send_cmd4');
+};
+
+sub sigchld {
+	my ($sig) = @_;
+	my $flags = $sig ? POSIX::WNOHANG() : 0;
+	while (waitpid(-1, $flags) > 0) {}
+}
+
+sub exec_cmd {
+	my ($fds, $argc, @argv) = @_;
+	my @old = (*STDIN{IO}, *STDOUT{IO}, *STDERR{IO});
+	my @rdr;
+	for my $fd (@$fds) {
+		open(my $tmpfh, '+<&=', $fd) or die "open +<&=$fd: $!";
+		push @rdr, shift(@old), $tmpfh;
+	}
+	require POSIX; # WNOHANG
+	$SIG{CHLD} = \&sigchld;
+	my $pid = fork // die "fork: $!";
+	if ($pid == 0) {
+		my %env = map { split(/=/, $_, 2) } splice(@argv, $argc);
+		while (my ($old_io, $tmpfh) = splice(@rdr, 0, 2)) {
+			open $old_io, '+<&', $tmpfh or die "open +<&=: $!";
+		}
+		%ENV = (%ENV, %env);
+		exec(@argv);
+		die "exec: @argv: $!";
+	}
+}
+
+if ($send_cmd && eval {
+	my $path = do {
+		my $runtime_dir = ($ENV{XDG_RUNTIME_DIR} // '') . '/lei';
+		if ($runtime_dir eq '/lei') {
+			require File::Spec;
+			$runtime_dir = File::Spec->tmpdir."/lei-$<";
+		}
+		unless (-d $runtime_dir) {
+			require File::Path;
+			File::Path::mkpath($runtime_dir, 0, 0700);
+		}
+		"$runtime_dir/$narg.seq.sock";
+	};
+	my $addr = pack_sockaddr_un($path);
+	socket($sock, AF_UNIX, SOCK_SEQPACKET, 0) or die "socket: $!";
+	unless (connect($sock, $addr)) { # start the daemon if not started
+		local $ENV{PERL5LIB} = join(':', @INC);
+		open(my $daemon, '-|', $^X, qw[-MPublicInbox::LEI
+			-E PublicInbox::LEI::lazy_start(@ARGV)],
+			$path, $! + 0, $narg) or die "popen: $!";
+		while (<$daemon>) { warn $_ } # EOF when STDERR is redirected
+		close($daemon) or warn <<"";
+lei-daemon could not start, exited with \$?=$?
+
+		# try connecting again anyways, unlink+bind may be racy
+		connect($sock, $addr) or die <<"";
+connect($path): $! (after attempted daemon start)
+Falling back to (slow) one-shot mode
+
+	}
+	1;
+}) { # (Socket::MsgHdr|Inline::C), $sock, $pwd are all available:
+	open my $dh, '<', '.' or die "open(.) $!";
+	my $buf = join("\0", scalar(@ARGV), @ARGV);
+	while (my ($k, $v) = each %ENV) { $buf .= "\0$k=$v" }
+	$buf .= "\0\0";
+	$send_cmd->($sock, [ 0, 1, 2, fileno($dh) ], $buf, MSG_EOR);
+	my $x_it_code = 0;
+	while (1) {
+		my (@fds) = $recv_cmd->($sock, $buf, 4096 * 33);
+		if (scalar(@fds) == 1 && !defined($fds[0])) {
+			last if $! == ECONNRESET;
+			next if $! == EINTR;
+			die "recvmsg: $!";
+		}
+		last if $buf eq '';
+		if ($buf =~ /\Ax_it ([0-9]+)\z/) {
+			$x_it_code = $1 + 0;
+			last;
+		} elsif ($buf =~ /\Achild_error ([0-9]+)\z/) {
+			$x_it_code = $1 + 0;
+		} elsif ($buf =~ /\Aexec (.+)\z/) {
+			exec_cmd(\@fds, split(/\0/, $1));
+		} else {
+			sigchld();
+			die $buf;
+		}
+	}
+	sigchld();
+	if (my $sig = ($x_it_code & 127)) {
+		kill $sig, $$;
+		sleep;
+	}
+	exit($x_it_code >> 8);
+} else { # for systems lacking Socket::MsgHdr or Inline::C
+	warn $@ if $@;
+	require PublicInbox::LEI;
+	PublicInbox::LEI::oneshot(__PACKAGE__);
+}
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index dfebac1c..ab1d1e5e 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index b61c743f..3c627b79 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
@@ -47,34 +47,21 @@ die $help if (scalar(@ARGV) || $new_dir eq '' || $old_dir eq '');
 die "$new_dir exists\n" if -d $new_dir;
 die "$old_dir not a directory\n" unless -d $old_dir;
 
-require Cwd;
-Cwd->import('abs_path');
+require PublicInbox::Admin;
 require PublicInbox::Config;
 require PublicInbox::InboxWritable;
 
-my $abs = abs_path($old_dir);
-die "failed to resolve $old_dir: $!\n" if (!defined($abs));
-
 my $cfg = PublicInbox::Config->new;
-my $old;
-$cfg->each_inbox(sub {
-	$old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir;
-});
-if ($old) {
-	$old = PublicInbox::InboxWritable->new($old);
-} else {
+my @old = PublicInbox::Admin::resolve_inboxes([$old_dir], undef, $cfg);
+@old > 1 and die "BUG: resolved several inboxes from $old_dir:\n",
+		map { "\t$_->{inboxdir}\n" } @old;
+my $old = PublicInbox::InboxWritable->new($old[0]);
+if (delete $old->{-unconfigured}) {
 	warn "W: $old_dir not configured in " .
 		PublicInbox::Config::default_file() . "\n";
-	$old = PublicInbox::InboxWritable->new({
-		inboxdir => $old_dir,
-		name => 'ignored',
-		-primary_address => 'old@example.com',
-		address => [ 'old@example.com' ],
-	});
 }
 die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2;
 
-require File::Spec;
 require PublicInbox::Admin;
 my $detected = PublicInbox::Admin::detect_indexlevel($old);
 $old->{indexlevel} //= $detected;
@@ -88,12 +75,11 @@ if ($opt->{'index'}) {
 }
 local %ENV = (%$env, %ENV) if $env;
 my $new = { %$old };
-$new->{inboxdir} = File::Spec->canonpath($new_dir);
+$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir);
 $new->{version} = 2;
 $new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} });
 $new->{-no_fsync} = 1 if !$opt->{fsync};
 my $v2w;
-$old->umask_prepare;
 
 sub link_or_copy ($$) {
 	my ($src, $dst) = @_;
diff --git a/script/public-inbox-edit b/script/public-inbox-edit
index a70614fc..1c6c4e4a 100755
--- a/script/public-inbox-edit
+++ b/script/public-inbox-edit
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Used for editing messages in a public-inbox.
@@ -183,7 +183,8 @@ retry_edit:
 	# rename/relink $edit_fn
 	open my $new_fh, '<', $edit_fn or
 		die "can't read edited file ($edit_fn): $!\n";
-	my $new_raw = do { local $/; <$new_fh> };
+	defined(my $new_raw = do { local $/; <$new_fh> }) or die
+		"read $edit_fn: $!\n";
 
 	if (!$opt->{raw}) {
 		# get rid of the From we added
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
new file mode 100644
index 00000000..15ac20eb
--- /dev/null
+++ b/script/public-inbox-extindex
@@ -0,0 +1,81 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+# Basic tool to create a Xapian search index for a public-inbox.
+use strict;
+use v5.10.1;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = < -1, compact => 0, fsync => 1, scan => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
+		fsync|sync!
+		indexlevel|index-level|L=s max_size|max-size=s
+		batch_size|batch-size=s
+		gc commit-interval=i watch scan!
+		all help|h))
+	or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+my $cfg = PublicInbox::Config->new;
+my $eidx_dir = shift(@ARGV);
+unless (defined $eidx_dir) {
+	if ($opt->{all} && $cfg->ALL) {
+		$eidx_dir = $cfg->ALL->{topdir};
+	} else {
+		die "E: $help";
+	}
+}
+my @ibxs;
+if ($opt->{gc}) {
+	die "E: inbox paths must not be specified with --gc\n" if @ARGV;
+	die "E: --all not compatible with --gc\n" if $opt->{all};
+	die "E: --watch is not compatible with --gc\n" if $opt->{watch};
+} else {
+	@ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+}
+PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
+require PublicInbox::ExtSearchIdx;
+my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
+if ($opt->{gc}) {
+	$eidx->attach_config($cfg);
+	$eidx->eidx_gc($opt);
+} else {
+	if ($opt->{all}) {
+		$eidx->attach_config($cfg);
+	} else {
+		$eidx->attach_inbox($_) for @ibxs;
+	}
+	if ($opt->{watch}) {
+		$cfg = undef; # save memory only after SIGHUP
+		$eidx->eidx_watch($opt);
+	} else {
+		$eidx->eidx_sync($opt);
+	}
+}
diff --git a/script/public-inbox-httpd b/script/public-inbox-httpd
index b8159f3a..b31b896d 100755
--- a/script/public-inbox-httpd
+++ b/script/public-inbox-httpd
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Standalone HTTP server for public-inbox.
@@ -13,6 +13,7 @@ BEGIN {
 	require PublicInbox::HTTP;
 	require PublicInbox::HTTPD;
 }
+
 my %httpds;
 my $app;
 my $refresh = sub {
diff --git a/script/public-inbox-imapd b/script/public-inbox-imapd
index 60f2e6d8..6b755938 100755
--- a/script/public-inbox-imapd
+++ b/script/public-inbox-imapd
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Standalone read-only IMAP server for public-inbox.
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 5dad6ecb..33169bd0 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 # Basic tool to create a Xapian search index for a public-inbox.
 # Usage with libeatmydata 
@@ -11,12 +11,13 @@ use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $help = < -1, compact => 0, max_size => undef, fsync => 1 };
+my $opt = {
+	quiet => -1, compact => 0, max_size => undef, fsync => 1,
+	'update-extindex' => [], # ":s@" optional arg sets '' if no arg given
+};
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
 		fsync|sync! xapian_only|xapian-only
 		indexlevel|index-level|L=s max_size|max-size=s
 		batch_size|batch-size=s
 		sequential_shard|seq-shard|sequential-shard
-		skip-docdata all help|h))
+		no-update-extindex update-extindex|E=s@
+		fast-noop|F skip-docdata all help|h))
 	or die $help;
 if ($opt->{help}) { print $help; exit 0 };
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
 if ($opt->{xapian_only} && !$opt->{reindex}) {
 	die "--xapian-only requires --reindex\n";
 }
+if ($opt->{reindex} && delete($opt->{'fast-noop'})) {
+	warn "--fast-noop ignored with --reindex\n";
+}
 
 # require lazily to speed up --help
 require PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-index');
 
 my $cfg = PublicInbox::Config->new; # Config is loaded by Admin
+$opt->{-use_cwd} = 1;
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR $help; exit 1 }
 
+my (@eidx, %eidx_seen);
+my $update_extindex = $opt->{'update-extindex'};
+if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
+	# extindex and normal inboxes may have different owners
+	push(@$update_extindex, 'all') if -w $ALL->{topdir};
+}
+@$update_extindex = () if $opt->{'no-update-extindex'};
+if (scalar @$update_extindex) {
+	PublicInbox::Admin::require_or_die('-search');
+	require PublicInbox::ExtSearchIdx;
+}
+for my $ei_name (@$update_extindex) {
+	my $es = $cfg->lookup_ei($ei_name);
+	my $topdir;
+	if (!$es && -d $ei_name) { # allow dirname or config section name
+		$topdir = $ei_name;
+	} elsif ($es) {
+		$topdir = $es->{topdir};
+	} else {
+		die "extindex `$ei_name' not configured or found\n";
+	}
+	my $o = { %$opt };
+	delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic';
+	$eidx_seen{$topdir} //=
+		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o));
+}
 my $mods = {};
+my @eidx_unconfigured;
 foreach my $ibx (@ibxs) {
 	# detect_indexlevel may also set $ibx->{-skip_docdata}
 	my $detected = PublicInbox::Admin::detect_indexlevel($ibx);
@@ -62,7 +98,14 @@ foreach my $ibx (@ibxs) {
 	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
 			'full' : $detected);
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
+	if (@eidx && $ibx->{-unconfigured}) {
+		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
+	}
 }
+warn <{compact} = 0 if !$mods->{'Search::Xapian'};
@@ -88,9 +131,21 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean
 EOL
 		$ibx_opt = { %$opt, sequential_shard => $v };
 	}
-	PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	last if $ibx_opt->{quit};
 	if (my $copt = $opt->{compact_opt}) {
 		local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard};
 		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
 	}
+	last if $ibx_opt->{quit};
+	next if $ibx->{-unconfigured} || !$nidx;
+	for my $eidx (@eidx) {
+		$eidx->attach_inbox($ibx);
+	}
+}
+my $pr = $opt->{-progress};
+for my $eidx (@eidx) {
+	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
+	$eidx->eidx_sync($opt);
+	last if $opt->{quit};
 }
diff --git a/script/public-inbox-init b/script/public-inbox-init
index c775eb31..6a867a22 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
@@ -91,7 +91,8 @@ sysopen($lockfh, $lockfile, O_RDWR|O_CREAT|O_EXCL) or do {
 	warn "could not open config file: $lockfile: $!\n";
 	exit(255);
 };
-my $auto_unlink = UnlinkMe->new($lockfile);
+require PublicInbox::OnDestroy;
+my $auto_unlink = PublicInbox::OnDestroy->new($$, sub { unlink $lockfile });
 my ($perm, %seen);
 if (-e $pi_config) {
 	open(my $oh, '<', $pi_config) or die "unable to read $pi_config: $!\n";
@@ -100,11 +101,7 @@ if (-e $pi_config) {
 	defined $perm or die "(f)stat failed on $pi_config: $!\n";
 	chmod($perm & 07777, $fh) or
 		die "(f)chmod failed on future $pi_config: $!\n";
-	my $old;
-	{
-		local $/;
-		$old = <$oh>;
-	}
+	defined(my $old = do { local $/; <$oh> }) or die "read $pi_config: $!\n";
 	print $fh $old or die "failed to write: $!\n";
 	close $oh or die "failed to close $pi_config: $!\n";
 
@@ -138,10 +135,9 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n";
 my $pfx = "publicinbox.$name";
 my @x = (qw/git config/, "--file=$pi_config_tmp");
 
-require File::Spec;
-$inboxdir = File::Spec->canonpath($inboxdir);
+$inboxdir = PublicInbox::Config::rel2abs_collapsed($inboxdir);
+die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0;
 
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
 if (-f "$inboxdir/inbox.lock") {
 	if (!defined $version) {
 		$version = 2;
@@ -186,26 +182,24 @@ if ($skip_docdata) {
 	$ibx->{-skip_docdata} = $skip_docdata;
 }
 $ibx->init_inbox(0, $skip_epoch, $skip_artnum);
-require Cwd;
-my $tmp = Cwd::abs_path($inboxdir);
-defined($tmp) or die "failed to resolve $inboxdir: $!\n";
-$inboxdir = $tmp;
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
 
 # needed for git prior to v2.1.0
 umask(0077) if defined $perm;
 
+require PublicInbox::Spawn;
+PublicInbox::Spawn->import(qw(run_die));
+
 foreach my $addr (@address) {
 	next if $seen{lc($addr)};
-	PublicInbox::Import::run_die([@x, "--add", "$pfx.address", $addr]);
+	run_die([@x, "--add", "$pfx.address", $addr]);
 }
-PublicInbox::Import::run_die([@x, "$pfx.url", $http_url]);
-PublicInbox::Import::run_die([@x, "$pfx.inboxdir", $inboxdir]);
+run_die([@x, "$pfx.url", $http_url]);
+run_die([@x, "$pfx.inboxdir", $inboxdir]);
 
 if (defined($indexlevel)) {
-	PublicInbox::Import::run_die([@x, "$pfx.indexlevel", $indexlevel]);
+	run_die([@x, "$pfx.indexlevel", $indexlevel]);
 }
-PublicInbox::Import::run_die([@x, "$pfx.newsgroup", $ng]) if $ng ne '';
+run_die([@x, "$pfx.newsgroup", $ng]) if $ng ne '';
 
 # needed for git prior to v2.1.0
 if (defined $perm) {
@@ -215,18 +209,4 @@ if (defined $perm) {
 
 rename $pi_config_tmp, $pi_config or
 	die "failed to rename `$pi_config_tmp' to `$pi_config': $!\n";
-$auto_unlink->DESTROY;
-
-package UnlinkMe;
-use strict;
-
-sub new {
-	my ($klass, $file) = @_;
-	bless { file => $file }, $klass;
-}
-
-sub DESTROY {
-	my $f = delete($_[0]->{file});
-	unlink($f) if defined($f);
-}
-1;
+undef $auto_unlink; # trigger ->DESTROY
diff --git a/script/public-inbox-learn b/script/public-inbox-learn
index fb2d86ec..8b8e1b77 100755
--- a/script/public-inbox-learn
+++ b/script/public-inbox-learn
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Used for training spam (via SpamAssassin) and removing messages from a
@@ -36,11 +36,10 @@ if ($train !~ /\A(?:ham|spam|rm)\z/) {
 die "--all only works with `rm'\n" if $opt{all} && $train ne 'rm';
 
 my $spamc = PublicInbox::Spamcheck::Spamc->new;
-my $pi_config = PublicInbox::Config->new;
+my $pi_cfg = PublicInbox::Config->new;
 my $err;
 my $mime = PublicInbox::Eml->new(do{
-	local $/;
-	my $data = ;
+	defined(my $data = do { local $/;  }) or die "read STDIN: $!\n";
 	$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 
 	if ($train ne 'rm') {
@@ -87,7 +86,7 @@ sub remove_or_add ($$$$) {
 
 # spam is removed from all known inboxes since it is often Bcc:-ed
 if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
-	$pi_config->each_inbox(sub {
+	$pi_cfg->each_inbox(sub {
 		my ($ibx) = @_;
 		$ibx = PublicInbox::InboxWritable->new($ibx);
 		my $im = $ibx->importer(0);
@@ -102,7 +101,7 @@ if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
 	for ($mime->header('Cc'), $mime->header('To')) {
 		foreach my $addr (PublicInbox::Address::emails($_)) {
 			$addr = lc($addr);
-			$dests{$addr} //= $pi_config->lookup($addr) // 0;
+			$dests{$addr} //= $pi_cfg->lookup($addr) // 0;
 		}
 	}
 
@@ -113,7 +112,7 @@ if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
 		next if $seen{"$ibx"}++;
 		remove_or_add($ibx, $train, $mime, $addr);
 	}
-	my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime);
+	my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_cfg, $mime);
 	for my $ibx (@$dests) {
 		next if $seen{"$ibx"}++;
 		remove_or_add($ibx, $train, $mime, $ibx->{-primary_address});
diff --git a/script/public-inbox-mda b/script/public-inbox-mda
index 3ed5abb6..7e2bee92 100755
--- a/script/public-inbox-mda
+++ b/script/public-inbox-mda
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2013-2020 all contributors 
+# Copyright (C) 2013-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Mail delivery agent for public-inbox, run from your MTA upon mail delivery
@@ -42,18 +42,18 @@ my $str = do { local $/;  };
 $str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 $ems->prepare(\$str);
 my $eml = PublicInbox::Eml->new(\$str);
-my $config = PublicInbox::Config->new;
+my $cfg = PublicInbox::Config->new;
 my $key = 'publicinboxmda.spamcheck';
 my $default = 'PublicInbox::Spamcheck::Spamc';
-my $spamc = PublicInbox::Spamcheck::get($config, $key, $default);
+my $spamc = PublicInbox::Spamcheck::get($cfg, $key, $default);
 my $dests = [];
 my $recipient = $ENV{ORIGINAL_RECIPIENT};
 if (defined $recipient) {
-	my $ibx = $config->lookup($recipient); # first check
+	my $ibx = $cfg->lookup($recipient); # first check
 	push @$dests, $ibx if $ibx;
 }
 if (!scalar(@$dests)) {
-	$dests = PublicInbox::MDA->inboxes_for_list_id($config, $eml);
+	$dests = PublicInbox::MDA->inboxes_for_list_id($cfg, $eml);
 	if (!scalar(@$dests) && !defined($recipient)) {
 		die "ORIGINAL_RECIPIENT not defined in ENV\n";
 	}
diff --git a/script/public-inbox-nntpd b/script/public-inbox-nntpd
index f42db6fe..9fb0a8d9 100755
--- a/script/public-inbox-nntpd
+++ b/script/public-inbox-nntpd
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Standalone NNTP server for public-inbox.
diff --git a/script/public-inbox-purge b/script/public-inbox-purge
index 7bca11ea..59c03150 100755
--- a/script/public-inbox-purge
+++ b/script/public-inbox-purge
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Used for purging messages entirely from a public-inbox.  Currently
@@ -32,7 +32,7 @@ if ($opt->{help}) { print $help; exit 0 };
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt);
 PublicInbox::AdminEdit::check_editable(\@ibxs);
 
-my $data = do { local $/;  };
+defined(my $data = do { local $/;  }) or die "read STDIN: $!\n";
 $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 my $n_purged = 0;
 
diff --git a/script/public-inbox-watch b/script/public-inbox-watch
index 55183ef2..86349d71 100755
--- a/script/public-inbox-watch
+++ b/script/public-inbox-watch
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 my $help = < \$do_scan, # undocumented, testing only
 	'help|h' => \(my $show_help)) or do { print STDERR $help; exit 1 };
 if ($show_help) { print $help; exit 0 };
-my $oldset = PublicInbox::Sigfd::block_signals();
+my $oldset = PublicInbox::DS::block_signals();
 STDOUT->autoflush(1);
 STDERR->autoflush(1);
 local $0 = $0; # local since this script may be eval-ed
@@ -57,10 +57,10 @@ if ($watch) {
 	# --no-scan is only intended for testing atm, undocumented.
 	PublicInbox::DS::requeue($scan) if $do_scan;
 
-	my $sigfd = PublicInbox::Sigfd->new($sig, $SFD_NONBLOCK);
-	local %SIG = (%SIG, %$sig) if !$sigfd;
+	my $sigfd = PublicInbox::Sigfd->new($sig, SFD_NONBLOCK);
+	local @SIG{keys %$sig} = values(%$sig) unless $sigfd;
 	if (!$sigfd) {
-		PublicInbox::Sigfd::sig_setmask($oldset);
+		PublicInbox::DS::sig_setmask($oldset);
 		PublicInbox::DS->SetLoopTimeout(1000);
 	}
 	$watch->watch($sig, $oldset) while ($watch);
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index 84620175..3c99fde8 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
diff --git a/script/public-inbox.cgi b/script/public-inbox.cgi
index 42ab17c9..3a430d5b 100755
--- a/script/public-inbox.cgi
+++ b/script/public-inbox.cgi
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ or later 
 #
 # Enables using PublicInbox::WWW as a CGI script
diff --git a/scripts/dc-dlvr b/scripts/dc-dlvr
index 90aab73b..935a8312 100755
--- a/scripts/dc-dlvr
+++ b/scripts/dc-dlvr
@@ -1,5 +1,5 @@
 #!/bin/sh
-# Copyright (C) 2008-2020 all contributors 
+# Copyright (C) 2008-2021 all contributors 
 # License: GPL-3.0+ 
 # This is installed as /etc/dc-dcvr on my system
 # to use with postfix main.cf: mailbox_command = /etc/dc-dlvr "$EXTENSION"
diff --git a/scripts/dupe-finder b/scripts/dupe-finder
index deeb0d6f..d9744fcb 100644
--- a/scripts/dupe-finder
+++ b/scripts/dupe-finder
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # ad-hoc tool for finding duplicates, unstable!
diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool
index bdcc605c..d9a35dfd 100755
--- a/scripts/import_slrnspool
+++ b/scripts/import_slrnspool
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Incremental (or one-shot) importer of a slrnpull news spool
@@ -22,8 +22,8 @@ $SIG{TERM} = $sighandler;
 my $spool = shift @ARGV or die usage();
 my $recipient = $ENV{ORIGINAL_RECIPIENT};
 defined $recipient or die usage();
-my $config = PublicInbox::Config->new;
-my $ibx = $config->lookup($recipient);
+my $cfg = PublicInbox::Config->new;
+my $ibx = $cfg->lookup($recipient);
 my $git = $ibx->git;
 my $im;
 if ($ibx->version == 2) {
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index d1ce7231..c33e42e4 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/scripts/slrnspool2maildir b/scripts/slrnspool2maildir
index c36de0c9..8e2ba08a 100755
--- a/scripts/slrnspool2maildir
+++ b/scripts/slrnspool2maildir
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2013-2020 all contributors 
+# Copyright (C) 2013-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # One-off script to convert an slrnpull news spool to Maildir
diff --git a/scripts/ssoma-replay b/scripts/ssoma-replay
index 07121423..cfb0fbd9 100755
--- a/scripts/ssoma-replay
+++ b/scripts/ssoma-replay
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # A work-in-progress, but one day I hope this script is no longer
diff --git a/scripts/xhdr-num2mid b/scripts/xhdr-num2mid
index 19f5d0e0..3ca33f5d 100755
--- a/scripts/xhdr-num2mid
+++ b/scripts/xhdr-num2mid
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 # Useful for mapping article IDs from existing NNTP servers to MIDs
 use strict;
diff --git a/t/address.t b/t/address.t
index 6f4bff6c..6aa94628 100644
--- a/t/address.t
+++ b/t/address.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -7,26 +7,40 @@ use_ok 'PublicInbox::Address';
 
 sub test_pkg {
 	my ($pkg) = @_;
-	my $emails = \&{"${pkg}::emails"};
-	my $names = \&{"${pkg}::names"};
+	my $emails = $pkg->can('emails');
+	my $names = $pkg->can('names');
+	my $pairs = $pkg->can('pairs');
 
 	is_deeply([qw(e@example.com e@example.org)],
 		[$emails->('User , e@example.org')],
 		'address extraction works as expected');
 
+	is_deeply($pairs->('User , e@example.org'),
+			[[qw(User e@example.com)], [undef, 'e@example.org']],
+		"pair extraction works ($pkg)");
+
 	is_deeply(['user@example.com'],
 		[$emails->('')],
 		'comment after domain accepted before >');
+	is_deeply($pairs->(''),
+		[[qw(Comment user@example.com)]], "comment as name ($pkg)");
 
-	my @names = $names->(
-		'User , e@e, "John A. Doe" , ,  (xyz), '.
-		'U Ser  (do not use)');
+	my $s = 'User , e@e, "John A. Doe" , ,  (xyz), '.
+		'U Ser  (do not use)';
+	my @names = $names->($s);
 	is_deeply(\@names, ['User', 'e', 'John A. Doe', 'x', 'xyz', 'U Ser'],
 		'name extraction works as expected');
+	is_deeply($pairs->($s), [ [ 'User', 'e@e' ], [ undef, 'e@e' ],
+			[ 'John A. Doe', 'j@d' ], [ undef, 'x@x' ],
+			[ 'xyz', 'y@x' ], [ 'U Ser', 'u@x' ] ],
+		"pairs extraction works for $pkg");
 
 	@names = $names->('"user@example.com" ');
 	is_deeply(['user'], \@names,
 		'address-as-name extraction works as expected');
+	is_deeply($pairs->('"user@example.com" '),
+		[ [ 'user@example.com', 'user@example.com' ] ],
+		"pairs for $pkg");
 
 	{
 		my $backwards = 'u@example.com (John Q. Public)';
@@ -34,10 +48,17 @@ sub test_pkg {
 		is_deeply(\@names, ['John Q. Public'], 'backwards name OK');
 		my @emails = $emails->($backwards);
 		is_deeply(\@emails, ['u@example.com'], 'backwards emails OK');
+
+		is_deeply($pairs->($backwards),
+			[ [ 'John Q. Public', 'u@example.com' ] ],
+			"backwards pairs $pkg");
 	}
 
-	@names = $names->('"Quote Unneeded" ');
+	$s = '"Quote Unneeded" ';
+	@names = $names->($s);
 	is_deeply(['Quote Unneeded'], \@names, 'extra quotes dropped');
+	is_deeply($pairs->($s), [ [ 'Quote Unneeded', 'user@example.com' ] ],
+		"extra quotes dropped in pairs $pkg");
 
 	my @emails = $emails->('Local User ');
 	is_deeply([], \@emails , 'no address for local address');
diff --git a/t/admin.t b/t/admin.t
index c25667b2..fbfcd6d3 100644
--- a/t/admin.t
+++ b/t/admin.t
@@ -1,28 +1,29 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
 use Test::More;
 use PublicInbox::TestCommon;
 use PublicInbox::Import;
-use_ok 'PublicInbox::Admin', qw(resolve_repo_dir);
+use_ok 'PublicInbox::Admin';
 my ($tmpdir, $for_destroy) = tmpdir();
 my $git_dir = "$tmpdir/v1";
 my $v2_dir = "$tmpdir/v2";
 my ($res, $err, $v);
 
 PublicInbox::Import::init_bare($git_dir);
+*resolve_inboxdir = \&PublicInbox::Admin::resolve_inboxdir;
 
 # v1
-is(resolve_repo_dir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
-is(resolve_repo_dir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
+is(resolve_inboxdir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
+is(resolve_inboxdir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
 
 ok(chdir($git_dir), 'chdir GIT_DIR works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
 
 ok(chdir("$git_dir/objects"), 'chdir GIT_DIR/objects works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
-$res = resolve_repo_dir(undef, \$v);
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
+$res = resolve_inboxdir(undef, \$v);
 is($v, 1, 'version 1 detected');
 is($res, $git_dir, 'detects directory along with version');
 
@@ -36,13 +37,13 @@ SKIP: {
 
 	ok(chdir($no_vcs_dir), 'chdir to a non-inbox');
 	open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
-	$res = eval { resolve_repo_dir() };
+	$res = eval { resolve_inboxdir() };
 	open STDERR, '>&', $olderr or die "restore stderr: $!";
 	is($res, undef, 'fails inside non-version-controlled dir');
 
 	ok(chdir($tmpdir), 'back to test-specific $tmpdir');
 	open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
-	$res = eval { resolve_repo_dir($no_vcs_dir) };
+	$res = eval { resolve_inboxdir($no_vcs_dir) };
 	$err = $@;
 	open STDERR, '>&', $olderr or die "restore stderr: $!";
 	is($res, undef, 'fails on non-version-controlled dir');
@@ -66,18 +67,25 @@ SKIP: {
 	PublicInbox::V2Writable->new($ibx, 1)->idx_init;
 
 	ok(-e "$v2_dir/inbox.lock", 'exists');
-	is(resolve_repo_dir($v2_dir), $v2_dir,
-		'resolve_repo_dir works on v2_dir');
-	ok(chdir($v2_dir), 'chdir v2_dir OK');
-	is(resolve_repo_dir(), $v2_dir, 'resolve_repo_dir works inside v2_dir');
-	$res = resolve_repo_dir(undef, \$v);
+	is(resolve_inboxdir($v2_dir), $v2_dir,
+		'resolve_inboxdir works on v2_dir');
+	chdir($v2_dir) or BAIL_OUT "chdir v2_dir: $!";
+	is(resolve_inboxdir(), $v2_dir, 'resolve_inboxdir works inside v2_dir');
+	$res = resolve_inboxdir(undef, \$v);
 	is($v, 2, 'version 2 detected');
 	is($res, $v2_dir, 'detects directory along with version');
 
 	# TODO: should work from inside Xapian dirs, and git dirs, here...
+	PublicInbox::Import::init_bare("$v2_dir/git/0.git");
+	my $objdir = "$v2_dir/git/0.git/objects";
+	is($v2_dir, resolve_inboxdir($objdir, \$v), 'at $objdir');
+	is($v, 2, 'version 2 detected at $objdir');
+	chdir($objdir) or BAIL_OUT "chdir objdir: $!";
+	is(resolve_inboxdir(undef, \$v), $v2_dir, 'inside $objdir');
+	is($v, 2, 'version 2 detected inside $objdir');
 }
 
-chdir '/';
+chdir '/' or BAIL_OUT "chdir: $!";
 
 my @pairs = (
 	'1g' => 1024 ** 3,
diff --git a/t/altid.t b/t/altid.t
index 816f5f5b..0e9da07e 100644
--- a/t/altid.t
+++ b/t/altid.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/altid_v2.t b/t/altid_v2.t
index f04b547b..c6295b2f 100644
--- a/t/altid_v2.t
+++ b/t/altid_v2.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/cgi.t b/t/cgi.t
index 96c627c3..3818b991 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 # FIXME: this test is too slow and most non-CGI-requirements
 # should be moved over to things which use test_psgi
diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl
index dc463ea8..eee8adc2 100644
--- a/t/check-www-inbox.perl
+++ b/t/check-www-inbox.perl
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 # Parallel WWW checker
 my $usage = "$0 [-j JOBS] [-s SLOW_THRESHOLD] URL_OF_INBOX\n";
diff --git a/t/cmd_ipc.t b/t/cmd_ipc.t
new file mode 100644
index 00000000..84f8fb4d
--- /dev/null
+++ b/t/cmd_ipc.t
@@ -0,0 +1,130 @@
+#!perl -w
+# Copyright (C) 2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use Socket qw(AF_UNIX SOCK_STREAM MSG_EOR);
+pipe(my ($r, $w)) or BAIL_OUT;
+my ($send, $recv);
+require_ok 'PublicInbox::Spawn';
+my $SOCK_SEQPACKET = eval { Socket::SOCK_SEQPACKET() } // undef;
+use Time::HiRes qw(alarm);
+
+my $do_test = sub { SKIP: {
+	my ($type, $flag, $desc) = @_;
+	defined $type or skip 'SOCK_SEQPACKET missing', 7;
+	my ($s1, $s2);
+	my $src = 'some payload' x 40;
+	socketpair($s1, $s2, AF_UNIX, $type, 0) or BAIL_OUT $!;
+	my $sfds = [ fileno($r), fileno($w), fileno($s1) ];
+	$send->($s1, $sfds, $src, $flag);
+	my (@fds) = $recv->($s2, my $buf, length($src) + 1);
+	is($buf, $src, 'got buffer payload '.$desc);
+	my ($r1, $w1, $s1a);
+	my $opens = sub {
+		ok(open($r1, '<&=', $fds[0]), 'opened received $r');
+		ok(open($w1, '>&=', $fds[1]), 'opened received $w');
+		ok(open($s1a, '+>&=', $fds[2]), 'opened received $s1');
+	};
+	$opens->();
+	my @exp = stat $r;
+	my @cur = stat $r1;
+	is("$exp[0]\0$exp[1]", "$cur[0]\0$cur[1]", '$r dev/ino matches');
+	@exp = stat $w;
+	@cur = stat $w1;
+	is("$exp[0]\0$exp[1]", "$cur[0]\0$cur[1]", '$w dev/ino matches');
+	@exp = stat $s1;
+	@cur = stat $s1a;
+	is("$exp[0]\0$exp[1]", "$cur[0]\0$cur[1]", '$s1 dev/ino matches');
+	if (defined($SOCK_SEQPACKET) && $type == $SOCK_SEQPACKET) {
+		$r1 = $w1 = $s1a = undef;
+		$src = (',' x 1023) . '-' .('.' x 1024);
+		$send->($s1, $sfds, $src, $flag);
+		(@fds) = $recv->($s2, $buf, 1024);
+		is($buf, (',' x 1023) . '-', 'silently truncated buf');
+		$opens->();
+		$r1 = $w1 = $s1a = undef;
+
+		$s2->blocking(0);
+		@fds = $recv->($s2, $buf, length($src) + 1);
+		ok($!{EAGAIN}, "EAGAIN set by ($desc)");
+		is_deeply(\@fds, [ undef ], "EAGAIN $desc");
+		$s2->blocking(1);
+
+		my $alrm = 0;
+		local $SIG{ALRM} = sub { $alrm++ };
+		alarm(0.001);
+		@fds = $recv->($s2, $buf, length($src) + 1);
+		ok($!{EINTR}, "EINTR set by ($desc)");
+		is_deeply(\@fds, [ undef ], "EINTR $desc");
+		is($alrm, 1, 'SIGALRM hit');
+
+		close $s1;
+		@fds = $recv->($s2, $buf, length($src) + 1);
+		is_deeply(\@fds, [], "no FDs on EOF $desc");
+		is($buf, '', "buffer cleared on EOF ($desc)");
+
+		socketpair($s1, $s2, AF_UNIX, $type, 0) or BAIL_OUT $!;
+		$s1->blocking(0);
+		my $nsent = 0;
+		while (defined(my $n = $send->($s1, $sfds, $src, $flag))) {
+			$nsent += $n;
+			fail "sent 0 bytes" if $n == 0;
+		}
+		ok($!{EAGAIN}, "hit EAGAIN on send $desc");
+		ok($nsent > 0, 'sent some bytes');
+
+		socketpair($s1, $s2, AF_UNIX, $type, 0) or BAIL_OUT $!;
+		is($send->($s1, [], $src, $flag), length($src), 'sent w/o FDs');
+		$buf = 'nope';
+		@fds = $recv->($s2, $buf, length($src));
+		is(scalar(@fds), 0, 'no FDs received');
+		is($buf, $src, 'recv w/o FDs');
+
+		my $nr = 2 * 1024 * 1024;
+		while (1) {
+			vec(my $vec = '', $nr * 8 - 1, 1) = 1;
+			my $n = $send->($s1, [], $vec, $flag);
+			if (defined($n)) {
+				$n == length($vec) or
+					fail "short send: $n != ".length($vec);
+				diag "sent $nr, retrying with more";
+				$nr += 2 * 1024 * 1024;
+			} else {
+				ok($!{EMSGSIZE}, 'got EMSGSIZE');
+				# diag "$nr bytes hits EMSGSIZE";
+				last;
+			}
+		}
+	}
+} };
+
+my $send_ic = PublicInbox::Spawn->can('send_cmd4');
+my $recv_ic = PublicInbox::Spawn->can('recv_cmd4');
+SKIP: {
+	($send_ic && $recv_ic) or skip 'Inline::C not installed/enabled', 12;
+	$send = $send_ic;
+	$recv = $recv_ic;
+	$do_test->(SOCK_STREAM, 0, 'Inline::C stream');
+	$do_test->($SOCK_SEQPACKET, MSG_EOR, 'Inline::C seqpacket');
+}
+
+SKIP: {
+	require_mods('Socket::MsgHdr', 13);
+	require_ok 'PublicInbox::CmdIPC4';
+	$send = PublicInbox::CmdIPC4->can('send_cmd4');
+	$recv = PublicInbox::CmdIPC4->can('recv_cmd4');
+	$do_test->(SOCK_STREAM, 0, 'MsgHdr stream');
+	$do_test->($SOCK_SEQPACKET, MSG_EOR, 'MsgHdr seqpacket');
+	SKIP: {
+		($send_ic && $recv_ic) or
+			skip 'Inline::C not installed/enabled', 12;
+		$recv = $recv_ic;
+		$do_test->(SOCK_STREAM, 0, 'Inline::C -> MsgHdr stream');
+		$do_test->($SOCK_SEQPACKET, 0, 'Inline::C -> MsgHdr seqpacket');
+	}
+}
+
+done_testing;
diff --git a/t/config.t b/t/config.t
index 204fc790..fe684106 100644
--- a/t/config.t
+++ b/t/config.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -41,7 +41,6 @@ my ($tmpdir, $for_destroy) = tmpdir();
 		'url' => [ 'http://example.com/meta' ],
 		-primary_address => 'meta@public-inbox.org',
 		'name' => 'meta',
-		feedmax => 25,
 		-httpbackend_limiter => undef,
 		nntpserver => undef,
 	}, "lookup matches expected output");
@@ -58,7 +57,6 @@ my ($tmpdir, $for_destroy) = tmpdir();
 		'inboxdir' => '/home/pi/test-main.git',
 		'domain' => 'public-inbox.org',
 		'name' => 'test',
-		feedmax => 25,
 		'url' => [ 'http://example.com/test' ],
 		-httpbackend_limiter => undef,
 		nntpserver => undef,
diff --git a/t/config_limiter.t b/t/config_limiter.t
index 0da8903d..8c83aca8 100644
--- a/t/config_limiter.t
+++ b/t/config_limiter.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/content_hash.t b/t/content_hash.t
index 646aab07..3f02b1b3 100644
--- a/t/content_hash.t
+++ b/t/content_hash.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/convert-compact.t b/t/convert-compact.t
index e479476d..cdb9e3f5 100644
--- a/t/convert-compact.t
+++ b/t/convert-compact.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -21,8 +21,8 @@ my $ibx = {
 
 PublicInbox::Import::init_bare($ibx->{inboxdir});
 ok(umask(077), 'set restrictive umask');
-ok(PublicInbox::Import::run_die([qw(git) , "--git-dir=$ibx->{inboxdir}",
-	qw(config core.sharedRepository 0644)]), 'set sharedRepository');
+xsys_e(qw(git) , "--git-dir=$ibx->{inboxdir}",
+	qw(config core.sharedRepository 0644));
 $ibx = PublicInbox::Inbox->new($ibx);
 my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
 my $mime = PublicInbox::Eml->new(<<'EOF');
diff --git a/t/data/message_embed.eml b/t/data/message_embed.eml
index a7aa88ac..95758084 100644
--- a/t/data/message_embed.eml
+++ b/t/data/message_embed.eml
@@ -63,7 +63,7 @@ index 00000000..166baf91
 --- /dev/null
 +++ b/lib/PublicInbox/MailHeader.pm
 @@ -0,0 +1,55 @@
-+# Copyright (C) 2020 all contributors 
++# Copyright (C) 2020-2021 all contributors 
 +# License: AGPL-3.0+ 
 +package PublicInbox::MailHeader;
 +use strict;
diff --git a/t/dir_idle.t b/t/dir_idle.t
index 587599e8..d62eb5a2 100644
--- a/t/dir_idle.t
+++ b/t/dir_idle.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use Test::More;
 use_ok 'PublicInbox::DirIdle';
diff --git a/t/ds-kqxs.t b/t/ds-kqxs.t
index 718567d6..43c71fed 100644
--- a/t/ds-kqxs.t
+++ b/t/ds-kqxs.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # Licensed the same as Danga::Socket (and Perl5)
 # License: GPL-1.0+ or Artistic-1.0-Perl
 #  
diff --git a/t/ds-leak.t b/t/ds-leak.t
index 72bf0379..57d9cd72 100644
--- a/t/ds-leak.t
+++ b/t/ds-leak.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # Licensed the same as Danga::Socket (and Perl5)
 # License: GPL-1.0+ or Artistic-1.0-Perl
 #  
diff --git a/t/ds-poll.t b/t/ds-poll.t
index 3771059b..d8861369 100644
--- a/t/ds-poll.t
+++ b/t/ds-poll.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # Licensed the same as Danga::Socket (and Perl5)
 # License: GPL-1.0+ or Artistic-1.0-Perl
 #  
@@ -16,35 +16,35 @@ pipe($r, $w) or die;
 pipe($x, $y) or die;
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($r), EPOLLIN), 0, 'add EPOLLIN');
 my $events = [];
-my $n = $p->epoll_wait(9, 0, $events);
+$p->epoll_wait(9, 0, $events);
 is_deeply($events, [], 'no events set');
-is($n, 0, 'nothing ready, yet');
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($w), EPOLLOUT|EPOLLONESHOT), 0,
 	'add EPOLLOUT|EPOLLONESHOT');
-$n = $p->epoll_wait(9, -1, $events);
-is($n, 1, 'got POLLOUT event');
-is($events->[0]->[0], fileno($w), '$w ready');
+$p->epoll_wait(9, -1, $events);
+is(scalar(@$events), 1, 'got POLLOUT event');
+is($events->[0], fileno($w), '$w ready');
 
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after oneshot');
+$p->epoll_wait(9, 0, $events);
+is(scalar(@$events), 0, 'nothing ready after oneshot');
 is_deeply($events, [], 'no events set after oneshot');
 
 syswrite($w, '1') == 1 or die;
 for my $t (0..1) {
-	$n = $p->epoll_wait(9, $t, $events);
-	is($events->[0]->[0], fileno($r), "level-trigger POLLIN ready #$t");
-	is($n, 1, "only event ready #$t");
+	$p->epoll_wait(9, $t, $events);
+	is($events->[0], fileno($r), "level-trigger POLLIN ready #$t");
+	is(scalar(@$events), 1, "only event ready #$t");
 }
 syswrite($y, '1') == 1 or die;
 is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($x), EPOLLIN|EPOLLONESHOT), 0,
 	'EPOLLIN|EPOLLONESHOT add');
-is($p->epoll_wait(9, -1, $events), 2, 'epoll_wait has 2 ready');
-my @fds = sort(map { $_->[0] } @$events);
+$p->epoll_wait(9, -1, $events);
+is(scalar @$events, 2, 'epoll_wait has 2 ready');
+my @fds = sort @$events;
 my @exp = sort((fileno($r), fileno($x)));
 is_deeply(\@fds, \@exp, 'got both ready FDs');
 
 is($p->epoll_ctl(EPOLL_CTL_DEL, fileno($r), 0), 0, 'EPOLL_CTL_DEL OK');
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after EPOLL_CTL_DEL');
+$p->epoll_wait(9, 0, $events);
+is(scalar @$events, 0, 'nothing ready after EPOLL_CTL_DEL');
 
 done_testing;
diff --git a/t/edit.t b/t/edit.t
index dbdda394..0d57e629 100644
--- a/t/edit.t
+++ b/t/edit.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 # edit frontend behavior test (t/replace.t for backend)
 use strict;
diff --git a/t/emergency.t b/t/emergency.t
index 74cc1d2e..60dba2ad 100644
--- a/t/emergency.t
+++ b/t/emergency.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/eml.t b/t/eml.t
index 4d1c1216..ebd45c13 100644
--- a/t/eml.t
+++ b/t/eml.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/eml_content_disposition.t b/t/eml_content_disposition.t
index 9bdacc05..099587f8 100644
--- a/t/eml_content_disposition.t
+++ b/t/eml_content_disposition.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
 # This library is free software; you can redistribute it and/or modify
 # it under the same terms as Perl itself.
diff --git a/t/eml_content_type.t b/t/eml_content_type.t
index 5acd51ad..ab8d4b2d 100644
--- a/t/eml_content_type.t
+++ b/t/eml_content_type.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
 # This library is free software; you can redistribute it and/or modify
 # it under the same terms as Perl itself.
diff --git a/t/epoll.t b/t/epoll.t
index b47650e3..a1e73e07 100644
--- a/t/epoll.t
+++ b/t/epoll.t
@@ -12,11 +12,11 @@ is(epoll_ctl($epfd, EPOLL_CTL_ADD, fileno($w), EPOLLOUT), 0,
     'epoll_ctl socket EPOLLOUT');
 
 my @events;
-is(epoll_wait($epfd, 100, 10000, \@events), 1, 'epoll_wait returns');
+epoll_wait($epfd, 100, 10000, \@events);
 is(scalar(@events), 1, 'got one event');
-is($events[0]->[0], fileno($w), 'got expected FD');
-is($events[0]->[1], EPOLLOUT, 'got expected event');
+is($events[0], fileno($w), 'got expected FD');
 close $w;
-is(epoll_wait($epfd, 100, 0, \@events), 0, 'epoll_wait timeout');
+epoll_wait($epfd, 100, 0, \@events);
+is(@events, 0, 'epoll_wait timeout');
 
 done_testing;
diff --git a/t/extsearch.t b/t/extsearch.t
new file mode 100644
index 00000000..2c3f7547
--- /dev/null
+++ b/t/extsearch.t
@@ -0,0 +1,369 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Config;
+use PublicInbox::Search;
+use PublicInbox::InboxWritable;
+use Fcntl qw(:seek);
+require_git(2.6);
+require_mods(qw(json DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::ExtSearch';
+use_ok 'PublicInbox::ExtSearchIdx';
+use_ok 'PublicInbox::OverIdx';
+my $sock = tcp_server();
+my $host_port = $sock->sockhost . ':' . $sock->sockport;
+my ($home, $for_destroy) = tmpdir();
+local $ENV{HOME} = $home;
+mkdir "$home/.public-inbox" or BAIL_OUT $!;
+my $cfg_path = "$home/.public-inbox/config";
+open my $fh, '>', $cfg_path or BAIL_OUT $!;
+print $fh < $v2addr };
+my $eml = eml_load('t/utf8.eml');
+
+$eml->header_set('List-Id', '');
+open($fh, '+>', undef) or BAIL_OUT $!;
+$fh->autoflush(1);
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
+	'http://example.com/v1test', $v1addr ]), 'v1test init');
+
+$eml->header_set('List-Id', '');
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+truncate($fh, 0) or BAIL_OUT $!;
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+$env = { ORIGINAL_RECIPIENT => $v1addr };
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?";
+
+ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
+{
+	my $es = PublicInbox::ExtSearch->new("$home/extindex");
+	ok($es->has_threadid, '->has_threadid');
+}
+
+{ # TODO: -extindex should write this to config
+	open $fh, '>>', $cfg_path or BAIL_OUT $!;
+	print $fh <ALL
+[extindex "all"]
+	topdir = $home/extindex
+EOF
+	close $fh or BAIL_OUT $!;
+
+	my $pi_cfg = PublicInbox::Config->new;
+	$pi_cfg->fill_all;
+	ok($pi_cfg->ALL, '->ALL');
+	my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
+	my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
+	is_deeply($ret, { 'v1.example' => 1, 'v2.example' => 1 },
+		'->nntp_xref_for');
+}
+
+SKIP: {
+	require_mods(qw(Net::NNTP), 1);
+	my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
+	my $cmd = [ '-nntpd', '-W0', "--stdout=$out", "--stderr=$err" ];
+	my $td = start_script($cmd, undef, { 3 => $sock });
+	my $n = Net::NNTP->new($host_port);
+	my @xp = $n->xpath('');
+	is_deeply(\@xp, [ qw(v1.example/1 v2.example/1) ]);
+	$n->group('v1.example');
+	my $res = $n->head(1);
+	@$res = grep(/^Xref: /, @$res);
+	like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works');
+}
+
+my $es = PublicInbox::ExtSearch->new("$home/extindex");
+{
+	my $smsg = $es->over->get_art(1);
+	ok($smsg, 'got first article');
+	is($es->over->get_art(2), undef, 'only one added');
+	my $xref3 = $es->over->get_xref3(1);
+	like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1');
+	like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2');
+	is(scalar(@$xref3), 2, 'only to entries');
+}
+
+if ('inbox edited') {
+	my ($in, $out, $err);
+	$in = $out = $err = '';
+	my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+	my $env = { MAIL_EDITOR => "$^X -i -p -e 's/test message/BEST MSG/'" };
+	my $cmd = [ qw(-edit -Ft/utf8.eml), "$home/v2test" ];
+	ok(run_script($cmd, $env, $opt), '-edit');
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+		'extindex again');
+	like($err, qr/discontiguous range/, 'warned about discontiguous range');
+	my $msg1 = $es->over->get_art(1) or BAIL_OUT 'msg1 missing';
+	my $msg2 = $es->over->get_art(2) or BAIL_OUT 'msg2 missing';
+	is($msg1->{mid}, $msg2->{mid}, 'edited message indexed');
+	isnt($msg1->{blob}, $msg2->{blob}, 'blobs differ');
+	my $eml2 = $es->smsg_eml($msg2);
+	like($eml2->body, qr/BEST MSG/, 'edited body in #2');
+	unlike($eml2->body, qr/test message/, 'old body discarded in #2');
+	my $eml1 = $es->smsg_eml($msg1);
+	like($eml1->body, qr/test message/, 'original body in #1');
+	my $x1 = $es->over->get_xref3(1);
+	my $x2 = $es->over->get_xref3(2);
+	is(scalar(@$x1), 1, 'original only has one xref3');
+	is(scalar(@$x2), 1, 'new message has one xref3');
+	isnt($x1->[0], $x2->[0], 'xref3 differs');
+
+	my $mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 1, 'new message found');
+	$mset = $es->mset('b:"test message"');
+	is($mset->size, 1, 'old message found');
+	delete @$es{qw(git over xdb)}; # fork preparation
+
+	my $pi_cfg = PublicInbox::Config->new;
+	$pi_cfg->fill_all;
+	is(scalar($pi_cfg->ALL->mset('s:Testing')->items), 2,
+		'2 results in ->ALL');
+	my $res = {};
+	my $nr = 0;
+	$pi_cfg->each_inbox(sub {
+		$nr++;
+		my ($ibx) = @_;
+		local $SIG{__WARN__} = sub {}; # FIXME support --reindex
+		my $mset = $ibx->isrch->mset('s:Testing');
+		$res->{$ibx->eidx_key} = $ibx->isrch->mset_to_smsg($ibx, $mset);
+	});
+	is($nr, 2, 'two inboxes');
+	my $exp = {};
+	for my $v (qw(v1 v2)) {
+		my $ibx = $pi_cfg->lookup_newsgroup("$v.example");
+		my $smsg = $ibx->over->get_art(1);
+		$smsg->psgi_cull;
+		$exp->{"$v.example"} = [ $smsg ];
+	}
+	is_deeply($res, $exp, 'isearch limited results');
+	$pi_cfg = $res = $exp = undef;
+
+	open my $rmfh, '+>', undef or BAIL_OUT $!;
+	$rmfh->autoflush(1);
+	print $rmfh $eml2->as_string or BAIL_OUT $!;
+	seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
+	$opt->{0} = $rmfh;
+	ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
+
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
+		'extindex after rm');
+	is($es->over->get_art(2), undef, 'doc #2 gone');
+	$mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 0, 'new message gone');
+}
+
+my $misc = $es->misc;
+my @it = $misc->mset('')->items;
+is(scalar(@it), 2, 'two inboxes');
+like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
+like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
+
+my $cfg = PublicInbox::Config->new;
+my $schema_version = PublicInbox::Search::SCHEMA_VERSION();
+my $f = "$home/extindex/ei$schema_version/over.sqlite3";
+my $oidx = PublicInbox::OverIdx->new($f);
+if ('inject w/o indexing') {
+	use PublicInbox::Import;
+	my $v1ibx = $cfg->lookup_name('v1test');
+	my $last_v1_commit = $v1ibx->mm->last_commit;
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $last_v2_commit = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $git0 = PublicInbox::Git->new("$v2ibx->{inboxdir}/git/0.git");
+	chomp(my $cmt = $git0->qx(qw(rev-parse HEAD^0)));
+	is($last_v2_commit, $cmt, 'v2 index up-to-date');
+
+	my $v2im = PublicInbox::Import->new($git0, undef, undef, $v2ibx);
+	$v2im->{lock_path} = undef;
+	$v2im->{path_type} = 'v2';
+	$v2im->add(eml_load('t/mda-mime.eml'));
+	$v2im->done;
+	chomp(my $tip = $git0->qx(qw(rev-parse HEAD^0)));
+	isnt($tip, $cmt, '0.git v2 updated');
+
+	# inject a message w/o updating index
+	rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or
+		BAIL_OUT $!;
+	open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!;
+	run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or
+		BAIL_OUT '-mda';
+	rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or
+		BAIL_OUT $!;
+
+	my ($in, $out, $err);
+	$in = $out = $err = '';
+	my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+	ok(run_script([qw(-extindex -v -v --all), "$home/extindex"],
+		undef, undef), 'extindex noop');
+	$es->{xdb}->reopen;
+	my $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 0, 'did not attempt to index unindexed v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 0, 'did not attempt to index unindexed v2 message');
+	ok(run_script([qw(-index --all)]), 'indexed v1 and v2 inboxes');
+
+	isnt($v1ibx->mm->last_commit, $last_v1_commit, '-index v1 worked');
+	isnt($v2ibx->mm->last_commit_xap($schema_version, 0),
+		$last_v2_commit, '-index v2 worked');
+	ok(run_script([qw(-extindex --all), "$home/extindex"]),
+		'extindex updates');
+
+	$es->{xdb}->reopen;
+	$mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 1, 'got v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 1, 'got v2 message');
+}
+
+if ('reindex catches missed messages') {
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+	my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $eml = eml_load('t/data/0001.patch');
+	$im->add($eml);
+	$im->done;
+	my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	isnt($cmt_a, $cmt_b, 'v2 0.git HEAD updated');
+	$oidx->dbh;
+	my $uv = $v2ibx->uidvalidity;
+	my $lc_key = "lc-v2:v2.example//$uv;0";
+	is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+		'update lc-v2 meta, old is as expected');
+	my $max = $oidx->max;
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex), "$home/extindex", $v2ibx->{inboxdir}]),
+		'-extindex noop');
+	is($oidx->max, $max, '->max unchanged');
+	is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 unchanged');
+	$oidx->dbh_close;
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --reindex), "$home/extindex",
+			$v2ibx->{inboxdir}], undef, $opt),
+			'--reindex for unseen');
+	is($oidx->max, $max + 1, '->max bumped');
+	is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 stays unchanged');
+	my @err = split(/^/, $err);
+	is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+	like($err[0], qr/I: reindex_unseen/, 'got reindex_unseen message');
+	my $new = $oidx->get_art($max + 1);
+	is($new->{subject}, $eml->header('Subject'), 'new message added');
+
+	$es->{xdb}->reopen;
+	my $mset = $es->mset("mid:$new->{mid}");
+	is($mset->size, 1, 'previously unseen, now indexed in Xapian');
+
+	ok($im->remove($eml), 'remove new message from v2 inbox');
+	$im->done;
+	my $cmt_c = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	is($oidx->eidx_meta($lc_key, $cmt_c), $cmt_b,
+		'bump lc-v2 meta again to skip v2 remove');
+	$err = '';
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex --reindex), "$home/extindex",
+			$v2ibx->{inboxdir}], undef, $opt),
+			'--reindex for stale');
+	@err = split(/^/, $err);
+	is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+	like($err[0], qr/\(#$new->{num}\): stale/, 'got stale message warning');
+	is($oidx->get_art($new->{num}), undef,
+		'stale message gone from over');
+	is_deeply($oidx->get_xref3($new->{num}), [],
+		'stale message has no xref3');
+	$es->{xdb}->reopen;
+	$mset = $es->mset("mid:$new->{mid}");
+	is($mset->size, 0, 'stale mid gone Xapian');
+}
+
+if ('reindex catches content bifurcation') {
+	use PublicInbox::MID qw(mids);
+	my $v2ibx = $cfg->lookup_name('v2test');
+	my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+	my $eml = eml_load('t/data/message_embed.eml');
+	my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	$im->add($eml);
+	$im->done;
+	my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $uv = $v2ibx->uidvalidity;
+	my $lc_key = "lc-v2:v2.example//$uv;0";
+	$oidx->dbh;
+	is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+		'update lc-v2 meta, old is as expected');
+	my $mid = mids($eml)->[0];
+	my $smsg = $v2ibx->over->next_by_mid($mid, \(my $id), \(my $prev));
+	my $oldmax = $oidx->max;
+	my $x3_orig = $oidx->get_xref3(3);
+	is(scalar(@$x3_orig), 1, '#3 has one xref');
+	$oidx->add_xref3(3, $smsg->{num}, $smsg->{blob}, 'v2.example');
+	my $x3 = $oidx->get_xref3(3);
+	is(scalar(@$x3), 2, 'injected xref3');
+	$oidx->commit_lazy;
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+		'extindex --all is noop');
+	is($err, '', 'no warnings in index');
+	$oidx->dbh;
+	is($oidx->max, $oldmax, 'oidx->max unchanged');
+	$oidx->dbh_close;
+	ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
+		undef, $opt), 'extindex --reindex');
+	$oidx->dbh;
+	ok($oidx->max > $oldmax, 'oidx->max bumped');
+	like($err, qr/split into 2 due to deduplication change/,
+		'bifurcation noted');
+	my $added = $oidx->get_art($oidx->max);
+	is($added->{blob}, $smsg->{blob}, 'new blob indexed');
+	is_deeply(["v2.example:$smsg->{num}:$smsg->{blob}"],
+		$oidx->get_xref3($added->{num}),
+		'xref3 corrected for bifurcated message');
+	is_deeply($oidx->get_xref3(3), $x3_orig, 'xref3 restored for #3');
+}
+
+if ('--reindex --rethread') {
+	my $before = $oidx->dbh->selectrow_array(<<'');
+SELECT MAX(tid) FROM over WHERE num > 0
+
+	my $opt = {};
+	ok(run_script([qw(-extindex --reindex --rethread --all),
+			"$home/extindex"], undef, $opt),
+			'--rethread');
+	my $after = $oidx->dbh->selectrow_array(<<'');
+SELECT MIN(tid) FROM over WHERE num > 0
+
+	# actual rethread logic is identical to v1/v2 and tested elsewhere
+	ok($after > $before, '--rethread updates MIN(tid)');
+}
+
+if ('remove v1test and test gc') {
+	xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
+		{ GIT_CONFIG => $cfg_path });
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
+		'extindex --gc');
+	like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
+	is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+		'no non-informational messages');
+	$misc->{xdb}->reopen;
+	@it = $misc->mset('')->items;
+	is(scalar(@it), 1, 'only one inbox left');
+}
+
+done_testing;
diff --git a/t/fake_inotify.t b/t/fake_inotify.t
index 11dac117..5c925ae6 100644
--- a/t/fake_inotify.t
+++ b/t/fake_inotify.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Ensure FakeInotify can pick up rename(2) and link(2) operations
diff --git a/t/feed.t b/t/feed.t
index 5ad90a07..cdbc88cd 100644
--- a/t/feed.t
+++ b/t/feed.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -75,7 +75,7 @@ EOF
 {
 	# check initial feed
 	{
-		my $feed = string_feed({ -inbox => $ibx });
+		my $feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 3 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($feed);
@@ -109,7 +109,7 @@ EOF
 
 	# check spam shows up
 	{
-		my $spammy_feed = string_feed({ -inbox => $ibx });
+		my $spammy_feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($spammy_feed);
@@ -127,7 +127,7 @@ EOF
 
 	# spam no longer shows up
 	{
-		my $feed = string_feed({ -inbox => $ibx });
+		my $feed = string_feed({ ibx => $ibx });
 		SKIP: {
 			skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
 			my $t = XML::TreePP->new->parse($feed);
diff --git a/t/filter_base.t b/t/filter_base.t
index 47d0220f..2646321a 100644
--- a/t/filter_base.t
+++ b/t/filter_base.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/filter_mirror.t b/t/filter_mirror.t
index 5bc7f3f4..678d9fb0 100644
--- a/t/filter_mirror.t
+++ b/t/filter_mirror.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/filter_rubylang.t b/t/filter_rubylang.t
index e6c53f98..81799451 100644
--- a/t/filter_rubylang.t
+++ b/t/filter_rubylang.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -35,7 +35,7 @@ SKIP: {
 	];
 	my $ibx = PublicInbox::Inbox->new({ inboxdir => $git_dir,
 						altid => $altid });
-	$f = PublicInbox::Filter::RubyLang->new(-inbox => $ibx);
+	$f = PublicInbox::Filter::RubyLang->new(ibx => $ibx);
 	$msg = <<'EOF';
 X-Mail-Count: 12
 Message-ID: 
diff --git a/t/filter_subjecttag.t b/t/filter_subjecttag.t
index e2d91e74..f88fcad5 100644
--- a/t/filter_subjecttag.t
+++ b/t/filter_subjecttag.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/filter_vger.t b/t/filter_vger.t
index ca5a6ca7..92d6a9f3 100644
--- a/t/filter_vger.t
+++ b/t/filter_vger.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/gcf2.t b/t/gcf2.t
new file mode 100644
index 00000000..fa907c8b
--- /dev/null
+++ b/t/gcf2.t
@@ -0,0 +1,162 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Fcntl qw(:seek);
+use IO::Handle ();
+use POSIX qw(_exit);
+use Cwd qw(abs_path);
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2';
+use PublicInbox::Import;
+my ($tmpdir, $for_destroy) = tmpdir();
+
+my $gcf2 = PublicInbox::Gcf2::new();
+is(ref($gcf2), 'PublicInbox::Gcf2', '::new works');
+my $COPYING = 'dba13ed2ddf783ee8118c6a581dbf75305f816a3';
+open my $agpl, '<', 'COPYING' or BAIL_OUT "AGPL-3 missing: $!";
+$agpl = do { local $/; <$agpl> };
+
+PublicInbox::Import::init_bare($tmpdir);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr);
+is($?, 0, 'fast-import succeeded');
+$gcf2->add_alternate("$tmpdir/objects");
+
+{
+	my ($r, $w);
+	pipe($r, $w) or BAIL_OUT $!;
+	my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+	$gcf2->cat_oid(fileno($w), $tree);
+	close $w;
+	is("$tree tree 30\n", <$r>, 'tree header ok');
+	$r = do { local $/; <$r> };
+	is(chop($r), "\n", 'got trailing newline');
+	is(length($r), 30, 'tree length matches');
+}
+
+chomp(my $objdir = xqx([qw(git rev-parse --git-path objects)]));
+if ($objdir =~ /\A--git-path\n/) { # git <2.5
+	chomp($objdir = xqx([qw(git rev-parse --git-dir)]));
+	$objdir .= '/objects';
+}
+if ($objdir && -d $objdir) {
+	$objdir = abs_path($objdir);
+	open my $alt, '>>', "$tmpdir/objects/info/alternates" or
+							BAIL_OUT $!;
+	print $alt $objdir, "\n" or BAIL_OUT $!;
+	close $alt or BAIL_OUT $!;
+
+	# calling gcf2->add_alternate on an already-added path won't
+	# cause alternates to be reloaded, so we do
+	# $gcf2->add_alternate($objdir) later on instead of
+	# $gcf2->add_alternate("$tmpdir/objects");
+	# $objdir = "$tmpdir/objects";
+} else {
+	$objdir = undef
+}
+
+my $nr = $ENV{TEST_LEAK_NR};
+my $cat = $ENV{TEST_LEAK_CAT} // 10;
+diag "checking for leaks... (TEST_LEAK_NR=$nr TEST_LEAK_CAT=$cat)" if $nr;
+
+SKIP: {
+	skip 'not in git worktree', 21 unless defined($objdir);
+	$gcf2->add_alternate($objdir);
+	eval { $gcf2->add_alternate($objdir) };
+	ok(!$@, 'no error adding alternate redundantly');
+	if ($nr) {
+		diag "adding alternate $nr times redundantly";
+		$gcf2->add_alternate($objdir) for (1..$nr);
+		diag 'done adding redundant alternates';
+	}
+
+	open my $fh, '+>', undef or BAIL_OUT "open: $!";
+	$fh->autoflush(1);
+
+	ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails');
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	is(do { local $/; <$fh> }, '', 'nothing written');
+
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails');
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	is(do { local $/; <$fh> }, '', 'nothing written for z40');
+
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	my $ck_copying = sub {
+		my ($desc) = @_;
+		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+		is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc");
+		my $buf = do { local $/; <$fh> };
+		is(chop($buf), "\n", 'got trailing \\n');
+		is($buf, $agpl, "AGPL matches ($desc)");
+	};
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal');
+	$ck_copying->('regular file');
+
+	$gcf2 = PublicInbox::Gcf2::new();
+	$gcf2->add_alternate("$tmpdir/objects");
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate');
+	$ck_copying->('alternates after reopen');
+
+	$^O eq 'linux' or skip('pipe tests are Linux-only', 14);
+	for my $blk (1, 0) {
+		my ($r, $w);
+		pipe($r, $w) or BAIL_OUT $!;
+		fcntl($w, 1031, 4096) or
+			skip('Linux too old for F_SETPIPE_SZ', 14);
+		$w->blocking($blk);
+		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+		truncate($fh, 0) or BAIL_OUT "truncate: $!";
+		defined(my $pid = fork) or BAIL_OUT "fork: $!";
+		if ($pid == 0) {
+			close $w;
+			tick; # wait for parent to block on writev
+			my $buf = do { local $/; <$r> };
+			print $fh $buf or _exit(1);
+			_exit(0);
+		}
+		ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk");
+		close $w or BAIL_OUT "close: $!";
+		is(waitpid($pid, 0), $pid, 'child exited');
+		is($?, 0, 'no error in child');
+		$ck_copying->("pipe blocking($blk)");
+
+		pipe($r, $w) or BAIL_OUT $!;
+		fcntl($w, 1031, 4096) or BAIL_OUT $!;
+		$w->blocking($blk);
+		close $r;
+		local $SIG{PIPE} = 'IGNORE';
+		eval { $gcf2->cat_oid(fileno($w), $COPYING) };
+		like($@, qr/writev error:/, 'got writev error');
+	}
+}
+
+if ($nr) {
+	open my $null, '>', '/dev/null' or BAIL_OUT "open /dev/null: $!";
+	my $fd = fileno($null);
+	local $SIG{PIPE} = 'IGNORE';
+	my ($r, $w);
+	pipe($r, $w);
+	close $r;
+	my $broken = fileno($w);
+	for (1..$nr) {
+		my $obj = PublicInbox::Gcf2::new();
+		if (defined($objdir)) {
+			$obj->add_alternate($objdir);
+			for (1..$cat) {
+				$obj->cat_oid($fd, $COPYING);
+				eval { $obj->cat_oid($broken, $COPYING) };
+				$obj->cat_oid($fd, '0'x40);
+				$obj->cat_oid($fd, 'invalid');
+			}
+		}
+	}
+}
+done_testing;
diff --git a/t/gcf2_client.t b/t/gcf2_client.t
new file mode 100644
index 00000000..6d059cad
--- /dev/null
+++ b/t/gcf2_client.t
@@ -0,0 +1,90 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Cwd qw(getcwd);
+use PublicInbox::Import;
+use PublicInbox::DS;
+
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2Client';
+my ($tmpdir, $for_destroy) = tmpdir();
+my $git_a = "$tmpdir/a.git";
+my $git_b = "$tmpdir/b.git";
+PublicInbox::Import::init_bare($git_a);
+PublicInbox::Import::init_bare($git_b);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr);
+is($?, 0, 'fast-import succeeded');
+
+my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+my $called = 0;
+my $err_f = "$tmpdir/err";
+{
+	PublicInbox::DS->Reset;
+	open my $err, '>>', $err_f or BAIL_OUT $!;
+	my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->gcf2_async(\"$tree $git_a\n", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is($oid, $tree, 'got expected OID');
+		is($size, 30, 'got expected length');
+		is($type, 'tree', 'got tree type');
+		is(length($$bref), 30, 'got a tree');
+		is($arg, 'hi', 'arg passed');
+		$called++;
+	}, 'hi');
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	my $estr = do { local $/; <$err> };
+	is($estr, '', 'nothing in stderr');
+
+	my $trunc = substr($tree, 0, 39);
+	$gcf2c->gcf2_async(\"$trunc $git_a\n", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is(undef, $bref, 'missing bref is undef');
+		is($oid, $trunc, 'truncated OID printed');
+		is($type, 'missing', 'type is "missing"');
+		is($size, undef, 'size is undef');
+		is($arg, 'bye', 'arg passed when missing');
+		$called++;
+	}, 'bye');
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry');
+
+	# try failed alternates lookup
+	PublicInbox::DS->Reset;
+	open $err, '>', $err_f or BAIL_OUT $!;
+	$gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->gcf2_async(\"$tree $git_b\n", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is(undef, $bref, 'missing bref from alt is undef');
+		$called++;
+	});
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry before alt update');
+
+	# now try successful alternates lookup
+	open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!;
+	print $alt "$git_a/objects\n" or BAIL_OUT $!;
+	close $alt or BAIL_OUT;
+	my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]);
+	$gcf2c->gcf2_async(\"$tree $git_a\n", sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is($oid, $tree, 'oid match on alternates retry');
+		is($$bref, $expect, 'tree content matched');
+		$called++;
+	});
+	$gcf2c->cat_async_step($gcf2c->{inflight});
+}
+is($called, 4, 'gcf2_async callbacks hit');
+done_testing;
diff --git a/t/git-http-backend.psgi b/t/git-http-backend.psgi
index e34ebe40..a91e5de8 100644
--- a/t/git-http-backend.psgi
+++ b/t/git-http-backend.psgi
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/git.t b/t/git.t
index dfd7173a..377652ca 100644
--- a/t/git.t
+++ b/t/git.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -76,12 +76,17 @@ if (1) {
 	is(length($$x), $size, 'read correct number of bytes');
 
 	my $ref = $gcf->qx(qw(cat-file blob), $buf);
+	is($?, 0, 'no error on scalar success');
 	my @ref = $gcf->qx(qw(cat-file blob), $buf);
+	is($?, 0, 'no error on wantarray success');
 	my $nl = scalar @ref;
 	ok($nl > 1, "qx returned array length of $nl");
+	is(join('', @ref), $ref, 'qx array and scalar context both work');
 
 	$gcf->qx(qw(repack -adq));
 	ok($gcf->packed_bytes > 0, 'packed size is positive');
+	$gcf->qx(qw(rev-parse --verify bogus));
+	isnt($?, 0, '$? set on failure'.$?);
 }
 
 SKIP: {
diff --git a/t/gzip_filter.t b/t/gzip_filter.t
index 400214e6..b349ae58 100644
--- a/t/gzip_filter.t
+++ b/t/gzip_filter.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/hl_mod.t b/t/hl_mod.t
index 95057354..f5bf433d 100644
--- a/t/hl_mod.t
+++ b/t/hl_mod.t
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/html_index.t b/t/html_index.t
index 80f81577..8e2a674f 100644
--- a/t/html_index.t
+++ b/t/html_index.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/httpd-corner.psgi b/t/httpd-corner.psgi
index cb41cfa0..5436a74d 100644
--- a/t/httpd-corner.psgi
+++ b/t/httpd-corner.psgi
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 # corner case tests for the generic PSGI server
 # Usage: plackup [OPTIONS] /path/to/this/file
diff --git a/t/httpd-corner.t b/t/httpd-corner.t
index 514672a1..c3f80530 100644
--- a/t/httpd-corner.t
+++ b/t/httpd-corner.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 # note: our HTTP server should be standalone and capable of running
 # generic PSGI/Plack apps.
diff --git a/t/httpd-https.t b/t/httpd-https.t
index fcfa12af..a2166ce6 100644
--- a/t/httpd-https.t
+++ b/t/httpd-https.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/httpd-unix.t b/t/httpd-unix.t
index 363f3648..2d3cecc1 100644
--- a/t/httpd-unix.t
+++ b/t/httpd-unix.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 # Tests for binding Unix domain sockets
 use strict;
diff --git a/t/httpd.t b/t/httpd.t
index 7404eb8b..2fc28355 100644
--- a/t/httpd.t
+++ b/t/httpd.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/hval.t b/t/hval.t
index e80a02ff..9d0dab7a 100644
--- a/t/hval.t
+++ b/t/hval.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/idx_stack.t b/t/idx_stack.t
index 35aff37b..7af096a8 100644
--- a/t/idx_stack.t
+++ b/t/idx_stack.t
@@ -1,11 +1,13 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
 use_ok 'PublicInbox::IdxStack';
 my $oid_a = '03c21563cf15c241687966b5b2a3f37cdc193316';
 my $oid_b = '963caad026055ab9bcbe3ee9550247f9d8840feb';
+my $cmt_a = 'df8e4a0612545d53672036641e9f076efc94c2f6';
+my $cmt_b = '3ba7c9fa4a083c439e768882c571c2026a981ca5';
 
 my $stk = PublicInbox::IdxStack->new;
 is($stk->read_prepare, $stk, 'nothing');
@@ -13,19 +15,19 @@ is($stk->num_records, 0, 'no records');
 is($stk->pop_rec, undef, 'undef on empty');
 
 $stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
 is($stk->read_prepare, $stk, 'read_prepare');
 is($stk->num_records, 1, 'num_records');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop once');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop once');
 is($stk->pop_rec, undef, 'undef on empty');
 
 $stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
-$stk->push_rec('d', 1234, 5678, $oid_b);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
+$stk->push_rec('d', 1234, 5678, $oid_b, $cmt_b);
 is($stk->read_prepare, $stk, 'read_prepare');
 is($stk->num_records, 2, 'num_records');
-is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b], 'pop');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop-pop');
+is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b, $cmt_b], 'pop');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop-pop');
 is($stk->pop_rec, undef, 'empty');
 
 SKIP: {
@@ -37,11 +39,11 @@ SKIP: {
 	while (<$fh>) {
 		chomp;
 		my ($at, $ct, $H) = split(/\./);
-		$stk //= PublicInbox::IdxStack->new($H);
+		$stk //= PublicInbox::IdxStack->new;
 		# not bothering to parse blobs here, just using commit OID
 		# as a blob OID since they're the same size + format
-		$stk->push_rec('m', $at + 0, $ct + 0, $H);
-		push(@expect, [ 'm', $at, $ct, $H ]);
+		$stk->push_rec('m', $at + 0, $ct + 0, $H, $H);
+		push(@expect, [ 'm', $at, $ct, $H, $H ]);
 	}
 	$stk or skip('nothing from git log', 3);
 	is($stk->read_prepare, $stk, 'read_prepare');
diff --git a/t/imap.t b/t/imap.t
index 5a251c6b..0ec02818 100644
--- a/t/imap.t
+++ b/t/imap.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # unit tests (no network) for IMAP, see t/imapd.t for end-to-end tests
 use strict;
diff --git a/t/imap_searchqp.t b/t/imap_searchqp.t
index adf7b205..6b4121ea 100644
--- a/t/imap_searchqp.t
+++ b/t/imap_searchqp.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
@@ -76,17 +76,17 @@ is($q->{xap}, 'c:"b" d:..19931002', 'compound query w/ parens');
 	$q = $parse->($s = qq{BEFORE 2-Oct-1993});
 	is_deeply($q->{sql}, \" AND ts <= $t0", 'BEFORE SQL');
 	$q = $parse->("FROM z $s");
-	is($q->{xap}, qq{f:"z" ts:..$t0}, 'BEFORE Xapian');
+	is($q->{xap}, qq{f:"z" rt:..$t0}, 'BEFORE Xapian');
 
 	$q = $parse->($s = qq{SINCE 2-Oct-1993});
 	is_deeply($q->{sql}, \" AND ts >= $t0", 'SINCE SQL');
 	$q = $parse->("FROM z $s");
-	is($q->{xap}, qq{f:"z" ts:$t0..}, 'SINCE Xapian');
+	is($q->{xap}, qq{f:"z" rt:$t0..}, 'SINCE Xapian');
 
 	$q = $parse->($s = qq{ON 2-Oct-1993});
 	is_deeply($q->{sql}, \" AND ts >= $t0 AND ts <= $t1", 'ON SQL');
 	$q = $parse->("FROM z $s");
-	is($q->{xap}, qq{f:"z" ts:$t0..$t1}, 'ON Xapian');
+	is($q->{xap}, qq{f:"z" rt:$t0..$t1}, 'ON Xapian');
 }
 
 {
diff --git a/t/imap_tracker.t b/t/imap_tracker.t
index 01e1d0b1..be7c6e65 100644
--- a/t/imap_tracker.t
+++ b/t/imap_tracker.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use Test::More;
 use strict;
diff --git a/t/imapd-tls.t b/t/imapd-tls.t
index df4ef85c..e40ae1e8 100644
--- a/t/imapd-tls.t
+++ b/t/imapd-tls.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/imapd.t b/t/imapd.t
index a464ad86..1df9d26e 100644
--- a/t/imapd.t
+++ b/t/imapd.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # end-to-end IMAP tests, see unit tests in t/imap.t, too
 use strict;
@@ -251,8 +251,8 @@ ok($mic->logout, 'logout works');
 
 my $have_inotify = eval { require Linux::Inotify2; 1 };
 
-my $pi_config = PublicInbox::Config->new;
-$pi_config->each_inbox(sub {
+my $pi_cfg = PublicInbox::Config->new;
+$pi_cfg->each_inbox(sub {
 	my ($ibx) = @_;
 	my $env = { ORIGINAL_RECIPIENT => $ibx->{-primary_address} };
 	my $name = $ibx->{name};
@@ -371,11 +371,13 @@ is(scalar keys %$ret, 3, 'got all 3 messages');
 
 SKIP: {
 	# do any clients use non-UID IMAP SEARCH?
-	skip 'Xapian missing', 2 if $level eq 'basic';
+	skip 'Xapian missing', 3 if $level eq 'basic';
 	my $x = $mic->search('all');
 	is_deeply($x, [1, 2, 3], 'MSN SEARCH works before rm');
 	$x = $mic->search(qw(header subject embedded));
 	is_deeply($x, [2], 'MSN SEARCH on Subject works before rm');
+	$x = $mic->search('FROM scraper@example.com');
+	is_deeply($x, [], "MSN SEARCH miss won't trigger warnings");
 }
 
 {
diff --git a/t/import.t b/t/import.t
index 9a88416f..ae76858b 100644
--- a/t/import.t
+++ b/t/import.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -7,7 +7,6 @@ use PublicInbox::Eml;
 use PublicInbox::Smsg;
 use PublicInbox::Git;
 use PublicInbox::Import;
-use PublicInbox::Spawn qw(spawn);
 use Fcntl qw(:DEFAULT SEEK_SET);
 use PublicInbox::TestCommon;
 use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2
@@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
 
 if ($v2) {
 	like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
-	my $raw_email = $smsg->{-raw_email};
-	is($mime->as_string, $$raw_email, 'string matches');
-	is($smsg->{raw_bytes}, length($$raw_email), 'length matches');
 	my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
 	open my $in, '+<', undef or BAIL_OUT "open(+<): $!";
 	print $in $mime->as_string or die "write failed: $!";
 	$in->flush or die "flush failed: $!";
-	seek($in, 0, SEEK_SET);
-	open my $out, '+<', undef or BAIL_OUT "open(+<): $!";
-	my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out });
-	is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object');
+	seek($in, 0, SEEK_SET) or die "seek: $!";
+	chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in }));
 	is($?, 0, 'hash-object');
-	seek($out, 0, SEEK_SET);
-	chomp(my $hashed_obj = <$out>);
 	is($hashed_obj, $smsg->{blob}, "blob object_id matches exp");
 }
 
diff --git a/t/inbox.t b/t/inbox.t
index 08f1724f..bc8fae9a 100644
--- a/t/inbox.t
+++ b/t/inbox.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/inbox_idle.t b/t/inbox_idle.t
index e16ee11b..27facfe9 100644
--- a/t/inbox_idle.t
+++ b/t/inbox_idle.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use Test::More;
 use PublicInbox::TestCommon;
@@ -32,14 +32,14 @@ for my $V (1, 2) {
 		$sidx->set_metadata_once;
 		$sidx->idx_release; # allow watching on lockfile
 	}
-	my $pi_config = PublicInbox::Config->new(\<new(\<each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
-	my $ii = PublicInbox::InboxIdle->new($pi_config);
+	$pi_cfg->each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
+	my $ii = PublicInbox::InboxIdle->new($pi_cfg);
 	ok($ii, 'InboxIdle created');
 	SKIP: {
 		skip('inotify or kqueue missing', 1) unless $ii->{sock};
@@ -50,7 +50,7 @@ EOF
 	PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
 	$ii->event_step;
 	is(scalar @{$obj->{called}}, 1, 'called on unlock');
-	$pi_config->each_inbox(sub { shift->unsubscribe_unlock($ident) });
+	$pi_cfg->each_inbox(sub { shift->unsubscribe_unlock($ident) });
 	ok($im->add(eml_load('t/data/0001.patch')), "$V added #2");
 	$im->done;
 	PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
diff --git a/t/index-git-times.t b/t/index-git-times.t
index f9869cfa..3cfb99f4 100644
--- a/t/index-git-times.t
+++ b/t/index-git-times.t
@@ -1,11 +1,10 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
 use Test::More;
 use PublicInbox::TestCommon;
-use PublicInbox::Import;
 use PublicInbox::Config;
 use PublicInbox::Admin;
 use File::Path qw(remove_tree);
@@ -48,7 +47,7 @@ EOF
 	print $w $data or die;
 	close $w or die;
 	my $cmd = ['git', "--git-dir=$v1dir", 'fast-import', '--quiet'];
-	PublicInbox::Import::run_die($cmd, undef, { 0 => $r });
+	xsys_e($cmd, undef, { 0 => $r });
 }
 
 run_script(['-index', '--skip-docdata', $v1dir]) or die 'v1 index failed';
diff --git a/t/indexlevels-mirror-v1.t b/t/indexlevels-mirror-v1.t
index adcc93fd..a0cee72c 100644
--- a/t/indexlevels-mirror-v1.t
+++ b/t/indexlevels-mirror-v1.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 local $ENV{PI_TEST_VERSION} = 1;
 require './t/indexlevels-mirror.t';
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
index 656a9a34..53826aef 100644
--- a/t/indexlevels-mirror.t
+++ b/t/indexlevels-mirror.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -172,9 +172,7 @@ my $import_index_incremental = sub {
 $import_index_incremental->($PI_TEST_VERSION, 'basic', $mime);
 
 SKIP: {
-	require PublicInbox::Search;
-	PublicInbox::Search::load_xapian() or
-		skip('Xapian perl binding missing', 2);
+	require_mods(qw(Search::Xapian), 2);
 	foreach my $l (qw(medium full)) {
 		$import_index_incremental->($PI_TEST_VERSION, $l, $mime);
 	}
diff --git a/t/init.t b/t/init.t
index dba59231..b8dfea66 100644
--- a/t/init.t
+++ b/t/init.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/ipc.t b/t/ipc.t
new file mode 100644
index 00000000..5801c760
--- /dev/null
+++ b/t/ipc.t
@@ -0,0 +1,210 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use Fcntl qw(SEEK_SET);
+use Digest::SHA qw(sha1_hex);
+require_mods(qw(Storable||Sereal));
+require_ok 'PublicInbox::IPC';
+state $once = eval <<'';
+package PublicInbox::IPC;
+use strict;
+use Digest::SHA qw(sha1_hex);
+sub test_array { qw(test array) }
+sub test_scalar { 'scalar' }
+sub test_scalarref { \'scalarref' }
+sub test_undef { undef }
+sub test_die { shift; die @_; 'unreachable' }
+sub test_pid { $$ }
+sub test_write_each_fd {
+	my ($self, @args) = @_;
+	for my $fd (0..2) {
+		print { $self->{$fd} } "i=$fd $$ ", @args, "\n";
+		$self->{$fd}->flush;
+	}
+}
+sub test_sha {
+	my ($self, $buf) = @_;
+	print { $self->{1} } sha1_hex($buf), "\n";
+	$self->{1}->flush;
+}
+1;
+
+my $ipc = bless {}, 'PublicInbox::IPC';
+my @t = qw(array scalar scalarref undef);
+my $test = sub {
+	my $x = shift;
+	my @res;
+	for my $type (@t) {
+		my $m = "test_$type";
+		my @ret = $ipc->ipc_do($m);
+		my @exp = $ipc->$m;
+		is_deeply(\@ret, \@exp, "wantarray $m $x");
+
+		$ipc->ipc_do($m);
+
+		$ipc->ipc_async($m, [], sub { push @res, \@_ }, \$m);
+
+		my $ret = $ipc->ipc_do($m);
+		my $exp = $ipc->$m;
+		is_deeply($ret, $exp, "!wantarray $m $x");
+
+		is_deeply(\@res, [ [ \$m, \@exp ] ], "async $m $x");
+		@res = ();
+	}
+	$ipc->ipc_async_wait(-1);
+	is_deeply(\@res, [], 'no leftover results');
+	$ipc->ipc_async('test_die', ['die test'],
+			sub { push @res, \@_  }, 'die arg');
+	$ipc->ipc_async_wait(1);
+	is(scalar(@res), 1, 'only one result');
+	is(scalar(@{$res[0]}), 2, 'result has 2-element array');
+	is($res[0]->[0], 'die arg', 'got async die arg '.$x);
+	is(ref($res[0]->[1]), 'PublicInbox::IPC::Die',
+		"exception type $x");
+	{
+		my $nr = PublicInbox::IPC::PIPE_BUF();
+		my $count = 0;
+		my $cb = sub { ++$count };
+		$ipc->ipc_async('test_undef', [], $cb) for (1..$nr);
+		$ipc->ipc_async_wait(-1);
+		is($count, $nr, "$x async runs w/o deadlock");
+	}
+
+	my $ret = eval { $ipc->test_die('phail') };
+	my $exp = $@;
+	$ret = eval { $ipc->ipc_do('test_die', 'phail') };
+	my $err = $@;
+	my %lines;
+	for ($err, $exp) {
+		s/ line (\d+).*//s and $lines{$1}++;
+	}
+	is(scalar keys %lines, 1, 'line numbers match');
+	is((values %lines)[0], 2, '2 hits on same line number');
+	is($err, $exp, "$x die matches");
+	is($ret, undef, "$x die did not return");
+
+	eval { $ipc->test_die(['arrayref']) };
+	$exp = $@;
+	$ret = eval { $ipc->ipc_do('test_die', ['arrayref']) };
+	$err = $@;
+	is_deeply($err, $exp, 'die with unblessed ref');
+	is(ref($err), 'ARRAY', 'got an array ref');
+
+	$exp = bless ['blessed'], 'PublicInbox::WTF';
+	$ret = eval { $ipc->ipc_do('test_die', $exp) };
+	$err = $@;
+	is_deeply($err, $exp, 'die with blessed ref');
+	is(ref($err), 'PublicInbox::WTF', 'got blessed ref');
+};
+$test->('local');
+
+{
+	my $pid = $ipc->ipc_worker_spawn('test worker');
+	ok($pid > 0 && kill(0, $pid), 'worker spawned and running');
+	defined($pid) or BAIL_OUT 'no spawn, no test';
+	is($ipc->ipc_do('test_pid'), $pid, 'worker pid returned');
+	$test->('worker');
+	{
+		my ($tmp, $for_destroy) = tmpdir();
+		$ipc->ipc_lock_init("$tmp/lock");
+		is($ipc->ipc_do('test_pid'), $pid, 'worker pid returned');
+	}
+	$ipc->ipc_worker_stop;
+	ok(!kill(0, $pid) && $!{ESRCH}, 'worker stopped');
+}
+$ipc->ipc_worker_stop; # idempotent
+
+# work queues
+pipe(my ($ra, $wa)) or BAIL_OUT $!;
+pipe(my ($rb, $wb)) or BAIL_OUT $!;
+pipe(my ($rc, $wc)) or BAIL_OUT $!;
+open my $warn, '+>', undef or BAIL_OUT;
+$warn->autoflush(0);
+local $SIG{__WARN__} = sub { print $warn "PID:$$ ", @_ };
+my @ppids;
+open my $agpl, '<', 'COPYING' or BAIL_OUT "AGPL-3 missing: $!";
+my $big = do { local $/; <$agpl> } // BAIL_OUT "read: $!";
+close $agpl or BAIL_OUT "close: $!";
+
+for my $t ('local', 'worker', 'worker again') {
+	$ipc->wq_do('test_write_each_fd', [ $wa, $wb, $wc ], 'hello world');
+	my $i = 0;
+	for my $fh ($ra, $rb, $rc) {
+		my $buf = readline($fh);
+		is(chop($buf), "\n", "trailing CR ($t)");
+		like($buf, qr/\Ai=$i \d+ hello world\z/, "got expected ($t)");
+		$i++;
+	}
+	$ipc->wq_do('test_die', [ $wa, $wb, $wc ]);
+	$ipc->wq_do('test_sha', [ $wa, $wb ], 'hello world');
+	is(readline($rb), sha1_hex('hello world')."\n", "SHA small ($t)");
+	{
+		my $bigger = $big x 10;
+		$ipc->wq_do('test_sha', [ $wa, $wb ], $bigger);
+		my $exp = sha1_hex($bigger)."\n";
+		undef $bigger;
+		is(readline($rb), $exp, "SHA big ($t)");
+	}
+	my $ppid = $ipc->wq_workers_start('wq', 1);
+	push(@ppids, $ppid);
+}
+
+# wq_do works across fork (siblings can feed)
+SKIP: {
+	skip 'Socket::MsgHdr or Inline::C missing', 3 if !$ppids[0];
+	is_deeply(\@ppids, [$$, undef, undef],
+		'parent pid returned in wq_workers_start');
+	my $pid = fork // BAIL_OUT $!;
+	if ($pid == 0) {
+		use POSIX qw(_exit);
+		$ipc->wq_do('test_write_each_fd', [ $wa, $wb, $wc ], $$);
+		_exit(0);
+	} else {
+		my $i = 0;
+		my ($wpid, @rest) = keys %{$ipc->{-wq_workers}};
+		is(scalar(@rest), 0, 'only one worker');
+		for my $fh ($ra, $rb, $rc) {
+			my $buf = readline($fh);
+			is(chop($buf), "\n", "trailing CR #$i");
+			like($buf, qr/^i=$i $wpid $pid\z/,
+				'got expected from sibling');
+			$i++;
+		}
+		is(waitpid($pid, 0), $pid, 'waitpid complete');
+		is($?, 0, 'child wq producer exited');
+	}
+}
+
+$ipc->wq_close;
+SKIP: {
+	skip 'Socket::MsgHdr or Inline::C missing', 11 if !$ppids[0];
+	seek($warn, 0, SEEK_SET) or BAIL_OUT;
+	my @warn = <$warn>;
+	is(scalar(@warn), 3, 'warned 3 times');
+	like($warn[0], qr/ wq_do: /, '1st warned from wq_do');
+	like($warn[1], qr/ wq_worker: /, '2nd warned from wq_worker');
+	is($warn[2], $warn[1], 'worker did not die');
+
+	$SIG{__WARN__} = 'DEFAULT';
+	is($ipc->wq_workers_start('wq', 1), $$, 'workers started again');
+	is($ipc->wq_workers, 1, '1 worker started');
+	SKIP: {
+		$ipc->WQ_MAX_WORKERS > 1 or
+			skip 'Inline::C or Socket::MsgHdr not available', 4;
+		$ipc->wq_worker_incr;
+		is($ipc->wq_workers, 2, 'worker count bumped');
+		$ipc->wq_worker_decr;
+		$ipc->wq_worker_decr_wait(10);
+		is($ipc->wq_workers, 1, 'worker count lowered');
+		is($ipc->wq_workers(2), 2, 'worker count set');
+		is($ipc->wq_workers, 2, 'worker count stayed set');
+	}
+	$ipc->wq_close;
+	is($ipc->wq_workers, undef, 'workers undef after close');
+}
+
+done_testing;
diff --git a/t/kqnotify.t b/t/kqnotify.t
index c3557d3e..902ce0f1 100644
--- a/t/kqnotify.t
+++ b/t/kqnotify.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Ensure KQNotify can pick up rename(2) and link(2) operations
diff --git a/t/lei-oneshot.t b/t/lei-oneshot.t
new file mode 100644
index 00000000..7688da5b
--- /dev/null
+++ b/t/lei-oneshot.t
@@ -0,0 +1,8 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use PublicInbox::TestCommon;
+local $ENV{TEST_LEI_ONESHOT} = '1';
+require './t/lei.t';
diff --git a/t/lei.t b/t/lei.t
new file mode 100644
index 00000000..3fd1d1fe
--- /dev/null
+++ b/t/lei.t
@@ -0,0 +1,376 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Config;
+use File::Path qw(rmtree);
+use Fcntl qw(SEEK_SET);
+use PublicInbox::Spawn qw(which);
+require_git 2.6;
+require_mods(qw(json DBD::SQLite Search::Xapian));
+my $opt = { 1 => \(my $out = ''), 2 => \(my $err = '') };
+my ($home, $for_destroy) = tmpdir();
+my $err_filter;
+my @onions = qw(http://hjrcffqmbrq6wope.onion/meta/
+	http://czquwvybam4bgbro.onion/meta/
+	http://ou63pmih66umazou.onion/meta/);
+my $lei = sub {
+	my ($cmd, $env, $xopt) = @_;
+	$out = $err = '';
+	if (!ref($cmd)) {
+		($env, $xopt) = grep { (!defined) || ref } @_;
+		$cmd = [ grep { defined && !ref } @_ ];
+	}
+	my $res = run_script(['lei', @$cmd], $env, $xopt // $opt);
+	$err_filter and
+		$err = join('', grep(!/$err_filter/, split(/^/m, $err)));
+	$res;
+};
+
+delete local $ENV{XDG_DATA_HOME};
+delete local $ENV{XDG_CONFIG_HOME};
+local $ENV{GIT_COMMITTER_EMAIL} = 'lei@example.com';
+local $ENV{GIT_COMMITTER_NAME} = 'lei user';
+local $ENV{XDG_RUNTIME_DIR} = "$home/xdg_run";
+local $ENV{HOME} = $home;
+local $ENV{FOO} = 'BAR';
+mkdir "$home/xdg_run", 0700 or BAIL_OUT "mkdir: $!";
+my $home_trash = [ "$home/.local", "$home/.config" ];
+my $cleanup = sub { rmtree([@$home_trash, @_]) };
+my $config_file = "$home/.config/lei/config";
+my $store_dir = "$home/.local/share/lei";
+
+my $test_help = sub {
+	ok(!$lei->(), 'no args fails');
+	is($? >> 8, 1, '$? is 1');
+	is($out, '', 'nothing in stdout');
+	like($err, qr/^usage:/sm, 'usage in stderr');
+
+	for my $arg (['-h'], ['--help'], ['help'], [qw(daemon-pid --help)]) {
+		ok($lei->($arg), "lei @$arg");
+		like($out, qr/^usage:/sm, "usage in stdout (@$arg)");
+		is($err, '', "nothing in stderr (@$arg)");
+	}
+
+	for my $arg ([''], ['--halp'], ['halp'], [qw(daemon-pid --halp)]) {
+		ok(!$lei->($arg), "lei @$arg");
+		is($? >> 8, 1, '$? set correctly');
+		isnt($err, '', 'something in stderr');
+		is($out, '', 'nothing in stdout');
+	}
+	ok($lei->(qw(init -h)), 'init -h');
+	like($out, qr! \Q$home\E/\.local/share/lei/store\b!,
+		'actual path shown in init -h');
+	ok($lei->(qw(init -h), { XDG_DATA_HOME => '/XDH' }),
+		'init with XDG_DATA_HOME');
+	like($out, qr! /XDH/lei/store\b!, 'XDG_DATA_HOME in init -h');
+	is($err, '', 'no errors from init -h');
+
+	ok($lei->(qw(config -h)), 'config-h');
+	like($out, qr! \Q$home\E/\.config/lei/config\b!,
+		'actual path shown in config -h');
+	ok($lei->(qw(config -h), { XDG_CONFIG_HOME => '/XDC' }),
+		'config with XDG_CONFIG_HOME');
+	like($out, qr! /XDC/lei/config\b!, 'XDG_CONFIG_HOME in config -h');
+	is($err, '', 'no errors from config -h');
+};
+
+my $ok_err_info = sub {
+	my ($msg) = @_;
+	is(grep(!/^I:/, split(/^/, $err)), 0, $msg) or
+		diag "$msg: err=$err";
+};
+
+my $test_init = sub {
+	$cleanup->();
+	ok($lei->('init'), 'init w/o args');
+	$ok_err_info->('after init w/o args');
+	ok($lei->('init'), 'idempotent init w/o args');
+	$ok_err_info->('after idempotent init w/o args');
+
+	ok(!$lei->('init', "$home/x"), 'init conflict');
+	is(grep(/^E:/, split(/^/, $err)), 1, 'got error on conflict');
+	ok(!-e "$home/x", 'nothing created on conflict');
+	$cleanup->();
+
+	ok($lei->('init', "$home/x"), 'init conflict resolved');
+	$ok_err_info->('init w/ arg');
+	ok($lei->('init', "$home/x"), 'init idempotent w/ path');
+	$ok_err_info->('init idempotent w/ arg');
+	ok(-d "$home/x", 'created dir');
+	$cleanup->("$home/x");
+
+	ok(!$lei->('init', "$home/x", "$home/2"), 'too many args fails');
+	like($err, qr/too many/, 'noted excessive');
+	ok(!-e "$home/x", 'x not created on excessive');
+	for my $d (@$home_trash) {
+		my $base = (split(m!/!, $d))[-1];
+		ok(!-d $d, "$base not created");
+	}
+	is($out, '', 'nothing in stdout on init failure');
+};
+
+my $test_config = sub {
+	$cleanup->();
+	ok($lei->(qw(config a.b c)), 'config set var');
+	is($out.$err, '', 'no output on var set');
+	ok($lei->(qw(config -l)), 'config -l');
+	is($err, '', 'no errors on listing');
+	is($out, "a.b=c\n", 'got expected output');
+	ok(!$lei->(qw(config -f), "$home/.config/f", qw(x.y z)),
+			'config set var with -f fails');
+	like($err, qr/not supported/, 'not supported noted');
+	ok(!-f "$home/config/f", 'no file created');
+};
+
+my $setup_publicinboxes = sub {
+	state $done = '';
+	return if $done eq $home;
+	use PublicInbox::InboxWritable;
+	for my $V (1, 2) {
+		run_script([qw(-init), "-V$V", "t$V",
+				'--newsgroup', "t.$V",
+				"$home/t$V", "http://example.com/t$V",
+				"t$V\@example.com" ]) or BAIL_OUT "init v$V";
+	}
+	my $cfg = PublicInbox::Config->new;
+	my $seen = 0;
+	$cfg->each_inbox(sub {
+		my ($ibx) = @_;
+		my $im = PublicInbox::InboxWritable->new($ibx)->importer(0);
+		my $V = $ibx->version;
+		my @eml = glob('t/*.eml');
+		push(@eml, 't/data/0001.patch') if $V == 2;
+		for (@eml) {
+			next if $_ eq 't/psgi_v2-old.eml'; # dup mid
+			$im->add(eml_load($_)) or BAIL_OUT "v$V add $_";
+			$seen++;
+		}
+		$im->done;
+		if ($V == 1) {
+			run_script(['-index', $ibx->{inboxdir}]) or
+				BAIL_OUT 'index v1';
+		}
+	});
+	$done = $home;
+	$seen || BAIL_OUT 'no imports';
+};
+
+my $test_external_remote = sub {
+	my ($url, $k) = @_;
+SKIP: {
+	my $nr = 4;
+	skip "$k unset", $nr if !$url;
+	which('curl') or skip 'no curl', $nr;
+	which('torsocks') or skip 'no torsocks', $nr if $url =~ m!\.onion/!;
+	$lei->('ls-external');
+	for my $e (split(/^/ms, $out)) {
+		$e =~ s/\s+boost.*//s;
+		$lei->('forget-external', '-q', $e) or
+			fail "error forgetting $e: $err"
+	}
+	$lei->('add-external', $url);
+	my $mid = '20140421094015.GA8962@dcvr.yhbt.net';
+	ok($lei->('q', "m:$mid"), "query $url");
+	is($err, '', "no errors on $url");
+	my $res = PublicInbox::Config->json->decode($out);
+	is($res->[0]->{'m'}, "<$mid>", "got expected mid from $url");
+	ok($lei->('q', "m:$mid", 'd:..20101002'), 'no results, no error');
+	like($err, qr/404/, 'noted 404');
+	is($out, "[null]\n", 'got null results');
+	$lei->('forget-external', $url);
+} # /SKIP
+}; # /sub
+
+my $test_external = sub {
+	$setup_publicinboxes->();
+	$cleanup->();
+	$lei->('ls-external');
+	is($out.$err, '', 'ls-external no output, yet');
+	ok(!-e $config_file && !-e $store_dir,
+		'nothing created by ls-external');
+
+	ok(!$lei->('add-external', "$home/nonexistent"),
+		"fails on non-existent dir");
+	$lei->('ls-external');
+	is($out.$err, '', 'ls-external still has no output');
+	my $cfg = PublicInbox::Config->new;
+	$cfg->each_inbox(sub {
+		my ($ibx) = @_;
+		ok($lei->(qw(add-external -q), $ibx->{inboxdir}),
+			'added external');
+		is($out.$err, '', 'no output');
+	});
+	ok(-s $config_file && -e $store_dir,
+		'add-external created config + store');
+	my $lcfg = PublicInbox::Config->new($config_file);
+	$cfg->each_inbox(sub {
+		my ($ibx) = @_;
+		is($lcfg->{"external.$ibx->{inboxdir}.boost"}, 0,
+			"configured boost on $ibx->{name}");
+	});
+	$lei->('ls-external');
+	like($out, qr/boost=0\n/s, 'ls-external has output');
+	ok($lei->(qw(add-external -q https://EXAMPLE.com/ibx)), 'add remote');
+	is($err, '', 'no warnings after add-external');
+	$lei->('ls-external');
+	like($out, qr!https://example\.com/ibx/!s, 'added canonical URL');
+	is($err, '', 'no warnings on ls-external');
+	ok($lei->(qw(forget-external -q https://EXAMPLE.com/ibx)),
+		'forget');
+	$lei->('ls-external');
+	unlike($out, qr!https://example\.com/ibx/!s, 'removed canonical URL');
+
+	ok(!$lei->(qw(q s:prefix -o /dev/null -f maildir)), 'bad maildir');
+	like($err, qr!/dev/null exists and is not a directory!,
+		'error shown');
+	is($? >> 8, 1, 'errored out with exit 1');
+
+	ok(!$lei->(qw(q s:prefix -f mboxcl2 -o), $home), 'bad mbox');
+	like($err, qr!\Q$home\E exists and is not a writable file!,
+		'error shown');
+	is($? >> 8, 1, 'errored out with exit 1');
+
+	ok(!$lei->(qw(q s:prefix -o /dev/stdout -f Mbox2)), 'bad format');
+	like($err, qr/bad mbox --format=mbox2/, 'error shown');
+	is($? >> 8, 1, 'errored out with exit 1');
+
+	# note, on a Bourne shell users should be able to use either:
+	#	s:"use boolean prefix"
+	#	"s:use boolean prefix"
+	# or use single quotes, it should not matter.  Users only need
+	# to know shell quoting rules, not Xapian quoting rules.
+	# No double-quoting should be imposed on users on the CLI
+	$lei->('q', 's:use boolean prefix');
+	like($out, qr/search: use boolean prefix/, 'phrase search got result');
+	require IO::Uncompress::Gunzip;
+	for my $sfx ('', '.gz') {
+		my $f = "$home/mbox$sfx";
+		$lei->('q', '-o', "mboxcl2:$f", 's:use boolean prefix');
+		my $cat = $sfx eq '' ? sub {
+			open my $mb, '<', $f or fail "no mbox: $!";
+			<$mb>
+		} : sub {
+			my $z = IO::Uncompress::Gunzip->new($f, MultiStream=>1);
+			<$z>;
+		};
+		my @s = grep(/^Subject:/, $cat->());
+		is(scalar(@s), 1, "1 result in mbox$sfx");
+		$lei->('q', '-a', '-o', "mboxcl2:$f", 's:see attachment');
+		is($err, '', 'no errors from augment');
+		@s = grep(/^Subject:/, my @wtf = $cat->());
+		is(scalar(@s), 2, "2 results in mbox$sfx");
+
+		$lei->('q', '-a', '-o', "mboxcl2:$f", 's:nonexistent');
+		is($err, '', "no errors on no results ($sfx)");
+
+		my @s2 = grep(/^Subject:/, $cat->());
+		is_deeply(\@s2, \@s,
+			"same 2 old results w/ --augment and bad search $sfx");
+
+		$lei->('q', '-o', "mboxcl2:$f", 's:nonexistent');
+		my @res = $cat->();
+		is_deeply(\@res, [], "clobber w/o --augment $sfx");
+	}
+	ok(!$lei->('q', '-o', "$home/mbox", 's:nope'),
+			'fails if mbox format unspecified');
+	ok(!$lei->(qw(q --no-local s:see)), '--no-local');
+	is($? >> 8, 1, 'proper exit code');
+	like($err, qr/no local or remote.+? to search/, 'no inbox');
+	my %e = (
+		TEST_LEI_EXTERNAL_HTTPS => 'https://public-inbox.org/meta/',
+		TEST_LEI_EXTERNAL_ONION => $onions[int(rand(scalar(@onions)))],
+	);
+	for my $k (keys %e) {
+		my $url = $ENV{$k} // '';
+		$url = $e{$k} if $url eq '1';
+		$test_external_remote->($url, $k);
+	}
+};
+
+my $test_lei_common = sub {
+	$test_help->();
+	$test_config->();
+	$test_init->();
+	$test_external->();
+};
+
+if ($ENV{TEST_LEI_ONESHOT}) {
+	require_ok 'PublicInbox::LEI';
+	# force sun_path[108] overflow, ($lei->() filters out this path)
+	my $xrd = "$home/1shot-test".('.sun_path' x 108);
+	local $ENV{XDG_RUNTIME_DIR} = $xrd;
+	$err_filter = qr!\Q$xrd!;
+	$test_lei_common->();
+} else {
+SKIP: { # real socket
+	eval { require Socket::MsgHdr; 1 } // do {
+		require PublicInbox::Spawn;
+		PublicInbox::Spawn->can('send_cmd4');
+	} // skip 'Socket::MsgHdr or Inline::C missing or unconfigured', 115;
+	local $ENV{XDG_RUNTIME_DIR} = "$home/xdg_run";
+	my $sock = "$ENV{XDG_RUNTIME_DIR}/lei/5.seq.sock";
+	my $err_log = "$ENV{XDG_RUNTIME_DIR}/lei/errors.log";
+
+	ok($lei->('daemon-pid'), 'daemon-pid');
+	is($err, '', 'no error from daemon-pid');
+	like($out, qr/\A[0-9]+\n\z/s, 'pid returned') or BAIL_OUT;
+	chomp(my $pid = $out);
+	ok(kill(0, $pid), 'pid is valid');
+	ok(-S $sock, 'sock created');
+
+	$test_lei_common->();
+	is(-s $err_log, 0, 'nothing in errors.log');
+	open my $efh, '>>', $err_log or BAIL_OUT $!;
+	print $efh "phail\n" or BAIL_OUT $!;
+	close $efh or BAIL_OUT $!;
+
+	ok($lei->('daemon-pid'), 'daemon-pid');
+	chomp(my $pid_again = $out);
+	is($pid, $pid_again, 'daemon-pid idempotent');
+	like($err, qr/phail/, 'got mock "phail" error previous run');
+
+	ok($lei->(qw(daemon-kill)), 'daemon-kill');
+	is($out, '', 'no output from daemon-kill');
+	is($err, '', 'no error from daemon-kill');
+	for (0..100) {
+		kill(0, $pid) or last;
+		tick();
+	}
+	ok(-S $sock, 'sock still exists');
+	ok(!kill(0, $pid), 'pid gone after stop');
+
+	ok($lei->(qw(daemon-pid)), 'daemon-pid');
+	chomp(my $new_pid = $out);
+	ok(kill(0, $new_pid), 'new pid is running');
+	ok(-S $sock, 'sock still exists');
+
+	for my $sig (qw(-0 -CHLD)) {
+		ok($lei->('daemon-kill', $sig), "handles $sig");
+	}
+	is($out.$err, '', 'no output on innocuous signals');
+	ok($lei->('daemon-pid'), 'daemon-pid');
+	chomp $out;
+	is($out, $new_pid, 'PID unchanged after -0/-CHLD');
+
+	if ('socket inaccessible') {
+		chmod 0000, $sock or BAIL_OUT "chmod 0000: $!";
+		ok($lei->('help'), 'connect fail, one-shot fallback works');
+		like($err, qr/\bconnect\(/, 'connect error noted');
+		like($out, qr/^usage: /, 'help output works');
+		chmod 0700, $sock or BAIL_OUT "chmod 0700: $!";
+	}
+	unlink $sock or BAIL_OUT "unlink($sock) $!";
+	for (0..100) {
+		kill('CHLD', $new_pid) or last;
+		tick();
+	}
+	ok(!kill(0, $new_pid), 'daemon exits after unlink');
+	# success over socket, can't test without
+}; # SKIP
+} # else
+
+done_testing;
diff --git a/t/lei_dedupe.t b/t/lei_dedupe.t
new file mode 100644
index 00000000..bcb06a0a
--- /dev/null
+++ b/t/lei_dedupe.t
@@ -0,0 +1,86 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use PublicInbox::Smsg;
+require_mods(qw(DBD::SQLite));
+use_ok 'PublicInbox::LeiDedupe';
+my $eml = eml_load('t/plack-qp.eml');
+my $mid = $eml->header_raw('Message-ID');
+my $different = eml_load('t/msg_iter-order.eml');
+$different->header_set('Message-ID', $mid);
+my $smsg = bless { ds => time }, 'PublicInbox::Smsg';
+$smsg->populate($eml);
+$smsg->{$_} //= '' for (qw(to cc references)) ;
+
+my $check_storable = sub {
+	my ($x) = @_;
+	SKIP: {
+		require_mods('Storable', 1);
+		my $dup = Storable::thaw(Storable::freeze($x));
+		is_deeply($dup, $x, "$x->[3] round-trips through storable");
+	}
+};
+
+my $lei = { opt => { dedupe => 'none' } };
+my $dd = PublicInbox::LeiDedupe->new($lei);
+$check_storable->($dd);
+$dd->prepare_dedupe;
+ok(!$dd->is_dup($eml), '1st is_dup w/o dedupe');
+ok(!$dd->is_dup($eml), '2nd is_dup w/o dedupe');
+ok(!$dd->is_dup($different), 'different is_dup w/o dedupe');
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 1');
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 2');
+
+for my $strat (undef, 'content') {
+	$lei->{opt}->{dedupe} = $strat;
+	$dd = PublicInbox::LeiDedupe->new($lei);
+	$check_storable->($dd);
+	$dd->prepare_dedupe;
+	my $desc = $strat // 'default';
+	ok(!$dd->is_dup($eml), "1st is_dup with $desc dedupe");
+	ok($dd->is_dup($eml), "2nd seen with $desc dedupe");
+	ok(!$dd->is_dup($different), "different is_dup with $desc dedupe");
+	ok(!$dd->is_smsg_dup($smsg), "is_smsg_dup pass w/ $desc dedupe");
+	ok($dd->is_smsg_dup($smsg), "is_smsg_dup reject w/ $desc dedupe");
+}
+$lei->{opt}->{dedupe} = 'bogus';
+eval { PublicInbox::LeiDedupe->new($lei) };
+like($@, qr/unsupported.*bogus/, 'died on bogus strategy');
+
+$lei->{opt}->{dedupe} = 'mid';
+$dd = PublicInbox::LeiDedupe->new($lei);
+$check_storable->($dd);
+$dd->prepare_dedupe;
+ok(!$dd->is_dup($eml), '1st is_dup with mid dedupe');
+ok($dd->is_dup($eml), '2nd seen with mid dedupe');
+ok($dd->is_dup($different), 'different seen with mid dedupe');
+ok(!$dd->is_smsg_dup($smsg), 'smsg mid dedupe pass');
+ok($dd->is_smsg_dup($smsg), 'smsg mid dedupe reject');
+
+$lei->{opt}->{dedupe} = 'oid';
+$dd = PublicInbox::LeiDedupe->new($lei);
+$check_storable->($dd);
+$dd->prepare_dedupe;
+
+# --augment won't have OIDs:
+ok(!$dd->is_dup($eml), '1st is_dup with oid dedupe (augment)');
+ok($dd->is_dup($eml), '2nd seen with oid dedupe (augment)');
+ok(!$dd->is_dup($different), 'different is_dup with mid dedupe (augment)');
+$different->header_set('Status', 'RO');
+ok($dd->is_dup($different), 'different seen with oid dedupe Status removed');
+
+ok(!$dd->is_dup($eml, '01d'), '1st is_dup with oid dedupe');
+ok($dd->is_dup($different, '01d'), 'different content ignored if oid matches');
+ok($dd->is_dup($eml, '01D'), 'case insensitive oid comparison :P');
+ok(!$dd->is_dup($eml, '01dbad'), 'case insensitive oid comparison :P');
+
+$smsg->{blob} = 'dead';
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe pass');
+ok($dd->is_smsg_dup($smsg), 'smsg dedupe reject');
+
+done_testing;
diff --git a/t/lei_external.t b/t/lei_external.t
new file mode 100644
index 00000000..1f0048a1
--- /dev/null
+++ b/t/lei_external.t
@@ -0,0 +1,18 @@
+#!perl -w
+use strict;
+use v5.10.1;
+use Test::More;
+my $cls = 'PublicInbox::LeiExternal';
+require_ok $cls;
+my $canon = $cls->can('_canonicalize');
+my $exp = 'https://example.com/my-inbox/';
+is($canon->('https://example.com/my-inbox'), $exp, 'trailing slash added');
+is($canon->('https://example.com/my-inbox//'), $exp, 'trailing slash removed');
+is($canon->('https://example.com//my-inbox/'), $exp, 'leading slash removed');
+is($canon->('https://EXAMPLE.com/my-inbox/'), $exp, 'lowercased');
+is($canon->('/this/path/is/nonexistent/'), '/this/path/is/nonexistent',
+	'non-existent pathname canonicalized');
+is($canon->('/this//path/'), '/this/path', 'extra slashes gone');
+is($canon->('/ALL/CAPS'), '/ALL/CAPS', 'caps preserved');
+
+done_testing;
diff --git a/t/lei_overview.t b/t/lei_overview.t
new file mode 100644
index 00000000..896cc01a
--- /dev/null
+++ b/t/lei_overview.t
@@ -0,0 +1,33 @@
+#!perl -w
+# Copyright (C) 2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use POSIX qw(_exit);
+require_ok 'PublicInbox::LeiOverview';
+
+my $ovv = bless {}, 'PublicInbox::LeiOverview';
+$ovv->ovv_out_lk_init;
+my $lock_path = $ovv->{lock_path};
+ok(-f $lock_path, 'lock init');
+undef $ovv;
+ok(!-f $lock_path, 'lock DESTROY');
+
+$ovv = bless {}, 'PublicInbox::LeiOverview';
+$ovv->ovv_out_lk_init;
+$lock_path = $ovv->{lock_path};
+ok(-f $lock_path, 'lock init #2');
+my $pid = fork // BAIL_OUT "fork $!";
+if ($pid == 0) {
+	undef $ovv;
+	_exit(0);
+}
+is(waitpid($pid, 0), $pid, 'child exited');
+is($?, 0, 'no error in child process');
+ok(-f $lock_path, 'lock was not destroyed by child');
+undef $ovv;
+ok(!-f $lock_path, 'lock DESTROY #2');
+
+done_testing;
diff --git a/t/lei_store.t b/t/lei_store.t
new file mode 100644
index 00000000..c9360f8f
--- /dev/null
+++ b/t/lei_store.t
@@ -0,0 +1,131 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+require_mods(qw(DBD::SQLite Search::Xapian));
+require_git 2.6;
+require_ok 'PublicInbox::LeiStore';
+require_ok 'PublicInbox::ExtSearch';
+my ($home, $for_destroy) = tmpdir();
+my $opt = { 1 => \(my $out = ''), 2 => \(my $err = '') };
+my $store_dir = "$home/lst";
+local $ENV{GIT_COMMITTER_EMAIL} = 'lei@example.com';
+local $ENV{GIT_COMMITTER_NAME} = 'lei user';
+my $lst = PublicInbox::LeiStore->new($store_dir, { creat => 1 });
+ok($lst, '->new');
+my $smsg = $lst->add_eml(eml_load('t/data/0001.patch'));
+like($smsg->{blob}, qr/\A[0-9a-f]+\z/, 'add returned OID');
+my $eml = eml_load('t/data/0001.patch');
+is($lst->add_eml($eml), undef, 'idempotent');
+$lst->done;
+is_deeply([$lst->mbox_keywords($eml)], [], 'no keywords');
+$eml->header_set('Status', 'RO');
+is_deeply([$lst->mbox_keywords($eml)], ['seen'], 'seen extracted');
+$eml->header_set('X-Status', 'A');
+is_deeply([$lst->mbox_keywords($eml)], [qw(answered seen)],
+	'seen+answered extracted');
+$eml->header_set($_) for qw(Status X-Status);
+
+is_deeply([$lst->maildir_keywords('/foo:2,')], [], 'Maildir no keywords');
+is_deeply([$lst->maildir_keywords('/foo:2,S')], ['seen'], 'Maildir seen');
+is_deeply([$lst->maildir_keywords('/foo:2,RS')], ['answered', 'seen'],
+	'Maildir answered + seen');
+is_deeply([$lst->maildir_keywords('/foo:2,RSZ')], ['answered', 'seen'],
+	'Maildir answered + seen w/o Z');
+{
+	my $es = $lst->search;
+	my $msgs = $es->over->query_xover(0, 1000);
+	is(scalar(@$msgs), 1, 'one message');
+	is($msgs->[0]->{blob}, $smsg->{blob}, 'blob matches');
+	my $mset = $es->mset("mid:$msgs->[0]->{mid}");
+	is($mset->size, 1, 'search works');
+	is_deeply($es->mset_to_artnums($mset), [ $msgs->[0]->{num} ],
+		'mset_to_artnums');
+	my @kw = $es->msg_keywords(($mset->items)[0]);
+	is_deeply(\@kw, [], 'no flags');
+}
+
+for my $parallel (0, 1) {
+	$lst->{priv_eidx}->{parallel} = $parallel;
+	my $docids = $lst->set_eml_keywords($eml, qw(seen draft));
+	is(scalar @$docids, 1, 'set keywords on one doc');
+	$lst->done;
+	my @kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, [qw(draft seen)], 'kw matches');
+
+	$docids = $lst->add_eml_keywords($eml, qw(seen draft));
+	$lst->done;
+	is(scalar @$docids, 1, 'idempotently added keywords to doc');
+	@kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, [qw(draft seen)], 'kw matches after noop');
+
+	$docids = $lst->remove_eml_keywords($eml, qw(seen draft));
+	is(scalar @$docids, 1, 'removed from one doc');
+	$lst->done;
+	@kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, [], 'kw matches after remove');
+
+	$docids = $lst->remove_eml_keywords($eml, qw(answered));
+	is(scalar @$docids, 1, 'removed from one doc (idempotently)');
+	$lst->done;
+	@kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, [], 'kw matches after remove (idempotent)');
+
+	$docids = $lst->add_eml_keywords($eml, qw(answered));
+	is(scalar @$docids, 1, 'added to empty doc');
+	$lst->done;
+	@kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, ['answered'], 'kw matches after add');
+
+	$docids = $lst->set_eml_keywords($eml);
+	is(scalar @$docids, 1, 'set to clobber');
+	$lst->done;
+	@kw = $lst->search->msg_keywords($docids->[0]);
+	is_deeply(\@kw, [], 'set clobbers all');
+
+	my $set = eml_load('t/plack-qp.eml');
+	$set->header_set('Message-ID', "");
+	my $ret = $lst->set_eml($set, 'seen');
+	is(ref $ret, 'PublicInbox::Smsg', 'initial returns smsg');
+	my $ids = $lst->set_eml($set, qw(seen));
+	is_deeply($ids, [ $ret->{num} ], 'set_eml idempotent');
+	$ids = $lst->set_eml($set, qw(seen answered));
+	is_deeply($ids, [ $ret->{num} ], 'set_eml to change kw');
+	$lst->done;
+	@kw = $lst->search->msg_keywords($ids->[0]);
+	is_deeply(\@kw, [qw(answered seen)], 'set changed kw');
+}
+
+SKIP: {
+	require_mods(qw(Storable), 1);
+	ok($lst->can('ipc_do'), 'ipc_do works if we have Storable');
+	$eml->header_set('Message-ID', '');
+	my $pid = $lst->ipc_worker_spawn('lei-store');
+	ok($pid > 0, 'got a worker');
+	my $smsg = $lst->ipc_do('set_eml', $eml, qw(seen));
+	is(ref($smsg), 'PublicInbox::Smsg', 'set_eml works over ipc');
+	my $ids = $lst->ipc_do('set_eml', $eml, qw(seen));
+	is_deeply($ids, [ $smsg->{num} ], 'docid returned');
+
+	$eml->header_set('Message-ID');
+	my $no_mid = $lst->ipc_do('set_eml', $eml, qw(seen));
+	my $wait = $lst->ipc_do('done');
+	my @kw = $lst->search->msg_keywords($no_mid->{num});
+	is_deeply(\@kw, [qw(seen)], 'ipc set changed kw');
+
+	is(ref($smsg), 'PublicInbox::Smsg', 'no mid works ipc');
+	$ids = $lst->ipc_do('set_eml', $eml, qw(seen));
+	is_deeply($ids, [ $no_mid->{num} ], 'docid returned w/o mid w/ ipc');
+	$lst->ipc_do('done');
+	$lst->ipc_worker_stop;
+	$ids = $lst->ipc_do('set_eml', $eml, qw(seen answered));
+	is_deeply($ids, [ $no_mid->{num} ], 'docid returned w/o mid w/o ipc');
+	$wait = $lst->ipc_do('done');
+	@kw = $lst->search->msg_keywords($no_mid->{num});
+	is_deeply(\@kw, [qw(answered seen)], 'set changed kw w/o ipc');
+}
+
+done_testing;
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
new file mode 100644
index 00000000..47c0e3d4
--- /dev/null
+++ b/t/lei_to_mail.t
@@ -0,0 +1,267 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use Fcntl qw(SEEK_SET);
+use PublicInbox::Spawn qw(popen_rd which);
+use List::Util qw(shuffle);
+require_mods(qw(DBD::SQLite));
+require PublicInbox::MboxReader;
+require PublicInbox::LeiOverview;
+use_ok 'PublicInbox::LeiToMail';
+my $from = "Content-Length: 10\nSubject: x\n\nFrom hell\n";
+my $noeol = "Subject: x\n\nFrom hell";
+my $crlf = $noeol;
+$crlf =~ s/\n/\r\n/g;
+my $kw = [qw(seen answered flagged)];
+my $smsg = { kw => $kw, blob => '0'x40 };
+my @MBOX = qw(mboxcl2 mboxrd mboxcl mboxo);
+for my $mbox (@MBOX) {
+	my $m = "eml2$mbox";
+	my $cb = PublicInbox::LeiToMail->can($m);
+	my $s = $cb->(PublicInbox::Eml->new($from), $smsg);
+	is(substr($$s, -1, 1), "\n", "trailing LF in normal $mbox");
+	my $eml = PublicInbox::Eml->new($s);
+	is($eml->header('Status'), 'OR', "Status: set by $m");
+	is($eml->header('X-Status'), 'AF', "X-Status: set by $m");
+	if ($mbox eq 'mboxcl2') {
+		like($eml->body_raw, qr/^From /, "From not escaped $m");
+	} else {
+		like($eml->body_raw, qr/^>From /, "From escaped once by $m");
+	}
+	my @cl = $eml->header('Content-Length');
+	if ($mbox =~ /mboxcl/) {
+		is(scalar(@cl), 1, "$m only has one Content-Length header");
+		is($cl[0] + length("\n"),
+			length($eml->body_raw), "$m Content-Length matches");
+	} else {
+		is(scalar(@cl), 0, "$m clobbered Content-Length");
+	}
+	$s = $cb->(PublicInbox::Eml->new($noeol), $smsg);
+	is(substr($$s, -1, 1), "\n",
+		"trailing LF added by $m when original lacks EOL");
+	$eml = PublicInbox::Eml->new($s);
+	if ($mbox eq 'mboxcl2') {
+		is($eml->body_raw, "From hell\n", "From not escaped by $m");
+	} else {
+		is($eml->body_raw, ">From hell\n", "From escaped once by $m");
+	}
+	$s = $cb->(PublicInbox::Eml->new($crlf), $smsg);
+	is(substr($$s, -2, 2), "\r\n",
+		"trailing CRLF added $m by original lacks EOL");
+	$eml = PublicInbox::Eml->new($s);
+	if ($mbox eq 'mboxcl2') {
+		is($eml->body_raw, "From hell\r\n", "From not escaped by $m");
+	} else {
+		is($eml->body_raw, ">From hell\r\n", "From escaped once by $m");
+	}
+	if ($mbox =~ /mboxcl/) {
+		is($eml->header('Content-Length') + length("\r\n"),
+			length($eml->body_raw), "$m Content-Length matches");
+	} elsif ($mbox eq 'mboxrd') {
+		$s = $cb->($eml, $smsg);
+		$eml = PublicInbox::Eml->new($s);
+		is($eml->body_raw,
+			">>From hell\r\n\r\n", "From escaped again by $m");
+	}
+}
+
+my ($tmpdir, $for_destroy) = tmpdir();
+local $ENV{TMPDIR} = $tmpdir;
+open my $err, '>>', "$tmpdir/lei.err" or BAIL_OUT $!;
+my $lei = { 2 => $err };
+my $buf = <<'EOM';
+From: x@example.com
+Subject: x
+
+blah
+EOM
+my $fn = "$tmpdir/x.mbox";
+my ($mbox) = shuffle(@MBOX); # pick one, shouldn't matter
+my $wcb_get = sub {
+	my ($fmt, $dst) = @_;
+	delete $lei->{dedupe};
+	$lei->{ovv} = bless {
+		fmt => $fmt,
+		dst => $dst
+	}, 'PublicInbox::LeiOverview';
+	my $l2m = PublicInbox::LeiToMail->new($lei);
+	SKIP: {
+		require_mods('Storable', 1);
+		my $dup = Storable::thaw(Storable::freeze($l2m));
+		is_deeply($dup, $l2m, "$fmt round-trips through storable");
+	}
+	my $zpipe = $l2m->pre_augment($lei);
+	$l2m->do_augment($lei);
+	$l2m->post_augment($lei, $zpipe);
+	my $cb = $l2m->write_cb($lei);
+	delete $lei->{1};
+	$cb;
+};
+
+my $deadbeef = { blob => 'deadbeef', kw => [ qw(seen) ] };
+my $orig = do {
+	my $wcb = $wcb_get->($mbox, $fn);
+	is(ref $wcb, 'CODE', 'write_cb returned callback');
+	ok(-f $fn && !-s _, 'empty file created');
+	$wcb->(\(my $dup = $buf), $deadbeef);
+	undef $wcb;
+	open my $fh, '<', $fn or BAIL_OUT $!;
+	my $raw = do { local $/; <$fh> };
+	like($raw, qr/^blah\n/sm, 'wrote content');
+	unlink $fn or BAIL_OUT $!;
+
+	local $lei->{opt} = { jobs => 2 };
+	$wcb = $wcb_get->($mbox, $fn);
+	ok(-f $fn && !-s _, 'truncated mbox destination');
+	$wcb->(\($dup = $buf), $deadbeef);
+	undef $wcb;
+	open $fh, '<', $fn or BAIL_OUT $!;
+	is(do { local $/; <$fh> }, $raw, 'jobs > 1');
+	$raw;
+};
+for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
+	my $zsfx2cmd = PublicInbox::LeiToMail->can('zsfx2cmd');
+	SKIP: {
+		my $cmd = eval { $zsfx2cmd->($zsfx, 0, $lei) };
+		skip $@, 3 if $@;
+		my $dc_cmd = eval { $zsfx2cmd->($zsfx, 1, $lei) };
+		ok($dc_cmd, "decompressor for .$zsfx");
+		my $f = "$fn.$zsfx";
+		my $wcb = $wcb_get->($mbox, $f);
+		$wcb->(\(my $dup = $buf), $deadbeef);
+		undef $wcb;
+		my $uncompressed = xqx([@$dc_cmd, $f]);
+		is($uncompressed, $orig, "$zsfx works unlocked");
+
+		local $lei->{opt} = { jobs => 2 }; # for atomic writes
+		unlink $f or BAIL_OUT "unlink $!";
+		$wcb = $wcb_get->($mbox, $f);
+		$wcb->(\($dup = $buf), $deadbeef);
+		undef $wcb;
+		is(xqx([@$dc_cmd, $f]), $orig, "$zsfx matches with lock");
+
+		local $lei->{opt} = { augment => 1 };
+		$wcb = $wcb_get->($mbox, $f);
+		$wcb->(\($dup = $buf . "\nx\n"), $deadbeef);
+		undef $wcb; # commit
+
+		my $cat = popen_rd([@$dc_cmd, $f]);
+		my @raw;
+		PublicInbox::MboxReader->$mbox($cat,
+			sub { push @raw, shift->as_string });
+		like($raw[1], qr/\nblah\n\nx\n\z/s, "augmented $zsfx");
+		like($raw[0], qr/\nblah\n\z/s, "original preserved $zsfx");
+
+		local $lei->{opt} = { augment => 1, jobs => 2 };
+		$wcb = $wcb_get->($mbox, $f);
+		$wcb->(\($dup = $buf . "\ny\n"), $deadbeef);
+		undef $wcb; # commit
+
+		my @raw3;
+		$cat = popen_rd([@$dc_cmd, $f]);
+		PublicInbox::MboxReader->$mbox($cat,
+			sub { push @raw3, shift->as_string });
+		my $y = pop @raw3;
+		is_deeply(\@raw3, \@raw, 'previous messages preserved');
+		like($y, qr/\nblah\n\ny\n\z/s, "augmented $zsfx (atomic)");
+	}
+}
+
+my $as_orig = sub {
+	my ($eml) = @_;
+	$eml->header_set('Status');
+	$eml->as_string;
+};
+
+unlink $fn or BAIL_OUT $!;
+if ('default deduplication uses content_hash') {
+	my $wcb = $wcb_get->('mboxo', $fn);
+	$deadbeef->{kw} = [];
+	$wcb->(\(my $x = $buf), $deadbeef) for (1..2);
+	undef $wcb; # undef to commit changes
+	my $cmp = '';
+	open my $fh, '<', $fn or BAIL_OUT $!;
+	PublicInbox::MboxReader->mboxo($fh, sub { $cmp .= $as_orig->(@_) });
+	is($cmp, $buf, 'only one message written');
+
+	local $lei->{opt} = { augment => 1 };
+	$wcb = $wcb_get->('mboxo', $fn);
+	$wcb->(\($x = $buf . "\nx\n"), $deadbeef) for (1..2);
+	undef $wcb; # undef to commit changes
+	open $fh, '<', $fn or BAIL_OUT $!;
+	my @x;
+	PublicInbox::MboxReader->mboxo($fh, sub { push @x, $as_orig->(@_) });
+	is(scalar(@x), 2, 'augmented mboxo');
+	is($x[0], $cmp, 'original message preserved');
+	is($x[1], $buf . "\nx\n", 'new message appended');
+}
+
+{ # stdout support
+	open my $tmp, '+>', undef or BAIL_OUT $!;
+	local $lei->{1} = $tmp;
+	my $wcb = $wcb_get->('mboxrd', '/dev/stdout');
+	$wcb->(\(my $x = $buf), $deadbeef);
+	undef $wcb; # commit
+	seek($tmp, 0, SEEK_SET) or BAIL_OUT $!;
+	my $cmp = '';
+	PublicInbox::MboxReader->mboxrd($tmp, sub { $cmp .= $as_orig->(@_) });
+	is($cmp, $buf, 'message written to stdout');
+}
+
+SKIP: { # FIFO support
+	use POSIX qw(mkfifo);
+	my $fn = "$tmpdir/fifo";
+	mkfifo($fn, 0600) or skip("mkfifo not supported: $!", 1);
+	my $cat = popen_rd([which('cat'), $fn]);
+	my $wcb = $wcb_get->('mboxo', $fn);
+	$wcb->(\(my $x = $buf), $deadbeef);
+	undef $wcb; # commit
+	my $cmp = '';
+	PublicInbox::MboxReader->mboxo($cat, sub { $cmp .= $as_orig->(@_) });
+	is($cmp, $buf, 'message written to FIFO');
+}
+
+{ # Maildir support
+	my $md = "$tmpdir/maildir/";
+	my $wcb = $wcb_get->('maildir', $md);
+	is(ref($wcb), 'CODE', 'got Maildir callback');
+	my $b4dc0ffee = { blob => 'badc0ffee', kw => [] };
+	$wcb->(\(my $x = $buf), $b4dc0ffee);
+
+	my @f;
+	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
+	open my $fh, $f[0] or BAIL_OUT $!;
+	is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
+
+	$wcb = $wcb_get->('maildir', $md);
+	my $deadcafe = { blob => 'deadcafe', kw => [] };
+	$wcb->(\($x = $buf."\nx\n"), $deadcafe);
+
+	my @x = ();
+	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @x, shift });
+	is(scalar(@x), 1, 'wrote one new file');
+	ok(!-f $f[0], 'old file clobbered');
+	open $fh, $x[0] or BAIL_OUT $!;
+	is(do { local $/; <$fh> }, $buf."\nx\n", 'wrote new file to Maildir');
+
+	local $lei->{opt}->{augment} = 1;
+	$wcb = $wcb_get->('maildir', $md);
+	$wcb->(\($x = $buf."\ny\n"), $deadcafe);
+	$wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
+	@f = ();
+	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
+	is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');
+	my @new = grep(!/\A\Q$x[0]\E\z/, @f);
+	is(scalar @new, 1, '1 new file written (b4dc0ffee skipped)');
+	open $fh, $x[0] or BAIL_OUT $!;
+	is(do { local $/; <$fh> }, $buf."\nx\n", 'old file untouched');
+	open $fh, $new[0] or BAIL_OUT $!;
+	is(do { local $/; <$fh> }, $buf."\ny\n", 'new file written');
+}
+
+done_testing;
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
new file mode 100644
index 00000000..f745ea3e
--- /dev/null
+++ b/t/lei_xsearch.t
@@ -0,0 +1,81 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use List::Util qw(shuffle max);
+use PublicInbox::TestCommon;
+use PublicInbox::ExtSearchIdx;
+use PublicInbox::Eml;
+use PublicInbox::InboxWritable;
+require_mods(qw(DBD::SQLite Search::Xapian));
+require_git 2.6;
+require_ok 'PublicInbox::LeiXSearch';
+my ($home, $for_destroy) = tmpdir();
+my @ibx;
+for my $V (1..2) {
+	for my $i (3..6) {
+		my $ibx = PublicInbox::InboxWritable->new({
+			inboxdir => "$home/v$V-$i",
+			name => "test-v$V-$i",
+			version => $V,
+			indexlevel => 'medium',
+			-primary_address => "v$V-$i\@example.com",
+		}, { nproc => int(rand(8)) + 1 });
+		push @ibx, $ibx;
+		my $im = $ibx->importer(0);
+		for my $j (0..9) {
+			my $eml = PublicInbox::Eml->new(<{-primary_address}
+Date: Fri, 02 Oct 1993 0$V:0$i:0$j +0000
+Subject: v${V}i${i}j$j
+Message-ID: 
+
+${V}er ${i}on j$j
+EOF
+			$im->add($eml);
+		}
+		$im->done;
+	}
+}
+my $first = shift @ibx; is($first->{name}, 'test-v1-3', 'first plucked');
+my $last = pop @ibx; is($last->{name}, 'test-v2-6', 'last plucked');
+my $eidx = PublicInbox::ExtSearchIdx->new("$home/eidx");
+$eidx->attach_inbox($first);
+$eidx->attach_inbox($last);
+$eidx->eidx_sync({fsync => 0});
+my $es = PublicInbox::ExtSearch->new("$home/eidx");
+my $lxs = PublicInbox::LeiXSearch->new;
+for my $ibxish (shuffle($es, @ibx)) {
+	$lxs->prepare_external($ibxish);
+}
+for my $loc ($lxs->locals) {
+	$lxs->attach_external($loc);
+}
+my $nr = $lxs->xdb->get_doccount;
+my $mset = $lxs->mset('d:19931002..19931003', { limit => $nr });
+is($mset->size, $nr, 'got all messages');
+my @msgs;
+for my $mi ($mset->items) {
+	if (my $smsg = $lxs->smsg_for($mi)) {
+		push @msgs, $smsg;
+	} else {
+		diag "E: ${\$mi->get_docid} missing";
+	}
+}
+is(scalar(@msgs), $nr, 'smsgs retrieved for all');
+
+$mset = $lxs->recent(undef, { limit => 1 });
+is($mset->size, 1, 'one result');
+my $max = max(map { $_->{docid} } @msgs);
+is($lxs->smsg_for(($mset->items)[0])->{docid}, $max,
+	'got highest docid');
+
+my @ibxish = $lxs->locals;
+is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back');
+is($lxs->search, $lxs, '->search works');
+is($lxs->over, undef, '->over fails');
+
+done_testing;
diff --git a/t/linkify.t b/t/linkify.t
index 34840410..e42e1efe 100644
--- a/t/linkify.t
+++ b/t/linkify.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/mbox_reader.t b/t/mbox_reader.t
new file mode 100644
index 00000000..30a5e6e3
--- /dev/null
+++ b/t/mbox_reader.t
@@ -0,0 +1,94 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use List::Util qw(shuffle);
+use PublicInbox::Eml;
+use Fcntl qw(SEEK_SET);
+require_ok 'PublicInbox::MboxReader';
+require_ok 'PublicInbox::LeiToMail';
+my %raw = (
+	hdr_only => "From: header-only\@example.com\n\n",
+	small_from => "From: small-from\@example.com\n\nFrom hell\n",
+	small => "From: small\@example.com\n\nfrom hell\n",
+	big_hdr_only => "From: big-header\@example.com\n" .
+		(('A: '.('a' x 72)."\n") x 1000)."\n",
+	big_body => "From: big-body\@example.com\n\n".
+		(('b: '.('b' x 72)."\n") x 1000) .
+		"From hell\n",
+	big_all => "From: big-all\@example.com\n".
+		(("A: ".('a' x 72)."\n") x 1000). "\n" .
+		(("b: ".('b' x 72)."\n") x 1000) .
+		"From hell\n",
+);
+
+if ($ENV{TEST_EXTRA}) {
+	for my $fn (glob('t/*.eml'), glob('t/*/*.{patch,eml}')) {
+		$raw{$fn} = eml_load($fn)->as_string;
+	}
+}
+
+my $reader = PublicInbox::MboxReader->new;
+my $check_fmt = sub {
+	my $fmt = shift;
+	my @order = shuffle(keys %raw);
+	my $eml2mbox = PublicInbox::LeiToMail->can("eml2$fmt");
+	open my $fh, '+>', undef or BAIL_OUT "open: $!";
+	for my $k (@order) {
+		my $eml = PublicInbox::Eml->new($raw{$k});
+		my $buf = $eml2mbox->($eml);
+		print $fh $$buf or BAIL_OUT "print $!";
+	}
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	$reader->$fmt($fh, sub {
+		my ($eml) = @_;
+		$eml->header_set('Status');
+		$eml->header_set('Lines');
+		my $cur = shift @order;
+		my @cl = $eml->header_raw('Content-Length');
+		if ($fmt =~ /\Amboxcl/) {
+			is(scalar(@cl), 1, "Content-Length set $fmt $cur");
+			my $raw = $eml->body_raw;
+			my $adj = 0;
+			if ($fmt eq 'mboxcl') {
+				my @from = ($raw =~ /^(From )/smg);
+				$adj = scalar(@from);
+			}
+			is(length($raw), $cl[0] - $adj,
+				"Content-Length is correct $fmt $cur");
+			# clobber for ->as_string comparison below
+			$eml->header_set('Content-Length');
+		} else {
+			is(scalar(@cl), 0, "Content-Length unset $fmt $cur");
+		}
+		my $orig = PublicInbox::Eml->new($raw{$cur});
+		is($eml->as_string, $orig->as_string,
+			"read back original $fmt $cur");
+	});
+};
+my @mbox = qw(mboxrd mboxo mboxcl mboxcl2);
+for my $fmt (@mbox) { $check_fmt->($fmt) }
+s/\n/\r\n/sg for (values %raw);
+for my $fmt (@mbox) { $check_fmt->($fmt) }
+
+SKIP: {
+	use PublicInbox::Spawn qw(popen_rd);
+	use Time::HiRes qw(alarm);
+	my $fh = popen_rd([ $^X, '-E', <<'' ]);
+say "From x@y Fri Oct  2 00:00:00 1993";
+print "a: b\n\n", "x" x 70000, "\n\n";
+say "From x@y Fri Oct  2 00:00:00 2010";
+print "Final: bit\n\n", "Incomplete\n\n";
+exit 1
+
+	my @x;
+	eval { $reader->mboxrd($fh, sub { push @x, shift->as_string }) };
+	like($@, qr/error closing mbox/, 'detects error reading from pipe');
+	is(scalar(@x), 1, 'only saw one message');
+	is(scalar(grep(/Final/, @x)), 0, 'no incomplete bit');
+}
+
+done_testing;
diff --git a/t/mda.t b/t/mda.t
index c5b35eec..d20cdb92 100644
--- a/t/mda.t
+++ b/t/mda.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/mda_filter_rubylang.t b/t/mda_filter_rubylang.t
index 754d52f7..d05eec25 100644
--- a/t/mda_filter_rubylang.t
+++ b/t/mda_filter_rubylang.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -44,8 +44,8 @@ something
 EOF
 		ok(run_script(['-mda'], $env, $opt), 'message delivered');
 	}
-	my $config = PublicInbox::Config->new;
-	my $ibx = $config->lookup_name($v);
+	my $cfg = PublicInbox::Config->new;
+	my $ibx = $cfg->lookup_name($v);
 
 	# make sure all serials are searchable:
 	for my $i (1..2) {
diff --git a/t/mid.t b/t/mid.t
index 3b8f4108..e2d8dcbf 100644
--- a/t/mid.t
+++ b/t/mid.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/mime.t b/t/mime.t
index 46c1d8d7..471f0efa 100644
--- a/t/mime.t
+++ b/t/mime.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # This library is free software; you can redistribute it and/or modify
 # it under the same terms as Perl itself.
 # Artistic or GPL-1+ 
diff --git a/t/miscsearch.t b/t/miscsearch.t
new file mode 100644
index 00000000..0424328d
--- /dev/null
+++ b/t/miscsearch.t
@@ -0,0 +1,57 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::InboxWritable;
+require_mods(qw(Search::Xapian DBD::SQLite));
+use_ok 'PublicInbox::MiscSearch';
+use_ok 'PublicInbox::MiscIdx';
+
+my ($tmp, $for_destroy) = tmpdir();
+my $eidx = { xpfx => "$tmp/eidx", -no_fsync => 1 }; # mock ExtSearchIdx
+{
+	mkdir "$tmp/v1" or BAIL_OUT "mkdir $!";
+	open my $fh, '>', "$tmp/v1/description" or BAIL_OUT "open: $!";
+	print $fh "Everything sucks this year\n" or BAIL_OUT "print $!";
+	close $fh or BAIL_OUT "close $!";
+}
+{
+	my $v1 = PublicInbox::InboxWritable->new({
+		inboxdir => "$tmp/v1",
+		name => 'hope',
+		address => [ 'nope@example.com' ],
+		indexlevel => 'basic',
+		version => 1,
+	});
+	$v1->init_inbox;
+	my $mi = PublicInbox::MiscIdx->new($eidx);
+	$mi->begin_txn;
+	$mi->index_ibx($v1);
+	$mi->commit_txn;
+}
+
+my $ms = PublicInbox::MiscSearch->new("$tmp/eidx/misc");
+my $mset = $ms->mset('"everything sucks today"');
+is(scalar($mset->items), 0, 'no match on description phrase');
+
+$mset = $ms->mset('"everything sucks this year"');
+is(scalar($mset->items), 1, 'match phrase on description');
+
+$mset = $ms->mset('everything sucks');
+is(scalar($mset->items), 1, 'match words in description');
+
+$mset = $ms->mset('nope@example.com');
+is(scalar($mset->items), 1, 'match full address');
+
+$mset = $ms->mset('nope');
+is(scalar($mset->items), 1, 'match partial address');
+
+$mset = $ms->mset('hope');
+is(scalar($mset->items), 1, 'match name');
+my $mi = ($mset->items)[0];
+my $doc = $mi->get_document;
+is($doc->get_data, '{}', 'stored empty data');
+
+done_testing;
diff --git a/t/msg_iter.t b/t/msg_iter.t
index 4ee3a201..e46d515c 100644
--- a/t/msg_iter.t
+++ b/t/msg_iter.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/msgmap.t b/t/msgmap.t
index 437e106e..2d462dfb 100644
--- a/t/msgmap.t
+++ b/t/msgmap.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -12,7 +12,7 @@ my $d = PublicInbox::Msgmap->new($tmpdir, 1);
 my %mid2num;
 my %num2mid;
 my @mids = qw(a@b c@d e@f g@h aa@bb aa@cc);
-is_deeply([$d->minmax], [undef,undef], "empty min max on new DB");
+is_deeply([$d->minmax], [0,0], "zero min max on new DB");
 
 foreach my $mid (@mids) {
 	my $n = $d->mid_insert($mid);
diff --git a/t/msgtime.t b/t/msgtime.t
index 89fd9e37..00d57999 100644
--- a/t/msgtime.t
+++ b/t/msgtime.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/multi-mid.t b/t/multi-mid.t
index 41d556b9..e9c3dd8c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/nntp.t b/t/nntp.t
index 9a482acb..5bad9dfe 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -8,6 +8,7 @@ use PublicInbox::Eml;
 require_mods(qw(DBD::SQLite Data::Dumper));
 use_ok 'PublicInbox::NNTP';
 use_ok 'PublicInbox::Inbox';
+use PublicInbox::Config;
 
 {
 	sub quote_str {
@@ -98,44 +99,38 @@ use_ok 'PublicInbox::Inbox';
 
 { # test setting NNTP headers in HEAD and ARTICLE requests
 	my $u = 'https://example.com/a/';
-	my $ng = PublicInbox::Inbox->new({ name => 'test',
+	my $ibx = PublicInbox::Inbox->new({ name => 'test',
 					inboxdir => 'test.git',
 					address => 'a@example.com',
 					-primary_address => 'a@example.com',
 					newsgroup => 'test',
 					domain => 'example.com',
 					url => [ '//example.com/a' ]});
-	is($ng->base_url, $u, 'URL expanded');
+	is($ibx->base_url, $u, 'URL expanded');
 	my $mid = 'a@b';
 	my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n");
 	my $hdr = $mime->header_obj;
 	my $mock_self = {
-		nntpd => { grouplist => [], servername => 'example.com' },
-		ng => $ng,
+		nntpd => {
+			servername => 'example.com',
+			pi_cfg => bless {}, 'PublicInbox::Config',
+		},
+		ibx => $ibx,
 	};
-	my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };
+	my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ibx };
 	PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
-	is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
-		'Archived-At: set');
-	is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ],
-		'List-Archive: set');
-	is_deeply([ $mime->header('List-Post') ], [ '' ],
-		'List-Post: set');
 	is_deeply([ $mime->header('Newsgroups') ], [ 'test' ],
 		'Newsgroups: set');
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:1' ],
 		'Xref: set');
 
-	$ng->{-base_url} = 'http://mirror.example.com/m/';
+	$ibx->{-base_url} = 'http://mirror.example.com/m/';
 	$smsg->{num} = 2;
 	PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
-	is_deeply([ $mime->header('Archived-At') ],
-		[ "<${u}a\@b/>", '' ],
-		'Archived-At: appended');
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ],
 		'Old Xref: clobbered');
 }
diff --git a/t/nntpd-tls.t b/t/nntpd-tls.t
index 23baf4e4..1194be6f 100644
--- a/t/nntpd-tls.t
+++ b/t/nntpd-tls.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/nntpd-v2.t b/t/nntpd-v2.t
index 1dd992a0..0433a57a 100644
--- a/t/nntpd-v2.t
+++ b/t/nntpd-v2.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 local $ENV{PI_TEST_VERSION} = 2;
 require './t/nntpd.t';
diff --git a/t/nntpd.t b/t/nntpd.t
index c7a7ee6b..63287d43 100644
--- a/t/nntpd.t
+++ b/t/nntpd.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/nodatacow.t b/t/nodatacow.t
index e5b742a2..72860d43 100644
--- a/t/nodatacow.t
+++ b/t/nodatacow.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/nulsubject.t b/t/nulsubject.t
index ccb60d52..7f5dd378 100644
--- a/t/nulsubject.t
+++ b/t/nulsubject.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/on_destroy.t b/t/on_destroy.t
new file mode 100644
index 00000000..0de67d0b
--- /dev/null
+++ b/t/on_destroy.t
@@ -0,0 +1,34 @@
+#!perl -w
+use strict;
+use v5.10.1;
+use Test::More;
+require_ok 'PublicInbox::OnDestroy';
+my @x;
+my $od = PublicInbox::OnDestroy->new(sub { push @x, 'hi' });
+is_deeply(\@x, [], 'not called, yet');
+undef $od;
+is_deeply(\@x, [ 'hi' ], 'no args works');
+$od = PublicInbox::OnDestroy->new(sub { $x[0] = $_[0] }, 'bye');
+is_deeply(\@x, [ 'hi' ], 'nothing changed while alive');
+undef $od;
+is_deeply(\@x, [ 'bye' ], 'arg passed');
+$od = PublicInbox::OnDestroy->new(sub { @x = @_ }, qw(x y));
+undef $od;
+is_deeply(\@x, [ 'x', 'y' ], '2 args passed');
+
+open my $tmp, '+>>', undef or BAIL_OUT $!;
+$tmp->autoflush(1);
+$od = PublicInbox::OnDestroy->new(1, sub { print $tmp "$$ DESTROY\n" });
+undef $od;
+is(-s $tmp, 0, '$tmp is empty on pid mismatch');
+$od = PublicInbox::OnDestroy->new($$, sub { $tmp = $$ });
+undef $od;
+is($tmp, $$, '$tmp set to $$ by callback');
+
+if (my $nr = $ENV{TEST_LEAK_NR}) {
+	for (0..$nr) {
+		$od = PublicInbox::OnDestroy->new(sub { @x = @_ }, qw(x y));
+	}
+}
+
+done_testing;
diff --git a/t/over.t b/t/over.t
index 4c8f8098..a92d2f77 100644
--- a/t/over.t
+++ b/t/over.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -74,4 +74,28 @@ SKIP: {
 		'WAL journal_mode not clobbered if manually set');
 }
 
+# ext index additions
+$over->eidx_prep;
+{
+	my @arg = qw(1349 2019 adeadba7cafe example.key);
+	ok($over->add_xref3(@arg), 'first add');
+	ok($over->add_xref3(@arg), 'add idempotent');
+	my $xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.key:2019:adeadba7cafe' ], 'xref3 works');
+
+	@arg = qw(1349 2018 deadbeefcafe example.kee);
+	ok($over->add_xref3(@arg), 'add another xref3');
+	$xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.key:2019:adeadba7cafe',
+			'example.kee:2018:deadbeefcafe' ],
+			'xref3 works forw two');
+
+	@arg = qw(1349 adeadba7cafe example.key);
+	is($over->remove_xref3(@arg), 1, 'remove first');
+	$xref3 = $over->get_xref3(1349);
+	is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
+		'confirm removal successful');
+	$over->rollback_lazy;
+}
+
 done_testing();
diff --git a/t/plack.t b/t/plack.t
index 1fedf426..8d8aa100 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -21,8 +21,8 @@ ok(-f $psgi, "psgi example file found");
 my $pfx = 'http://example.com/test';
 ok(run_script(['-init', 'test', $inboxdir, "$pfx/", $addr]),
 	'initialized repo');
-PublicInbox::Import::run_die([qw(git config -f), $pi_config,
-	'publicinbox.test.newsgroup', 'inbox.test']);
+xsys_e(qw(git config -f), $pi_config,
+	qw(publicinbox.test.newsgroup inbox.test));
 open my $fh, '>', "$inboxdir/description" or die "open: $!\n";
 print $fh "test for public-inbox\n";
 close $fh or die "close: $!\n";
diff --git a/t/precheck.t b/t/precheck.t
index 11193e38..360dc74f 100644
--- a/t/precheck.t
+++ b/t/precheck.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2020 all contributors 
+# Copyright (C) 2014-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_attach.t b/t/psgi_attach.t
index 14d20adb..65d704bf 100644
--- a/t/psgi_attach.t
+++ b/t/psgi_attach.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_bad_mids.t b/t/psgi_bad_mids.t
index 70393573..f23680f8 100644
--- a/t/psgi_bad_mids.t
+++ b/t/psgi_bad_mids.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_mount.t b/t/psgi_mount.t
index b4de8274..5836e9ce 100644
--- a/t/psgi_mount.t
+++ b/t/psgi_mount.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -17,7 +17,7 @@ use_ok 'PublicInbox::WWW';
 use PublicInbox::Import;
 use PublicInbox::Git;
 use PublicInbox::Config;
-my $config = PublicInbox::Config->new(\<new(\<done;
 }
 
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
 my $app = builder(sub {
 	enable('Head');
 	mount('/a' => builder(sub { sub { $www->call(@_) } }));
@@ -67,11 +67,9 @@ test_psgi($app, sub {
 
 	$res = $cb->(GET('/a/test/blah%40example.com/raw'));
 	is($res->code, 200, 'OK with URLMap mount');
-	like($res->content, qr!^List-Archive: !m,
-		'List-Archive set in /raw mboxrd');
 	like($res->content,
-		qr!^Archived-At: !m,
-		'Archived-At set in /raw mboxrd');
+		qr/^Message-Id: \n/sm,
+		'headers appear in /raw');
 
 	# redirects
 	$res = $cb->(GET('/a/test/m/blah%40example.com.html'));
@@ -85,7 +83,7 @@ test_psgi($app, sub {
 
 SKIP: {
 	require_mods(qw(DBD::SQLite Search::Xapian IO::Uncompress::Gunzip), 3);
-	my $ibx = $config->lookup_name('test');
+	my $ibx = $cfg->lookup_name('test');
 	require_ok 'PublicInbox::SearchIdx';
 	PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
 	test_psgi($app, sub {
@@ -94,12 +92,8 @@ SKIP: {
 		my $gz = $res->content;
 		my $raw;
 		IO::Uncompress::Gunzip::gunzip(\$gz => \$raw);
-		like($raw, qr!^List-Archive: !m,
-			'List-Archive set in /t.mbox.gz mboxrd');
-		like($raw,
-			qr!^Archived-At:\x20
-				!mx,
-			'Archived-At set in /t.mbox.gz mboxrd');
+		like($raw, qr!^Message-Id:\x20\n!sm,
+			'headers appear in /t.mbox.gz mboxrd');
 	});
 }
 
diff --git a/t/psgi_multipart_not.t b/t/psgi_multipart_not.t
index 9b7fb4d0..8edbe088 100644
--- a/t/psgi_multipart_not.t
+++ b/t/psgi_multipart_not.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_scan_all.t b/t/psgi_scan_all.t
index c8cb2409..80b855e1 100644
--- a/t/psgi_scan_all.t
+++ b/t/psgi_scan_all.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_search.t b/t/psgi_search.t
index c1677eb3..8ba431bc 100644
--- a/t/psgi_search.t
+++ b/t/psgi_search.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -67,11 +67,11 @@ $im->done;
 PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
 
 my $cfgpfx = "publicinbox.test";
-my $config = PublicInbox::Config->new(\<new(\<new($config);
+my $www = PublicInbox::WWW->new($cfg);
 test_psgi(sub { $www->call(@_) }, sub {
 	my ($cb) = @_;
 	my $res;
@@ -144,7 +144,7 @@ test_psgi(sub { $www->call(@_) }, sub {
 		$xdb->set_metadata('has_threadid', '0');
 		$sidx->idx_release;
 	}
-	$config->each_inbox(sub { delete $_[0]->{search} });
+	$cfg->each_inbox(sub { delete $_[0]->{search} });
 	$res = $cb->(GET('/test/?q=s:test'));
 	is($res->code, 200, 'successful search w/o has_threadid');
 	unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
diff --git a/t/psgi_text.t b/t/psgi_text.t
index 9867feaa..e4613945 100644
--- a/t/psgi_text.t
+++ b/t/psgi_text.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/psgi_v2.t b/t/psgi_v2.t
index c13f5e71..7ab60adc 100644
--- a/t/psgi_v2.t
+++ b/t/psgi_v2.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -87,12 +87,11 @@ like($$msg, qr/\AFrom oldbug/s,
 	'"From_" line stored to test old bug workaround');
 
 my $cfgpfx = "publicinbox.v2test";
-my $cfg = <new(\<{-primary_address}
 $cfgpfx.inboxdir=$inboxdir
 EOF
-my $config = PublicInbox::Config->new(\$cfg);
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
 my ($res, $raw, @from_);
 my $client0 = sub {
 	my ($cb) = @_;
@@ -154,7 +153,7 @@ my $client1 = sub {
 	like($raw, qr/^hello ghosts$/m, 'got third message');
 	@from_ = ($raw =~ m/^From /mg);
 	is(scalar(@from_), 3, 'three From_ lines');
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 
 	SKIP: {
 		eval { require IO::Uncompress::Gunzip };
@@ -244,7 +243,7 @@ $run_httpd->($client1, 38);
 	$im->done;
 	my @h = $mime->header('Message-ID');
 	is_deeply($exp, \@h, 'reused existing Message-ID');
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 }
 
 my $client2 = sub {
@@ -283,7 +282,7 @@ $run_httpd->($client2, 8);
 		ok($im->add($mime), "added attachment $body");
 	}
 	$im->done;
-	$config->each_inbox(sub { $_[0]->search->reopen });
+	$cfg->each_inbox(sub { $_[0]->search->reopen });
 }
 
 my $client3 = sub {
diff --git a/t/purge.t b/t/purge.t
index 2ca9edca..f4281c13 100644
--- a/t/purge.t
+++ b/t/purge.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/qspawn.t b/t/qspawn.t
index e37a05fd..4b9dc8a5 100644
--- a/t/qspawn.t
+++ b/t/qspawn.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/replace.t b/t/replace.t
index fd8ce2c6..51bdb964 100644
--- a/t/replace.t
+++ b/t/replace.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -186,8 +186,7 @@ test_replace(2, 'basic', $opt = { %$opt, post => \&pad_msgs });
 test_replace(2, 'basic', $opt = { %$opt, rotate_bytes => 1 });
 
 SKIP: {
-	require PublicInbox::Search;
-	PublicInbox::Search::load_xapian() or skip 'Search::Xapian missing', 8;
+	require_mods(qw(Search::Xapian), 8);
 	for my $l (qw(medium)) {
 		test_replace(2, $l, {});
 		$opt = { pre => \&pad_msgs };
diff --git a/t/reply.t b/t/reply.t
index 53162df5..0b8e1f38 100644
--- a/t/reply.t
+++ b/t/reply.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3+ 
 use strict;
 use warnings;
diff --git a/t/run.perl b/t/run.perl
index 1c7bcfc3..b7cb988b 100755
--- a/t/run.perl
+++ b/t/run.perl
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Parallel test runner which preloads code and reuses worker processes
@@ -71,7 +71,8 @@ sub test_status () {
 		my $skip = '';
 		if (open my $fh, '<', $log) {
 			my @not_ok = grep(!/^(?:ok |[ \t]*#)/ms, <$fh>);
-			pop @not_ok if $not_ok[-1] =~ /^[0-9]+\.\.[0-9]+$/;
+			my $last = $not_ok[-1] // '';
+			pop @not_ok if $last =~ /^[0-9]+\.\.[0-9]+$/;
 			my $pfx = "# $log: ";
 			print $OLDERR map { $pfx.$_ } @not_ok;
 			seek($fh, 0, SEEK_SET) or die "seek: $!";
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
index bd663519..fc1b666a 100644
--- a/t/search-thr-index.t
+++ b/t/search-thr-index.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 all contributors 
+# Copyright (C) 2017-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/search.t b/t/search.t
index 8df8a202..b2958c00 100644
--- a/t/search.t
+++ b/t/search.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -60,7 +60,7 @@ sub oct_is ($$$) {
 }
 
 {
-	my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust;
+	my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust;
 	is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed');
 	is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR');
 	is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');
@@ -332,13 +332,13 @@ $ibx->with_umask(sub {
 		like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears');
 		my $doc = $m->get_document;
 		my $col = PublicInbox::Search::BYTES();
-		my $bytes = PublicInbox::Smsg::get_val($doc, $col);
+		my $bytes = PublicInbox::Search::int_val($doc, $col);
 		like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit');
 		ok($bytes > 0, '$bytes is > 0');
 		is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over');
 
 		$col = PublicInbox::Search::UID();
-		my $uid = PublicInbox::Smsg::get_val($doc, $col);
+		my $uid = PublicInbox::Search::int_val($doc, $col);
 		is($uid, $smsg->{num}, 'UID column matches {num}');
 		is($uid, $m->get_docid, 'UID column matches docid');
 	}
@@ -535,5 +535,3 @@ $ibx->with_umask(sub {
 });
 
 done_testing();
-
-1;
diff --git a/t/shared_kv.t b/t/shared_kv.t
new file mode 100644
index 00000000..6f6374f2
--- /dev/null
+++ b/t/shared_kv.t
@@ -0,0 +1,54 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use_ok 'PublicInbox::SharedKV';
+my ($tmpdir, $for_destroy) = tmpdir();
+local $ENV{TMPDIR} = $tmpdir;
+my $skv = PublicInbox::SharedKV->new;
+my $skv_tmpdir = $skv->{tmpdir};
+ok(-d $skv_tmpdir, 'created a temporary dir');
+$skv->dbh;
+my $dead = "\xde\xad";
+my $beef = "\xbe\xef";
+my $cafe = "\xca\xfe";
+ok($skv->set($dead, $beef), 'set');
+is($skv->get($dead), $beef, 'get');
+ok($skv->set($dead, $beef), 'set idempotent');
+ok(!$skv->set_maybe($dead, $cafe), 'set_maybe ignores');
+ok($skv->set_maybe($cafe, $dead), 'set_maybe sets');
+is($skv->xchg($dead, $cafe), $beef, 'xchg');
+is($skv->get($dead), $cafe, 'get after xchg');
+is($skv->xchg($dead, undef), $cafe, 'xchg to undef');
+is($skv->get($dead), undef, 'get after xchg to undef');
+is($skv->get($cafe), $dead, 'get after set_maybe');
+ok($skv->index_values, 'index_values works');
+is($skv->replace_values($dead, $cafe), 1, 'replaced one by value');
+is($skv->get($cafe), $cafe, 'value updated');
+is($skv->replace_values($dead, $cafe), 0, 'replaced none by value');
+is($skv->xchg($dead, $cafe), undef, 'xchg from undef');
+is($skv->count, 2, 'count works');
+
+my %seen;
+my $sth = $skv->each_kv_iter;
+while (my ($k, $v) = $sth->fetchrow_array) {
+	$seen{$k} = $v;
+}
+is($seen{$dead}, $cafe, '$dead has expected value');
+is($seen{$cafe}, $cafe, '$cafe has expected value');
+is(scalar keys %seen, 2, 'iterated through all');
+
+is($skv->replace_values($cafe, $dead), 2, 'replaced 2 by value');
+is($skv->delete_by_val('bogus'), 0, 'delete_by_val misses');
+is($skv->delete_by_val($dead), 2, 'delete_by_val hits');
+is($skv->delete_by_val($dead), 0, 'delete_by_val misses again');
+
+undef $skv;
+ok(!-d $skv_tmpdir, 'temporary dir gone');
+$skv = PublicInbox::SharedKV->new("$tmpdir/dir", 'base');
+ok(-e "$tmpdir/dir/base.sqlite3", 'file created');
+
+done_testing;
diff --git a/t/sigfd.t b/t/sigfd.t
index 8daf3137..3a383d79 100644
--- a/t/sigfd.t
+++ b/t/sigfd.t
@@ -1,10 +1,10 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 use strict;
 use Test::More;
 use IO::Handle;
 use POSIX qw(:signal_h);
 use Errno qw(ENOSYS);
-use PublicInbox::Syscall qw($SFD_NONBLOCK);
+use PublicInbox::Syscall qw(SFD_NONBLOCK);
 require_ok 'PublicInbox::Sigfd';
 
 SKIP: {
@@ -42,8 +42,8 @@ SKIP: {
 		}
 		$sigfd = undef;
 
-		my $nbsig = PublicInbox::Sigfd->new($sig, $SFD_NONBLOCK);
-		ok($nbsig, 'Sigfd->new $SFD_NONBLOCK works');
+		my $nbsig = PublicInbox::Sigfd->new($sig, SFD_NONBLOCK);
+		ok($nbsig, 'Sigfd->new SFD_NONBLOCK works');
 		is($nbsig->wait_once, undef, 'nonblocking ->wait_once');
 		ok($! == Errno::EAGAIN, 'got EAGAIN');
 		kill('HUP', $$) or die "kill $!";
diff --git a/t/solver_git.t b/t/solver_git.t
index 6b0ed8d2..d03a6f38 100644
--- a/t/solver_git.t
+++ b/t/solver_git.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/spamcheck_spamc.t b/t/spamcheck_spamc.t
index 2d9da631..ab46d62b 100644
--- a/t/spamcheck_spamc.t
+++ b/t/spamcheck_spamc.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/spawn.t b/t/spawn.t
index a0019202..0eed79bb 100644
--- a/t/spawn.t
+++ b/t/spawn.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 all contributors 
+# Copyright (C) 2015-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -32,7 +32,7 @@ elsif ($pid > 0) {
 	select(undef, undef, undef, 0.01) while 1;
 }
 EOF
-	my $oldset = PublicInbox::Sigfd::block_signals();
+	my $oldset = PublicInbox::DS::block_signals();
 	my $rd = popen_rd([$^X, '-e', $script]);
 	diag 'waiting for child to reap grandchild...';
 	chomp(my $line = readline($rd));
@@ -41,7 +41,7 @@ EOF
 	ok(kill('CHLD', $pid), 'sent SIGCHLD to child');
 	is(readline($rd), "HI\n", '$SIG{CHLD} works in child');
 	ok(close $rd, 'popen_rd close works');
-	PublicInbox::Sigfd::sig_setmask($oldset);
+	PublicInbox::DS::sig_setmask($oldset);
 }
 
 {
@@ -98,6 +98,44 @@ EOF
 	isnt($?, 0, '$? set properly: '.$?);
 }
 
+{ # ->CLOSE vs ->DESTROY waitpid caller distinction
+	my @c;
+	my $fh = popen_rd(['true'], undef, { cb => sub { @c = caller } });
+	ok(close($fh), '->CLOSE fired and successful');
+	ok(scalar(@c), 'callback fired by ->CLOSE');
+	ok(grep(!m[/PublicInbox/DS\.pm\z], @c), 'callback not invoked by DS');
+
+	@c = ();
+	$fh = popen_rd(['true'], undef, { cb => sub { @c = caller } });
+	undef $fh; # ->DESTROY
+	ok(scalar(@c), 'callback fired by ->DESTROY');
+	ok(grep(!m[/PublicInbox/ProcessPipe\.pm\z], @c),
+		'callback not invoked by ProcessPipe');
+}
+
+{ # children don't wait on siblings
+	use POSIX qw(_exit);
+	pipe(my ($r, $w)) or BAIL_OUT $!;
+	my $cb = sub { warn "x=$$\n" };
+	my $fh = popen_rd(['cat'], undef, { 0 => $r, cb => $cb });
+	my $pp = tied *$fh;
+	my $pid = fork // BAIL_OUT $!;
+	local $SIG{__WARN__} = sub { _exit(1) };
+	if ($pid == 0) {
+		local $SIG{__DIE__} = sub { _exit(2) };
+		undef $fh;
+		_exit(0);
+	}
+	waitpid($pid, 0);
+	is($?, 0, 'forked process exited');
+	my @w;
+	local $SIG{__WARN__} = sub { push @w, @_ };
+	close $w;
+	close $fh;
+	is($?, 0, 'cat exited');
+	is_deeply(\@w, [ "x=$$\n" ], 'callback fired from owner');
+}
+
 SKIP: {
 	eval {
 		require BSD::Resource;
diff --git a/t/thread-cycle.t b/t/thread-cycle.t
index 484ea443..613c142e 100644
--- a/t/thread-cycle.t
+++ b/t/thread-cycle.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/thread-index-gap.t b/t/thread-index-gap.t
index 49f254e9..83c3707d 100644
--- a/t/thread-index-gap.t
+++ b/t/thread-index-gap.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
@@ -45,12 +45,15 @@ for my $msgs (['orig', reverse @msgs], ['shuffle', shuffle(@msgs)]) {
 	}
 	$im->done;
 	my $over = $ibx->over;
-	my @tid = $over->dbh->selectall_array('SELECT DISTINCT(tid) FROM over');
+	my $dbh = $over->dbh;
+	my $tid = $dbh->selectall_arrayref('SELECT DISTINCT(tid) FROM over');
+	my @tid = map { $_->[0] } @$tid;
 	is(scalar(@tid), 1, "only one thread initially ($desc)");
 	$over->dbh_close;
 	run_script([qw(-index --reindex --rethread), $ibx->{inboxdir}]) or
 		BAIL_OUT 'rethread';
-	@tid = $over->dbh->selectall_array('SELECT DISTINCT(tid) FROM over');
+	$tid = $dbh->selectall_arrayref('SELECT DISTINCT(tid) FROM over');
+	@tid = map { $_->[0] } @$tid;
 	is(scalar(@tid), 1, "only one thread after rethread ($desc)");
 }
 
diff --git a/t/time.t b/t/time.t
index b491711d..fae20897 100644
--- a/t/time.t
+++ b/t/time.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/uri_imap.t b/t/uri_imap.t
index a2e86a7e..6c4207c3 100644
--- a/t/uri_imap.t
+++ b/t/uri_imap.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/v1-add-remove-add.t b/t/v1-add-remove-add.t
index 2cd45f60..a94bf7fd 100644
--- a/t/v1-add-remove-add.t
+++ b/t/v1-add-remove-add.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v1reindex.t b/t/v1reindex.t
index e66d89e5..36cefda5 100644
--- a/t/v1reindex.t
+++ b/t/v1reindex.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v2-add-remove-add.t b/t/v2-add-remove-add.t
index cfdc8cf1..b325e521 100644
--- a/t/v2-add-remove-add.t
+++ b/t/v2-add-remove-add.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v2dupindex.t b/t/v2dupindex.t
index b1abccd9..4b20c8e0 100644
--- a/t/v2dupindex.t
+++ b/t/v2dupindex.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # we can index a message from a mirror which bypasses dedupe.
diff --git a/t/v2mda.t b/t/v2mda.t
index abbdc8e4..38aea0c1 100644
--- a/t/v2mda.t
+++ b/t/v2mda.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 81b9544d..ebad2566 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v2reindex.t b/t/v2reindex.t
index ae1570ed..05ea952f 100644
--- a/t/v2reindex.t
+++ b/t/v2reindex.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/v2writable.t b/t/v2writable.t
index 2f71fafa..f5c8313a 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -274,14 +274,13 @@ EOF
 	$mime->header_set('Message-ID', "<$y>");
 	$mime->header_set('References', "<$x>");
 	ok($im->add($mime), 'add excessively long References');
-	$im->barrier;
+	$im->done;
 
 	my $msgs = $ibx->over->get_thread('x'x244);
 	is(2, scalar(@$msgs), 'got both messages');
 	is($msgs->[0]->{mid}, 'x'x244, 'stored truncated mid');
 	is($msgs->[1]->{references}, '<'.('x'x244).'>', 'stored truncated ref');
 	is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)');
-	$im->done;
 }
 
 my $tmp = {
diff --git a/t/view.t b/t/view.t
index 462667f1..114c3304 100644
--- a/t/view.t
+++ b/t/view.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2013-2020 all contributors 
+# Copyright (C) 2013-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/watch_filter_rubylang.t b/t/watch_filter_rubylang.t
index 6513f30b..29a9f793 100644
--- a/t/watch_filter_rubylang.t
+++ b/t/watch_filter_rubylang.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -72,11 +72,11 @@ $cfgpfx.filter=PublicInbox::Filter::RubyLang
 $cfgpfx.altid=serial:alerts:file=msgmap.sqlite3
 publicinboxwatch.watchspam=maildir:$spamdir
 EOF
-	my $config = PublicInbox::Config->new(\$orig);
-	my $ibx = $config->lookup_name($v);
+	my $cfg = PublicInbox::Config->new(\$orig);
+	my $ibx = $cfg->lookup_name($v);
 	ok($ibx, 'found inbox by name');
 
-	my $w = PublicInbox::Watch->new($config);
+	my $w = PublicInbox::Watch->new($cfg);
 	for my $i (1..2) {
 		$w->scan('full');
 	}
@@ -101,8 +101,8 @@ EOF
 	}
 	$w->scan('full');
 
-	$config = PublicInbox::Config->new(\$orig);
-	$ibx = $config->lookup_name($v);
+	$cfg = PublicInbox::Config->new(\$orig);
+	$ibx = $cfg->lookup_name($v);
 	is($ibx->search->reopen->mset('b:spam')->size, 0, 'spam removed');
 
 	is_deeply([], \@warn, 'no warnings');
diff --git a/t/watch_imap.t b/t/watch_imap.t
index fb71d3df..eeda29eb 100644
--- a/t/watch_imap.t
+++ b/t/watch_imap.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index ae53caf9..e74b512f 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
@@ -34,13 +34,13 @@ my $sem = PublicInbox::Emergency->new($spamdir); # create dirs
 {
 	my @w;
 	local $SIG{__WARN__} = sub { push @w, @_ };
-	my $config = PublicInbox::Config->new(\<new(\<new($config);
+	my $wm = PublicInbox::Watch->new($cfg);
 	is(scalar grep(/is a spam folder/, @w), 1, 'got warning about spam');
 	is_deeply($wm->{mdmap}, { "$spamdir/cur" => 'watchspam' },
 		'only got the spam folder to watch');
@@ -61,8 +61,8 @@ EOF
 	close $fh or BAIL_OUT $!;
 }
 
-my $config = PublicInbox::Config->new($cfg_path);
-PublicInbox::Watch->new($config)->scan('full');
+my $cfg = PublicInbox::Config->new($cfg_path);
+PublicInbox::Watch->new($cfg)->scan('full');
 my $git = PublicInbox::Git->new($git_dir);
 my @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 1, 'one revision in rev-list');
@@ -79,7 +79,7 @@ my $write_spam = sub {
 };
 $write_spam->();
 is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 2, 'two revisions in rev-list');
 @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
@@ -93,7 +93,7 @@ To unsubscribe from this list: send the line "unsubscribe git" in
 the body of a message to majordomo\@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file');
 	my $mref = $git->cat_file('HEAD:'.$list[0]);
@@ -101,7 +101,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 
 	is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
 	$write_spam->();
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree is empty');
 	@list = $git->qx(qw(rev-list refs/heads/master));
@@ -115,10 +115,10 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $fail_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	{
 		local $SIG{__WARN__} = sub {}; # quiet spam check warning
-		PublicInbox::Watch->new($config)->scan('full');
+		PublicInbox::Watch->new($cfg)->scan('full');
 	}
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree has no files spamc checked');
@@ -131,9 +131,9 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $main_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file after spamc checked');
 
@@ -166,9 +166,9 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 		$delivered++;
 	};
 	PublicInbox::DS->Reset;
-	my $ii = PublicInbox::InboxIdle->new($config);
+	my $ii = PublicInbox::InboxIdle->new($cfg);
 	my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup';
-	$config->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
+	$cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
 	PublicInbox::DS->SetPostLoopCallback(sub { $delivered == 0 });
 
 	# wait for -watch to setup inotify watches
diff --git a/t/watch_maildir_v2.t b/t/watch_maildir_v2.t
index 12546418..195e238b 100644
--- a/t/watch_maildir_v2.t
+++ b/t/watch_maildir_v2.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
@@ -44,11 +44,11 @@ $cfgpfx.watch=maildir:$maildir
 $cfgpfx.filter=PublicInbox::Filter::Vger
 publicinboxlearn.watchspam=maildir:$spamdir
 EOF
-my $config = PublicInbox::Config->new(\$orig);
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$orig);
+my $ibx = $cfg->lookup_name('test');
 ok($ibx, 'found inbox by name');
 
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 my $total = scalar @{$ibx->over->recent};
 is($total, 1, 'got one revision');
 
@@ -68,7 +68,7 @@ my $write_spam = sub {
 };
 $write_spam->();
 is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
 is_deeply($ibx->over->recent, [], 'deleted file');
 is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
 
@@ -79,7 +79,7 @@ To unsubscribe from this list: send the line "unsubscribe git" in
 the body of a message to majordomo\@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 1, 'got one file back');
 	my $mref = $ibx->msg_by_smsg($msgs->[0]);
@@ -87,7 +87,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 
 	is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
 	$write_spam->();
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	$msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 0, 'inbox is empty again');
 	is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
@@ -99,10 +99,10 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $fail_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
 	{
 		local $SIG{__WARN__} = sub {}; # quiet spam check warning
-		PublicInbox::Watch->new($config)->scan('full');
+		PublicInbox::Watch->new($cfg)->scan('full');
 	}
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 0, 'inbox is still empty');
@@ -115,13 +115,13 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
 	local $ENV{PATH} = $main_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
-	PublicInbox::Watch->new($config)->scan('full');
+	$cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $msgs = $ibx->over->recent;
 	is(scalar(@$msgs), 1, 'inbox has one mail after spamc OK-ed a message');
 	my $mref = $ibx->msg_by_smsg($msgs->[0]);
 	like($$mref, qr/something\n\z/s, 'message scrubbed on import');
-	delete $config->{'publicinboxwatch.spamcheck'};
+	delete $cfg->{'publicinboxwatch.spamcheck'};
 }
 
 {
@@ -129,7 +129,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	open my $fh, '<', $patch or die "failed to open $patch: $!\n";
 	$msg = do { local $/; <$fh> };
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $post = $ibx->search->reopen->mset('dfpost:6e006fd7');
 	is($post->size, 1, 'diff postimage found');
 	my $pre = $ibx->search->mset('dfpre:090d998b6c2c');
@@ -146,12 +146,12 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	my $v1pfx = "publicinbox.v1";
 	my $v1addr = 'v1-public@example.com';
 	PublicInbox::Import::init_bare($v1repo);
-	my $cfg2 = <new(\$cfg2);
+	my $cfg = PublicInbox::Config->new(\$raw);
 	my $both = <new($maildir)->prepare(\$both);
-	PublicInbox::Watch->new($config)->scan('full');
+	PublicInbox::Watch->new($cfg)->scan('full');
 	my $mset = $ibx->search->reopen->mset('m:both@b.com');
 	my $msgs = $ibx->search->mset_to_smsg($ibx, $mset);
-	my $v1 = $config->lookup_name('v1');
+	my $v1 = $cfg->lookup_name('v1');
 	my $msg = $v1->git->cat_file($msgs->[0]->{blob});
 	is($both, $$msg, 'got original message back from v1');
 	$msg = $ibx->git->cat_file($msgs->[0]->{blob});
@@ -184,21 +184,21 @@ List-Id: 
 X-Mailing-List: no@example.com
 Message-ID: 
 EOF
-	my $cfg = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
+	my $raw = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
 	PublicInbox::Emergency->new($maildir)->prepare(\$want);
 	PublicInbox::Emergency->new($maildir)->prepare(\$do_not_want);
-	my $config = PublicInbox::Config->new(\$cfg);
-	PublicInbox::Watch->new($config)->scan('full');
-	$ibx = $config->lookup_name('test');
+	my $cfg = PublicInbox::Config->new(\$raw);
+	PublicInbox::Watch->new($cfg)->scan('full');
+	$ibx = $cfg->lookup_name('test');
 	my $num = $ibx->mm->num_for('do.want@example.com');
 	ok(defined $num, 'List-ID matched for watch');
 	$num = $ibx->mm->num_for('do.not.want@example.com');
 	is($num, undef, 'unaccepted List-ID matched for watch');
 
-	$cfg = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
-	$config = PublicInbox::Config->new(\$cfg);
-	PublicInbox::Watch->new($config)->scan('full');
-	$ibx = $config->lookup_name('test');
+	$raw = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
+	$cfg = PublicInbox::Config->new(\$raw);
+	PublicInbox::Watch->new($cfg)->scan('full');
+	$ibx = $cfg->lookup_name('test');
 	$num = $ibx->mm->num_for('do.not.want@example.com');
 	ok(defined $num, 'X-Mailing-List matched');
 }
diff --git a/t/watch_multiple_headers.t b/t/watch_multiple_headers.t
index a0813532..33ed0770 100644
--- a/t/watch_multiple_headers.t
+++ b/t/watch_multiple_headers.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
@@ -54,16 +54,16 @@ PublicInbox::Emergency->new($maildir)->prepare(\$msg_to);
 PublicInbox::Emergency->new($maildir)->prepare(\$msg_cc);
 PublicInbox::Emergency->new($maildir)->prepare(\$msg_none);
 
-my $cfg = <new(\$cfg);
-PublicInbox::Watch->new($config)->scan('full');
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$raw);
+PublicInbox::Watch->new($cfg)->scan('full');
+my $ibx = $cfg->lookup_name('test');
 ok($ibx, 'found inbox by name');
 
 my $num = $ibx->mm->num_for('to@a.com');
diff --git a/t/watch_nntp.t b/t/watch_nntp.t
index ce1a3153..c0ad3098 100644
--- a/t/watch_nntp.t
+++ b/t/watch_nntp.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/www_altid.t b/t/www_altid.t
index 337303d9..14eda030 100644
--- a/t/www_altid.t
+++ b/t/www_altid.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/t/www_listing.t b/t/www_listing.t
index 4309a5e1..1bcbaefb 100644
--- a/t/www_listing.t
+++ b/t/www_listing.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 # manifest.js.gz generation and grok-pull integration test
 use strict;
@@ -7,22 +7,19 @@ use Test::More;
 use PublicInbox::Spawn qw(which);
 use PublicInbox::TestCommon;
 use PublicInbox::Import;
-require_mods(qw(URI::Escape Plack::Builder Digest::SHA
+require_mods(qw(json URI::Escape Plack::Builder Digest::SHA
 		IO::Compress::Gzip IO::Uncompress::Gunzip HTTP::Tiny));
 require PublicInbox::WwwListing;
 require PublicInbox::ManifestJsGz;
-my $json = do {
-	no warnings 'once';
-	$PublicInbox::ManifestJsGz::json;
-} or plan skip_all => "JSON module missing";
+use PublicInbox::Config;
+my $json = PublicInbox::Config::json();
 
 use_ok 'PublicInbox::Git';
 
 my ($tmpdir, $for_destroy) = tmpdir();
 my $bare = PublicInbox::Git->new("$tmpdir/bare.git");
 PublicInbox::Import::init_bare($bare->{git_dir});
-is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
-	'empty repo has no fingerprint');
+is($bare->manifest_entry, undef, 'empty repo has no manifest entry');
 {
 	my $fi_data = './t/git.fast-import-data';
 	open my $fh, '<', $fi_data or die "open $fi_data: $!";
@@ -31,7 +28,7 @@ is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
 		'fast-import');
 }
 
-like(PublicInbox::ManifestJsGz::fingerprint($bare), qr/\A[a-f0-9]{40}\z/,
+like($bare->manifest_entry->{fingerprint}, qr/\A[a-f0-9]{40}\z/,
 	'got fingerprint with non-empty repo');
 
 sub tiny_test {
diff --git a/t/www_static.t b/t/www_static.t
index 364b9447..3281751c 100644
--- a/t/www_static.t
+++ b/t/www_static.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/t/xcpdb-reshard.t b/t/xcpdb-reshard.t
index ede736c1..1b726f1a 100644
--- a/t/xcpdb-reshard.t
+++ b/t/xcpdb-reshard.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/xt/cmp-msgstr.t b/xt/cmp-msgstr.t
index 0276f845..e0e8ed5a 100644
--- a/xt/cmp-msgstr.t
+++ b/xt/cmp-msgstr.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/xt/cmp-msgview.t b/xt/cmp-msgview.t
index 5bd7aa17..49dcbc9e 100644
--- a/xt/cmp-msgview.t
+++ b/xt/cmp-msgview.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
@@ -24,7 +24,7 @@ vec(my $vec = '', fileno($fh), 1) = 1;
 select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
 my $mime_ctx = {
 	env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
-	-inbox => $ibx,
+	ibx => $ibx,
 	www => Plack::Util::inline_object(style => sub {''}),
 	obuf => \(my $mime_buf = ''),
 	mhref => '../',
diff --git a/xt/create-many-inboxes.t b/xt/create-many-inboxes.t
new file mode 100644
index 00000000..0c2de40d
--- /dev/null
+++ b/xt/create-many-inboxes.t
@@ -0,0 +1,99 @@
+#!perl -w
+# Copyright (C) 2020-2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use File::Path qw(mkpath);
+use IO::Handle (); # autoflush
+use POSIX qw(_exit);
+use Cwd qw(getcwd abs_path);
+use File::Spec;
+my $many_root = $ENV{TEST_MANY_ROOT} or
+	plan skip_all => 'TEST_MANY_ROOT not defined';
+my $cwd = getcwd();
+mkpath($many_root);
+-d $many_root or BAIL_OUT "$many_root: $!";
+$many_root = abs_path($many_root);
+$many_root =~ m!\A\Q$cwd\E/! and BAIL_OUT "$many_root must not be in $cwd";
+require_git 2.6;
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::V2Writable';
+my $nr_inbox = $ENV{NR_INBOX} // 10;
+my $nproc = $ENV{NPROC} || PublicInbox::V2Writable::detect_nproc() || 2;
+my $indexlevel = $ENV{TEST_INDEXLEVEL} // 'basic';
+diag "NR_INBOX=$nr_inbox NPROC=$nproc TEST_INDEXLEVEL=$indexlevel";
+diag "TEST_MANY_ROOT=$many_root";
+my $level_cfg = $indexlevel eq 'full' ? '' : "\tindexlevel = $indexlevel\n";
+my $pfx = "$many_root/$nr_inbox-$indexlevel";
+mkpath($pfx);
+open my $cfg_fh, '>>', "$pfx/config" or BAIL_OUT $!;
+$cfg_fh->autoflush(1);
+my $v2_init_add = sub {
+	my ($i) = @_;
+	my $ibx = PublicInbox::Inbox->new({
+		inboxdir => "$pfx/test-$i",
+		name => "test-$i",
+		newsgroup => "inbox.comp.test.foo.test-$i",
+		address => [ "test-$i\@example.com" ],
+		url => [ "//example.com/test-$i" ],
+		version => 2,
+	});
+	$ibx->{indexlevel} = $indexlevel if $level_cfg ne '';
+	my $entry = <{name}"]
+	address = $ibx->{-primary_address}
+	url = $ibx->{url}->[0]
+	newsgroup = $ibx->{newsgroup}
+	inboxdir = $ibx->{inboxdir}
+EOF
+	$entry .= $level_cfg;
+	print $cfg_fh $entry or die $!;
+	my $v2w = PublicInbox::V2Writable->new($ibx, { nproc => 0 });
+	$v2w->init_inbox(0);
+	$v2w->add(PublicInbox::Eml->new(<
+To: test-$i\@example.com
+Message-ID: <20101002-000000-$i\@example.com>
+Subject: hello world $i
+
+hi
+EOM
+	$v2w->done;
+};
+
+my @children;
+for my $i (1..$nproc) {
+	my ($r, $w);
+	pipe($r, $w) or BAIL_OUT $!;
+	my $pid = fork;
+	if ($pid == 0) {
+		close $w;
+		while (my $i = <$r>) {
+			chomp $i;
+			$v2_init_add->($i);
+		}
+		_exit(0);
+	}
+	defined $pid or BAIL_OUT "fork: $!";
+	close $r or BAIL_OUT $!;
+	push @children, [ $w, $pid ];
+	$w->autoflush(1);
+}
+
+for my $i (0..$nr_inbox) {
+	print { $children[$i % @children]->[0] } "$i\n" or BAIL_OUT $!;
+}
+
+for my $c (@children) {
+	close $c->[0] or BAIL_OUT "close $!";
+}
+my $i = 0;
+for my $c (@children) {
+	my $pid = waitpid($c->[1], 0);
+	is($?, 0, ++$i.' exited ok');
+}
+ok(close($cfg_fh), 'config written');
+done_testing;
diff --git a/xt/eml_check_limits.t b/xt/eml_check_limits.t
index 9f821946..536a25f1 100644
--- a/xt/eml_check_limits.t
+++ b/xt/eml_check_limits.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use v5.10.1;
diff --git a/xt/git-http-backend.t b/xt/git-http-backend.t
index 2f02725a..dcff72cc 100644
--- a/xt/git-http-backend.t
+++ b/xt/git-http-backend.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # Ensure buffering behavior in -httpd doesn't cause runaway memory use
diff --git a/xt/git_async_cmp.t b/xt/git_async_cmp.t
index f9c9ddef..a7a94c2d 100644
--- a/xt/git_async_cmp.t
+++ b/xt/git_async_cmp.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/xt/httpd-async-stream.t b/xt/httpd-async-stream.t
index 22a96875..f6715c58 100644
--- a/xt/httpd-async-stream.t
+++ b/xt/httpd-async-stream.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # Expensive test to validate compression and TLS.
 use strict;
diff --git a/xt/imapd-mbsync-oimap.t b/xt/imapd-mbsync-oimap.t
index f8641d06..5f671fc8 100644
--- a/xt/imapd-mbsync-oimap.t
+++ b/xt/imapd-mbsync-oimap.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # ensure mbsync and offlineimap compatibility
 use strict;
diff --git a/xt/imapd-validate.t b/xt/imapd-validate.t
index 3e445156..b6ac3e21 100644
--- a/xt/imapd-validate.t
+++ b/xt/imapd-validate.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # Expensive test to validate compression and TLS.
 use strict;
diff --git a/xt/lei-sigpipe.t b/xt/lei-sigpipe.t
new file mode 100644
index 00000000..448bd7db
--- /dev/null
+++ b/xt/lei-sigpipe.t
@@ -0,0 +1,35 @@
+#!perl -w
+# Copyright (C) 2021 all contributors 
+# License: AGPL-3.0+ 
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use POSIX qw(WTERMSIG WIFSIGNALED SIGPIPE);
+require_mods(qw(json DBD::SQLite Search::Xapian));
+# XXX this needs an already configured lei instance with many messages
+
+my $do_test = sub {
+	my $env = shift // {};
+	for my $out ([], [qw(-f mboxcl2)]) {
+		pipe(my ($r, $w)) or BAIL_OUT $!;
+		open my $err, '+>', undef or BAIL_OUT $!;
+		my $opt = { run_mode => 0, 1 => $w, 2 => $err };
+		my $cmd = [qw(lei q -t), @$out, 'bytes:1..'];
+		my $tp = start_script($cmd, $env, $opt);
+		close $w;
+		sysread($r, my $buf, 1);
+		close $r; # trigger SIGPIPE
+		$tp->join;
+		ok(WIFSIGNALED($?), "signaled @$out");
+		is(WTERMSIG($?), SIGPIPE, "got SIGPIPE @$out");
+		seek($err, 0, 0);
+		my @err = grep(!m{mkdir /dev/null\b}, <$err>);
+		is_deeply(\@err, [], "no errors @$out");
+	}
+};
+
+$do_test->();
+$do_test->({XDG_RUNTIME_DIR => '/dev/null'});
+
+done_testing;
diff --git a/xt/mem-imapd-tls.t b/xt/mem-imapd-tls.t
index 3f1436c7..e4b3b8cd 100644
--- a/xt/mem-imapd-tls.t
+++ b/xt/mem-imapd-tls.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # Idle client memory usage test, particularly after EXAMINE when
 # Message Sequence Numbers are loaded
diff --git a/xt/mem-msgview.t b/xt/mem-msgview.t
index c09afde0..dceb24b2 100644
--- a/xt/mem-msgview.t
+++ b/xt/mem-msgview.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 # Note: this may be altered as-needed to demonstrate improvements.
 # See history in git for this file.
diff --git a/xt/msgtime_cmp.t b/xt/msgtime_cmp.t
index aa96be4d..ae9e4215 100644
--- a/xt/msgtime_cmp.t
+++ b/xt/msgtime_cmp.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;
diff --git a/xt/nntpd-validate.t b/xt/nntpd-validate.t
index 322e6f62..efe97c02 100644
--- a/xt/nntpd-validate.t
+++ b/xt/nntpd-validate.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 
 # Integration test to validate compression.
diff --git a/xt/perf-msgview.t b/xt/perf-msgview.t
index d99101a3..59980839 100644
--- a/xt/perf-msgview.t
+++ b/xt/perf-msgview.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 all contributors 
+# Copyright (C) 2019-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
@@ -29,7 +29,7 @@ select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
 
 my $ctx = {
 	env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
-	-inbox => $ibx,
+	ibx => $ibx,
 	www => Plack::Util::inline_object(style => sub {''}),
 };
 my ($mime, $res, $oid, $type);
diff --git a/xt/perf-nntpd.t b/xt/perf-nntpd.t
index f73afacc..cd0d4938 100644
--- a/xt/perf-nntpd.t
+++ b/xt/perf-nntpd.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 all contributors 
+# Copyright (C) 2018-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use warnings;
diff --git a/xt/perf-threading.t b/xt/perf-threading.t
index b27c9cbd..57e9db9b 100644
--- a/xt/perf-threading.t
+++ b/xt/perf-threading.t
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2020 all contributors 
+# Copyright (C) 2016-2021 all contributors 
 # License: AGPL-3.0+ 
 #
 # real-world testing of search threading
@@ -25,7 +25,7 @@ ok($n, 'got some messages');
 diag "enquire: ".timestr($elapsed)." for $n";
 
 $elapsed = timeit(1, sub {
-	PublicInbox::View::thread_results({-inbox => $ibx}, $msgs);
+	PublicInbox::View::thread_results({ibx => $ibx}, $msgs);
 });
 diag "thread_results ".timestr($elapsed);
 
diff --git a/xt/solver.t b/xt/solver.t
index 99fca0d3..2f2fcc44 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2020 all contributors 
+# Copyright (C) 2020-2021 all contributors 
 # License: AGPL-3.0+ 
 use strict;
 use Test::More;