+++ /dev/null
-From: Eric Wong <e@80x24.org>
-To: meta@public-inbox.org
-Subject: [ANNOUNCE] public-inbox 1.6.1
-Date: Thu, 31 Dec 2020 23:45:56 +0000
-Message-ID: <20201231234556.public-inbox-1.6.1-rele@sed>
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Disposition: inline
-
-A small, bugfix release on top of 1.6.0 from September 2020.
-
-Bug fixes:
-
-* MIME header decoding no longer warns on undefined variables,
- with Perl <5.28. Thanks to a bug report by Ali Alnubani.
- https://public-inbox.org/meta/DM6PR12MB49106F8E3BD697B63B943A22DADB0@DM6PR12MB4910.namprd12.prod.outlook.com/
-
-* Fixed a message threading bug thanks to a report from Kyle Meyer.
- "public-inbox-index --rethread --reindex" will be necessary
- in case of certain messages arrive out-of-order.
- Link: https://public-inbox.org/meta/87360nlc44.fsf@kyleam.com/
-
-* WWW: per-inbox grokmirror manifests no longer return info
- for all inboxes, only the root /manifest.js.gz includes all
- inboxes. This regression appeared in 1.6.
-
-* public-inbox-mda matches List-Id headers insensitively,
- matching public-inbox-watch behavior. Similarly, List-Id
- is always indexed lower-cased for boolean matches to avoid
- matching an incorrect term.
-
-* Newsgroup and Path NNTP headers are now emitted in conformance
- with RFC 5536 3.1.[45]. Thanks to Andrey Melnikov for the report:
- https://public-inbox.org/meta/CA+PODjpUN5Q4gBFQhAzUNuMasVEdmp9f=8Uo0Ej0mFumdSwi4w@mail.gmail.com/
-
-* Inotify fixes for public-inbox-imapd users relying on SIGHUP
- reloads and thousands of watches.
-
-* Read-only daemon fixes around TLS and Linux <4.5 systems
-
-Bugfixes with minor behavior changes:
-
-* The X-Status mbox header is now excluded from imports,
- just like the Status: header has been for many years.
- They have no place in public archives and can be privacy
- concern for people sharing archives.
-
-* WWW prevents deep-linking to attachments to limit abuse
- vectors. Noticed by Leah Neukirchen:
- https://public-inbox.org/meta/87imagyap9.fsf@vuxu.org/
-
-There are also several ocumentation fixes from Uwe Kleine-König
-and Kyle Meyer.
-
-Please report bugs via plain-text mail to: meta@public-inbox.org
-
-See archives at https://public-inbox.org/meta/ for all history.
);
$ibx->{-primary_address} = $addr;
my $ctx = {
- -inbox => $ibx,
+ ibx => $ibx,
-upfx => "$base_url/",
-hr => 1,
};
}
sub html_end {
- print $out <<EOF or die;
- git clone $PublicInbox::WwwStream::CODE_URL
-</pre></body></html>
-EOF
+ for (@$PublicInbox::WwwStream::CODE_URL) {
+ print $out " git clone $_\n" or die;
+ }
+ print $out "</pre></body></html>\n" or die;
}
sub atom_start {
# WwwAtomStream stats this dir for mtime
my $astream = PublicInbox::WwwAtomStream->new($ctx);
delete $astream->{emit_header};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $title = PublicInbox::WwwAtomStream::title_tag($ibx->description);
my $updated = PublicInbox::WwwAtomStream::feed_updated($mtime);
print $out <<EOF or die;
--- /dev/null
+% public-inbox developer manual
+
+=head1 NAME
+
+public-inbox extindex format description
+
+=head1 DESCRIPTION
+
+The extindex is an index-only evolution of the per-inbox
+SQLite and Xapian indices used by L<public-inbox-v2-format(5)>
+and L<public-inbox-v1-format(5)>. It exists to facilitate
+searches across multiple inboxes as well as to reduce index
+space when messages are cross-posted to several existing
+inboxes.
+
+It transparently indexes messages across any combination of v1 and v2
+inboxes and data about inboxes themselves.
+
+=head1 DIRECTORY LAYOUT
+
+While inspired by v2, there is no git blob storage nor
+C<msgmap.sqlite3> DB.
+
+Instead, there is an C<ALL.git> (all caps) git repo which treats
+every indexed v1 inbox or v2 epoch as a git alternate.
+
+As with v2 inboxes, it uses C<over.sqlite3> and Xapian "shards"
+for WWW and IMAP use. Several exclusive new tables are added
+to deal with L</XREF3 DEDUPLICATION> and metadata.
+
+Unlike v1 and v2 inboxes, it is NOT designed to map to a NNTP
+newsgroup. Thus it lacks C<msgmap.sqlite3> to enforce the
+unique Message-ID requirement of NNTP.
+
+=head2 INDEX OVERVIEW AND DEFINITIONS
+
+ $SCHEMA_VERSION - DB schema version (for Xapian)
+ $SHARD - Integer starting with 0 based on parallelism
+
+ foo/ # "foo" is the name of the index
+ - ei.lock # lock file to protect global state
+ - ALL.git # empty, alternates for inboxes
+ - ei$SCHEMA_VERSION/$SHARD # per-shard Xapian DB
+ - ei$SCHEMA_VERSION/over.sqlite3 # overview DB for WWW, IMAP
+ - ei$SCHEMA_VERSION/misc # misc Xapian DB
+
+File and directory names are intentionally different from
+analogous v2 names to ensure extindex and v2 inboxes can
+easily be distinguished from each other.
+
+=head2 XREF3 DEDUPLICATION
+
+Due to cross-posted messages being the norm in the large Linux kernel
+development community and Xapian indices being the primary consumer of
+storage, it makes sense to deduplicate indexing as much as possible.
+
+The internal storage format is based on the NNTP "Xref" tuple,
+but with the addition of a third element: the git blob OID.
+Thus the triple is expressed in string form as:
+
+ $NEWSGROUP_NAME:$ARTICLE_NUM:$OID
+
+If no C<newsgroup> is configured for an inbox, the C<inboxdir>
+of the inbox is used.
+
+This data is stored in the C<xref3> table of over.sqlite3.
+
+=head2 misc XAPIAN DB
+
+In addition to the numeric Xapian shards for indexing messages,
+there is a new, in-development Xapian index for storing data
+about inboxes themselves and other non-message data. This
+index allows us to speed up operations involving hundreds or
+thousands of inboxes.
+
+=head1 BENEFITS
+
+In addition to providing cross-inbox search capabilities, it can
+also replace per-inbox Xapian shards (but not per-inbox
+over.sqlite3). This allows reduction in disk space, open file
+handles, and associated memory use.
+
+=head1 CAVEATS
+
+Relocating v1 and v2 inboxes on the filesystem will require
+extindex to be garbage-collected and/or reindexed.
+
+Configuring and maintaining stable C<newsgroup> names before any
+messages are indexed from every inbox can avoid expensive
+reindexing and rely exclusively on GC.
+
+=head1 LOCKING
+
+L<flock(2)> locking exclusively locks the empty ei.lock file
+for all non-atomic operations.
+
+=head1 THANKS
+
+Thanks to the Linux Foundation for sponsoring the development
+and testing.
+
+=head1 COPYRIGHT
+
+Copyright 2020 all contributors L<mailto:meta@public-inbox.org>
+
+License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt>
+
+=head1 SEE ALSO
+
+L<public-inbox-v2-format(5)>
Available in public-inbox 1.6.0+.
+=item --update-extindex=EXTINDEX, -E
+
+Update the given external index (L<public-inbox-extindex-format(5)>.
+Either the configured section name (e.g. C<all>) or a directory name
+may be specified.
+
+Defaults to C<all> if C<[extindex "all"]> is configured,
+otherwise no external indices are updated.
+
+May be specified multiple times in rare cases where multiple
+external indices are configured.
+
+=item --no-update-extindex
+
+Do not update the C<all> external index by default. This negates
+all uses of C<-E> / C<--update-extindex=> on the command-line.
+
=back
=head1 FILES
=head1 SEE ALSO
-L<Search::Xapian>, L<DBD::SQLite>
+L<Search::Xapian>, L<DBD::SQLite>, L<public-inbox-extindex-format(5)>
1036 => 'Standard for Interchange of USENET Messages',
5536 => 'Netnews Article Format',
5537 => 'Netnews Architecture and Protocols',
+ 1738 => 'Uniform resource locators',
+ 5092 => 'IMAP URL scheme',
+ 5538 => 'NNTP URI schemes',
6048 => 'NNTP additions to LIST command (TODO)',
8054 => 'NNTP compression',
4642 => 'NNTP TLS',
Documentation/RelNotes/v1.4.0.eml
Documentation/RelNotes/v1.5.0.eml
Documentation/RelNotes/v1.6.0.eml
-Documentation/RelNotes/v1.6.1.eml
Documentation/RelNotes/v1.7.0.wip
Documentation/clients.txt
Documentation/dc-dlvr-spam-flow.txt
Documentation/public-inbox-convert.pod
Documentation/public-inbox-daemon.pod
Documentation/public-inbox-edit.pod
+Documentation/public-inbox-extindex-format.pod
Documentation/public-inbox-httpd.pod
Documentation/public-inbox-imapd.pod
Documentation/public-inbox-index.pod
lib/PublicInbox/Eml.pm
lib/PublicInbox/EmlContentFoo.pm
lib/PublicInbox/ExtMsg.pm
+lib/PublicInbox/ExtSearch.pm
+lib/PublicInbox/ExtSearchIdx.pm
lib/PublicInbox/FakeInotify.pm
lib/PublicInbox/Feed.pm
lib/PublicInbox/Filter/Base.pm
lib/PublicInbox/Filter/RubyLang.pm
lib/PublicInbox/Filter/SubjectTag.pm
lib/PublicInbox/Filter/Vger.pm
+lib/PublicInbox/Gcf2.pm
+lib/PublicInbox/Gcf2Client.pm
lib/PublicInbox/GetlineBody.pm
lib/PublicInbox/Git.pm
lib/PublicInbox/GitAsyncCat.pm
lib/PublicInbox/Inbox.pm
lib/PublicInbox/InboxIdle.pm
lib/PublicInbox/InboxWritable.pm
+lib/PublicInbox/Isearch.pm
lib/PublicInbox/KQNotify.pm
lib/PublicInbox/Linkify.pm
lib/PublicInbox/Listener.pm
lib/PublicInbox/ManifestJsGz.pm
lib/PublicInbox/Mbox.pm
lib/PublicInbox/MboxGz.pm
+lib/PublicInbox/MiscIdx.pm
+lib/PublicInbox/MiscSearch.pm
lib/PublicInbox/MsgIter.pm
lib/PublicInbox/MsgTime.pm
lib/PublicInbox/Msgmap.pm
lib/PublicInbox/WwwStream.pm
lib/PublicInbox/WwwText.pm
lib/PublicInbox/Xapcmd.pm
+lib/PublicInbox/gcf2_libgit2.h
sa_config/Makefile
sa_config/README
sa_config/root/etc/spamassassin/public-inbox.pre
script/public-inbox-compact
script/public-inbox-convert
script/public-inbox-edit
+script/public-inbox-extindex
script/public-inbox-httpd
script/public-inbox-imapd
script/public-inbox-index
t/eml_content_disposition.t
t/eml_content_type.t
t/epoll.t
+t/extsearch.t
t/fail-bin/spamc
t/fake_inotify.t
t/feed.t
t/filter_rubylang.t
t/filter_subjecttag.t
t/filter_vger.t
+t/gcf2.t
+t/gcf2_client.t
t/git-http-backend.psgi
t/git.fast-import-data
t/git.t
t/mda_filter_rubylang.t
t/mid.t
t/mime.t
+t/miscsearch.t
t/msg_iter-nested.eml
t/msg_iter-order.eml
t/msg_iter.t
t/xcpdb-reshard.t
xt/cmp-msgstr.t
xt/cmp-msgview.t
+xt/create-many-inboxes.t
xt/eml_check_limits.t
xt/git-http-backend.t
xt/git_async_cmp.t
my $t = {};
# do not sort
-my @RELEASES = qw(v1.6.1 v1.6.0 v1.5.0 v1.4.0 v1.3.0 v1.2.0 v1.1.0-pre1 v1.0.0);
+my @RELEASES = qw(v1.6.0 v1.5.0 v1.4.0 v1.3.0 v1.2.0 v1.1.0-pre1 v1.0.0);
$v->{news_deps} = [ map { "Documentation/RelNotes/$_.eml" } @RELEASES ];
$v->{txt} = [ qw(INSTALL README COPYING TODO HACKING) ];
@syn = grep(!/SaPlugin/, @syn) if !eval { require Mail::SpamAssasin };
$v->{syn_files} = \@syn;
$v->{my_syntax} = [map { "$_.syntax" } @syn];
-$v->{-m1} = [ map { (split('/'))[-1] } @EXE_FILES ];
+my @no_pod;
+$v->{-m1} = [ map {
+ my $x = (split('/'))[-1];
+ my $pod = "Documentation/$x.pod";
+ if (-f $pod) {
+ $x;
+ } else {
+ warn "W: $pod missing\n";
+ push @no_pod, $x;
+ ();
+ }
+ } @EXE_FILES ];
$v->{-m5} = [ qw(public-inbox-config public-inbox-v1-format
- public-inbox-v2-format) ];
+ public-inbox-v2-format public-inbox-extindex-format) ];
$v->{-m7} = [ qw(public-inbox-overview public-inbox-tuning) ];
$v->{-m8} = [ qw(public-inbox-daemon) ];
my @sections = (1, 5, 7, 8);
$mod =~ s/\.\w+\z//;
"lib/PublicInbox/$_" => "blib/man3/PublicInbox::$mod.\$(MAN3EXT)"
} qw(Git.pm Import.pm WWW.pod SaPlugin/ListMirror.pod);
+my $warn_no_pod = @no_pod ? "\n\t\@echo W: missing .pod: @no_pod\n" : '';
WriteMakefile(
NAME => 'PublicInbox', # n.b. camel-case is not our choice
# XXX drop "PENDING" in .pod before updating this!
- VERSION => '1.6.1',
+ VERSION => '1.6.0',
AUTHOR => 'Eric Wong <e@80x24.org>',
ABSTRACT => 'public-inbox server infrastructure',
-include Documentation/include.mk
$TGTS
+check-man ::$warn_no_pod
+
# syntax checks are currently GNU make only:
%.syntax :: %
@\$(PERL) -w -I lib -c \$<
git clone https://public-inbox.org/public-inbox.git
git clone https://repo.or.cz/public-inbox.git
+ torsocks git clone http://ou63pmih66umazou.onion/public-inbox.git
torsocks git clone http://hjrcffqmbrq6wope.onion/public-inbox
See below for contact info.
use Plack::Builder;
use PublicInbox::Cgit;
use PublicInbox::Config;
-my $pi_config = PublicInbox::Config->new;
-my $cgit = PublicInbox::Cgit->new($pi_config);
+my $pi_cfg = PublicInbox::Config->new;
+my $cgit = PublicInbox::Cgit->new($pi_cfg);
builder {
eval { enable 'ReverseProxy' };
package PublicInbox::Admin;
use strict;
use parent qw(Exporter);
-use Cwd qw(abs_path);
-use POSIX ();
-our @EXPORT_OK = qw(resolve_repo_dir setup_signals);
+our @EXPORT_OK = qw(setup_signals);
use PublicInbox::Config;
use PublicInbox::Inbox;
use PublicInbox::Spawn qw(popen_rd);
+*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed;
sub setup_signals {
my ($cb, $arg) = @_; # optional
+ require POSIX;
# we call exit() here instead of _exit() so DESTROY methods
# get called (e.g. File::Temp::Dir and PublicInbox::Msgmap)
};
}
-sub resolve_repo_dir {
+sub resolve_inboxdir {
my ($cd, $ver) = @_;
- my $prefix = defined $cd ? $cd : './';
- if (-d $prefix && -f "$prefix/inbox.lock") { # v2
- $$ver = 2 if $ver;
- return abs_path($prefix);
+ my $try = $cd // '.';
+ my $root_dev_ino;
+ while (1) { # favor v2, first
+ if (-f "$try/inbox.lock") {
+ $$ver = 2 if $ver;
+ return rel2abs_collapsed($try);
+ } elsif (-d $try) {
+ my @try = stat _;
+ $root_dev_ino //= do {
+ my @root = stat('/') or die "stat /: $!\n";
+ "$root[0]\0$root[1]";
+ };
+ last if "$try[0]\0$try[1]" eq $root_dev_ino;
+ $try .= '/..'; # continue, cd up
+ } else {
+ die "`$try' is not a directory\n";
+ }
}
+ # try v1 bare git dirs
my $cmd = [ qw(git rev-parse --git-dir) ];
my $fh = popen_rd($cmd, undef, {-C => $cd});
my $dir = do { local $/; <$fh> };
- close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n";
+ close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n";
chomp $dir;
$$ver = 1 if $ver;
- return abs_path($cd) if ($dir eq '.' && defined $cd);
- abs_path($dir);
+ rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir);
}
# for unconfigured inboxes
name => $name,
address => [ "$name\@example.com" ],
inboxdir => $dir,
- # TODO: consumers may want to warn on this:
- #-unconfigured => 1,
+ # consumers (-convert) warn on this:
+ -unconfigured => 1,
});
}
}
my $min_ver = $opt->{-min_inbox_version} || 0;
+ # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames,
+ # pathnames are not unique due to symlinks and bind mounts
my (@old, @ibxs);
- my %dir2ibx;
- if ($cfg) {
+ if ($opt->{all}) {
$cfg->each_inbox(sub {
my ($ibx) = @_;
- my $path = abs_path($ibx->{inboxdir});
- if (defined($path)) {
- $dir2ibx{$path} = $ibx;
+ if (-e $ibx->{inboxdir}) {
+ push(@ibxs, $ibx) if $ibx->version >= $min_ver;
} else {
- warn <<EOF;
-W: $ibx->{name} $ibx->{inboxdir}: $!
-EOF
+ warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n";
}
});
- }
- if ($opt->{all}) {
- my @all = values %dir2ibx;
- @all = grep { $_->version >= $min_ver } @all;
- push @ibxs, @all;
} else { # directories specified on the command-line
- my $i = 0;
my @dirs = @$argv;
- push @dirs, '.' unless @dirs;
- foreach (@dirs) {
- my $v;
- my $dir = resolve_repo_dir($_, \$v);
- if ($v < $min_ver) {
+ push @dirs, '.' if !@dirs && $opt->{-use_cwd};
+ my %s2i; # "st_dev\0st_ino" => array index
+ for (my $i = 0; $i <= $#dirs; $i++) {
+ my $dir = $dirs[$i];
+ my @st = stat($dir) or die "stat($dir): $!\n";
+ $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver));
+ if ($ver >= $min_ver) {
+ $s2i{"$st[0]\0$st[1]"} //= $i;
+ } else {
push @old, $dir;
- next;
}
- my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i);
- $i++;
- push @ibxs, $ibx;
}
+ my $done = \'done';
+ eval {
+ $cfg->each_inbox(sub {
+ my ($ibx) = @_;
+ return if $ibx->version < $min_ver;
+ my $dir = $ibx->{inboxdir};
+ if (my @s = stat $dir) {
+ my $i = delete($s2i{"$s[0]\0$s[1]"})
+ // return;
+ $ibxs[$i] = $ibx;
+ die $done if !keys(%s2i);
+ } else {
+ warn "W: $ibx->{name} $dir: $!\n";
+ }
+ });
+ };
+ die $@ if $@ && $@ ne $done;
+ for my $i (sort { $a <=> $b } values %s2i) {
+ $ibxs[$i] = unconfigured_ibx($dirs[$i], $i);
+ }
+ @ibxs = grep { defined } @ibxs; # duplicates are undef
}
if (@old) {
die "-V$min_ver inboxes not supported by $0\n\t",
sub index_inbox {
my ($ibx, $im, $opt) = @_;
+ require PublicInbox::InboxWritable;
my $jobs = delete $opt->{jobs} if $opt;
if (my $pr = $opt->{-progress}) {
$pr->("indexing $ibx->{inboxdir} ...\n");
}
local %SIG = %SIG;
setup_signals(\&index_terminate, $ibx);
+ my $warn_cb = $SIG{__WARN__} // \&CORE::warn;
+ my $idx = { current_info => $ibx->{inboxdir} };
+ my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore');
+ local $SIG{__WARN__} = sub {
+ return if $warn_ignore->(@_);
+ $warn_cb->($idx->{current_info}, ': ', @_);
+ };
if (ref($ibx) && $ibx->version == 2) {
eval { require PublicInbox::V2Writable };
die "v2 requirements not met: $@\n" if $@;
} else {
my $n = $v2w->{shards};
if ($jobs < ($n + 1) && !$opt->{reshard}) {
- warn
-"Unable to respect --jobs=$jobs on index, inbox was created with $n shards\n";
+ warn <<EOM;
+Unable to respect --jobs=$jobs on index, inbox was created with $n shards
+EOM
}
}
}
- my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
- local $SIG{__WARN__} = sub {
- $warn_cb->($v2w->{current_info}, ': ', @_);
- };
- $v2w->index_sync($opt);
+ $idx = $v2w;
} else {
require PublicInbox::SearchIdx;
- my $s = PublicInbox::SearchIdx->new($ibx, 1);
- $s->index_sync($opt);
+ $idx = PublicInbox::SearchIdx->new($ibx, 1);
}
+ $idx->index_sync($opt);
+ $idx->{nidx} // 0; # returns number processed
}
sub progress_prepare ($) {
use PublicInbox::WwwStatic qw(r);
sub locate_cgit ($) {
- my ($pi_config) = @_;
- my $cgit_bin = $pi_config->{'publicinbox.cgitbin'};
- my $cgit_data = $pi_config->{'publicinbox.cgitdata'};
+ my ($pi_cfg) = @_;
+ my $cgit_bin = $pi_cfg->{'publicinbox.cgitbin'};
+ my $cgit_data = $pi_cfg->{'publicinbox.cgitdata'};
# /var/www/htdocs/cgit is the default install path from cgit.git
# /usr/{lib,share}/cgit is where Debian puts cgit
}
sub new {
- my ($class, $pi_config) = @_;
- my ($cgit_bin, $cgit_data) = locate_cgit($pi_config);
+ my ($class, $pi_cfg) = @_;
+ my ($cgit_bin, $cgit_data) = locate_cgit($pi_cfg);
my $self = bless {
cmd => [ $cgit_bin ],
cgit_data => $cgit_data,
- pi_config => $pi_config,
+ pi_cfg => $pi_cfg,
}, $class;
- $pi_config->fill_all; # fill in -code_repos mapped to inboxes
+ $pi_cfg->fill_all; # fill in -code_repos mapped to inboxes
# some cgit repos may not be mapped to inboxes, so ensure those exist:
- my $code_repos = $pi_config->{-code_repos};
- foreach my $k (keys %$pi_config) {
+ my $code_repos = $pi_cfg->{-code_repos};
+ foreach my $k (keys %$pi_cfg) {
$k =~ /\Acoderepo\.(.+)\.dir\z/ or next;
- my $dir = $pi_config->{$k};
+ my $dir = $pi_cfg->{$k};
$code_repos->{$1} ||= PublicInbox::Git->new($dir);
}
while (my ($nick, $repo) = each %$code_repos) {
$self->{"\0$nick"} = $repo;
}
- my $cgit_static = $pi_config->{-cgit_static};
+ my $cgit_static = $pi_cfg->{-cgit_static};
my $static = join('|', map { quotemeta $_ } keys %$cgit_static);
$self->{static} = qr/\A($static)\z/;
$self;
my $rdr = input_prepare($env) or return r(500);
my $qsp = PublicInbox::Qspawn->new($self->{cmd}, $cgi_env, $rdr);
- my $limiter = $self->{pi_config}->limiter('-cgit');
+ my $limiter = $self->{pi_cfg}->limiter('-cgit');
$qsp->psgi_return($env, $limiter, $parse_cgi_headers);
}
$self->{-by_list_id} = {};
$self->{-by_name} = {};
$self->{-by_newsgroup} = {};
+ $self->{-by_eidx_key} = {};
$self->{-no_obfuscate} = {};
$self->{-limiters} = {};
$self->{-code_repos} = {}; # nick => PublicInbox::Git object
$self->{-by_name}->{$name} // _fill($self, "publicinbox.$name");
}
+sub lookup_ei {
+ my ($self, $name) = @_;
+ $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name");
+}
+
+# special case for [extindex "all"]
+sub ALL { lookup_ei($_[0], 'all') }
+
sub each_inbox {
my ($self, $cb, @arg) = @_;
# may auto-vivify if config file is non-existent:
sub config_fh_parse ($$$) {
my ($fh, $rs, $fs) = @_;
- my %rv;
- my (%section_seen, @section_order);
+ my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i);
local $/ = $rs;
- while (defined(my $line = <$fh>)) {
- chomp $line;
- my ($k, $v) = split($fs, $line, 2);
- my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/);
- unless (defined $section_seen{$section}) {
- $section_seen{$section} = 1;
- push @section_order, $section;
- }
-
- my $cur = $rv{$k};
- if (defined $cur) {
+ while (defined($line = <$fh>)) { # perf critical with giant configs
+ $i = index($line, $fs);
+ $k = substr($line, 0, $i);
+ $v = substr($line, $i + 1, -1); # chop off $fs
+ $section = substr($k, 0, rindex($k, '.'));
+ $seen{$section} //= push(@section_order, $section);
+
+ if (defined($cur = $rv{$k})) {
if (ref($cur) eq "ARRAY") {
push @$cur, $v;
} else {
sub git_config_dump {
my ($file) = @_;
return {} unless -e $file;
- my @cmd = (qw/git config -z -l --includes/, "--file=$file");
- my $cmd = join(' ', @cmd);
- my $fh = popen_rd(\@cmd);
+ my $cmd = [ qw(git config -z -l --includes), "--file=$file" ];
+ my $fh = popen_rd($cmd);
my $rv = config_fh_parse($fh, "\0", "\n");
- close $fh or die "failed to close ($cmd) pipe: $?";
+ close $fh or die "failed to close (@$cmd) pipe: $?";
$rv;
}
}
}
+# abs_path resolves symlinks, so we want to avoid it if rel2abs
+# is sufficient and doesn't leave "/.." or "/../"
+sub rel2abs_collapsed {
+ require File::Spec;
+ my $p = File::Spec->rel2abs($_[-1]);
+ return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0;
+ require Cwd;
+ Cwd::abs_path($p);
+}
+
sub _fill {
my ($self, $pfx) = @_;
my $ibx = {};
}
}
- # backwards compatibility:
- $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"};
- if (($ibx->{inboxdir} // '') =~ /\n/s) {
- warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n";
+ # "mainrepo" is backwards compatibility:
+ my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return;
+ if (index($dir, "\n") >= 0) {
+ warn "E: `$dir' must not contain `\\n'\n";
return;
}
foreach my $k (qw(obfuscate)) {
}
}
- return unless defined($ibx->{inboxdir});
- my $name = $pfx;
- $name =~ s/\Apublicinbox\.//;
-
+ my $name = substr($pfx, length('publicinbox.'));
if (!valid_inbox_name($name)) {
warn "invalid inbox name: '$name'\n";
return;
}
$ibx->{name} = $name;
- $ibx->{-pi_config} = $self;
+ $ibx->{-pi_cfg} = $self;
$ibx = PublicInbox::Inbox->new($ibx);
foreach (@{$ibx->{address}}) {
my $lc_addr = lc($_);
$self->{-by_list_id}->{lc($list_id)} = $ibx;
}
}
- if (my $ng = $ibx->{newsgroup}) {
- $self->{-by_newsgroup}->{$ng} = $ibx;
+ if (defined(my $ngname = $ibx->{newsgroup})) {
+ if (ref($ngname)) {
+ delete $ibx->{newsgroup};
+ warn 'multiple newsgroups not supported: '.
+ join(', ', @$ngname). "\n";
+ # Newsgroup name needs to be compatible with RFC 3977
+ # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR.
+ # Leave out a few chars likely to cause problems or conflicts:
+ # '|', '<', '>', ';', '#', '$', '&',
+ } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! ||
+ $ngname eq '') {
+ delete $ibx->{newsgroup};
+ warn "newsgroup name invalid: `$ngname'\n";
+ } else {
+ # PublicInbox::NNTPD does stricter ->nntp_usable
+ # checks, keep this lean for startup speed
+ $self->{-by_newsgroup}->{$ngname} = $ibx;
+ }
+ }
+ unless (defined $ibx->{newsgroup}) { # for ->eidx_key
+ my $abs = rel2abs_collapsed($dir);
+ if ($abs ne $dir) {
+ warn "W: `$dir' canonicalized to `$abs'\n";
+ $ibx->{inboxdir} = $abs;
+ }
}
$self->{-by_name}->{$name} = $ibx;
if ($ibx->{obfuscate}) {
push @$repo_objs, $repo if $repo;
}
}
+ if (my $es = ALL($self)) {
+ require PublicInbox::Isearch;
+ $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es);
+ }
+ $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx;
+}
- $ibx
+sub _fill_ei ($$) {
+ my ($self, $pfx) = @_;
+ require PublicInbox::ExtSearch;
+ my $d = $self->{"$pfx.topdir"};
+ defined($d) && -d $d ? PublicInbox::ExtSearch->new($d) : undef;
}
sub urlmatch {
}
}
+sub json {
+ state $json;
+ $json //= do {
+ for my $mod (qw(Cpanel::JSON::XS JSON::MaybeXS JSON JSON::PP)) {
+ eval "require $mod" or next;
+ # ->ascii encodes non-ASCII to "\uXXXX"
+ $json = $mod->new->ascii(1) and last;
+ }
+ $json;
+ };
+}
+
1;
$PostLoopCallback, # subref to call at the end of each loop, if defined (global)
$LoopTimeout, # timeout of event loop in milliseconds
- $DoneInit, # if we've done the one-time module init yet
@Timers, # timers
$in_loop,
);
@Timers = ();
$PostLoopCallback = undef;
- $DoneInit = 0;
$_io = undef; # closes real $Epoll FD
$Epoll = undef; # may call DSKQXS::DESTROY
-
- *EventLoop = *FirstTimeEventLoop;
}
=head2 C<< CLASS->SetLoopTimeout( $timeout ) >>
immediately.
=cut
-sub SetLoopTimeout {
- return $LoopTimeout = $_[1] + 0;
-}
+sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 }
=head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >>
fcntl($_io, F_SETFD, $fl | FD_CLOEXEC);
}
+# caller sets return value to $Epoll
sub _InitPoller
{
- return if $DoneInit;
- $DoneInit = 1;
-
if (PublicInbox::Syscall::epoll_defined()) {
- $Epoll = epoll_create();
- set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0);
+ my $fd = epoll_create();
+ set_cloexec($fd) if (defined($fd) && $fd >= 0);
+ $fd;
} else {
my $cls;
for (qw(DSKQXS DSPoll)) {
last if eval "require $cls";
}
$cls->import(qw(epoll_ctl epoll_wait));
- $Epoll = $cls->new;
+ $cls->new;
}
- *EventLoop = *EpollEventLoop;
}
=head2 C<< CLASS->EventLoop() >>
C<PostLoopCallback> below for how to exit the loop.
=cut
-sub FirstTimeEventLoop {
- my $class = shift;
-
- _InitPoller();
-
- EventLoop($class);
-}
sub now () { clock_gettime(CLOCK_MONOTONIC) }
my $timeout = int(($Timers[0][0] - $now) * 1000) + 1;
# -1 is an infinite timeout, so prefer a real timeout
- return $timeout if $LoopTimeout == -1;
-
- # otherwise pick the lower of our regular timeout and time until
- # the next timer
- return $LoopTimeout if $LoopTimeout < $timeout;
- return $timeout;
+ ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout;
}
# We can't use waitpid(-1) safely here since it can hit ``, system(),
$PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1;
}
-sub EpollEventLoop {
+sub EventLoop {
+ $Epoll //= _InitPoller();
local $in_loop = 1;
+ my @events;
do {
- my @events;
- my $i;
my $timeout = RunTimers();
# get up to 1000 events
- my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events);
- for ($i=0; $i<$evcount; $i++) {
+ epoll_wait($Epoll, 1000, $timeout, \@events);
+ for my $fd (@events) {
# it's possible epoll_wait returned many events, including some at the end
# that ones in the front triggered unregister-interest actions. if we
# can't find the %sock entry, it's because we're no longer interested
# in that event.
- $DescriptorMap{$events[$i]->[0]}->event_step;
+ $DescriptorMap{$fd}->event_step;
}
} while (PostEventLoop());
_run_later();
$self->{sock} = $sock;
my $fd = fileno($sock);
- _InitPoller();
-
+ $Epoll //= _InitPoller();
retry:
if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) {
if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) {
}
}
# caller only cares for $events[$i]->[0]
- scalar(@$events);
+ $_ = $_->[0] for @$events;
}
# kqueue is close-on-fork (not exec), so we must not close it
my $fd = $pset[$i++];
my $revents = $pset[$i++] or next;
delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT;
- push @$events, [ $fd ];
+ push @$events, $fd;
}
my $nevents = scalar @$events;
if ($n != $nevents) {
warn "BUG? poll() returned $n, but got $nevents";
}
}
- $n;
}
1;
use POSIX qw(WNOHANG :signal_h);
use Socket qw(IPPROTO_TCP SOL_SOCKET);
sub SO_ACCEPTFILTER () { 0x1000 }
-use Cwd qw/abs_path/;
STDOUT->autoflush(1);
STDERR->autoflush(1);
use PublicInbox::DS qw(now);
require PublicInbox::Listener;
use PublicInbox::EOFpipe;
use PublicInbox::Sigfd;
+use PublicInbox::GitAsyncCat;
my @CMD;
my ($set_user, $oldset);
my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize);
sub daemonize () {
if ($daemonize) {
+ require Cwd;
foreach my $i (0..$#ARGV) {
my $arg = $ARGV[$i];
next unless -e $arg;
- $ARGV[$i] = abs_path($arg);
+ $ARGV[$i] = Cwd::abs_path($arg);
}
check_absolute('stdout', $stdout);
check_absolute('stderr', $stderr);
};
if ($daemonize) {
- my $pid = fork;
- die "could not fork: $!\n" unless defined $pid;
+ my $pid = fork // die "fork: $!";
exit if $pid;
open(STDIN, '+<', '/dev/null') or
open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n";
open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n";
POSIX::setsid();
- $pid = fork;
- die "could not fork: $!\n" unless defined $pid;
+ $pid = fork // die "fork: $!";
exit if $pid;
}
return unless defined $pid_file;
foreach my $fd (3..$end) {
my $s = IO::Handle->new_from_fd($fd, 'r');
if (my $k = sockname($s)) {
- if ($s->blocking) {
- $s->blocking(0);
- warn <<"";
+ my $prev_was_blocking = $s->blocking(0);
+ warn <<"" if $prev_was_blocking;
Inherited socket (fd=$fd) is blocking, making it non-blocking.
Set 'NonBlocking = true' in the systemd.service unit to avoid stalled
processes when multiple service instances start.
- }
$listener_names->{$k} = $s;
push @rv, $s;
} else {
}
sub kill_workers ($) {
- my ($s) = @_;
-
- while (my ($pid, $id) = each %pids) {
- kill $s, $pid;
- }
+ my ($sig) = @_;
+ kill $sig, keys(%pids);
}
sub upgrade_aborted ($) {
daemon_prepare($default);
my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef;
my $for_destroy = daemonize();
+
+ # localize GCF2C for tests:
+ local $PublicInbox::GitAsyncCat::GCF2C;
+
daemon_loop($refresh, $post_accept, $tlsd, $af_default);
PublicInbox::DS->Reset;
# ->DESTROY runs when $for_destroy goes out-of-scope
package PublicInbox::DummyInbox;
use strict;
-sub created_at { 0 } # Msgmap::created_at
+sub uidvalidity { 0 } # Msgmap::created_at
sub mm { shift }
sub uid_range { [] } # Over::uid_range
sub subscribe_unlock { undef };
no warnings 'once';
-*max = \&created_at;
+*max = \&uidvalidity;
*query_xover = \&uid_range;
*over = \&mm;
-*search = *unsubscribe_unlock =
+*isrch = *search = *unsubscribe_unlock =
*get_art = *description = *base_url = \&subscribe_unlock;
1;
sub search_partial ($$) {
my ($ibx, $mid) = @_;
return if length($mid) < $MIN_PARTIAL_LEN;
- my $srch = $ibx->search or return;
- my $opt = { limit => PARTIAL_MAX, mset => 2 };
+ my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL
+ my $opt = { limit => PARTIAL_MAX, relevance => -1 };
my @try = ("m:$mid*");
my $chop = $mid;
if ($chop =~ s/(\W+)(\w*)\z//) {
sub ext_msg_i {
my ($other, $ctx) = @_;
- return if $other->{name} eq $ctx->{-inbox}->{name} || !$other->base_url;
+ return if $other->{name} eq $ctx->{ibx}->{name} || !$other->base_url;
my $mm = $other->mm or return;
}
}
+sub ext_msg_ALL ($) {
+ my ($ctx) = @_;
+ my $ALL = $ctx->{www}->{pi_cfg}->ALL or return;
+ my $by_eidx_key = $ctx->{www}->{pi_cfg}->{-by_eidx_key};
+ my $cur_key = eval { $ctx->{ibx}->eidx_key } //
+ return partial_response($ctx); # $cur->{ibx} == $ALL
+ my %seen = ($cur_key => 1);
+ my ($id, $prev);
+ while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) {
+ my $xr3 = $ALL->over->get_xref3($x->{num});
+ for my $k (@$xr3) {
+ $k =~ s/:[0-9]+:$x->{blob}\z// or next;
+ next if $k eq $cur_key;
+ my $ibx = $by_eidx_key->{$k} // next;
+ my $url = $ibx->base_url or next;
+ push(@{$ctx->{found}}, $ibx) unless $seen{$k}++;
+ }
+ }
+ return exact($ctx) if $ctx->{found};
+
+ # fall back to partial MID matching
+ for my $ibxish ($ctx->{ibx}, $ALL) {
+ my $mids = search_partial($ibxish, $ctx->{mid}) or next;
+ push @{$ctx->{partial}}, [ $ibxish, $mids ];
+ last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX;
+ }
+ partial_response($ctx);
+}
+
sub ext_msg {
my ($ctx) = @_;
- sub {
+ ext_msg_ALL($ctx) // sub {
$ctx->{-wcb} = $_[0]; # HTTP server write callback
if ($ctx->{env}->{'pi-httpd.async'}) {
require PublicInbox::ConfigIter;
my $iter = PublicInbox::ConfigIter->new(
- $ctx->{www}->{pi_config},
+ $ctx->{www}->{pi_cfg},
\&ext_msg_step, $ctx);
$iter->event_step;
} else {
- $ctx->{www}->{pi_config}->each_inbox(\&ext_msg_i, $ctx);
+ $ctx->{www}->{pi_cfg}->each_inbox(\&ext_msg_i, $ctx);
finalize_exact($ctx);
}
};
# fall back to partial MID matching
my $mid = $ctx->{mid};
- my $cur = $ctx->{-inbox};
+ my $cur = $ctx->{ibx};
my $mids = search_partial($cur, $mid);
if ($mids) {
$ctx->{n_partial} = scalar(@$mids);
finalize_partial($ctx);
}
-sub finalize_partial {
+sub partial_response ($) {
my ($ctx) = @_;
my $mid = $ctx->{mid};
my $code = 404;
my $es = $n_partial == 1 ? '' : 'es';
$n_partial .= '+' if ($n_partial == PARTIAL_MAX);
$s .= "\n$n_partial partial match$es found:\n\n";
- my $cur_name = $ctx->{-inbox}->{name};
+ my $cur_name = $ctx->{ibx}->{name};
foreach my $pair (@{$ctx->{partial}}) {
my ($ibx, $res) = @$pair;
my $env = $ctx->{env} if $ibx->{name} eq $cur_name;
$ctx->{-html_tip} = $s .= '</pre>';
$ctx->{-title_html} = $title;
$ctx->{-upfx} = '../';
- $ctx->{-wcb}->(html_oneshot($ctx, $code));
+ html_oneshot($ctx, $code);
}
+sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) }
+
sub ext_urls {
my ($ctx, $mid, $href, $html) = @_;
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Read-only external (detached) index for cross inbox search.
+# This is a read-only counterpart to PublicInbox::ExtSearchIdx
+# and behaves like PublicInbox::Inbox AND PublicInbox::Search
+package PublicInbox::ExtSearch;
+use strict;
+use v5.10.1;
+use PublicInbox::Over;
+use PublicInbox::Inbox;
+use PublicInbox::MiscSearch;
+use DBI qw(:sql_types); # SQL_BLOB
+
+# for ->reopen, ->mset, ->mset_to_artnums
+use parent qw(PublicInbox::Search);
+
+sub new {
+ my (undef, $topdir) = @_;
+ bless {
+ topdir => $topdir,
+ # xpfx => 'ei15'
+ xpfx => "$topdir/ei".PublicInbox::Search::SCHEMA_VERSION
+ }, __PACKAGE__;
+}
+
+sub misc {
+ my ($self) = @_;
+ $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc");
+}
+
+# overrides PublicInbox::Search::_xdb
+sub _xdb {
+ my ($self) = @_;
+ $self->xdb_sharded;
+}
+
+# same as per-inbox ->over, for now...
+sub over {
+ my ($self) = @_;
+ $self->{over} //= PublicInbox::Over->new("$self->{xpfx}/over.sqlite3");
+}
+
+sub git {
+ my ($self) = @_;
+ $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git");
+}
+
+# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table
+sub nntp_xref_for { # NNTP only
+ my ($self, $xibx, $xsmsg) = @_;
+ my $dbh = over($self)->dbh;
+
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
+
+ $sth->execute($xibx->{newsgroup});
+ my $xibx_id = $sth->fetchrow_array // do {
+ warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n";
+ return;
+ };
+
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1
+
+ $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB);
+
+ # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2
+ $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num});
+ $sth->bind_param(3, $xibx_id);
+ $sth->execute;
+ my $docid = $sth->fetchrow_array // do {
+ warn <<EOF;
+W: `$xibx->{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}"
+EOF
+ return;
+ };
+
+ # LIMIT is number of newsgroups on server:
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ?
+
+ $sth->execute($docid, $xibx_id);
+ my $rows = $sth->fetchall_arrayref;
+
+ my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+
+ my %xref = map {
+ my ($ibx_id, $xnum) = @$_;
+
+ $eidx_key_sth->execute($ibx_id);
+ my $eidx_key = $eidx_key_sth->fetchrow_array;
+
+ # only include if there's a newsgroup name
+ $eidx_key && index($eidx_key, '/') >= 0 ?
+ () : ($eidx_key => $xnum)
+ } @$rows;
+ $xref{$xibx->{newsgroup}} = $xsmsg->{num};
+ \%xref;
+}
+
+sub mm { undef }
+
+sub altid_map { {} }
+
+sub description {
+ my ($self) = @_;
+ ($self->{description} //=
+ PublicInbox::Inbox::cat_desc("$self->{topdir}/description")) //
+ '$EXTINDEX_DIR/description missing';
+}
+
+sub cloneurl { [] } # TODO
+
+sub base_url { 'https://example.com/TODO/' }
+sub nntp_url { [] }
+
+no warnings 'once';
+*smsg_eml = \&PublicInbox::Inbox::smsg_eml;
+*smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid;
+*msg_by_mid = \&PublicInbox::Inbox::msg_by_mid;
+*modified = \&PublicInbox::Inbox::modified;
+*recent = \&PublicInbox::Inbox::recent;
+
+*max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef
+*isrch = *search = \&PublicInbox::Search::reopen;
+
+1;
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Detached/external index cross inbox search indexing support
+# read-write counterpart to PublicInbox::ExtSearch
+#
+# It's based on the same ideas as public-inbox-v2-format(5) using
+# over.sqlite3 for dedupe and sharded Xapian. msgmap.sqlite3 is
+# missing, so there is no Message-ID conflict resolution, meaning
+# no NNTP support for now.
+#
+# v2 has a 1:1 mapping of index:inbox or msgmap for NNTP support.
+# This is intended to be an M:N index:inbox mapping, but it'll likely
+# be 1:N in common practice (M==1)
+
+package PublicInbox::ExtSearchIdx;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::ExtSearch PublicInbox::Lock);
+use Carp qw(croak carp);
+use Sys::Hostname qw(hostname);
+use POSIX qw(strftime);
+use PublicInbox::Search;
+use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor
+ is_bad_blob);
+use PublicInbox::OverIdx;
+use PublicInbox::MiscIdx;
+use PublicInbox::MID qw(mids);
+use PublicInbox::V2Writable;
+use PublicInbox::InboxWritable;
+use PublicInbox::ContentHash qw(content_hash);
+use PublicInbox::Eml;
+use PublicInbox::DS qw(now);
+use DBI qw(:sql_types); # SQL_BLOB
+
+sub new {
+ my (undef, $dir, $opt) = @_;
+ my $l = $opt->{indexlevel} // 'full';
+ $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
+ die "invalid indexlevel=$l\n";
+ $l eq 'basic' and die "E: indexlevel=basic not yet supported\n";
+ my $self = bless {
+ xpfx => "$dir/ei".PublicInbox::Search::SCHEMA_VERSION,
+ topdir => $dir,
+ creat => $opt->{creat},
+ ibx_map => {}, # (newsgroup//inboxdir) => $ibx
+ ibx_list => [],
+ indexlevel => $l,
+ transact_bytes => 0,
+ total_bytes => 0,
+ current_info => '',
+ parallel => 1,
+ lock_path => "$dir/ei.lock",
+ }, __PACKAGE__;
+ $self->{shards} = $self->count_shards || nproc_shards($opt->{creat});
+ my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3");
+ $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync};
+ $self->{oidx} = $oidx;
+ $self
+}
+
+sub attach_inbox {
+ my ($self, $ibx) = @_;
+ $self->{ibx_map}->{$ibx->eidx_key} //= do {
+ push @{$self->{ibx_list}}, $ibx;
+ $ibx;
+ }
+}
+
+sub _ibx_attach { # each_inbox callback
+ my ($ibx, $self) = @_;
+ attach_inbox($self, $ibx);
+}
+
+sub attach_config {
+ my ($self, $cfg) = @_;
+ $self->{cfg} = $cfg;
+ $cfg->each_inbox(\&_ibx_attach, $self);
+}
+
+sub check_batch_limit ($) {
+ my ($req) = @_;
+ my $self = $req->{self};
+ my $new_smsg = $req->{new_smsg};
+
+ # {raw_bytes} may be unset, so just use {bytes}
+ my $n = $self->{transact_bytes} += $new_smsg->{bytes};
+
+ # set flag for PublicInbox::V2Writable::index_todo:
+ ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes};
+}
+
+sub do_xpost ($$) {
+ my ($req, $smsg) = @_;
+ my $self = $req->{self};
+ my $docid = $smsg->{num};
+ my $idx = $self->idx_shard($docid);
+ my $oid = $req->{oid};
+ my $xibx = $req->{ibx};
+ my $eml = $req->{eml};
+ my $eidx_key = $xibx->eidx_key;
+ if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
+ my $xnum = $req->{xnum};
+ $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
+ $idx->shard_add_eidx_info($docid, $eidx_key, $eml);
+ check_batch_limit($req);
+ } else { # 'd'
+ my $rm_eidx_info;
+ my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key,
+ \$rm_eidx_info);
+ if ($nr == 0) {
+ $self->{oidx}->eidxq_del($docid);
+ $idx->shard_remove($docid);
+ } elsif ($rm_eidx_info) {
+ $idx->shard_remove_eidx_info($docid, $eidx_key, $eml);
+ $self->{oidx}->eidxq_add($docid); # yes, add
+ }
+ }
+}
+
+# called by V2Writable::sync_prepare
+sub artnum_max { $_[0]->{oidx}->eidx_max }
+
+sub index_unseen ($) {
+ my ($req) = @_;
+ my $new_smsg = $req->{new_smsg} or die 'BUG: {new_smsg} unset';
+ my $eml = delete $req->{eml};
+ $new_smsg->populate($eml, $req);
+ my $self = $req->{self};
+ my $docid = $self->{oidx}->adj_counter('eidx_docid', '+');
+ $new_smsg->{num} = $docid;
+ my $idx = $self->idx_shard($docid);
+ $self->{oidx}->add_overview($eml, $new_smsg);
+ my $oid = $new_smsg->{blob};
+ my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset';
+ $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key);
+ $idx->index_raw(undef, $eml, $new_smsg, $ibx->eidx_key);
+ check_batch_limit($req);
+}
+
+sub do_finalize ($) {
+ my ($req) = @_;
+ if (my $indexed = $req->{indexed}) {
+ do_xpost($req, $_) for @$indexed;
+ } elsif (exists $req->{new_smsg}) { # totally unseen messsage
+ index_unseen($req);
+ } else {
+ # `d' message was already unindexed in the v1/v2 inboxes,
+ # so it's too noisy to warn, here.
+ }
+ # cur_cmt may be undef for unindex_oid, set by V2Writable::index_todo
+ if (defined(my $cur_cmt = $req->{cur_cmt})) {
+ ${$req->{latest_cmt}} = $cur_cmt;
+ }
+}
+
+sub do_step ($) { # main iterator for adding messages to the index
+ my ($req) = @_;
+ my $self = $req->{self} // die 'BUG: {self} missing';
+ while (1) {
+ if (my $next_arg = $req->{next_arg}) {
+ if (my $smsg = $self->{oidx}->next_by_mid(@$next_arg)) {
+ $req->{cur_smsg} = $smsg;
+ $self->git->cat_async($smsg->{blob},
+ \&ck_existing, $req);
+ return; # ck_existing calls do_step
+ }
+ delete $req->{cur_smsg};
+ delete $req->{next_arg};
+ }
+ my $mid = shift(@{$req->{mids}});
+ last unless defined $mid;
+ my ($id, $prev);
+ $req->{next_arg} = [ $mid, \$id, \$prev ];
+ # loop again
+ }
+ do_finalize($req);
+}
+
+sub _blob_missing ($) { # called when req->{cur_smsg}->{blob} is bad
+ my ($req) = @_;
+ my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing';
+ my $self = $req->{self};
+ my $xref3 = $self->{oidx}->get_xref3($smsg->{num});
+ my @keep = grep(!/:$smsg->{blob}\z/, @$xref3);
+ if (@keep) {
+ $keep[0] =~ /:([a-f0-9]{40,}+)\z/ or
+ die "BUG: xref $keep[0] has no OID";
+ my $oidhex = $1;
+ $self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob});
+ my $upd = $self->{oidx}->update_blob($smsg, $oidhex);
+ my $saved = $self->{oidx}->get_art($smsg->{num});
+ } else {
+ $self->{oidx}->delete_by_num($smsg->{num});
+ }
+}
+
+sub ck_existing { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $req) = @_;
+ my $smsg = $req->{cur_smsg} or die 'BUG: {cur_smsg} missing';
+ if ($type eq 'missing') {
+ _blob_missing($req);
+ } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) {
+ my $self = $req->{self} // die 'BUG: {self} missing';
+ local $self->{current_info} = "$self->{current_info} $oid";
+ my $cur = PublicInbox::Eml->new($bref);
+ if (content_hash($cur) eq $req->{chash}) {
+ push @{$req->{indexed}}, $smsg; # for do_xpost
+ } # else { index_unseen later }
+ }
+ do_step($req);
+}
+
+# is the messages visible in the inbox currently being indexed?
+# return the number if so
+sub cur_ibx_xnum ($$) {
+ my ($req, $bref) = @_;
+ my $ibx = $req->{ibx} or die 'BUG: current {ibx} missing';
+
+ $req->{eml} = PublicInbox::Eml->new($bref);
+ $req->{chash} = content_hash($req->{eml});
+ $req->{mids} = mids($req->{eml});
+ my @q = @{$req->{mids}}; # copy
+ while (defined(my $mid = shift @q)) {
+ my ($id, $prev);
+ while (my $x = $ibx->over->next_by_mid($mid, \$id, \$prev)) {
+ return $x->{num} if $x->{blob} eq $req->{oid};
+ }
+ }
+ undef;
+}
+
+sub index_oid { # git->cat_async callback for 'm'
+ my ($bref, $oid, $type, $size, $req) = @_;
+ my $self = $req->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
+ return if is_bad_blob($oid, $type, $size, $req->{oid});
+ my $new_smsg = $req->{new_smsg} = bless {
+ blob => $oid,
+ }, 'PublicInbox::Smsg';
+ $new_smsg->{bytes} = $size + crlf_adjust($$bref);
+ defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return;
+ ++${$req->{nr}};
+ do_step($req);
+}
+
+sub unindex_oid { # git->cat_async callback for 'd'
+ my ($bref, $oid, $type, $size, $req) = @_;
+ my $self = $req->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
+ return if is_bad_blob($oid, $type, $size, $req->{oid});
+ return if defined(cur_ibx_xnum($req, $bref)); # was re-added
+ do_step($req);
+}
+
+# overrides V2Writable::last_commits, called by sync_ranges via sync_prepare
+sub last_commits {
+ my ($self, $sync) = @_;
+ my $heads = [];
+ my $ekey = $sync->{ibx}->eidx_key;
+ my $uv = $sync->{ibx}->uidvalidity;
+ for my $i (0..$sync->{epoch_max}) {
+ $heads->[$i] = $self->{oidx}->eidx_meta("lc-v2:$ekey//$uv;$i");
+ }
+ $heads;
+}
+
+sub _ibx_index_reject ($) {
+ my ($ibx) = @_;
+ $ibx->mm // return 'unindexed, no msgmap.sqlite3';
+ $ibx->uidvalidity // return 'no UIDVALIDITY';
+ $ibx->over // return 'unindexed, no over.sqlite3';
+ undef;
+}
+
+sub _sync_inbox ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $ekey = $ibx->eidx_key;
+ if (defined(my $err = _ibx_index_reject($ibx))) {
+ return "W: skipping $ekey ($err)";
+ }
+ $sync->{ibx} = $ibx;
+ $sync->{nr} = \(my $nr = 0);
+ my $v = $ibx->version;
+ if ($v == 2) {
+ $sync->{epoch_max} = $ibx->max_git_epoch // return;
+ sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable
+ } elsif ($v == 1) {
+ my $uv = $ibx->uidvalidity;
+ my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv");
+ my $head = $ibx->mm->last_commit //
+ return "E: $ibx->{inboxdir} is not indexed";
+ my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head);
+ my $unit = { stack => $stk, git => $ibx->git };
+ push @{$sync->{todo}}, $unit;
+ } else {
+ return "E: $ekey unsupported inbox version (v$v)";
+ }
+ for my $unit (@{delete($sync->{todo}) // []}) {
+ last if $sync->{quit};
+ index_todo($self, $sync, $unit);
+ }
+ $self->{midx}->index_ibx($ibx) unless $sync->{quit};
+ $ibx->git->cleanup; # done with this inbox, now
+ undef;
+}
+
+sub gc_unref_doc ($$$$) {
+ my ($self, $ibx_id, $eidx_key, $docid) = @_;
+ my $dbh = $self->{oidx}->dbh;
+
+ # for debug/info purposes, oids may no longer be accessible
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+ $sth->execute($docid, $ibx_id);
+ my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref};
+
+ $dbh->prepare_cached(<<'')->execute($docid, $ibx_id);
+DELETE FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+ my $remain = $self->{oidx}->get_xref3($docid);
+ if (scalar(@$remain)) {
+ $self->{oidx}->eidxq_add($docid); # enqueue for reindex
+ for my $oid (@oid) {
+ warn "I: unref #$docid $eidx_key $oid\n";
+ }
+ } else {
+ warn "I: remove #$docid $eidx_key @oid\n";
+ $self->idx_shard($docid)->shard_remove($docid);
+ }
+}
+
+sub eidx_gc {
+ my ($self, $opt) = @_;
+ $self->{cfg} or die "E: GC requires ->attach_config\n";
+ $opt->{-idx_gc} = 1;
+ $self->idx_init($opt); # acquire lock via V2Writable::_idx_init
+
+ my $dbh = $self->{oidx}->dbh;
+ my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?');
+ my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes');
+ my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?');
+
+ $ibx_ck->execute;
+ while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) {
+ next if $self->{ibx_map}->{$eidx_key};
+ $self->{midx}->remove_eidx_key($eidx_key);
+ warn "I: deleting messages for $eidx_key...\n";
+ $x3_doc->execute($ibx_id);
+ while (defined(my $docid = $x3_doc->fetchrow_array)) {
+ gc_unref_doc($self, $ibx_id, $eidx_key, $docid);
+ }
+ $dbh->prepare_cached(<<'')->execute($ibx_id);
+DELETE FROM inboxes WHERE ibx_id = ?
+
+ # drop last_commit info
+ my $pat = $eidx_key;
+ $pat =~ s/([_%])/\\$1/g;
+ $lc_i->execute("lc-%:$pat//%");
+ while (my ($key) = $lc_i->fetchrow_array) {
+ next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!;
+ warn "I: removing $key\n";
+ $dbh->prepare_cached(<<'')->execute($key);
+DELETE FROM eidx_meta WHERE key = ?
+
+ }
+
+ warn "I: $eidx_key removed\n";
+ }
+
+ # it's not real unless it's in `over', we use parallelism here,
+ # shards will be reading directly from over, so commit
+ $self->{oidx}->commit_lazy;
+ $self->{oidx}->begin_lazy;
+
+ for my $idx (@{$self->{idx_shards}}) {
+ warn "I: cleaning up shard #$idx->{shard}\n";
+ $idx->shard_over_check($self->{oidx});
+ }
+ my $nr = $dbh->do(<<'');
+DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over)
+
+ warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0;
+
+ done($self);
+}
+
+sub _ibx_for ($$$) {
+ my ($self, $sync, $smsg) = @_;
+ my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset';
+ my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos";
+ $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped"
+}
+
+sub _fd_constrained ($) {
+ my ($self) = @_;
+ $self->{-fd_constrained} //= do {
+ my $soft;
+ if (eval { require BSD::Resource; 1 }) {
+ my $NOFILE = BSD::Resource::RLIMIT_NOFILE();
+ ($soft, undef) = BSD::Resource::getrlimit($NOFILE);
+ } else {
+ chomp($soft = `sh -c 'ulimit -n'`);
+ }
+ if (defined($soft)) {
+ my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate
+ my $ret = $want > $soft;
+ if ($ret) {
+ warn <<EOF;
+RLIMIT_NOFILE=$soft insufficient (want: $want), will close DB handles early
+EOF
+ }
+ $ret;
+ } else {
+ warn "Unable to determine RLIMIT_NOFILE: $@\n";
+ 1;
+ }
+ };
+}
+
+sub _reindex_finalize ($$$) {
+ my ($req, $smsg, $eml) = @_;
+ my $sync = $req->{sync};
+ my $self = $sync->{self};
+ my $by_chash = delete $req->{by_chash} or die 'BUG: no {by_chash}';
+ my $nr = scalar(keys(%$by_chash)) or die 'BUG: no content hashes';
+ my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}';
+ my $docid = $smsg->{num} = $orig_smsg->{num};
+ $self->{oidx}->add_overview($eml, $smsg); # may rethread
+ check_batch_limit({ %$sync, new_smsg => $smsg });
+ my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}";
+ my $stable = delete($by_chash->{$chash0}) //
+ die "BUG: $smsg->{blob} chash missing";
+ my $idx = $self->idx_shard($docid);
+ my $top_smsg = pop @$stable;
+ $top_smsg == $smsg or die 'BUG: top_smsg != smsg';
+ my $ibx = _ibx_for($self, $sync, $smsg);
+ $idx->index_raw(undef, $eml, $smsg, $ibx->eidx_key);
+ for my $x (reverse @$stable) {
+ $ibx = _ibx_for($self, $sync, $x);
+ my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}';
+ $idx->shard_add_eidx_info($docid, $ibx->eidx_key, $hdr);
+ }
+ return if $nr == 1; # likely, all good
+
+ warn "W: #$docid split into $nr due to deduplication change\n";
+ my @todo;
+ for my $ary (values %$by_chash) {
+ for my $x (reverse @$ary) {
+ warn "removing #$docid xref3 $x->{blob}\n";
+ my $n = $self->{oidx}->remove_xref3($docid, $x->{blob});
+ die "BUG: $x->{blob} invalidated #$docid" if $n == 0;
+ }
+ my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty";
+ $x->{num} = delete($x->{xnum}) // die '{xnum} unset';
+ $ibx = _ibx_for($self, $sync, $x);
+ if (my $over = $ibx->over) {
+ my $e = $over->get_art($x->{num});
+ $e->{blob} eq $x->{blob} or die <<EOF;
+$x->{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num});
+EOF
+ push @todo, $ibx, $e;
+ $over->dbh_close if _fd_constrained($self);
+ } else {
+ die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n";
+ }
+ }
+ undef $by_chash;
+ while (my ($ibx, $e) = splice(@todo, 0, 2)) {
+ reindex_unseen($self, $sync, $ibx, $e);
+ }
+}
+
+sub _reindex_oid { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $req) = @_;
+ my $sync = $req->{sync};
+ my $self = $sync->{self};
+ my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}';
+ my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2];
+ my $docid = $orig_smsg->{num};
+ if (is_bad_blob($oid, $type, $size, $expect_oid)) {
+ my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid);
+ if ($remain == 0) {
+ warn "W: #$docid gone or corrupted\n";
+ $self->idx_shard($docid)->shard_remove($docid);
+ } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
+ $self->git->cat_async($next_oid, \&_reindex_oid, $req);
+ } else {
+ warn "BUG: #$docid gone (UNEXPECTED)\n";
+ $self->idx_shard($docid)->shard_remove($docid);
+ }
+ return;
+ }
+ my $ci = $self->{current_info};
+ local $self->{current_info} = "$ci #$docid $oid";
+ my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
+ $re_smsg->{bytes} = $size + crlf_adjust($$bref);
+ my $eml = PublicInbox::Eml->new($bref);
+ $re_smsg->populate($eml, { autime => $orig_smsg->{ds},
+ cotime => $orig_smsg->{ts} });
+ my $chash = content_hash($eml);
+ $re_smsg->{chash} = $chash;
+ $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1];
+ $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0];
+ $re_smsg->{hdr} = $eml->header_obj;
+ push @{$req->{by_chash}->{$chash}}, $re_smsg;
+ if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
+ $self->git->cat_async($next_oid, \&_reindex_oid, $req);
+ } else { # last $re_smsg is the highest priority xref3
+ local $self->{current_info} = "$ci #$docid";
+ _reindex_finalize($req, $re_smsg, $eml);
+ }
+}
+
+sub _reindex_smsg ($$$) {
+ my ($self, $sync, $smsg) = @_;
+ my $docid = $smsg->{num};
+ my $xr3 = $self->{oidx}->get_xref3($docid, 1);
+ if (scalar(@$xr3) == 0) { # _reindex_check_stale should've covered this
+ warn <<"";
+BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
+
+ $self->{oidx}->delete_by_num($docid);
+ $self->idx_shard($docid)->shard_remove($docid);
+ return;
+ }
+
+ # we sort {xr3r} in the reverse order of {ibx_list} so we can
+ # hit the common case in _reindex_finalize without rereading
+ # from git (or holding multiple messages in memory).
+ my $id2pos = $sync->{id2pos}; # index in {ibx_list}
+ @$xr3 = sort {
+ $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
+ ||
+ $b->[1] <=> $a->[1] # break ties with {xnum}
+ } @$xr3;
+ @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3;
+ my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 };
+ $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req);
+}
+
+sub checkpoint_due ($) {
+ my ($sync) = @_;
+ ${$sync->{need_checkpoint}} || (now() > $sync->{next_check});
+}
+
+sub host_ident () {
+ # I've copied FS images and only changed the hostname before,
+ # so prepend hostname. Use `state' since these a BOFH can change
+ # these while this process is running and we always want to be
+ # able to release locks taken by this process.
+ state $retval = hostname . '-' . do {
+ my $m; # machine-id(5) is systemd
+ if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> }
+ # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs
+ chomp($m ||= `{ sysctl -n kern.hostid ||
+ hostid || ghostid; } 2>/dev/null`
+ || "no-machine-id-or-hostid-on-$^O");
+ $m;
+ };
+}
+
+sub eidxq_release {
+ my ($self) = @_;
+ my $expect = delete($self->{-eidxq_locked}) or return;
+ my ($owner_pid, undef) = split(/-/, $expect);
+ return if $owner_pid != $$; # shards may fork
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') // '';
+ if ($cur eq $expect) {
+ $oidx->eidx_meta('eidxq_lock', '');
+ return 1;
+ } elsif ($cur ne '') {
+ warn "E: eidxq_lock($expect) stolen by $cur\n";
+ } else {
+ warn "E: eidxq_lock($expect) released by another process\n";
+ }
+ undef;
+}
+
+sub DESTROY {
+ my ($self) = @_;
+ eidxq_release($self) and $self->{oidx}->commit_lazy;
+}
+
+sub _eidxq_take ($) {
+ my ($self) = @_;
+ my $val = "$$-${\time}-$>-".host_ident;
+ $self->{oidx}->eidx_meta('eidxq_lock', $val);
+ $self->{-eidxq_locked} = $val;
+}
+
+sub eidxq_lock_acquire ($) {
+ my ($self) = @_;
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self);
+ if (my $locked = $self->{-eidxq_locked}) { # be lazy
+ return $locked if $locked eq $cur;
+ }
+ my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4);
+ my $t = strftime('%Y-%m-%d %k:%M:%S', gmtime($time));
+ if ($euid == $> && $ident eq host_ident) {
+ if (kill(0, $pid)) {
+ warn <<EOM; return;
+I: PID:$pid (re)indexing Xapian since $t, it will continue our work
+EOM
+ }
+ if ($!{ESRCH}) {
+ warn "I: eidxq_lock is stale ($cur), clobbering\n";
+ return _eidxq_take($self);
+ }
+ warn "E: kill(0, $pid) failed: $!\n"; # fall-through:
+ }
+ my $fn = $oidx->dbh->sqlite_db_filename;
+ warn <<EOF;
+W: PID:$pid, UID:$euid on $ident is indexing Xapian since $t
+W: If this is unexpected, delete `eidxq_lock' from the `eidx_meta' table:
+W: sqlite3 $fn 'DELETE FROM eidx_meta WHERE key = "eidxq_lock"'
+EOF
+ undef;
+}
+
+sub eidxq_process ($$) { # for reindexing
+ my ($self, $sync) = @_;
+
+ return unless eidxq_lock_acquire($self);
+ my $dbh = $self->{oidx}->dbh;
+ my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return;
+ ${$sync->{nr}} = 0;
+ local $sync->{-regen_fmt} = "%u/$tot\n";
+ my $pr = $sync->{-opt}->{-progress};
+ if ($pr) {
+ my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq');
+ my $max = $dbh->selectrow_array('SELECT MAX(docid) FROM eidxq');
+ $pr->("Xapian indexing $min..$max (total=$tot)\n");
+ }
+ $sync->{id2pos} //= do {
+ my %id2pos;
+ my $pos = 0;
+ $id2pos{$_->{-ibx_id}} = $pos++ for @{$self->{ibx_list}};
+ \%id2pos;
+ };
+ my ($del, $iter);
+restart:
+ $del = $dbh->prepare('DELETE FROM eidxq WHERE docid = ?');
+ $iter = $dbh->prepare('SELECT docid FROM eidxq ORDER BY docid ASC');
+ $iter->execute;
+ while (defined(my $docid = $iter->fetchrow_array)) {
+ last if $sync->{quit};
+ if (my $smsg = $self->{oidx}->get_art($docid)) {
+ _reindex_smsg($self, $sync, $smsg);
+ } else {
+ warn "E: #$docid does not exist in over\n";
+ }
+ $del->execute($docid);
+ ++${$sync->{nr}};
+
+ if (checkpoint_due($sync)) {
+ $dbh = $del = $iter = undef;
+ reindex_checkpoint($self, $sync); # release lock
+ $dbh = $self->{oidx}->dbh;
+ goto restart;
+ }
+ }
+ $self->git->async_wait_all;
+ $pr->("reindexed ${$sync->{nr}}/$tot\n") if $pr;
+}
+
+sub _reindex_unseen { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $req) = @_;
+ return if is_bad_blob($oid, $type, $size, $req->{oid});
+ my $self = $req->{self} // die 'BUG: {self} unset';
+ local $self->{current_info} = "$self->{current_info} $oid";
+ my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg';
+ $new_smsg->{bytes} = $size + crlf_adjust($$bref);
+ my $eml = $req->{eml} = PublicInbox::Eml->new($bref);
+ $req->{new_smsg} = $new_smsg;
+ $req->{chash} = content_hash($eml);
+ $req->{mids} = mids($eml); # do_step iterates through this
+ do_step($req); # enter the normal indexing flow
+}
+
+# --reindex may catch totally unseen messages, this handles them
+sub reindex_unseen ($$$$) {
+ my ($self, $sync, $ibx, $xsmsg) = @_;
+ my $req = {
+ %$sync, # has {self}
+ autime => $xsmsg->{ds},
+ cotime => $xsmsg->{ts},
+ oid => $xsmsg->{blob},
+ ibx => $ibx,
+ xnum => $xsmsg->{num},
+ # {mids} and {chash} will be filled in at _reindex_unseen
+ };
+ warn "I: reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n";
+ $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req);
+}
+
+sub _reindex_check_unseen ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $ibx_id = $ibx->{-ibx_id};
+ my $slice = 1000;
+ my ($beg, $end) = (1, $slice);
+
+ # first, check if we missed any messages in target $ibx
+ my $msgs;
+ my $pr = $sync->{-opt}->{-progress};
+ my $ekey = $ibx->eidx_key;
+ local $sync->{-regen_fmt} =
+ "$ekey checking unseen %u/".$ibx->over->max."\n";
+ ${$sync->{nr}} = 0;
+
+ while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) {
+ ${$sync->{nr}} = $beg;
+ $beg = $msgs->[-1]->{num} + 1;
+ $end = $beg + $slice;
+ if (checkpoint_due($sync)) {
+ reindex_checkpoint($self, $sync); # release lock
+ }
+
+ my $inx3 = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
+SELECT DISTINCT(docid) FROM xref3 WHERE
+ibx_id = ? AND xnum = ? AND oidbin = ?
+
+ for my $xsmsg (@$msgs) {
+ my $oidbin = pack('H*', $xsmsg->{blob});
+ $inx3->bind_param(1, $ibx_id);
+ $inx3->bind_param(2, $xsmsg->{num});
+ $inx3->bind_param(3, $oidbin, SQL_BLOB);
+ $inx3->execute;
+ my $docids = $inx3->fetchall_arrayref;
+ # index messages which were totally missed
+ # the first time around ASAP:
+ if (scalar(@$docids) == 0) {
+ reindex_unseen($self, $sync, $ibx, $xsmsg);
+ } else { # already seen, reindex later
+ for my $r (@$docids) {
+ $self->{oidx}->eidxq_add($r->[0]);
+ }
+ }
+ last if $sync->{quit};
+ }
+ last if $sync->{quit};
+ }
+}
+
+sub _reindex_check_stale ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $min = 0;
+ my $pr = $sync->{-opt}->{-progress};
+ my $fetching;
+ my $ekey = $ibx->eidx_key;
+ local $sync->{-regen_fmt} =
+ "$ekey check stale/missing %u/".$ibx->over->max."\n";
+ ${$sync->{nr}} = 0;
+ do {
+ if (checkpoint_due($sync)) {
+ reindex_checkpoint($self, $sync); # release lock
+ }
+ # now, check if there's stale xrefs
+ my $iter = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
+SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? AND docid > ?
+ORDER BY docid,xnum ASC LIMIT 10000
+
+ $iter->execute($ibx->{-ibx_id}, $min);
+ $fetching = undef;
+
+ while (my ($docid, $xnum, $oidbin) = $iter->fetchrow_array) {
+ return if $sync->{quit};
+ ${$sync->{nr}} = $xnum;
+
+ $fetching = $min = $docid;
+ my $smsg = $ibx->over->get_art($xnum);
+ my $oidhex = unpack('H*', $oidbin);
+ my $err;
+ if (!$smsg) {
+ $err = 'stale';
+ } elsif ($smsg->{blob} ne $oidhex) {
+ $err = "mismatch (!= $smsg->{blob})";
+ } else {
+ next; # likely, all good
+ }
+ # current_info already has eidx_key
+ warn "$xnum:$oidhex (#$docid): $err\n";
+ my $del = $self->{oidx}->dbh->prepare_cached(<<'');
+DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
+
+ $del->bind_param(1, $ibx->{-ibx_id});
+ $del->bind_param(2, $xnum);
+ $del->bind_param(3, $oidbin, SQL_BLOB);
+ $del->execute;
+
+ # get_xref3 over-fetches, but this is a rare path:
+ my $xr3 = $self->{oidx}->get_xref3($docid);
+ my $idx = $self->idx_shard($docid);
+ if (scalar(@$xr3) == 0) { # all gone
+ $self->{oidx}->delete_by_num($docid);
+ $self->{oidx}->eidxq_del($docid);
+ $idx->shard_remove($docid);
+ } else { # enqueue for reindex of remaining messages
+ $idx->shard_remove_eidx_info($docid,
+ $ibx->eidx_key);
+ $self->{oidx}->eidxq_add($docid); # yes, add
+ }
+ }
+ } while (defined $fetching);
+}
+
+sub _reindex_inbox ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $ekey = $ibx->eidx_key;
+ local $self->{current_info} = $ekey;
+ if (defined(my $err = _ibx_index_reject($ibx))) {
+ warn "W: cannot reindex $ekey ($err)\n";
+ } else {
+ _reindex_check_unseen($self, $sync, $ibx);
+ _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit};
+ }
+ delete @$ibx{qw(over mm search git)}; # won't need these for a bit
+}
+
+sub eidx_reindex {
+ my ($self, $sync) = @_;
+
+ # acquire eidxq_lock early because full reindex takes forever
+ # and incremental -extindex processes can run during our checkpoints
+ if (!eidxq_lock_acquire($self)) {
+ warn "E: aborting --reindex\n";
+ return;
+ }
+ for my $ibx (@{$self->{ibx_list}}) {
+ _reindex_inbox($self, $sync, $ibx);
+ last if $sync->{quit};
+ }
+ $self->git->async_wait_all; # ensure eidxq gets filled completely
+ eidxq_process($self, $sync) unless $sync->{quit};
+}
+
+sub sync_inbox {
+ my ($self, $sync, $ibx) = @_;
+ my $err = _sync_inbox($self, $sync, $ibx);
+ delete @$ibx{qw(mm over)};
+ warn $err, "\n" if defined($err);
+}
+
+sub eidx_sync { # main entry point
+ my ($self, $opt) = @_;
+
+ my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
+ local $self->{current_info} = '';
+ local $SIG{__WARN__} = sub {
+ $warn_cb->($self->{current_info}, ': ', @_);
+ };
+ $self->idx_init($opt); # acquire lock via V2Writable::_idx_init
+ $self->{oidx}->rethread_prepare($opt);
+ my $sync = {
+ need_checkpoint => \(my $need_checkpoint = 0),
+ check_intvl => 10,
+ next_check => now() + 10,
+ -opt => $opt,
+ # DO NOT SET {reindex} here, it's incompatible with reused
+ # V2Writable code, reindex is totally different here
+ # compared to v1/v2 inboxes because we have multiple histories
+ self => $self,
+ -regen_fmt => "%u/?\n",
+ };
+ local $SIG{USR1} = sub { $need_checkpoint = 1 };
+ my $quit = PublicInbox::SearchIdx::quit_cb($sync);
+ local $SIG{QUIT} = $quit;
+ local $SIG{INT} = $quit;
+ local $SIG{TERM} = $quit;
+ for my $ibx (@{$self->{ibx_list}}) {
+ $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key);
+ }
+ if (delete($opt->{reindex})) {
+ local $sync->{checkpoint_unlocks} = 1;
+ eidx_reindex($self, $sync);
+ }
+
+ # don't use $_ here, it'll get clobbered by reindex_checkpoint
+ if ($opt->{scan} // 1) {
+ for my $ibx (@{$self->{ibx_list}}) {
+ last if $sync->{quit};
+ sync_inbox($self, $sync, $ibx);
+ }
+ }
+ $self->{oidx}->rethread_done($opt) unless $sync->{quit};
+ eidxq_process($self, $sync) unless $sync->{quit};
+
+ eidxq_release($self);
+ done($self);
+ $sync; # for eidx_watch
+}
+
+sub update_last_commit { # overrides V2Writable
+ my ($self, $sync, $stk) = @_;
+ my $unit = $sync->{unit} // return;
+ my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}};
+ defined($latest_cmt) or return;
+ my $ibx = $sync->{ibx} or die 'BUG: {ibx} missing';
+ my $ekey = $ibx->eidx_key;
+ my $uv = $ibx->uidvalidity;
+ my $epoch = $unit->{epoch};
+ my $meta_key;
+ my $v = $ibx->version;
+ if ($v == 2) {
+ die 'No {epoch} for v2 unit' unless defined $epoch;
+ $meta_key = "lc-v2:$ekey//$uv;$epoch";
+ } elsif ($v == 1) {
+ die 'Unexpected {epoch} for v1 unit' if defined $epoch;
+ $meta_key = "lc-v1:$ekey//$uv";
+ } else {
+ die "Unsupported inbox version: $v";
+ }
+ my $last = $self->{oidx}->eidx_meta($meta_key);
+ if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) {
+ my @cmd = (qw(rev-list --count), "$last..$latest_cmt");
+ chomp(my $n = $unit->{git}->qx(@cmd));
+ return if $n ne '' && $n == 0;
+ }
+ $self->{oidx}->eidx_meta($meta_key, $latest_cmt);
+}
+
+sub _idx_init { # with_umask callback
+ my ($self, $opt) = @_;
+ PublicInbox::V2Writable::_idx_init($self, $opt);
+ $self->{midx} = PublicInbox::MiscIdx->new($self);
+}
+
+sub idx_init { # similar to V2Writable
+ my ($self, $opt) = @_;
+ return if $self->{idx_shards};
+
+ $self->git->cleanup;
+
+ my $ALL = $self->git->{git_dir}; # ALL.git
+ PublicInbox::Import::init_bare($ALL) unless -d $ALL;
+ my $info_dir = "$ALL/objects/info";
+ my $alt = "$info_dir/alternates";
+ my $mode = 0644;
+ my (@old, @new, %seen); # seen: st_dev + st_ino
+ if (-e $alt) {
+ open(my $fh, '<', $alt) or die "open $alt: $!";
+ $mode = (stat($fh))[2] & 07777;
+ while (my $line = <$fh>) {
+ chomp(my $d = $line);
+ if (my @st = stat($d)) {
+ next if $seen{"$st[0]\0$st[1]"}++;
+ } else {
+ warn "W: stat($d) failed (from $alt): $!\n";
+ next if $opt->{-idx_gc};
+ }
+ push @old, $line;
+ }
+ }
+ for my $ibx (@{$self->{ibx_list}}) {
+ my $line = $ibx->git->{git_dir} . "/objects\n";
+ chomp(my $d = $line);
+ if (my @st = stat($d)) {
+ next if $seen{"$st[0]\0$st[1]"}++;
+ } else {
+ warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n";
+ next if $opt->{-idx_gc};
+ }
+ push @new, $line;
+ }
+ if (scalar @new) {
+ push @old, @new;
+ my $o = \@old;
+ PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o);
+ }
+ $self->parallel_init($self->{indexlevel});
+ $self->with_umask(\&_idx_init, $self, $opt);
+ $self->{oidx}->begin_lazy;
+ $self->{oidx}->eidx_prep;
+ $self->{midx}->begin_txn;
+}
+
+sub _watch_commit { # PublicInbox::DS::add_timer callback
+ my ($self) = @_;
+ delete $self->{-commit_timer};
+ eidxq_process($self, $self->{-watch_sync});
+ eidxq_release($self);
+ delete local $self->{-watch_sync}->{-regen_fmt};
+ reindex_checkpoint($self, $self->{-watch_sync});
+
+ # call event_step => done unless commit_timer is armed
+ PublicInbox::DS::requeue($self);
+}
+
+sub on_inbox_unlock { # called by PublicInbox::InboxIdle
+ my ($self, $ibx) = @_;
+ my $opt = $self->{-watch_sync}->{-opt};
+ my $pr = $opt->{-progress};
+ my $ekey = $ibx->eidx_key;
+ local $0 = "sync $ekey";
+ $pr->("indexing $ekey\n") if $pr;
+ $self->idx_init($opt);
+ sync_inbox($self, $self->{-watch_sync}, $ibx);
+ $self->{-commit_timer} //= PublicInbox::DS::add_timer(
+ $opt->{'commit-interval'} // 10,
+ \&_watch_commit, $self);
+}
+
+sub eidx_reload { # -extindex --watch SIGHUP handler
+ my ($self, $idler) = @_;
+ if ($self->{cfg}) {
+ my $pr = $self->{-watch_sync}->{-opt}->{-progress};
+ $pr->('reloading ...') if $pr;
+ delete $self->{-resync_queue};
+ @{$self->{ibx_list}} = ();
+ %{$self->{ibx_map}} = ();
+ delete $self->{-watch_sync}->{id2pos};
+ my $cfg = PublicInbox::Config->new;
+ attach_config($self, $cfg);
+ $idler->refresh($cfg);
+ $pr->(" done\n") if $pr;
+ } else {
+ warn "reload not supported without --all\n";
+ }
+}
+
+sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler
+ my ($self) = @_;
+ $self->{-resync_queue} //= [ @{$self->{ibx_list}} ];
+ PublicInbox::DS::requeue($self); # trigger our ->event_step
+}
+
+sub event_step { # PublicInbox::DS::requeue callback
+ my ($self) = @_;
+ if (my $resync_queue = $self->{-resync_queue}) {
+ if (my $ibx = shift(@$resync_queue)) {
+ on_inbox_unlock($self, $ibx);
+ PublicInbox::DS::requeue($self);
+ } else {
+ delete $self->{-resync_queue};
+ _watch_commit($self);
+ }
+ } else {
+ done($self) unless $self->{-commit_timer};
+ }
+}
+
+sub eidx_watch { # public-inbox-extindex --watch main loop
+ my ($self, $opt) = @_;
+ local %SIG = %SIG;
+ for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) {
+ $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" };
+ }
+ require PublicInbox::InboxIdle;
+ require PublicInbox::DS;
+ require PublicInbox::Syscall;
+ require PublicInbox::Sigfd;
+ my $idler = PublicInbox::InboxIdle->new($self->{cfg});
+ if (!$self->{cfg}) {
+ $idler->watch_inbox($_) for @{$self->{ibx_list}};
+ }
+ $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}};
+ my $pr = $opt->{-progress};
+ $pr->("performing initial scan ...\n") if $pr;
+ my $sync = eidx_sync($self, $opt); # initial sync
+ return if $sync->{quit};
+ my $oldset = PublicInbox::Sigfd::block_signals();
+ local $self->{current_info} = '';
+ my $cb = $SIG{__WARN__} || \&CORE::warn;
+ local $SIG{__WARN__} = sub { $cb->($self->{current_info}, ': ', @_) };
+ my $sig = {
+ HUP => sub { eidx_reload($self, $idler) },
+ USR1 => sub { eidx_resync_start($self) },
+ TSTP => sub { kill('STOP', $$) },
+ };
+ my $quit = PublicInbox::SearchIdx::quit_cb($sync);
+ $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit;
+ my $sigfd = PublicInbox::Sigfd->new($sig,
+ $PublicInbox::Syscall::SFD_NONBLOCK);
+ %SIG = (%SIG, %$sig) if !$sigfd;
+ local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock
+ if (!$sigfd) {
+ # wake up every second to accept signals if we don't
+ # have signalfd or IO::KQueue:
+ PublicInbox::Sigfd::sig_setmask($oldset);
+ PublicInbox::DS->SetLoopTimeout(1000);
+ }
+ PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} });
+ $pr->("initial scan complete, entering event loop\n") if $pr;
+ PublicInbox::DS->EventLoop; # calls InboxIdle->event_step
+ done($self);
+}
+
+no warnings 'once';
+*done = \&PublicInbox::V2Writable::done;
+*with_umask = \&PublicInbox::InboxWritable::with_umask;
+*parallel_init = \&PublicInbox::V2Writable::parallel_init;
+*nproc_shards = \&PublicInbox::V2Writable::nproc_shards;
+*sync_prepare = \&PublicInbox::V2Writable::sync_prepare;
+*index_todo = \&PublicInbox::V2Writable::index_todo;
+*count_shards = \&PublicInbox::V2Writable::count_shards;
+*atfork_child = \&PublicInbox::V2Writable::atfork_child;
+*idx_shard = \&PublicInbox::V2Writable::idx_shard;
+*reindex_checkpoint = \&PublicInbox::V2Writable::reindex_checkpoint;
+
+1;
sub generate_thread_atom {
my ($ctx) = @_;
- my $msgs = $ctx->{msgs} = $ctx->{-inbox}->over->get_thread($ctx->{mid});
+ my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid});
return _no_thread() unless @$msgs;
PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i);
}
# if the 'r' query parameter is given, it is a legacy permalink
# which we must continue supporting:
my $qp = $ctx->{qp};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
if ($qp && !$qp->{r} && $ibx->over) {
return PublicInbox::View::index_topics($ctx);
}
sub recent_msgs {
my ($ctx) = @_;
- my $ibx = $ctx->{-inbox};
- my $max = $ibx->{feedmax};
+ my $ibx = $ctx->{ibx};
+ my $max = $ibx->{feedmax} // 25;
return PublicInbox::View::paginate_recent($ctx, $max) if $ibx->over;
# only for rare v1 inboxes which aren't indexed at all
my ($class, %opts) = @_;
my $altid = delete $opts{-altid};
my $self = $class->SUPER::new(%opts);
- my $ibx = $self->{-inbox};
+ my $ibx = $self->{ibx};
# altid = serial:ruby-core:file=msgmap.sqlite3
if (!$altid && $ibx && $ibx->{altid}) {
$altid ||= $ibx->{altid}->[0];
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# backend for a git-cat-file-workalike based on libgit2,
+# other libgit2 stuff may go here, too.
+package PublicInbox::Gcf2;
+use strict;
+use PublicInbox::Spawn qw(which popen_rd);
+use Fcntl qw(LOCK_EX);
+use IO::Handle; # autoflush
+my (%CFG, $c_src, $lockfh);
+BEGIN {
+ # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY
+ # to ~/.cache/public-inbox/inline-c if it exists
+ my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} //
+ die 'PERL_INLINE_DIRECTORY not defined';
+ my $f = "$inline_dir/.public-inbox.lock";
+ open $lockfh, '>', $f or die "failed to open $f: $!\n";
+ my $pc = which($ENV{PKG_CONFIG} // 'pkg-config');
+ my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!);
+ my $rdr = {};
+ open $rdr->{2}, '>', '/dev/null' or die "open /dev/null: $!";
+ for my $x (qw(libgit2)) {
+ my $l = popen_rd([$pc, '--libs', $x], undef, $rdr);
+ $l = do { local $/; <$l> };
+ next if $?;
+ my $c = popen_rd([$pc, '--cflags', $x], undef, $rdr);
+ $c = do { local $/; <$c> };
+ next if $?;
+
+ # note: we name C source files .h to prevent
+ # ExtUtils::MakeMaker from automatically trying to
+ # build them.
+ my $f = "$dir/gcf2_$x.h";
+ if (open(my $fh, '<', $f)) {
+ chomp($l, $c);
+ local $/;
+ defined($c_src = <$fh>) or die "read $f: $!\n";
+ $CFG{LIBS} = $l;
+ $CFG{CCFLAGSEX} = $c;
+ last;
+ } else {
+ die "E: $f: $!\n";
+ }
+ }
+ die "E: libgit2 not installed\n" unless $c_src;
+
+ # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking
+ flock($lockfh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n";
+}
+
+# we use Capitalized and ALLCAPS for compatibility with old Inline::C
+use Inline C => Config => %CFG, BOOT => 'git_libgit2_init();';
+use Inline C => $c_src;
+undef $c_src;
+undef %CFG;
+undef $lockfh;
+
+sub add_alt ($$) {
+ my ($gcf2, $objdir) = @_;
+
+ # libgit2 (tested 0.27.7+dfsg.1-0.2 and 0.28.3+dfsg.1-1~bpo10+1
+ # in Debian) doesn't handle relative epochs properly when nested
+ # multiple levels. Add all the absolute paths to workaround it,
+ # since $EXTINDEX_DIR/ALL.git/objects/info/alternates uses absolute
+ # paths to reference $V2INBOX_DIR/all.git/objects and
+ # $V2INBOX_DIR/all.git/objects/info/alternates uses relative paths
+ # to refer to $V2INBOX_DIR/git/$EPOCH.git/objects
+ #
+ # See https://bugs.debian.org/975607
+ if (open(my $fh, '<', "$objdir/info/alternates")) {
+ chomp(my @abs_alt = grep(m!^/!, <$fh>));
+ $gcf2->add_alternate($_) for @abs_alt;
+ }
+ $gcf2->add_alternate($objdir);
+}
+
+# Usage: $^X -MPublicInbox::Gcf2 -e 'PublicInbox::Gcf2::loop()'
+# (see lib/PublicInbox/Gcf2Client.pm)
+sub loop {
+ my $gcf2 = new();
+ my %seen;
+ STDERR->autoflush(1);
+ STDOUT->autoflush(1);
+
+ while (<STDIN>) {
+ chomp;
+ my ($oid, $git_dir) = split(/ /, $_, 2);
+ $seen{$git_dir}++ or add_alt($gcf2, "$git_dir/objects");
+ if (!$gcf2->cat_oid(1, $oid)) {
+ # retry once if missing. We only get unabbreviated OIDs
+ # from SQLite or Xapian DBs, here, so malicious clients
+ # can't trigger excessive retries:
+ warn "I: $$ $oid missing, retrying in $git_dir\n";
+
+ $gcf2 = new();
+ %seen = ($git_dir => 1);
+ add_alt($gcf2, "$git_dir/objects");
+
+ if ($gcf2->cat_oid(1, $oid)) {
+ warn "I: $$ $oid found after retry\n";
+ } else {
+ warn "W: $$ $oid missing after retry\n";
+ print "$oid missing\n"; # mimic git-cat-file
+ }
+ }
+ }
+}
+
+1;
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# connects public-inbox processes to PublicInbox::Gcf2::loop()
+package PublicInbox::Gcf2Client;
+use strict;
+use parent qw(PublicInbox::DS);
+use PublicInbox::Git;
+use PublicInbox::Spawn qw(popen_rd);
+use IO::Handle ();
+use PublicInbox::Syscall qw(EPOLLONESHOT);
+# fields:
+# async_cat => GitAsyncCat ref (read-only pipe)
+# sock => writable pipe to Gcf2::loop
+
+
+sub new {
+ my ($rdr) = @_;
+ my $self = bless {}, __PACKAGE__;
+ # ensure the child process has the same @INC we do:
+ my $env = { PERL5LIB => join(':', @INC) };
+ my ($out_r, $out_w);
+ pipe($out_r, $out_w) or die "pipe failed: $!";
+ $rdr //= {};
+ $rdr->{0} = $out_r;
+ my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop()]];
+ @$self{qw(in pid)} = popen_rd($cmd, $env, $rdr);
+ fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ
+ $out_w->autoflush(1);
+ $out_w->blocking(0);
+ $self->{inflight} = [];
+ $self->SUPER::new($out_w, EPOLLONESHOT); # detect errors once
+}
+
+sub fail {
+ my $self = shift;
+ $self->close; # PublicInbox::DS::close
+ PublicInbox::Git::fail($self, @_);
+}
+
+sub cat_async ($$$;$) {
+ my ($self, $req, $cb, $arg) = @_;
+ my $inflight = $self->{inflight};
+
+ # {wbuf} is rare, I hope:
+ cat_async_step($self, $inflight) if $self->{wbuf};
+
+ if (!$self->write(\"$req\n")) {
+ $self->fail("gcf2c write: $!") if !$self->{sock};
+ }
+ push @$inflight, $req, $cb, $arg;
+}
+
+# ensure PublicInbox::Git::cat_async_step never calls cat_async_retry
+sub alternates_changed {}
+
+# this is the write-only end of a pipe, DS->EventLoop will call this
+sub event_step {
+ my ($self) = @_;
+ $self->flush_write;
+ $self->close if !$self->{in}; # process died
+}
+
+no warnings 'once';
+
+# used by GitAsyncCat
+*cat_async_step = \&PublicInbox::Git::cat_async_step;
+
+1;
use parent qw(Exporter);
use POSIX ();
use IO::Handle; # ->autoflush
-use Errno qw(EINTR);
+use Errno qw(EINTR EAGAIN);
use File::Glob qw(bsd_glob GLOB_NOSORT);
+use File::Spec ();
use Time::HiRes qw(stat);
use PublicInbox::Spawn qw(popen_rd);
use PublicInbox::Tmpfile;
+use IO::Poll qw(POLLIN);
use Carp qw(croak);
+use Digest::SHA ();
our @EXPORT_OK = qw(git_unquote git_quote);
our $PIPE_BUFSIZ = 65536; # Linux default
our $in_cleanup;
+our $RDTIMEO = 60_000; # milliseconds
use constant MAX_INFLIGHT =>
(($^O eq 'linux' ? 4096 : POSIX::_POSIX_PIPE_BUF()) * 3)
sub last_check_err {
my ($self) = @_;
my $fh = $self->{err_c} or return;
- sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!");
+ sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!");
defined(sysread($fh, my $buf, -s $fh)) or
- fail($self, "sysread failed: $!");
+ $self->fail("sysread failed: $!");
$buf;
}
if ($self->{$pid}) {
if (defined $err) { # "err_c"
my $fh = $self->{$err};
- sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!");
- truncate($fh, 0) or fail($self, "truncate failed: $!");
+ sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!");
+ truncate($fh, 0) or $self->fail("truncate failed: $!");
}
return;
}
my ($out_r, $out_w);
- pipe($out_r, $out_w) or fail($self, "pipe failed: $!");
+ pipe($out_r, $out_w) or $self->fail("pipe failed: $!");
my @cmd = (qw(git), "--git-dir=$self->{git_dir}",
qw(-c core.abbrev=40 cat-file), $batch);
my $redir = { 0 => $out_r };
if ($err) {
my $id = "git.$self->{git_dir}$batch.err";
- my $fh = tmpfile($id) or fail($self, "tmpfile($id): $!");
+ my $fh = tmpfile($id) or $self->fail("tmpfile($id): $!");
$self->{$err} = $fh;
$redir->{2} = $fh;
}
$self->{$in} = $in_r;
}
+sub poll_in ($) { IO::Poll::_poll($RDTIMEO, fileno($_[0]), my $ev = POLLIN) }
+
sub my_read ($$$) {
my ($fh, $rbuf, $len) = @_;
my $left = $len - length($$rbuf);
$r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf));
if ($r) {
$left -= $r;
+ } elsif (defined($r)) { # EOF
+ return 0;
} else {
- next if (!defined($r) && $! == EINTR);
- return $r;
+ next if ($! == EAGAIN and poll_in($fh));
+ next if $! == EINTR; # may be set by sysread or poll_in
+ return; # unrecoverable error
}
}
\substr($$rbuf, 0, $len, '');
if ((my $n = index($$rbuf, "\n")) >= 0) {
return substr($$rbuf, 0, $n + 1, '');
}
- my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf));
- next if $r || (!defined($r) && $! == EINTR);
- return defined($r) ? '' : undef; # EOF or error
+ my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf))
+ and next;
+
+ # return whatever's left on EOF
+ return substr($$rbuf, 0, length($$rbuf)+1, '') if defined($r);
+
+ next if ($! == EAGAIN and poll_in($fh));
+ next if $! == EINTR; # may be set by sysread or poll_in
+ return; # unrecoverable error
}
}
for (my $i = 0; $i < @$inflight; $i += 3) {
$buf .= "$inflight->[$i]\n";
}
- print { $self->{out} } $buf or fail($self, "write error: $!");
+ print { $self->{out} } $buf or $self->fail("write error: $!");
unshift(@$inflight, \$req, $cb, $arg); # \$ref to indicate retried
cat_async_step($self, $inflight); # take one step
my $rbuf = delete($self->{cat_rbuf}) // \(my $new = '');
my ($bref, $oid, $type, $size);
my $head = my_readline($self->{in}, $rbuf);
+ # ->fail may be called via Gcf2Client.pm
if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) {
($oid, $type, $size) = ($1, $2, $3 + 0);
$bref = my_read($self->{in}, $rbuf, $size + 1) or
- fail($self, defined($bref) ? 'read EOF' : "read: $!");
- chop($$bref) eq "\n" or fail($self, 'LF missing after blob');
- } elsif ($head =~ / missing$/) {
+ $self->fail(defined($bref) ? 'read EOF' : "read: $!");
+ chop($$bref) eq "\n" or $self->fail('LF missing after blob');
+ } elsif ($head =~ s/ missing\n//s) {
+ $oid = $head;
# ref($req) indicates it's already been retried
- if (!ref($req) && !$in_cleanup && alternates_changed($self)) {
+ # -gcf2 retries internally, so it never hits this path:
+ if (!ref($req) && !$in_cleanup && $self->alternates_changed) {
return cat_async_retry($self, $inflight,
$req, $cb, $arg);
}
$type = 'missing';
- $oid = ref($req) ? $$req : $req;
+ $oid = ref($req) ? $$req : $req if $oid eq '';
} else {
- fail($self, "Unexpected result from async git cat-file: $head");
+ my $err = $! ? " ($!)" : '';
+ $self->fail("bad result from async cat-file: $head$err");
}
- eval { $cb->($bref, $oid, $type, $size, $arg) };
$self->{cat_rbuf} = $rbuf if $$rbuf ne '';
+ eval { $cb->($bref, $oid, $type, $size, $arg) };
warn "E: $oid: $@\n" if $@;
}
sub cat_async_wait ($) {
my ($self) = @_;
- my $inflight = delete $self->{inflight} or return;
+ my $inflight = $self->{inflight} or return;
while (scalar(@$inflight)) {
cat_async_step($self, $inflight);
}
my ($self, $inflight_c) = @_;
die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3;
my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3);
- my $rbuf = delete($self->{rbuf_c}) // \(my $new = '');
+ my $rbuf = delete($self->{chk_rbuf}) // \(my $new = '');
chomp(my $line = my_readline($self->{in_c}, $rbuf));
my ($hex, $type, $size) = split(/ /, $line);
# https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/
if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') {
my $ret = my_read($self->{in_c}, $rbuf, $type + 1);
- fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret;
+ $self->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret;
}
+ $self->{chk_rbuf} = $rbuf if $$rbuf ne '';
eval { $cb->($hex, $type, $size, $arg, $self) };
warn "E: check($req) $@\n" if $@;
- $self->{rbuf_c} = $rbuf if $$rbuf ne '';
}
sub check_async_wait ($) {
my ($self) = @_;
- my $inflight_c = delete $self->{inflight_c} or return;
+ my $inflight_c = $self->{inflight_c} or return;
while (scalar(@$inflight_c)) {
check_async_step($self, $inflight_c);
}
sub check_async ($$$$) {
my ($self, $oid, $cb, $arg) = @_;
my $inflight_c = $self->{inflight_c} // check_async_begin($self);
- if (scalar(@$inflight_c) >= MAX_INFLIGHT) {
+ while (scalar(@$inflight_c) >= MAX_INFLIGHT) {
check_async_step($self, $inflight_c);
}
- print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!");
+ print { $self->{out_c} } $oid, "\n" or $self->fail("write error: $!");
push(@$inflight_c, $oid, $cb, $arg);
}
sub _destroy {
my ($self, $rbuf, $in, $out, $pid, $err) = @_;
- my $p = delete $self->{$pid} or return;
delete @$self{($rbuf, $in, $out)};
delete $self->{$err} if $err; # `err_c'
+ # GitAsyncCat::event_step may delete {pid}
+ my $p = delete $self->{$pid} or return;
+
# PublicInbox::DS may not be loaded
eval { PublicInbox::DS::dwaitpid($p, undef, undef) };
waitpid($p, 0) if $@; # wait synchronously if not in event loop
sub cat_async_abort ($) {
my ($self) = @_;
- my $inflight = delete $self->{inflight} or die 'BUG: not in async';
+ if (my $inflight = $self->{inflight}) {
+ while (@$inflight) {
+ my ($req, $cb, $arg) = splice(@$inflight, 0, 3);
+ $req =~ s/ .*//; # drop git_dir for Gcf2Client
+ eval { $cb->(undef, $req, undef, undef, $arg) };
+ warn "E: $req: $@ (in abort)\n" if $@;
+ }
+ delete $self->{cat_rbuf};
+ delete $self->{inflight};
+ }
cleanup($self);
}
-sub fail {
+sub fail { # may be augmented in subclasses
my ($self, $msg) = @_;
- $self->{inflight} ? cat_async_abort($self) : cleanup($self);
- croak("git $self->{git_dir}: $msg");
+ cat_async_abort($self);
+ croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg");
}
sub popen {
sub qx {
my ($self, @cmd) = @_;
my $fh = $self->popen(@cmd);
- local $/ = "\n";
- return <$fh> if wantarray;
- local $/;
- <$fh>
+ local $/ = wantarray ? "\n" : undef;
+ <$fh>;
+}
+
+# check_async and cat_async may trigger the other, so ensure they're
+# both completely done by using this:
+sub async_wait_all ($) {
+ my ($self) = @_;
+ while (scalar(@{$self->{inflight_c} // []}) ||
+ scalar(@{$self->{inflight} // []})) {
+ $self->check_async_wait;
+ $self->cat_async_wait;
+ }
}
# returns true if there are pending "git cat-file" processes
my ($self) = @_;
local $in_cleanup = 1;
delete $self->{async_cat};
- check_async_wait($self);
- cat_async_wait($self);
+ async_wait_all($self);
+ delete $self->{inflight};
+ delete $self->{inflight_c};
_destroy($self, qw(cat_rbuf in out pid));
_destroy($self, qw(chk_rbuf in_c out_c pid_c err_c));
!!($self->{pid} || $self->{pid_c});
}
+
# assuming a well-maintained repo, this should be a somewhat
# accurate estimation of its size
# TODO: show this in the WWW UI as a hint to potential cloners
sub cat_async_begin {
my ($self) = @_;
- cleanup($self) if alternates_changed($self);
- batch_prepare($self);
+ cleanup($self) if $self->alternates_changed;
+ $self->batch_prepare;
die 'BUG: already in async' if $self->{inflight};
$self->{inflight} = [];
}
sub cat_async ($$$;$) {
my ($self, $oid, $cb, $arg) = @_;
my $inflight = $self->{inflight} // cat_async_begin($self);
- if (scalar(@$inflight) >= MAX_INFLIGHT) {
+ while (scalar(@$inflight) >= MAX_INFLIGHT) {
cat_async_step($self, $inflight);
}
-
- print { $self->{out} } $oid, "\n" or fail($self, "write error: $!");
+ print { $self->{out} } $oid, "\n" or $self->fail("write error: $!");
push(@$inflight, $oid, $cb, $arg);
}
-# this is safe to call inside $cb, but not guaranteed to enqueue
-# returns true if successful, undef if not.
sub async_prefetch {
my ($self, $oid, $cb, $arg) = @_;
- if (defined($self->{async_cat}) && (my $inflight = $self->{inflight})) {
+ if (my $inflight = $self->{inflight}) {
# we could use MAX_INFLIGHT here w/o the halving,
# but lets not allow one client to monopolize a git process
if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) {
print { $self->{out} } $oid, "\n" or
- fail($self, "write error: $!");
+ $self->fail("write error: $!");
return push(@$inflight, $oid, $cb, $arg);
}
}
$modified || time;
}
+# for grokmirror, which doesn't read gitweb.description
+# templates/hooks--update.sample and git-multimail in git.git
+# only match "Unnamed repository", not the full contents of
+# templates/this--description in git.git
+sub manifest_entry {
+ my ($self, $epoch, $default_desc) = @_;
+ my ($fh, $pid) = $self->popen('show-ref');
+ my $dig = Digest::SHA->new(1);
+ while (read($fh, my $buf, 65536)) {
+ $dig->add($buf);
+ }
+ close $fh;
+ waitpid($pid, 0);
+ return if $?; # empty, uninitialized git repo
+ my $git_dir = $self->{git_dir};
+ my $ent = {
+ fingerprint => $dig->hexdigest,
+ reference => undef,
+ modified => modified($self),
+ };
+ chomp(my $owner = $self->qx('config', 'gitweb.owner'));
+ utf8::decode($owner);
+ $ent->{owner} = $owner eq '' ? undef : $owner;
+ my $desc = '';
+ if (open($fh, '<', "$git_dir/description")) {
+ local $/ = "\n";
+ chomp($desc = <$fh>);
+ utf8::decode($desc);
+ }
+ $desc = 'Unnamed repository' if $desc eq '';
+ if (defined $epoch && $desc =~ /\AUnnamed repository/) {
+ $desc = "$default_desc [epoch $epoch]";
+ }
+ $ent->{description} = $desc;
+ if (open($fh, '<', "$git_dir/objects/info/alternates")) {
+ # n.b.: GitPython doesn't seem to handle comments or C-quoted
+ # strings like native git does; and we don't for now, either.
+ local $/ = "\n";
+ chomp(my @alt = <$fh>);
+
+ # grokmirror only supports 1 alternate for "reference",
+ if (scalar(@alt) == 1) {
+ my $objdir = "$git_dir/objects";
+ my $ref = File::Spec->rel2abs($alt[0], $objdir);
+ $ref =~ s!/[^/]+/?\z!!; # basename
+ $ent->{reference} = $ref;
+ }
+ }
+ $ent;
+}
+
1;
__END__
=pod
#
# internal class used by PublicInbox::Git + PublicInbox::DS
# This parses the output pipe of "git cat-file --batch"
-#
-# Note: this does NOT set the non-blocking flag, we expect `git cat-file'
-# to be a local process, and git won't start writing a blob until it's
-# fully read. So minimize context switching and read as much as possible
-# and avoid holding a buffer in our heap any longer than it has to live.
package PublicInbox::GitAsyncCat;
use strict;
use parent qw(PublicInbox::DS Exporter);
+use POSIX qw(WNOHANG);
use PublicInbox::Syscall qw(EPOLLIN EPOLLET);
-our @EXPORT = qw(git_async_cat);
-
-sub _add {
- my ($class, $git) = @_;
- $git->batch_prepare;
- my $self = bless { git => $git }, $class;
- $self->SUPER::new($git->{in}, EPOLLIN|EPOLLET);
- \undef; # this is a true ref()
+our @EXPORT = qw(git_async_cat git_async_prefetch);
+use PublicInbox::Git ();
+
+our $GCF2C; # singleton PublicInbox::Gcf2Client
+
+sub close {
+ my ($self) = @_;
+
+ if (my $gitish = delete $self->{gitish}) {
+ PublicInbox::Git::cat_async_abort($gitish);
+ }
+ $self->SUPER::close; # PublicInbox::DS::close
}
sub event_step {
my ($self) = @_;
- my $git = $self->{git};
- return $self->close if ($git->{in} // 0) != ($self->{sock} // 1);
- my $inflight = $git->{inflight};
+ my $gitish = $self->{gitish} or return;
+ return $self->close if ($gitish->{in} // 0) != ($self->{sock} // 1);
+ my $inflight = $gitish->{inflight};
if ($inflight && @$inflight) {
- $git->cat_async_step($inflight);
- $self->requeue if @$inflight || exists $git->{cat_rbuf};
+ $gitish->cat_async_step($inflight);
+
+ # child death?
+ if (($gitish->{in} // 0) != ($self->{sock} // 1)) {
+ $self->close;
+ } elsif (@$inflight || exists $gitish->{cat_rbuf}) {
+ # ok, more to do, requeue for fairness
+ $self->requeue;
+ }
+ } elsif ((my $pid = waitpid($gitish->{pid}, WNOHANG)) > 0) {
+ # May happen if the child process is killed by a BOFH
+ # (or segfaults)
+ delete $gitish->{pid};
+ warn "E: gitish $pid exited with \$?=$?\n";
+ $self->close;
}
}
sub git_async_cat ($$$$) {
my ($git, $oid, $cb, $arg) = @_;
- $git->cat_async($oid, $cb, $arg);
- $git->{async_cat} //= _add(__PACKAGE__, $git);
+ my $gitish = $GCF2C //= eval {
+ require PublicInbox::Gcf2;
+ require PublicInbox::Gcf2Client;
+ PublicInbox::Gcf2Client::new();
+ } // 0; # 0: do not retry if libgit2 or Inline::C are missing
+ if ($gitish) { # Gcf2 active, {inflight} may be unset due to errors
+ $GCF2C->{inflight} or
+ $gitish = $GCF2C = PublicInbox::Gcf2Client::new();
+ $oid .= " $git->{git_dir}";
+ } else {
+ $gitish = $git;
+ }
+ $gitish->cat_async($oid, $cb, $arg);
+ $gitish->{async_cat} //= do {
+ # read-only end of pipe (Gcf2Client is write-only end)
+ my $self = bless { gitish => $gitish }, __PACKAGE__;
+ $gitish->{in}->blocking(0);
+ $self->SUPER::new($gitish->{in}, EPOLLIN|EPOLLET);
+ \undef; # this is a true ref()
+ };
+}
+
+# this is safe to call inside $cb, but not guaranteed to enqueue
+# returns true if successful, undef if not.
+sub git_async_prefetch {
+ my ($git, $oid, $cb, $arg) = @_;
+ if ($GCF2C) {
+ if ($GCF2C->{async_cat} && !$GCF2C->{wbuf}) {
+ $oid .= " $git->{git_dir}";
+ return $GCF2C->cat_async($oid, $cb, $arg);
+ }
+ } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) {
+ # we could use MAX_INFLIGHT here w/o the halving,
+ # but lets not allow one client to monopolize a git process
+ if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) {
+ print { $git->{out} } $oid, "\n" or
+ $git->fail("write error: $!");
+ return push(@$inflight, $oid, $cb, $arg);
+ }
+ }
+ undef;
}
1;
if (!defined($oid)) {
# it's possible to have TOCTOU if an admin runs
# public-inbox-(edit|purge), just move onto the next message
- warn "E: $smsg->{blob} missing in $self->{-inbox}->{inboxdir}\n";
+ warn "E: $smsg->{blob} missing in $self->{ibx}->{inboxdir}\n";
return $http->next_step($self->can('async_next'));
}
$smsg->{blob} eq $oid or bail($self, "BUG: $smsg->{blob} != $oid");
sub smsg_blob {
my ($self, $smsg) = @_;
- git_async_cat($self->{-inbox}->git, $smsg->{blob},
+ git_async_cat($self->{ibx}->git, $smsg->{blob},
\&async_blob_cb, $self);
}
}
my $pre;
if (!$self->{wbuf} && (my $nxt = $msgs->[0])) {
- $pre = $ibx->git->async_prefetch($nxt->{blob},
+ $pre = git_async_prefetch($ibx->git, $nxt->{blob},
\&fetch_blob_cb, $fetch_arg);
}
fetch_run_ops($self, $smsg, $bref, $ops, $partial);
1; # more
}
-sub parse_query ($$) {
+sub parse_imap_query ($$) {
my ($self, $query) = @_;
my $q = PublicInbox::IMAPsearchqp::parse($self, $query);
if (ref($q)) {
$q;
}
-sub refill_xap ($$$$) {
- my ($self, $uids, $range_info, $q) = @_;
- my ($beg, $end) = @$range_info;
- my $srch = $self->{ibx}->search;
- my $opt = { mset => 2, limit => 1000 };
- my $mset = $srch->mset("$q uid:$beg..$end", $opt);
- @$uids = @{$srch->mset_to_artnums($mset)};
- if (@$uids) {
- $range_info->[0] = $uids->[-1] + 1; # update $beg
- return; # possibly more
- }
- 0; # all done
-}
-
-sub search_xap_range { # long_response
- my ($self, $tag, $q, $range_info, $want_msn) = @_;
- my $uids = [];
- if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) {
- $err ||= 'OK Search done';
- $self->write("\r\n$tag $err\r\n");
- return;
- }
- msn_convert($self, $uids) if $want_msn;
- $self->msg_more(join(' ', '', @$uids));
- 1; # more
-}
-
sub search_common {
my ($self, $tag, $query, $want_msn) = @_;
my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n";
- my $q = parse_query($self, $query);
+ my $q = parse_imap_query($self, $query);
return "$tag $q\r\n" if !ref($q);
my ($sql, $range_info) = delete @$q{qw(sql range_info)};
if (!scalar(keys %$q)) { # overview.sqlite3
long_response($self, \&search_uid_range,
$tag, $sql, $range_info, $want_msn);
} elsif ($q = $q->{xap}) {
- $self->{ibx}->search or
+ my $srch = $self->{ibx}->isrch or
return "$tag BAD search not available for mailbox\r\n";
- $self->msg_more('* SEARCH');
- long_response($self, \&search_xap_range,
- $tag, $q, $range_info, $want_msn);
+ my $opt = {
+ relevance => -1,
+ limit => UID_SLICE,
+ uid_range => $range_info
+ };
+ my $mset = $srch->mset($q, $opt);
+ my $uids = $srch->mset_to_artnums($mset, $opt);
+ msn_convert($self, $uids) if $want_msn;
+ "* SEARCH @$uids\r\n$tag OK Search done\r\n";
} else {
"$tag BAD Error\r\n";
}
err => \*STDERR,
out => \*STDOUT,
# accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... }
- # pi_config => PublicInbox::Config
+ # pi_cfg => PublicInbox::Config
# idler => PublicInbox::InboxIdle
}, $class;
}
-sub imapd_refresh_ibx { # pi_config->each_inbox cb
+sub imapd_refresh_ibx { # pi_cfg->each_inbox cb
my ($ibx, $imapd) = @_;
my $ngname = $ibx->{newsgroup} or return;
- if (ref $ngname) {
- warn 'multiple newsgroups not supported: '.
- join(', ', @$ngname). "\n";
- return;
- } elsif ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! ||
- $ngname =~ /\.[0-9]+\z/) {
+
+ # We require lower-case since IMAP mailbox names are
+ # case-insensitive (but -nntpd matches INN in being
+ # case-sensitive
+ if ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! ||
+ # don't confuse with 50K slices
+ $ngname =~ /\.[0-9]+\z/) {
warn "mailbox name invalid: newsgroup=`$ngname'\n";
return;
}
$ibx->over or return;
$ibx->{over} = undef;
- my $mm = $ibx->mm or return;
- $ibx->{mm} = undef;
# RFC 3501 2.3.1.1 - "A good UIDVALIDITY value to use in
# this case is a 32-bit representation of the creation
# date/time of the mailbox"
- defined($ibx->{uidvalidity} = $mm->created_at) or return;
- PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max // 0);
+ eval { $ibx->uidvalidity };
+ my $mm = delete($ibx->{mm}) or return;
+ defined($ibx->{uidvalidity}) or return;
+ PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max);
# preload to avoid fragmentation:
$ibx->description;
}
sub imapd_refresh_finalize {
- my ($imapd, $pi_config) = @_;
+ my ($imapd, $pi_cfg) = @_;
my $mailboxes;
if (my $next = delete $imapd->{imapd_next}) {
$imapd->{mailboxes} = delete $next->{mailboxes};
qq[* LIST (\\Has${no}Children) "." $u\r\n]
} keys %$mailboxes
];
- $imapd->{pi_config} = $pi_config;
+ $imapd->{pi_cfg} = $pi_cfg;
if (my $idler = $imapd->{idler}) {
- $idler->refresh($pi_config);
+ $idler->refresh($pi_cfg);
}
}
-sub imapd_refresh_step { # pi_config->iterate_start cb
- my ($pi_config, $section, $imapd) = @_;
+sub imapd_refresh_step { # pi_cfg->iterate_start cb
+ my ($pi_cfg, $section, $imapd) = @_;
if (defined($section)) {
return if $section !~ m!\Apublicinbox\.([^/]+)\z!;
- my $ibx = $pi_config->lookup_name($1) or return;
+ my $ibx = $pi_cfg->lookup_name($1) or return;
imapd_refresh_ibx($ibx, $imapd->{imapd_next});
} else { # undef == "EOF"
- imapd_refresh_finalize($imapd, $pi_config);
+ imapd_refresh_finalize($imapd, $pi_cfg);
}
}
sub refresh_groups {
my ($self, $sig) = @_;
- my $pi_config = PublicInbox::Config->new;
+ my $pi_cfg = PublicInbox::Config->new;
if ($sig) { # SIGHUP is handled through the event loop
$self->{imapd_next} = { dummies => {}, mailboxes => {} };
- my $iter = PublicInbox::ConfigIter->new($pi_config,
+ my $iter = PublicInbox::ConfigIter->new($pi_cfg,
\&imapd_refresh_step, $self);
$iter->event_step;
} else { # initial start is synchronous
$self->{dummies} = {};
- $pi_config->each_inbox(\&imapd_refresh_ibx, $self);
- imapd_refresh_finalize($self, $pi_config);
+ $pi_cfg->each_inbox(\&imapd_refresh_ibx, $self);
+ imapd_refresh_finalize($self, $pi_cfg);
}
}
sub idler_start {
- $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config});
+ $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg});
}
1;
use v5.10.1;
use strict;
use Fcntl qw(:seek);
-use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*';
+use constant PACK_FMT => eval { pack('Q', 1) } ? 'A1QQH*H*' : 'A1IIH*H*';
# start off in write-only mode
sub new {
open(my $io, '+>', undef) or die "open: $!";
+ # latest_cmt is still useful when the newest revision is a `d'(elete),
+ # otherwise we favor $sync->{latest_cmt} for checkpoints and {quit}
bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__
}
# file_char = [d|m]
sub push_rec {
- my ($self, $file_char, $at, $ct, $blob_oid) = @_;
- my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid);
- $self->{rec_size} //= length($rec);
+ my ($self, $file_char, $at, $ct, $blob_oid, $cmt_oid) = @_;
+ my $rec = pack(PACK_FMT, $file_char, $at, $ct, $blob_oid, $cmt_oid);
+ $self->{unpack_fmt} //= do {
+ my $len = length($cmt_oid);
+ my $fmt = PACK_FMT;
+ $fmt =~ s/H\*/H$len/g;
+ $self->{rec_size} = length($rec);
+ $fmt;
+ };
print { $self->{wr} } $rec or die "print: $!";
$self->{tot_size} += length($rec);
}
my $r = read($io, my $buf, $sz);
defined($r) or die "read: $!";
$r == $sz or die "read($r != $sz)";
- unpack(FMT, $buf);
+ unpack($self->{unpack_fmt}, $buf);
}
1;
return ($self->{in}, $self->{out}) if $self->{pid};
- my (@ret, $out_r, $out_w);
+ my ($in_r, $pid, $out_r, $out_w);
pipe($out_r, $out_w) or die "pipe failed: $!";
$self->lock_acquire;
my ($git, $ref) = @$self{qw(git ref)};
local $/ = "\n";
chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref));
+ die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?;
if ($self->{path_type} ne '2/38' && $self->{tip}) {
local $/ = "\0";
my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref);
+ die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?;
chomp @t;
$self->{-tree} = { map { $_ => 1 } @t };
}
my @cmd = ('git', "--git-dir=$git->{git_dir}",
qw(fast-import --quiet --done --date-format=raw));
- my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r });
+ ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r });
$out_w->autoflush(1);
$self->{in} = $in_r;
$self->{out} = $out_w;
$self->{pid} = $pid;
$self->{nchg} = 0;
- @ret = ($in_r, $out_w);
};
if ($@) {
$self->lock_release;
die $@;
}
- @ret;
+ ($in_r, $out_w);
}
sub wfail () { die "write to fast-import failed: $!" }
# Represents a public-inbox (which may have multiple mailing addresses)
package PublicInbox::Inbox;
use strict;
-use warnings;
use PublicInbox::Git;
use PublicInbox::MID qw(mid2path);
use PublicInbox::Eml;
+use List::Util qw(max);
# Long-running "git-cat-file --batch" processes won't notice
# unlinked packs, so we need to restart those processes occasionally.
$CLEANUP->{"$self"} = $self;
}
-sub _set_uint ($$$) {
- my ($opts, $field, $default) = @_;
- my $val = $opts->{$field};
- if (defined $val) {
- $val = $val->[-1] if ref($val) eq 'ARRAY';
- $val = undef if $val !~ /\A[0-9]+\z/;
- }
- $opts->{$field} = $val || $default;
-}
-
sub _set_limiter ($$$) {
- my ($self, $pi_config, $pfx) = @_;
+ my ($self, $pi_cfg, $pfx) = @_;
my $lkey = "-${pfx}_limiter";
$self->{$lkey} ||= do {
# full key is: publicinbox.$NAME.httpbackendmax
require PublicInbox::Qspawn;
$lim = PublicInbox::Qspawn::Limiter->new($val);
} elsif ($val =~ /\A[a-z][a-z0-9]*\z/) {
- $lim = $pi_config->limiter($val);
+ $lim = $pi_cfg->limiter($val);
warn "$mkey limiter=$val not found\n" if !$lim;
} else {
warn "$mkey limiter=$val not understood\n";
my $v = $opts->{address} ||= [ 'public-inbox@example.com' ];
my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v;
$opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost';
- my $pi_config = delete $opts->{-pi_config};
- _set_limiter($opts, $pi_config, 'httpbackend');
- _set_uint($opts, 'feedmax', 25);
- $opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'};
- my $dir = $opts->{inboxdir};
- if (defined $dir && -f "$dir/inbox.lock") {
- $opts->{version} = 2;
+ my $pi_cfg = delete $opts->{-pi_cfg};
+ _set_limiter($opts, $pi_cfg, 'httpbackend');
+ my $fmax = $opts->{feedmax};
+ if (defined($fmax) && $fmax =~ /\A[0-9]+\z/) {
+ $opts->{feedmax} += 0;
+ } else {
+ delete $opts->{feedmax};
}
+ $opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'};
# allow any combination of multi-line or comma-delimited hide entries
my $hide = {};
bless $opts, $class;
}
-sub version { $_[0]->{version} // 1 }
+sub version {
+ $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1
+}
sub git_epoch {
- my ($self, $epoch) = @_;
- $self->version == 2 or return;
+ my ($self, $epoch) = @_; # v2-only, callers always supply $epoch
$self->{"$epoch.git"} ||= do {
my $git_dir = "$self->{inboxdir}/git/$epoch.git";
+ return unless -d $git_dir;
my $g = PublicInbox::Git->new($git_dir);
$g->{-httpbackend_limiter} = $self->{-httpbackend_limiter};
- # no cleanup needed, we never cat-file off this, only clone
+ # caller must manually cleanup when done
$g;
};
}
my ($self) = @_;
return if $self->version < 2;
my $cur = $self->{-max_git_epoch};
- my $changed = git($self)->alternates_changed;
- if (!defined($cur) || $changed) {
+ my $changed;
+ if (!defined($cur) || ($changed = git($self)->alternates_changed)) {
git_cleanup($self) if $changed;
my $gits = "$self->{inboxdir}/git";
if (opendir my $dh, $gits) {
- my $max = -1;
- while (defined(my $git_dir = readdir($dh))) {
- $git_dir =~ m!\A([0-9]+)\.git\z! or next;
- $max = $1 if $1 > $max;
- }
- $cur = $self->{-max_git_epoch} = $max if $max >= 0;
- } else {
- warn "opendir $gits failed: $!\n";
+ my $max = max(map {
+ substr($_, 0, -4) + 0; # drop ".git" suffix
+ } grep(/\A[0-9]+\.git\z/, readdir($dh))) // return;
+ $cur = $self->{-max_git_epoch} = $max;
}
}
$cur;
};
}
-sub search ($;$$) {
- my ($self, $over_only, $ctx) = @_;
- my $srch = $self->{search} ||= eval {
+sub search {
+ my ($self) = @_;
+ my $srch = $self->{search} //= eval {
_cleanup_later($self);
require PublicInbox::Search;
PublicInbox::Search->new($self);
};
- ($over_only || eval { $srch->xdb }) ? $srch : do {
- $ctx and $ctx->{env}->{'psgi.errors'}->print(<<EOF);
-`$self->{name}' search went away unexpectedly
-EOF
- undef;
- };
+ (eval { $srch->xdb }) ? $srch : undef;
}
+# isrch is preferred for read-only interfaces if available since it
+# reduces kernel cache and FD overhead
+sub isrch { $_[0]->{isrch} // search($_[0]) }
+
sub over {
$_[0]->{over} //= eval {
- my $srch = search($_[0], 1) or return;
+ my $srch = $_[0]->{search} //= eval {
+ _cleanup_later($_[0]);
+ require PublicInbox::Search;
+ PublicInbox::Search->new($_[0]);
+ };
my $over = PublicInbox::Over->new("$srch->{xpfx}/over.sqlite3");
$over->dbh; # may fail
$over;
};
}
+
sub try_cat {
my ($path) = @_;
- my $rv = '';
- if (open(my $fh, '<', $path)) {
- local $/;
- $rv = <$fh>;
- }
- $rv;
+ open(my $fh, '<', $path) or return '';
+ local $/;
+ <$fh> // '';
+}
+
+sub cat_desc ($) {
+ my $desc = try_cat($_[0]);
+ local $/ = "\n";
+ chomp $desc;
+ utf8::decode($desc);
+ $desc =~ s/\s+/ /smg;
+ $desc eq '' ? undef : $desc;
}
sub description {
my ($self) = @_;
- ($self->{description} //= do {
- my $desc = try_cat("$self->{inboxdir}/description");
- local $/ = "\n";
- chomp $desc;
- utf8::decode($desc);
- $desc =~ s/\s+/ /smg;
- $desc eq '' ? undef : $desc;
- }) // '($INBOX_DIR/description missing)';
+ ($self->{description} //= cat_desc("$self->{inboxdir}/description")) //
+ '($INBOX_DIR/description missing)';
}
sub cloneurl {
return unless defined $smsg;
defined(my $blob = $smsg->{blob}) or return;
- git($self)->cat_file($blob);
+ $self->git->cat_file($blob);
}
sub smsg_eml {
$eml;
}
-sub mid2num($$) {
- my ($self, $mid) = @_;
- my $mm = mm($self) or return;
- $mm->num_for($mid);
-}
-
sub smsg_by_mid ($$) {
my ($self, $mid) = @_;
- my $over = over($self) or return;
- # favor the Message-ID we used for the NNTP article number:
- defined(my $num = mid2num($self, $mid)) or return;
- my $smsg = $over->get_art($num) or return;
- PublicInbox::Smsg::psgi_cull($smsg);
+ my $over = $self->over or return;
+ my $smsg;
+ if (my $mm = $self->mm) {
+ # favor the Message-ID we used for the NNTP article number:
+ defined(my $num = $mm->num_for($mid)) or return;
+ $smsg = $over->get_art($num);
+ } else {
+ my ($id, $prev);
+ $smsg = $over->next_by_mid($mid, \$id, \$prev);
+ }
+ $smsg ? PublicInbox::Smsg::psgi_cull($smsg) : undef;
}
sub msg_by_mid ($$) {
my ($self, $mid) = @_;
-
- over($self) or
- return msg_by_path($self, mid2path($mid));
-
my $smsg = smsg_by_mid($self, $mid);
- $smsg ? msg_by_smsg($self, $smsg) : undef;
+ $smsg ? msg_by_smsg($self, $smsg) : msg_by_path($self, mid2path($mid));
}
sub recent {
my ($self, $opts, $after, $before) = @_;
- over($self)->recent($opts, $after, $before);
+ $self->over->recent($opts, $after, $before);
}
sub modified {
my ($self) = @_;
- if (my $over = over($self)) {
+ if (my $over = $self->over) {
my $msgs = $over->recent({limit => 1});
if (my $smsg = $msgs->[0]) {
return $smsg->{ts};
}
}
+sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } }
+
+sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
+
1;
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# fields:
-# pi_config: PublicInbox::Config ref
# inot: Linux::Inotify2-like object
# pathmap => { inboxdir => [ ibx, watch1, watch2, watch3... ] } mapping
package PublicInbox::InboxIdle;
use strict;
use parent qw(PublicInbox::DS);
-use Cwd qw(abs_path);
use PublicInbox::Syscall qw(EPOLLIN EPOLLET);
my $IN_MODIFY = 0x02; # match Linux inotify
my $ino_cls;
sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback
my ($ibx, $self) = @_;
- my $dir = abs_path($ibx->{inboxdir});
- if (!defined($dir)) {
- warn "W: $ibx->{inboxdir} not watched: $!\n";
- return;
- }
+ my $dir = $ibx->{inboxdir};
my $inot = $self->{inot};
my $cur = $self->{pathmap}->{$dir} //= [];
my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock');
}
sub refresh {
- my ($self, $pi_config) = @_;
- $pi_config->each_inbox(\&in2_arm, $self);
+ my ($self, $pi_cfg) = @_;
+ $pi_cfg->each_inbox(\&in2_arm, $self);
}
+# internal API for ease-of-use
+sub watch_inbox { in2_arm($_[1], $_[0]) };
+
sub new {
- my ($class, $pi_config) = @_;
+ my ($class, $pi_cfg) = @_;
my $self = bless {}, $class;
my $inot;
if ($ino_cls) {
$self->{inot} = $inot;
$self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...]
$self->{on_unlock} = {}; # lock path => ibx
- refresh($self, $pi_config);
+ refresh($self, $pi_cfg) if $pi_cfg;
PublicInbox::FakeInotify::poll_once($self) if !$ino_cls;
$self;
}
my @events = $self->{inot}->read; # Linux::Inotify2::read
my $on_unlock = $self->{on_unlock};
for my $ev (@events) {
- if (my $ibx = $on_unlock->{$ev->fullname}) {
+ my $fn = $ev->fullname // next; # cancelled
+ if (my $ibx = $on_unlock->{$fn}) {
$ibx->on_unlock;
}
}
require PublicInbox::Msgmap;
my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create
$sidx->begin_txn_lazy;
+ my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1);
if (defined $skip_artnum) {
- my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1);
$mm->{dbh}->begin_work;
$mm->skip_artnum($skip_artnum);
$mm->{dbh}->commit;
}
+ undef $mm; # ->created_at set
$sidx->commit_txn_lazy;
} else {
open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or
if ($self->version == 1) {
my $dir = assert_usable_dir($self);
PublicInbox::Import::init_bare($dir);
- $self->umask_prepare;
$self->with_umask(\&_init_v1, $self, $skip_artnum);
} else {
my $v2w = importer($self);
$im->done;
}
- my @args = (-inbox => $self);
+ my @args = (ibx => $self);
# basic line splitting, only
# Perhaps we can have proper quote splitting one day...
($f, @args) = split(/\s+/, $f) if $f =~ /\s+/;
sub with_umask {
my ($self, $cb, @arg) = @_;
- my $old = umask $self->{umask};
+ my $old = umask($self->{umask} //= umask_prepare($self));
my $rv = eval { $cb->(@arg) };
my $err = $@;
umask $old;
sub umask_prepare {
my ($self) = @_;
my $perm = _git_config_perm($self);
- my $umask = _umask_for($perm);
- $self->{umask} = $umask;
+ _umask_for($perm);
}
sub cleanup ($) {
# PublicInbox::MsgTime
|| $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/
|| $s =~ /^bad Date: .+? in /
+ # Encode::Unicode::UTF7
+ || $s =~ /^Bad UTF7 data escape at /
}
# this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..."
sub warn_ignore_cb {
- my $cb = $SIG{__WARN__} // sub { print STDERR @_ };
+ my $cb = $SIG{__WARN__} // \&CORE::warn;
sub {
return if warn_ignore(@_);
$cb->(@_);
}
}
+# v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove
+sub git_dir_latest {
+ my ($self, $max) = @_;
+ defined($$max = $self->max_git_epoch) ?
+ "$self->{inboxdir}/git/$$max.git" : undef;
+}
+
1;
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Provides everything the PublicInbox::Search object does;
+# but uses global ExtSearch (->ALL) with an eidx_key query to
+# emulate per-Inbox search using ->ALL.
+package PublicInbox::Isearch;
+use strict;
+use v5.10.1;
+use PublicInbox::ExtSearch;
+use PublicInbox::Search;
+
+sub new {
+ my (undef, $ibx, $es) = @_;
+ bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+}
+
+sub _ibx_id ($) {
+ my ($self) = @_;
+ my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
+
+ $sth->execute($self->{eidx_key});
+ $sth->fetchrow_array //
+ die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n";
+}
+
+
+sub mset {
+ my ($self, $str, $opt) = @_;
+ my %opt = $opt ? %$opt : ();
+ $opt{eidx_key} = $self->{eidx_key};
+ if (my $uid_range = $opt{uid_range}) {
+ my ($beg, $end) = @$uid_range;
+ my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
+ my $dbh = $self->{es}->{over}->dbh;
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+ $sth->execute($ibx_id, $beg, $end);
+ my @r = ($sth->fetchrow_array);
+
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+ $sth->execute($ibx_id, $beg, $end);
+ $r[1] = $sth->fetchrow_array;
+ if (defined($r[1]) && defined($r[0])) {
+ $opt{limit} = $r[1] - $r[0] + 1;
+ } else {
+ $r[1] //= 0xffffffff;
+ $r[0] //= 0;
+ }
+ $opt{uid_range} = \@r;
+ }
+ $self->{es}->mset($str, \%opt);
+}
+
+sub mset_to_artnums {
+ my ($self, $mset, $opt) = @_;
+ my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset);
+ my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
+ my $qmarks = join(',', map { '?' } @$docids);
+ if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING
+ my $range = '';
+ my @r;
+ if (my $r = $opt->{uid_range}) {
+ $range = 'AND xnum >= ? AND xnum <= ?';
+ @r = @$r;
+ }
+ my $rows = $self->{es}->over->dbh->
+ selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r);
+SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range
+ORDER BY xnum ASC
+
+ return [ map { $_->[0] } @$rows ];
+ }
+
+ my $rows = $self->{es}->over->dbh->
+ selectall_arrayref(<<"", undef, $ibx_id, @$docids);
+SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks)
+
+ my $i = -1;
+ my %order = map { $_ => ++$i } @$docids;
+ my @xnums;
+ for my $row (@$rows) { # @row = ($docid, $xnum)
+ my $idx = delete($order{$row->[0]}) // next;
+ $xnums[$idx] = $row->[1];
+ }
+ if (scalar keys %order) {
+ warn "W: $self->{es}->{topdir} #",
+ join(', ', sort { $a <=> $b } keys %order),
+ " not mapped to `$self->{eidx_key}'\n";
+ warn "W: $self->{es}->{topdir} may need to be reindexed\n";
+ @xnums = grep { defined } @xnums;
+ }
+ \@xnums;
+}
+
+sub mset_to_smsg {
+ my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx
+ my $xnums = mset_to_artnums($self, $mset);
+ my $i = -1;
+ my %order = map { $_ => ++$i } @$xnums;
+ my $unordered = $ibx->over->get_all(@$xnums);
+ my @msgs;
+ for my $smsg (@$unordered) {
+ my $idx = delete($order{$smsg->{num}}) // do {
+ warn "W: $ibx->{inboxdir} #$smsg->{num}\n";
+ next;
+ };
+ $msgs[$idx] = $smsg;
+ }
+ if (scalar keys %order) {
+ warn "W: $ibx->{inboxdir} #",
+ join(', ', sort { $a <=> $b } keys %order),
+ " no longer valid\n";
+ warn "W: $self->{es}->{topdir} may need to be reindexed\n";
+ }
+ wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs;
+}
+
+sub has_threadid { 1 }
+
+sub help { $_[0]->{es}->help }
+
+1;
}
sub inboxes_for_list_id ($$) {
- my ($klass, $config, $simple) = @_;
+ my ($klass, $pi_cfg, $simple) = @_;
# newer Email::Simple allows header_raw, as does Email::MIME:
my @list_ids = $simple->can('header_raw') ?
my @dests;
for my $list_id (@list_ids) {
$list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next;
- if (my $ibx = $config->lookup_list_id($1)) {
+ if (my $ibx = $pi_cfg->lookup_list_id($1)) {
push @dests, $ibx;
}
}
use strict;
use v5.10.1;
use parent qw(PublicInbox::WwwListing);
-use Digest::SHA ();
-use File::Spec ();
use bytes (); # length
-use PublicInbox::Inbox;
-use PublicInbox::Git;
+use PublicInbox::Config;
use IO::Compress::Gzip qw(gzip);
use HTTP::Date qw(time2str);
-*try_cat = \&PublicInbox::Inbox::try_cat;
-our $json;
-for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) {
- eval "require $mod" or next;
- # ->ascii encodes non-ASCII to "\uXXXX"
- $json = $mod->new->ascii(1) and last;
-}
+our $json = PublicInbox::Config::json();
# called by WwwListing
sub url_regexp {
$ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain');
}
-sub fingerprint ($) {
- my ($git) = @_;
- # TODO: convert to qspawn for fairness when there's
- # thousands of repos
- my ($fh, $pid) = $git->popen('show-ref');
- my $dig = Digest::SHA->new(1);
- while (read($fh, my $buf, 65536)) {
- $dig->add($buf);
- }
- close $fh;
- waitpid($pid, 0);
- return if $?; # empty, uninitialized git repo
- $dig->hexdigest;
+sub inject_entry ($$$;$) {
+ my ($ctx, $url_path, $ent, $git_dir) = @_;
+ $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path;
+ my $modified = $ent->{modified};
+ $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0);
+ $ctx->{manifest}->{$url_path} = $ent;
}
sub manifest_add ($$;$$) {
my ($ctx, $ibx, $epoch, $default_desc) = @_;
my $url_path = "/$ibx->{name}";
- my $git_dir = $ibx->{inboxdir};
+ my $git;
if (defined $epoch) {
- $git_dir .= "/git/$epoch.git";
$url_path .= "/git/$epoch.git";
+ $git = $ibx->git_epoch($epoch) or return;
+ } else {
+ $git = $ibx->git;
}
- return unless -d $git_dir;
- my $git = PublicInbox::Git->new($git_dir);
- my $fingerprint = fingerprint($git) or return; # no empty repos
-
- chomp(my $owner = $git->qx('config', 'gitweb.owner'));
- chomp(my $desc = try_cat("$git_dir/description"));
- utf8::decode($owner);
- utf8::decode($desc);
- $owner = undef if $owner eq '';
- $desc = 'Unnamed repository' if $desc eq '';
-
- # templates/hooks--update.sample and git-multimail in git.git
- # only match "Unnamed repository", not the full contents of
- # templates/this--description in git.git
- if ($desc =~ /\AUnnamed repository/) {
- $desc = "$default_desc [epoch $epoch]" if defined($epoch);
- }
-
- my $reference;
- chomp(my $alt = try_cat("$git_dir/objects/info/alternates"));
- if ($alt) {
- # n.b.: GitPython doesn't seem to handle comments or C-quoted
- # strings like native git does; and we don't for now, either.
- my @alt = split(/\n+/, $alt);
-
- # grokmirror only supports 1 alternate for "reference",
- if (scalar(@alt) == 1) {
- my $objdir = "$git_dir/objects";
- $reference = File::Spec->rel2abs($alt[0], $objdir);
- $reference =~ s!/[^/]+/?\z!!; # basename
- }
- }
- $ctx->{-abs2urlpath}->{$git_dir} = $url_path;
- my $modified = $git->modified;
- if ($modified > ($ctx->{-mtime} // 0)) {
- $ctx->{-mtime} = $modified;
- }
- $ctx->{manifest}->{$url_path} = {
- owner => $owner,
- reference => $reference,
- description => $desc,
- modified => $modified,
- fingerprint => $fingerprint,
- };
+ my $ent = $git->manifest_entry($epoch, $default_desc) or return;
+ inject_entry($ctx, $url_path, $ent, $git->{git_dir});
}
-sub ibx_entry {
+sub slow_manifest_add ($$) {
my ($ctx, $ibx) = @_;
eval {
if (defined(my $max = $ibx->max_git_epoch)) {
manifest_add($ctx, $ibx);
}
};
+}
+
+sub eidx_manifest_add ($$$) {
+ my ($ctx, $ALL, $ibx) = @_;
+ if (my $data = $ALL->misc->inbox_data($ibx)) {
+ $data = $json->decode($data);
+ delete $data->{''}; # private
+ while (my ($url_path, $ent) = each %$data) {
+ inject_entry($ctx, $url_path, $ent);
+ }
+ } else {
+ warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n";
+ }
+}
+
+sub ibx_entry {
+ my ($ctx, $ibx) = @_;
+ my $ALL = $ctx->{www}->{pi_cfg}->ALL;
+ if ($ALL) {
+ eidx_manifest_add($ctx, $ALL, $ibx);
+ } else {
+ slow_manifest_add($ctx, $ibx);
+ }
warn "E: $@" if $@;
}
sub per_inbox {
my ($ctx) = @_;
- ibx_entry($ctx, $ctx->{-inbox});
+ # only one inbox, slow is probably OK
+ slow_manifest_add($ctx, $ctx->{ibx});
psgi_triple($ctx);
}
sub getline {
my ($ctx) = @_; # ctx
my $smsg = $ctx->{smsg} or return;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $eml = $ibx->smsg_eml($smsg) or return;
my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}});
- $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid}));
+ $ctx->zmore(msg_hdr($ctx, $eml));
if ($n) {
$ctx->translate(msg_body($eml));
} else { # last message
my ($ctx, $eml) = @_;
my $smsg = delete $ctx->{smsg};
# next message
- $ctx->{smsg} = $ctx->{-inbox}->over->next_by_mid(@{$ctx->{next_arg}});
+ $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}});
- $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid}));
+ $ctx->zmore(msg_hdr($ctx, $eml));
$ctx->{http_out}->write($ctx->translate(msg_body($eml)));
}
$fn =~ s/^re:\s+//i;
$fn = to_filename($fn) // 'no-subject';
my @hdr = ('Content-Type');
- if ($ctx->{-inbox}->{obfuscate}) {
+ if ($ctx->{ibx}->{obfuscate}) {
# obfuscation is stupid, but maybe scrapers are, too...
push @hdr, 'application/mbox';
$fn .= '.mbox';
# for rare cases where v1 inboxes aren't indexed w/ ->over at all
sub no_over_raw ($) {
my ($ctx) = @_;
- my $mref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return;
+ my $mref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return;
my $eml = PublicInbox::Eml->new($mref);
[ 200, res_hdr($ctx, $eml->header_str('Subject')),
- [ msg_hdr($ctx, $eml, $ctx->{mid}) . msg_body($eml) ] ]
+ [ msg_hdr($ctx, $eml) . msg_body($eml) ] ]
}
# /$INBOX/$MESSAGE_ID/raw
sub emit_raw {
my ($ctx) = @_;
- $ctx->{base_url} = $ctx->{-inbox}->base_url($ctx->{env});
- my $over = $ctx->{-inbox}->over or return no_over_raw($ctx);
+ $ctx->{base_url} = $ctx->{ibx}->base_url($ctx->{env});
+ my $over = $ctx->{ibx}->over or return no_over_raw($ctx);
my ($id, $prev);
my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ];
my $smsg = $ctx->{smsg} = $over->next_by_mid(@$mip) or return;
$ctx->psgi_response(200, $res_hdr);
}
-sub msg_hdr ($$;$) {
- my ($ctx, $eml, $mid) = @_;
+sub msg_hdr ($$) {
+ my ($ctx, $eml) = @_;
my $header_obj = $eml->header_obj;
# drop potentially confusing headers, ssoma already should've dropped
foreach my $d (qw(Lines Bytes Content-Length Status)) {
$header_obj->header_set($d);
}
- my $ibx = $ctx->{-inbox};
- my $base = $ctx->{base_url};
- $mid = $ctx->{mid} unless defined $mid;
- $mid = mid_escape($mid);
- my @append = (
- 'Archived-At', "<$base$mid/>",
- 'List-Archive', "<$base>",
- 'List-Post', "<mailto:$ibx->{-primary_address}>",
- );
my $crlf = $header_obj->crlf;
my $buf = $header_obj->as_string;
# fixup old bug from import (pre-a0c07cba0e5d8b6a)
$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
- $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf;
-
- for (my $i = 0; $i < @append; $i += 2) {
- my $k = $append[$i];
- my $v = $append[$i + 1];
- my @v = $header_obj->header_raw($k);
- foreach (@v) {
- if ($v eq $_) {
- $v = undef;
- last;
- }
- }
- $buf .= "$k: $v$crlf" if defined $v;
- }
- $buf .= $crlf;
+ "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf;
}
sub msg_body ($) {
sub mbox_all_ids {
my ($ctx) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $prev = 0;
my $mm = $ctx->{mm} = $ibx->mm;
my $ids = $mm->ids_after(\$prev) or return
PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all');
}
+sub gone ($$) {
+ my ($ctx, $what) = @_;
+ warn "W: `$ctx->{ibx}->{inboxdir}' $what went away unexpectedly\n";
+ undef;
+}
+
sub results_cb {
my ($ctx) = @_;
- my $over = $ctx->{-inbox}->over or return;
+ my $over = $ctx->{ibx}->over or return gone($ctx, 'over');
while (1) {
while (defined(my $num = shift(@{$ctx->{ids}}))) {
my $smsg = $over->get_art($num) or next;
return $smsg;
}
# refill result set
- my $srch = $ctx->{-inbox}->search(undef, $ctx) or return;
+ my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search');
my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
my $size = $mset->size or return;
$ctx->{qopts}->{offset} += $size;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
}
}
sub results_thread_cb {
my ($ctx) = @_;
- my $over = $ctx->{-inbox}->over or return;
+ my $over = $ctx->{ibx}->over or return gone($ctx, 'over');
while (1) {
while (defined(my $num = shift(@{$ctx->{xids}}))) {
my $smsg = $over->get_art($num) or next;
next if $over->expand_thread($ctx);
# refill result set
- my $srch = $ctx->{-inbox}->search(undef, $ctx) or return;
+ my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search');
my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
my $size = $mset->size or return;
$ctx->{qopts}->{offset} += $size;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
}
}
my ($ctx, $q) = @_;
my $q_string = $q->{'q'};
return mbox_all_ids($ctx) if $q_string !~ /\S/;
- my $srch = $ctx->{-inbox}->search or
+ my $srch = $ctx->{ibx}->isrch or
return PublicInbox::WWW::need($ctx, 'Search');
- my $over = $ctx->{-inbox}->over or
+ my $over = $ctx->{ibx}->over or
return PublicInbox::WWW::need($ctx, 'Overview');
- my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid
+ my $qopts = $ctx->{qopts} = { relevance => -1 }; # ORDER BY docid ASC
$qopts->{thread} = 1 if $q->{t};
my $mset = $srch->mset($q_string, $qopts);
$qopts->{offset} = $mset->size or
return [404, [qw(Content-Type text/plain)],
["No results found\n"]];
$ctx->{query} = $q_string;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts);
require PublicInbox::MboxGz;
my $fn;
if ($q->{t} && $srch->has_threadid) {
sub mbox_gz {
my ($self, $cb, $fn) = @_;
$self->{cb} = $cb;
- $self->{base_url} = $self->{-inbox}->base_url($self->{env});
+ $self->{base_url} = $self->{ibx}->base_url($self->{env});
$self->{gz} = PublicInbox::GzipFilter::gzip_or_die();
$fn = to_filename($fn // '') // 'no-subject';
# http://www.iana.org/assignments/media-types/application/gzip
my ($self) = @_;
my $cb = $self->{cb} or return;
while (my $smsg = $cb->($self)) {
- my $eml = $self->{-inbox}->smsg_eml($smsg) or next;
- $self->zmore(msg_hdr($self, $eml, $smsg->{mid}));
+ my $eml = $self->{ibx}->smsg_eml($smsg) or next;
+ $self->zmore(msg_hdr($self, $eml));
return $self->translate(msg_body($eml));
}
# signal that we're done and can return undef next call:
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# like PublicInbox::SearchIdx, but for searching for non-mail messages.
+# Things indexed include:
+# * inboxes themselves
+# * epoch information
+# * (maybe) git code repository information
+# Expect ~100K-1M documents with no parallelism opportunities,
+# so no sharding, here.
+#
+# See MiscSearch for read-only counterpart
+package PublicInbox::MiscIdx;
+use strict;
+use v5.10.1;
+use PublicInbox::InboxWritable;
+use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat
+use PublicInbox::SearchIdx qw(index_text term_generator add_val);
+use PublicInbox::Spawn qw(nodatacow_dir);
+use Carp qw(croak);
+use File::Path ();
+use PublicInbox::MiscSearch;
+use PublicInbox::Config;
+my $json;
+
+sub new {
+ my ($class, $eidx) = @_;
+ PublicInbox::SearchIdx::load_xapian_writable();
+ my $mi_dir = "$eidx->{xpfx}/misc";
+ File::Path::mkpath($mi_dir);
+ nodatacow_dir($mi_dir);
+ my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN;
+ $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync};
+ $json //= PublicInbox::Config::json();
+ bless {
+ mi_dir => $mi_dir,
+ flags => $flags,
+ indexlevel => 'full', # small DB, no point in medium?
+ }, $class;
+}
+
+sub begin_txn {
+ my ($self) = @_;
+ croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy?
+ my $wdb = $PublicInbox::Search::X{WritableDatabase};
+ my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) };
+ croak "Failed opening $self->{mi_dir}: $@" if $@;
+ $self->{xdb} = $xdb;
+ $xdb->begin_transaction;
+}
+
+sub commit_txn {
+ my ($self) = @_;
+ croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy?
+ delete($self->{xdb})->commit_transaction;
+}
+
+sub remove_eidx_key {
+ my ($self, $eidx_key) = @_;
+ my $xdb = $self->{xdb};
+ my $head = $xdb->postlist_begin('Q'.$eidx_key);
+ my $tail = $xdb->postlist_end('Q'.$eidx_key);
+ my @docids; # only one, unless we had bugs
+ for (; $head != $tail; $head++) {
+ push @docids, $head->get_docid;
+ }
+ for my $docid (@docids) {
+ $xdb->delete_document($docid);
+ warn "I: remove inbox docid #$docid ($eidx_key)\n";
+ }
+}
+
+# adds or updates according to $eidx_key
+sub index_ibx {
+ my ($self, $ibx) = @_;
+ my $eidx_key = $ibx->eidx_key;
+ my $xdb = $self->{xdb};
+ # Q = uniQue in Xapian terminology
+ my $head = $xdb->postlist_begin('Q'.$eidx_key);
+ my $tail = $xdb->postlist_end('Q'.$eidx_key);
+ my ($docid, @drop);
+ for (; $head != $tail; $head++) {
+ if (defined $docid) {
+ my $i = $head->get_docid;
+ push @drop, $i;
+ warn <<EOF;
+W: multiple inboxes keyed to `$eidx_key', deleting #$i
+EOF
+ } else {
+ $docid = $head->get_docid;
+ }
+ }
+ $xdb->delete_document($_) for @drop; # just in case
+
+ my $doc = $PublicInbox::Search::X{Document}->new;
+ term_generator($self)->set_document($doc);
+
+ # allow sorting by modified and uidvalidity (created at)
+ add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified);
+ add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity);
+
+ $doc->add_boolean_term('Q'.$eidx_key); # uniQue id
+ $doc->add_boolean_term('T'.'inbox'); # Type
+
+ if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) {
+ $doc->add_boolean_term('T'.'newsgroup'); # additional Type
+ }
+
+ # force reread from disk, {description} could be loaded from {misc}
+ delete $ibx->{description};
+ my $desc = $ibx->description;
+
+ # description = S/Subject (or title)
+ # address = A/Author
+ index_text($self, $desc, 1, 'S');
+ index_text($self, $ibx->{name}, 1, 'XNAME');
+ my %map = (
+ address => 'A',
+ listid => 'XLISTID',
+ infourl => 'XINFOURL',
+ url => 'XURL'
+ );
+ while (my ($f, $pfx) = each %map) {
+ for my $v (@{$ibx->{$f} // []}) {
+ index_text($self, $v, 1, $pfx);
+ }
+ }
+ my $data = {};
+ if (defined(my $max = $ibx->max_git_epoch)) { # v2
+ my $pfx = "/$ibx->{name}/git/";
+ for my $epoch (0..$max) {
+ my $git = $ibx->git_epoch($epoch) or return;
+ if (my $ent = $git->manifest_entry($epoch, $desc)) {
+ $data->{"$pfx$epoch.git"} = $ent;
+ $ent->{git_dir} = $git->{git_dir};
+ }
+ $git->cleanup; # ->modified starts cat-file --batch
+ }
+ } elsif (my $ent = $ibx->git->manifest_entry) { # v1
+ $ent->{git_dir} = $ibx->{inboxdir};
+ $data->{"/$ibx->{name}"} = $ent;
+ }
+ $doc->set_data($json->encode($data));
+ if (defined $docid) {
+ $xdb->replace_document($docid, $doc);
+ } else {
+ $xdb->add_document($doc);
+ }
+}
+
+1;
--- /dev/null
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only counterpart to MiscIdx
+package PublicInbox::MiscSearch;
+use strict;
+use v5.10.1;
+use PublicInbox::Search qw(retry_reopen int_val);
+my $json;
+
+# Xapian value columns:
+our $MODIFIED = 0;
+our $UIDVALIDITY = 1; # (created time)
+
+# avoid conflicting with message Search::prob_prefix for UI/UX reasons
+my %PROB_PREFIX = (
+ description => 'S', # $INBOX_DIR/description
+ address => 'A',
+ listid => 'XLISTID',
+ url => 'XURL',
+ infourl => 'XINFOURL',
+ name => 'XNAME',
+ '' => 'S A XLISTID XNAME XURL XINFOURL'
+);
+
+sub new {
+ my ($class, $dir) = @_;
+ PublicInbox::Search::load_xapian();
+ $json //= PublicInbox::Config::json();
+ bless {
+ xdb => $PublicInbox::Search::X{Database}->new($dir)
+ }, $class;
+}
+
+# read-only
+sub mi_qp_new ($) {
+ my ($self) = @_;
+ my $xdb = $self->{xdb};
+ my $qp = $PublicInbox::Search::X{QueryParser}->new;
+ $qp->set_default_op(PublicInbox::Search::OP_AND());
+ $qp->set_database($xdb);
+ $qp->set_stemmer(PublicInbox::Search::stemmer($self));
+ $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME());
+ my $cb = $qp->can('set_max_wildcard_expansion') //
+ $qp->can('set_max_expansion'); # Xapian 1.5.0+
+ $cb->($qp, 100);
+ $cb = $qp->can('add_valuerangeprocessor') //
+ $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+ while (my ($name, $prefix) = each %PROB_PREFIX) {
+ $qp->add_prefix($name, $_) for split(/ /, $prefix);
+ }
+ $qp->add_boolean_prefix('type', 'T');
+ $qp;
+}
+
+sub misc_enquire_once { # retry_reopen callback
+ my ($self, $qr, $opt) = @_;
+ my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb});
+ $eq->set_query($qr);
+ my $desc = !$opt->{asc};
+ my $rel = $opt->{relevance} // 0;
+ if ($rel == -1) { # ORDER BY docid/UID
+ $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING);
+ $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new);
+ } elsif ($rel) {
+ $eq->set_sort_by_relevance_then_value($MODIFIED, $desc);
+ } else {
+ $eq->set_sort_by_value_then_relevance($MODIFIED, $desc);
+ }
+ $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200);
+}
+
+sub mset {
+ my ($self, $qs, $opt) = @_;
+ $opt ||= {};
+ reopen($self);
+ my $qp = $self->{qp} //= mi_qp_new($self);
+ $qs = 'type:inbox' if $qs eq '';
+ my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS);
+ $opt->{relevance} = 1 unless exists $opt->{relevance};
+ retry_reopen($self, \&misc_enquire_once, $qr, $opt);
+}
+
+sub ibx_matches_once { # retry_reopen callback
+ my ($self, $qr, $by_newsgroup) = @_;
+ # double in case no newsgroups are configured:
+ my $limit = scalar(keys %$by_newsgroup) * 2;
+ my $opt = { limit => $limit, offset => 0, relevance => -1 };
+ my $ret = {}; # newsgroup => $ibx of matches
+ while (1) {
+ my $mset = misc_enquire_once($self, $qr, $opt);
+ for my $mi ($mset->items) {
+ my $doc = $mi->get_document;
+ my $end = $doc->termlist_end;
+ my $cur = $doc->termlist_begin;
+ $cur->skip_to('Q');
+ if ($cur != $end) {
+ my $ng = $cur->get_termname; # eidx_key
+ $ng =~ s/\AQ// or warn "BUG: no `Q': $ng";
+ if (my $ibx = $by_newsgroup->{$ng}) {
+ $ret->{$ng} = $ibx;
+ }
+ } else {
+ warn <<EOF;
+W: docid=${\$mi->get_docid} has no `Q' (eidx_key) term
+EOF
+ }
+ }
+ my $nr = $mset->size;
+ return $ret if $nr < $limit;
+ $opt->{offset} += $nr;
+ }
+}
+
+# returns a newsgroup => PublicInbox::Inbox mapping
+sub newsgroup_matches {
+ my ($self, $qs, $pi_cfg) = @_;
+ my $qp = $self->{qp} //= mi_qp_new($self);
+ $qs .= ' type:inbox';
+ my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS);
+ retry_reopen($self, \&ibx_matches_once, $qr, $pi_cfg->{-by_newsgroup});
+}
+
+sub ibx_data_once {
+ my ($self, $ibx) = @_;
+ my $xdb = $self->{xdb};
+ my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private
+ my $head = $xdb->postlist_begin($term);
+ my $tail = $xdb->postlist_end($term);
+ if ($head != $tail) {
+ my $doc = $xdb->get_document($head->get_docid);
+ $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY);
+ $ibx->{-modified} = int_val($doc, $MODIFIED);
+ $doc->get_data;
+ } else {
+ undef;
+ }
+}
+
+sub inbox_data {
+ my ($self, $ibx) = @_;
+ retry_reopen($self, \&ibx_data_once, $ibx);
+}
+
+sub ibx_cache_load {
+ my ($doc, $cache) = @_;
+ my $end = $doc->termlist_end;
+ my $cur = $doc->termlist_begin;
+ $cur->skip_to('Q');
+ return if $cur == $end;
+ my $eidx_key = $cur->get_termname;
+ $eidx_key =~ s/\AQ// or return; # expired
+ my $ce = $cache->{$eidx_key} = {};
+ $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY);
+ $ce->{-modified} = int_val($doc, $MODIFIED);
+ $ce->{description} = do {
+ # extract description from manifest.js.gz epoch description
+ my $d;
+ my $data = $json->decode($doc->get_data);
+ for (values %$data) {
+ $d = $_->{description} // next;
+ $d =~ s/ \[epoch [0-9]+\]\z// or next;
+ last;
+ }
+ $d;
+ }
+}
+
+sub _nntpd_cache_load { # retry_reopen callback
+ my ($self) = @_;
+ my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 };
+ my $mset = mset($self, 'type:newsgroup type:inbox', $opt);
+ my $cache = {};
+ for my $it ($mset->items) {
+ ibx_cache_load($it->get_document, $cache);
+ }
+ $cache
+}
+
+# returns { newsgroup => $cache_entry } mapping, $cache_entry contains
+# anything which may trigger seeks at startup, currently: description,
+# -modified, and uidvalidity.
+sub nntpd_cache_load {
+ my ($self) = @_;
+ retry_reopen($self, \&_nntpd_cache_load);
+}
+
+no warnings 'once';
+*reopen = \&PublicInbox::Search::reopen;
+
+1;
create_tables($dbh);
$self->created_at(time) unless $self->created_at;
- my $max = $self->max // 0;
- $self->num_highwater($max);
+ $self->num_highwater(max($self));
$dbh->commit;
}
$self;
my $sth = $_[0]->{dbh}->prepare_cached('SELECT MAX(num) FROM msgmap',
undef, 1);
$sth->execute;
- $sth->fetchrow_array;
+ $sth->fetchrow_array // 0;
}
sub minmax {
my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap',
undef, 1);
$sth->execute;
- ($sth->fetchrow_array, max($_[0]));
+ ($sth->fetchrow_array // 0, max($_[0]));
}
sub mid_delete {
# fields:
# nntpd: PublicInbox::NNTPD ref
# article: per-session current article number
-# ng: PublicInbox::Inbox ref
+# ibx: PublicInbox::Inbox ref
# long_cb: long_response private data
package PublicInbox::NNTP;
use strict;
use Digest::SHA qw(sha1_hex);
use Time::Local qw(timegm timelocal);
use PublicInbox::GitAsyncCat;
+use PublicInbox::Address;
+
use constant {
LINE_MAX => 512, # RFC 977 section 2.3
r501 => '501 command syntax error',
my $ONE_MSGID = qr/\A$MID_EXTRACT\z/;
my @OVERVIEW = qw(Subject From Date Message-ID References);
my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') .
- "Xref:full\r\n";
+ "Xref:full\r\n.";
my $LIST_HEADERS = join("\r\n", @OVERVIEW,
- qw(:bytes :lines Xref To Cc)) . "\r\n";
+ qw(:bytes :lines Xref To Cc)) . "\r\n.";
my $CAPABILITIES = <<"";
101 Capability list:\r
VERSION 2\r
err($self, 'error from: %s (%s)', $l, $err);
$res = '503 program fault - command not performed';
}
- return 0 unless defined $res;
- res($self, $res);
+ defined($res) ? res($self, $res) : 0;
}
# The keyword argument is not used (rfc3977 5.2.2)
sub cmd_mode ($$) {
my ($self, $arg) = @_;
- $arg = uc $arg;
- return r501 unless $arg eq 'READER';
- '201 Posting prohibited';
+ uc($arg) eq 'READER' ? '201 Posting prohibited' : r501;
}
sub cmd_slave ($) { '202 slave status noted' }
my ($self, $wildmat) = @_;
more($self, '282 list of groups and descriptions follows');
list_newsgroups($self, $wildmat);
- '.'
}
-sub list_overview_fmt ($) {
- my ($self) = @_;
- $self->msg_more($OVERVIEW_FMT);
-}
+sub list_overview_fmt ($) { $OVERVIEW_FMT }
-sub list_headers ($;$) {
- my ($self) = @_;
- $self->msg_more($LIST_HEADERS);
+sub list_headers ($;$) { $LIST_HEADERS }
+
+sub list_active_i { # "LIST ACTIVE" and also just "LIST" (no args)
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $ibx;
+ my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup};
+ for my $ngname (@window) {
+ $ibx = $groups->{$ngname} and group_line($self, $ibx);
+ }
+ scalar(@$groupnames); # continue if there's more
}
-sub list_active ($;$) {
+sub list_active ($;$) { # called by cmd_list
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- group_line($self, $ng);
+ long_response($self, \&list_active_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
+}
+
+sub list_active_times_i {
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup};
+ for my $ngname (@window) {
+ my $ibx = $groups->{$ngname} or next;
+ my $c = eval { $ibx->uidvalidity } // time;
+ more($self, "$ngname $c <$ibx->{-primary_address}>");
}
+ scalar(@$groupnames); # continue if there's more
}
-sub list_active_times ($;$) {
+sub list_active_times ($;$) { # called by cmd_list
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- my $c = eval { $ng->mm->created_at } || time;
- more($self, "$ng->{newsgroup} $c $ng->{-primary_address}");
+ long_response($self, \&list_active_times_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
+}
+
+sub list_newsgroups_i {
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup};
+ my $ibx;
+ for my $ngname (@window) {
+ $ibx = $groups->{$ngname} and
+ more($self, "$ngname ".$ibx->description);
}
+ scalar(@$groupnames); # continue if there's more
}
-sub list_newsgroups ($;$) {
+sub list_newsgroups ($;$) { # called by cmd_list
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- my $d = $ng->description;
- more($self, "$ng->{newsgroup} $d");
- }
+ long_response($self, \&list_newsgroups_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
}
# LIST SUBSCRIPTIONS, DISTRIB.PATS are not supported
if (scalar @args) {
my $arg = shift @args;
$arg =~ tr/A-Z./a-z_/;
+ my $ret = $arg eq 'active';
$arg = "list_$arg";
$arg = $self->can($arg);
return r501 unless $arg && args_ok($arg, scalar @args);
$arg->($self, @args);
} else {
more($self, '215 list of newsgroups follows');
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- group_line($self, $ng);
- }
+ long_response($self, \&list_active_i, [ # copy array
+ @{$self->{nntpd}->{groupnames}} ]);
}
- '.'
}
sub listgroup_range_i {
my ($self, $beg, $end) = @_;
- my $r = $self->{ng}->mm->msg_range($beg, $end, 'num');
+ my $r = $self->{ibx}->mm->msg_range($beg, $end, 'num');
scalar(@$r) or return;
- more($self, join("\r\n", map { $_->[0] } @$r));
+ $self->msg_more(join('', map { "$_->[0]\r\n" } @$r));
1;
}
sub listgroup_all_i {
my ($self, $num) = @_;
- my $ary = $self->{ng}->mm->ids_after($num);
+ my $ary = $self->{ibx}->mm->ids_after($num);
scalar(@$ary) or return;
more($self, join("\r\n", @$ary));
1;
return $res if ($res !~ /\A211 /);
more($self, $res);
}
- $self->{ng} or return '412 no newsgroup selected';
+ $self->{ibx} or return '412 no newsgroup selected';
if (defined $range) {
my $r = get_range($self, $range);
return $r unless ref $r;
}
sub group_line ($$) {
- my ($self, $ng) = @_;
- my ($min, $max) = $ng->mm->minmax;
- more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max;
+ my ($self, $ibx) = @_;
+ my ($min, $max) = $ibx->mm->minmax;
+ more($self, "$ibx->{newsgroup} $max $min n");
+}
+
+sub newgroups_i {
+ my ($self, $ts, $i, $groupnames) = @_;
+ my $end = $$i + 100;
+ my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup};
+ while ($$i < $end) {
+ my $ngname = $groupnames->[$$i++] // return;
+ my $ibx = $groups->{$ngname} or next; # expired on reload
+ next unless (eval { $ibx->uidvalidity } // 0) > $ts;
+ group_line($self, $ibx);
+ }
+ 1;
}
sub cmd_newgroups ($$$;$$) {
# TODO dists
more($self, '231 list of new newsgroups follows');
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- my $c = eval { $ng->mm->created_at } || 0;
- next unless $c > $ts;
- group_line($self, $ng);
- }
- '.'
+ long_response($self, \&newgroups_i, $ts, \(my $i = 0),
+ $self->{nntpd}->{groupnames});
}
sub wildmat2re (;$) {
}
sub newnews_i {
- my ($self, $overs, $ts, $prev) = @_;
- my $over = $overs->[0];
- my $msgs = $over->query_ts($ts, $$prev);
- if (scalar @$msgs) {
- more($self, '<' .
- join(">\r\n<", map { $_->{mid} } @$msgs ).
- '>');
- $$prev = $msgs->[-1]->{num};
- } else {
- shift @$overs;
- if (@$overs) { # continue onto next newsgroup
- $$prev = 0;
- return 1;
- } else { # break out of the long response.
- return;
+ my ($self, $names, $ts, $prev) = @_;
+ my $ngname = $names->[0];
+ if (my $ibx = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}->{$ngname}) {
+ if (my $over = $ibx->over) {
+ my $msgs = $over->query_ts($ts, $$prev);
+ if (scalar @$msgs) {
+ $self->msg_more(join('', map {
+ "<$_->{mid}>\r\n";
+ } @$msgs));
+ $$prev = $msgs->[-1]->{num};
+ return 1; # continue on current group
+ }
}
}
+ shift @$names;
+ if (@$names) { # continue onto next newsgroup
+ $$prev = 0;
+ 1;
+ } else { # all done, break out of the long_response
+ undef;
+ }
}
sub cmd_newnews ($$$$;$$) {
my ($keep, $skip) = split('!', $newsgroups, 2);
ngpat2re($keep);
ngpat2re($skip);
- my @overs;
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $keep or next;
- $ng->{newsgroup} =~ $skip and next;
- my $over = $ng->over or next;
- push @overs, $over;
- };
- return '.' unless @overs;
-
+ my @names = grep(!/$skip/, grep(/$keep/,
+ @{$self->{nntpd}->{groupnames}}));
+ return '.' unless scalar(@names);
my $prev = 0;
- long_response($self, \&newnews_i, \@overs, $ts, \$prev);
+ long_response($self, \&newnews_i, \@names, $ts, \$prev);
}
sub cmd_group ($$) {
my ($self, $group) = @_;
- my $no_such = '411 no such news group';
my $nntpd = $self->{nntpd};
- my $ng = $nntpd->{groups}->{$group} or return $no_such;
+ my $ibx = $nntpd->{pi_cfg}->{-by_newsgroup}->{$group} or
+ return '411 no such news group';
$nntpd->idler_start;
- $self->{ng} = $ng;
- my ($min, $max) = $ng->mm->minmax;
- $min ||= 0;
- $max ||= 0;
+ $self->{ibx} = $ibx;
+ my ($min, $max) = $ibx->mm->minmax;
$self->{article} = $min;
my $est_size = $max - $min;
"211 $est_size $min $max $group";
sub article_adj ($$) {
my ($self, $off) = @_;
- my $ng = $self->{ng} or return '412 no newsgroup selected';
+ my $ibx = $self->{ibx} or return '412 no newsgroup selected';
my $n = $self->{article};
defined $n or return '420 no current article has been selected';
$n += $off;
- my $mid = $ng->mm->mid_for($n);
+ my $mid = $ibx->mm->mid_for($n);
unless ($mid) {
$n = $off > 0 ? 'next' : 'previous';
return "421 no $n article in this group";
# the single-point-of-failure a single server provides.
sub cmd_post ($) {
my ($self) = @_;
- my $ng = $self->{ng};
- $ng ? "440 mailto:$ng->{-primary_address} to post"
+ my $ibx = $self->{ibx};
+ $ibx ? "440 mailto:$ibx->{-primary_address} to post"
: '440 posting not allowed'
}
$hdr->header_set($k, @v, $v);
}
-sub xref ($$$$) {
- my ($self, $ng, $n, $mid) = @_;
- my $ret = $self->{nntpd}->{servername} . " $ng->{newsgroup}:$n";
+sub xref_by_tc ($$$) {
+ my ($xref, $pi_cfg, $smsg) = @_;
+ my $by_addr = $pi_cfg->{-by_addr};
+ my $mid = $smsg->{mid};
+ for my $f (qw(to cc)) {
+ my @ibxs = map {
+ $by_addr->{lc($_)} // ()
+ } (PublicInbox::Address::emails($smsg->{$f} // ''));
+ for my $ibx (@ibxs) {
+ my $ngname = $ibx->{newsgroup} // next;
+ next if defined $xref->{$ngname};
+ $xref->{$ngname} = eval { $ibx->mm->num_for($mid) };
+ }
+ }
+}
- # num_for is pretty cheap and sometimes we'll lookup the existence
- # of an article without getting even the OVER info. In other words,
- # I'm not sure if its worth optimizing by scanning To:/Cc: and
- # PublicInbox::ExtMsg on the PSGI end is just as expensive
- foreach my $other (@{$self->{nntpd}->{grouplist}}) {
- next if $ng eq $other;
- my $num = eval { $other->mm->num_for($mid) } or next;
- $ret .= " $other->{newsgroup}:$num";
+sub xref ($$$) {
+ my ($self, $cur_ibx, $smsg) = @_;
+ my $nntpd = $self->{nntpd};
+ my $cur_ng = $cur_ibx->{newsgroup};
+ my $xref;
+ if (my $ALL = $nntpd->{pi_cfg}->ALL) {
+ $xref = $ALL->nntp_xref_for($cur_ibx, $smsg);
+ xref_by_tc($xref, $nntpd->{pi_cfg}, $smsg);
+ } else { # slow path
+ $xref = { $cur_ng => $smsg->{num} };
+ my $mid = $smsg->{mid};
+ for my $ibx (values %{$nntpd->{pi_cfg}->{-by_newsgroup}}) {
+ next if defined($xref->{$ibx->{newsgroup}});
+ my $num = eval { $ibx->mm->num_for($mid) } // next;
+ $xref->{$ibx->{newsgroup}} = $num;
+ }
}
+ my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng});
+ $ret .= " $_:$xref->{$_}" for (sort keys %$xref);
$ret;
}
# clobber some existing headers
my $ibx = $smsg->{-ibx};
- my $xref = xref($smsg->{nntp}, $ibx, $smsg->{num}, $mid);
+ my $xref = xref($smsg->{nntp}, $ibx, $smsg);
$hdr->header_set('Xref', $xref);
# RFC 5536 3.1.4
# *something* here is required for leafnode, try to follow
# RFC 5536 3.1.5...
$hdr->header_set('Path', $server_name . '!not-for-mail');
-
- header_append($hdr, 'List-Post', "<mailto:$ibx->{-primary_address}>");
- if (my $url = $ibx->base_url) {
- $mid = mid_escape($mid);
- header_append($hdr, 'Archived-At', "<$url$mid/>");
- header_append($hdr, 'List-Archive', "<$url>");
- }
}
sub art_lookup ($$$) {
my ($self, $art, $code) = @_;
- my $ng = $self->{ng};
- my ($n, $mid);
+ my ($ibx, $n);
my $err;
if (defined $art) {
if ($art =~ /\A[0-9]+\z/) {
$err = '423 no such article number in this group';
$n = int($art);
- goto find_mid;
+ goto find_ibx;
} elsif ($art =~ $ONE_MSGID) {
- $mid = $1;
- $err = r430;
- $n = $ng->mm->num_for($mid) if $ng;
- goto found if defined $n;
- foreach my $g (values %{$self->{nntpd}->{groups}}) {
- $n = $g->mm->num_for($mid);
- if (defined $n) {
- $ng = $g;
- goto found;
- }
- }
- return $err;
+ ($ibx, $n) = mid_lookup($self, $1);
+ goto found if $ibx;
+ return r430;
} else {
return r501;
}
} else {
$err = '420 no current article has been selected';
- $n = $self->{article};
- defined $n or return $err;
-find_mid:
- $ng or return '412 no newsgroup has been selected';
- $mid = $ng->mm->mid_for($n);
- defined $mid or return $err;
+ $n = $self->{article} // return $err;
+find_ibx:
+ $ibx = $self->{ibx} or
+ return '412 no newsgroup has been selected';
}
found:
- my $smsg = $ng->over->get_art($n) or return $err;
- $smsg->{-ibx} = $ng;
+ my $smsg = $ibx->over->get_art($n) or return $err;
+ $smsg->{-ibx} = $ibx;
if ($code == 223) { # STAT
set_art($self, $n);
"223 $n <$smsg->{mid}> article retrieved - " .
$smsg->{nntp_code} = $code;
set_art($self, $art);
# this dereferences to `undef'
- ${git_async_cat($ng->git, $smsg->{blob}, \&blob_cb, $smsg)};
+ ${git_async_cat($ibx->git, $smsg->{blob}, \&blob_cb, $smsg)};
}
}
sub get_range ($$) {
my ($self, $range) = @_;
- my $ng = $self->{ng} or return '412 no news group has been selected';
+ my $ibx = $self->{ibx} or return '412 no news group has been selected';
defined $range or return '420 No article(s) selected';
my ($beg, $end);
- my ($min, $max) = $ng->mm->minmax;
+ my ($min, $max) = $ibx->mm->minmax;
if ($range =~ /\A([0-9]+)\z/) {
$beg = $end = $1;
} elsif ($range =~ /\A([0-9]+)-\z/) {
sub hdr_msgid_range_i {
my ($self, $beg, $end) = @_;
- my $r = $self->{ng}->mm->msg_range($beg, $end);
+ my $r = $self->{ibx}->mm->msg_range($beg, $end);
@$r or return;
- more($self, join("\r\n", map { "$_->[0] <$_->[1]>" } @$r));
+ $self->msg_more(join('', map { "$_->[0] <$_->[1]>\r\n" } @$r));
1;
}
my ($self, $xhdr, $range) = @_;
if (defined $range && $range =~ $ONE_MSGID) {
- my ($ng, $n) = mid_lookup($self, $1);
+ my ($ibx, $n) = mid_lookup($self, $1);
return r430 unless $n;
- hdr_mid_response($self, $xhdr, $ng, $n, $range, $range);
+ hdr_mid_response($self, $xhdr, $ibx, $n, $range, $range);
} else { # numeric range
$range = $self->{article} unless defined $range;
my $r = get_range($self, $range);
sub mid_lookup ($$) {
my ($self, $mid) = @_;
- my $self_ng = $self->{ng};
- if ($self_ng) {
- my $n = $self_ng->mm->num_for($mid);
- return ($self_ng, $n) if defined $n;
+ my $cur_ibx = $self->{ibx};
+ if ($cur_ibx) {
+ my $n = $cur_ibx->mm->num_for($mid);
+ return ($cur_ibx, $n) if defined $n;
}
- foreach my $ng (values %{$self->{nntpd}->{groups}}) {
- next if defined $self_ng && $ng eq $self_ng;
- my $n = $ng->mm->num_for($mid);
- return ($ng, $n) if defined $n;
+ my $pi_cfg = $self->{nntpd}->{pi_cfg};
+ if (my $ALL = $pi_cfg->ALL) {
+ my ($id, $prev);
+ while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) {
+ my $xr3 = $ALL->over->get_xref3($smsg->{num});
+ if (my @x = grep(/:$smsg->{blob}\z/, @$xr3)) {
+ my ($ngname, $xnum) = split(/:/, $x[0]);
+ my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname};
+ return ($ibx, $xnum) if $ibx;
+ # fall through to trying all xref3s
+ } else {
+ warn <<EOF;
+W: xref3 missing for <$mid> ($smsg->{blob}) in $ALL->{topdir}, -extindex bug?
+EOF
+ }
+ # try all xref3s
+ for my $x (@$xr3) {
+ my ($ngname, $xnum) = split(/:/, $x);
+ my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname};
+ return ($ibx, $xnum) if $ibx;
+ warn "W: `$ngname' does not exist for #$xnum\n";
+ }
+ }
+ # no warning here, $mid is just invalid
+ } else { # slow path for non-ALL users
+ for my $ibx (values %{$pi_cfg->{-by_newsgroup}}) {
+ next if defined $cur_ibx && $ibx eq $cur_ibx;
+ my $n = $ibx->mm->num_for($mid);
+ return ($ibx, $n) if defined $n;
+ }
}
(undef, undef);
}
sub xref_range_i {
my ($self, $beg, $end) = @_;
- my $ng = $self->{ng};
- my $r = $ng->mm->msg_range($beg, $end);
- @$r or return;
- more($self, join("\r\n", map {
- my $num = $_->[0];
- "$num ".xref($self, $ng, $num, $_->[1]);
- } @$r));
+ my $ibx = $self->{ibx};
+ my $msgs = $ibx->over->query_xover($$beg, $end);
+ scalar(@$msgs) or return;
+ $$beg = $msgs->[-1]->{num} + 1;
+ $self->msg_more(join('', map {
+ "$_->{num} ".xref($self, $ibx, $_) . "\r\n";
+ } @$msgs));
1;
}
if (defined $range && $range =~ $ONE_MSGID) {
my $mid = $1;
- my ($ng, $n) = mid_lookup($self, $mid);
+ my ($ibx, $n) = mid_lookup($self, $mid);
return r430 unless $n;
- hdr_mid_response($self, $xhdr, $ng, $n, $range,
- xref($self, $ng, $n, $mid));
+ my $smsg = $ibx->over->get_art($n) or return;
+ hdr_mid_response($self, $xhdr, $ibx, $n, $range,
+ xref($self, $ibx, $smsg));
} else { # numeric range
$range = $self->{article} unless defined $range;
my $r = get_range($self, $range);
sub smsg_range_i {
my ($self, $beg, $end, $field) = @_;
- my $over = $self->{ng}->over;
+ my $over = $self->{ibx}->over;
my $msgs = $over->query_xover($$beg, $end);
scalar(@$msgs) or return;
my $tmp = '';
sub hdr_smsg ($$$$) {
my ($self, $xhdr, $field, $range) = @_;
if (defined $range && $range =~ $ONE_MSGID) {
- my ($ng, $n) = mid_lookup($self, $1);
+ my ($ibx, $n) = mid_lookup($self, $1);
return r430 unless defined $n;
- my $v = over_header_for($ng->over, $n, $field);
- hdr_mid_response($self, $xhdr, $ng, $n, $range, $v);
+ my $v = over_header_for($ibx->over, $n, $field);
+ hdr_mid_response($self, $xhdr, $ibx, $n, $range, $v);
} else { # numeric range
$range = $self->{article} unless defined $range;
my $r = get_range($self, $range);
}
sub hdr_mid_prefix ($$$$$) {
- my ($self, $xhdr, $ng, $n, $mid) = @_;
+ my ($self, $xhdr, $ibx, $n, $mid) = @_;
return $mid if $xhdr;
# HDR for RFC 3977 users
- if (my $self_ng = $self->{ng}) {
- ($self_ng eq $ng) ? $n : '0';
+ if (my $cur_ibx = $self->{ibx}) {
+ ($cur_ibx eq $ibx) ? $n : '0';
} else {
'0';
}
}
sub hdr_mid_response ($$$$$$) {
- my ($self, $xhdr, $ng, $n, $mid, $v) = @_;
+ my ($self, $xhdr, $ibx, $n, $mid, $v) = @_;
my $res = '';
if ($xhdr) {
$res .= r221 . "\r\n";
$res .= "$mid $v\r\n";
} else {
$res .= r225 . "\r\n";
- my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid);
+ my $pfx = hdr_mid_prefix($self, $xhdr, $ibx, $n, $mid);
$res .= "$pfx $v\r\n";
}
res($self, $res .= '.');
sub xrover_i {
my ($self, $beg, $end) = @_;
- my $h = over_header_for($self->{ng}->over, $$beg, 'references');
+ my $h = over_header_for($self->{ibx}->over, $$beg, 'references');
more($self, "$$beg $h") if defined($h);
$$beg++ < $end;
}
sub cmd_xrover ($;$) {
my ($self, $range) = @_;
- my $ng = $self->{ng} or return '412 no newsgroup selected';
+ my $ibx = $self->{ibx} or return '412 no newsgroup selected';
(defined $range && $range =~ /[<>]/) and
return '420 No article(s) selected'; # no message IDs
long_response($self, \&xrover_i, @$r);
}
-sub over_line ($$$$) {
- my ($self, $ng, $num, $smsg) = @_;
+sub over_line ($$$) {
+ my ($self, $ibx, $smsg) = @_;
# n.b. field access and procedural calls can be
# 10%-15% faster than OO method calls:
- my $s = join("\t", $num,
+ my $s = join("\t", $smsg->{num},
$smsg->{subject},
$smsg->{from},
PublicInbox::Smsg::date($smsg),
$smsg->{references},
$smsg->{bytes},
$smsg->{lines},
- "Xref: " . xref($self, $ng, $num, $smsg->{mid}));
+ "Xref: " . xref($self, $ibx, $smsg));
utf8::encode($s);
- $s
+ $s .= "\r\n";
}
sub cmd_over ($;$) {
my ($self, $range) = @_;
if ($range && $range =~ $ONE_MSGID) {
- my ($ng, $n) = mid_lookup($self, $1);
+ my ($ibx, $n) = mid_lookup($self, $1);
defined $n or return r430;
- my $smsg = $ng->over->get_art($n) or return r430;
+ my $smsg = $ibx->over->get_art($n) or return r430;
more($self, '224 Overview information follows (multi-line)');
# Only set article number column if it's the current group
- my $self_ng = $self->{ng};
- $n = 0 if (!$self_ng || $self_ng ne $ng);
- more($self, over_line($self, $ng, $n, $smsg));
+ # (RFC 3977 8.3.2)
+ my $cur_ibx = $self->{ibx};
+ if (!$cur_ibx || $cur_ibx ne $ibx) {
+ # set {-orig_num} for nntp_xref_for
+ $smsg->{-orig_num} = $smsg->{num};
+ $smsg->{num} = 0;
+ }
+ $self->msg_more(over_line($self, $ibx, $smsg));
'.';
} else {
cmd_xover($self, $range);
sub xover_i {
my ($self, $beg, $end) = @_;
- my $ng = $self->{ng};
- my $msgs = $ng->over->query_xover($$beg, $end);
+ my $ibx = $self->{ibx};
+ my $msgs = $ibx->over->query_xover($$beg, $end);
my $nr = scalar @$msgs or return;
# OVERVIEW.FMT
- more($self, join("\r\n", map {
- over_line($self, $ng, $_->{num}, $_);
+ $self->msg_more(join('', map {
+ over_line($self, $ibx, $_);
} @$msgs));
$$beg = $msgs->[-1]->{num} + 1;
}
return r501 unless $mid =~ $ONE_MSGID;
$mid = $1;
my @paths;
- foreach my $ng (values %{$self->{nntpd}->{groups}}) {
- my $n = $ng->mm->num_for($mid);
- push @paths, "$ng->{newsgroup}/$n" if defined $n;
+ my $pi_cfg = $self->{nntpd}->{pi_cfg};
+ my $groups = $pi_cfg->{-by_newsgroup};
+ if (my $ALL = $pi_cfg->ALL) {
+ my ($id, $prev, %seen);
+ while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) {
+ my $xr3 = $ALL->over->get_xref3($smsg->{num});
+ for my $x (@$xr3) {
+ my ($ngname, $n) = split(/:/, $x);
+ $x = "$ngname/$n";
+ if ($groups->{$ngname} && !$seen{$x}++) {
+ push(@paths, $x);
+ }
+ }
+ }
+ } else { # slow path, no point in using long_response
+ for my $ibx (values %$groups) {
+ my $n = $ibx->mm->num_for($mid) // next;
+ push @paths, "$ibx->{newsgroup}/$n";
+ }
}
return '430 no such article on server' unless @paths;
- '223 '.join(' ', @paths);
+ '223 '.join(' ', sort(@paths));
}
sub res ($$) { do_write($_[0], $_[1] . "\r\n") }
sub new {
my ($class) = @_;
- my $pi_config = PublicInbox::Config->new;
- my $name = $pi_config->{'publicinbox.nntpserver'};
+ my $pi_cfg = PublicInbox::Config->new;
+ my $name = $pi_cfg->{'publicinbox.nntpserver'};
if (!defined($name) or $name eq '') {
$name = hostname;
} elsif (ref($name) eq 'ARRAY') {
groups => {},
err => \*STDERR,
out => \*STDOUT,
- grouplist => [],
- pi_config => $pi_config,
+ pi_cfg => $pi_cfg,
servername => $name,
greet => \"201 $name ready - post via email\r\n",
# accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... }
sub refresh_groups {
my ($self, $sig) = @_;
- my $pi_config = $sig ? PublicInbox::Config->new : $self->{pi_config};
- my $new = {};
- my @list;
- $pi_config->each_inbox(sub {
- my ($ng) = @_;
- my $ngname = $ng->{newsgroup} or return;
- if (ref $ngname) {
- warn 'multiple newsgroups not supported: '.
- join(', ', @$ngname). "\n";
- # Newsgroup name needs to be compatible with RFC 3977
- # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR.
- # Leave out a few chars likely to cause problems or conflicts:
- # '|', '<', '>', ';', '#', '$', '&',
- } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) {
- warn "newsgroup name invalid: `$ngname'\n";
- } elsif ($ng->nntp_usable) {
- # Only valid if msgmap and search works
- $new->{$ngname} = $ng;
- push @list, $ng;
-
+ my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg};
+ my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox
+ my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {};
+ $pi_cfg->each_inbox(sub {
+ my ($ibx) = @_;
+ my $ngname = $ibx->{newsgroup} // return;
+ my $ce = $cache->{$ngname};
+ if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) {
+ # only valid if msgmap and over works
# preload to avoid fragmentation:
- $ng->description;
- $ng->base_url;
+ $ibx->description;
+ $ibx->base_url;
+ } else {
+ delete $groups->{$ngname};
+ delete $ibx->{newsgroup};
+ # Note: don't be tempted to delete more for memory
+ # savings just yet: NNTP, IMAP, and WWW may all
+ # run in the same process someday.
}
});
- @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list;
- $self->{grouplist} = \@list;
- $self->{pi_config} = $pi_config;
+ $self->{groupnames} = [ sort(keys %$groups) ];
# this will destroy old groups that got deleted
- %{$self->{groups}} = %$new;
+ $self->{pi_cfg} = $pi_cfg;
}
sub idler_start {
- $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config});
+ $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg});
}
1;
use PublicInbox::Hval qw(prurl);
sub new {
- my ($class, $pi_config) = @_;
- $pi_config ||= PublicInbox::Config->new;
- bless { pi_config => $pi_config }, $class;
+ my ($class, $pi_cfg) = @_;
+ bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class;
}
sub redirect ($$) {
# /inbox.foo.bar/123456
my (undef, @parts) = split(m!/!, $env->{PATH_INFO});
my ($ng, $article) = @parts;
- my $pi_config = $self->{pi_config};
- if (my $ibx = $pi_config->lookup_newsgroup($ng)) {
+ my $pi_cfg = $self->{pi_cfg};
+ if (my $ibx = $pi_cfg->lookup_newsgroup($ng)) {
my $url = prurl($env, $ibx->{url});
my $code = 301;
if (defined $article && $article =~ /\A[0-9]+\z/) {
return redirect($code, $url);
}
- my $res;
my @try = (join('/', @parts));
# trailing slash is in the rest of our WWW, so maybe some users
pop @parts;
push @try, join('/', @parts);
}
-
- foreach my $mid (@try) {
- my $arg = [ $mid ];
- $pi_config->each_inbox(\&try_inbox, $arg);
- defined($res = $arg->[1]) and last;
+ my $ALL = $pi_cfg->ALL;
+ if (my $over = $ALL ? $ALL->over : undef) {
+ my $by_eidx_key = $pi_cfg->{-by_eidx_key};
+ for my $mid (@try) {
+ my ($id, $prev);
+ while (my $x = $over->next_by_mid($mid, \$id, \$prev)) {
+ my $xr3 = $over->get_xref3($x->{num});
+ for (@$xr3) {
+ s/:[0-9]+:$x->{blob}\z// or next;
+ my $ibx = $by_eidx_key->{$_} // next;
+ my $url = $ibx->base_url or next;
+ $url .= mid_escape($mid) . '/';
+ return redirect(302, $url);
+ }
+ }
+ }
+ } else { # slow path, scan every inbox
+ for my $mid (@try) {
+ my $arg = [ $mid ]; # [1] => result
+ $pi_cfg->each_inbox(\&try_inbox, $arg);
+ return $arg->[1] if $arg->[1];
+ }
}
- $res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ];
+ [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ];
}
1;
$smsg ? load_from_row($smsg) : undef;
}
+sub get_xref3 {
+ my ($self, $num, $raw) = @_;
+ my $dbh = dbh($self);
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id,xnum,oidbin FROM xref3 WHERE docid = ? ORDER BY ibx_id,xnum ASC
+
+ $sth->execute($num);
+ my $rows = $sth->fetchall_arrayref;
+ return $rows if $raw;
+ my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ?
+
+ [ map {
+ my $r = $_;
+ $eidx_key_sth->execute($r->[0]);
+ my $eidx_key = $eidx_key_sth->fetchrow_array;
+ $eidx_key //= "missing://ibx_id=$r->[0]";
+ "$eidx_key:$r->[1]:".unpack('H*', $r->[2]);
+ } @$rows ];
+}
+
sub next_by_mid {
my ($self, $mid, $id, $prev) = @_;
my $dbh = dbh($self);
}
}
+sub ibx_id {
+ my ($self, $eidx_key) = @_;
+ id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key);
+}
+
sub sid {
my ($self, $path) = @_;
return unless defined $path && $path ne '';
lc($subj);
}
+sub ddd_for ($) {
+ my ($smsg) = @_;
+ my $dd = $smsg->to_doc_data;
+ utf8::encode($dd);
+ compress($dd);
+}
+
sub add_overview {
my ($self, $eml, $smsg) = @_;
$smsg->{lines} = $eml->body_raw =~ tr!\n!\n!;
$xpath = subject_path($subj);
$xpath = id_compress($xpath);
}
- my $dd = $smsg->to_doc_data;
- utf8::encode($dd);
- $dd = compress($dd);
- add_over($self, $smsg, $mids, $refs, $xpath, $dd);
+ add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg));
}
sub _add_over {
$dbh->do(<<'');
CREATE TABLE IF NOT EXISTS over (
- num INTEGER NOT NULL, /* NNTP article number == IMAP UID */
+ num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */
tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */
sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */
ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */
ds INTEGER, /* RFC-2822 sent Date: header, git author time */
- ddd VARBINARY, /* doc-data-deflated (->to_doc_data, ->load_from_data) */
- UNIQUE (num)
+ ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */
)
$dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)');
sub create {
my ($self) = @_;
- unless (-r $self->{filename}) {
+ my $fn = $self->{filename} // do {
+ Carp::confess('BUG: no {filename}') unless $self->{dbh};
+ return;
+ };
+ unless (-r $fn) {
require File::Path;
require File::Basename;
- File::Path::mkpath(File::Basename::dirname($self->{filename}));
+ File::Path::mkpath(File::Basename::dirname($fn));
}
# create the DB:
PublicInbox::Over::dbh($self);
$pr->("I: rethread culled $total ghosts\n") if $pr && $total;
}
+# used for cross-inbox search
+sub eidx_prep ($) {
+ my ($self) = @_;
+ $self->{-eidx_prep} //= do {
+ my $dbh = $self->dbh;
+ $dbh->do(<<"");
+INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid')
+
+ $dbh->do(<<'');
+CREATE TABLE IF NOT EXISTS inboxes (
+ ibx_id INTEGER PRIMARY KEY AUTOINCREMENT,
+ eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */
+ UNIQUE (eidx_key)
+)
+
+ $dbh->do(<<'');
+CREATE TABLE IF NOT EXISTS xref3 (
+ docid INTEGER NOT NULL, /* <=> over.num */
+ ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */
+ xnum INTEGER NOT NULL, /* NNTP article number in ibx */
+ oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */
+ UNIQUE (docid, ibx_id, xnum, oidbin)
+)
+
+ $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
+
+ # performance critical, this is not UNIQUE since we may need to
+ # tolerate some old bugs from indexing mirrors
+ $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
+ 'xref3 (oidbin,xnum,ibx_id)');
+
+ $dbh->do(<<'');
+CREATE TABLE IF NOT EXISTS eidx_meta (
+ key VARCHAR(255) PRIMARY KEY,
+ val VARCHAR(255) NOT NULL
+)
+
+ # A queue of current docids which need reindexing.
+ # eidxq persists across aborted -extindex invocations
+ # Currently used for "-extindex --reindex" for Xapian
+ # data, but may be used in more places down the line.
+ $dbh->do(<<'');
+CREATE TABLE IF NOT EXISTS eidxq (
+ docid INTEGER PRIMARY KEY NOT NULL
+)
+
+ $dbh;
+ };
+}
+
+sub eidx_meta { # requires transaction
+ my ($self, $key, $val) = @_;
+
+ my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1';
+ my $dbh = $self->{dbh};
+ defined($val) or return $dbh->selectrow_array($sql, undef, $key);
+
+ my $prev = $dbh->selectrow_array($sql, undef, $key);
+ if (defined $prev) {
+ $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?';
+ $dbh->do($sql, undef, $val, $key);
+ } else {
+ $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)';
+ $dbh->do($sql, undef, $key, $val);
+ }
+ $prev;
+}
+
+sub eidx_max {
+ my ($self) = @_;
+ get_counter($self->{dbh}, 'eidx_docid');
+}
+
+sub add_xref3 {
+ my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_;
+ begin_lazy($self);
+ my $ibx_id = ibx_id($self, $eidx_key);
+ my $oidbin = pack('H*', $oidhex);
+ my $sth = $self->{dbh}->prepare_cached(<<'');
+INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
+
+ $sth->bind_param(1, $docid);
+ $sth->bind_param(2, $ibx_id);
+ $sth->bind_param(3, $xnum);
+ $sth->bind_param(4, $oidbin, SQL_BLOB);
+ $sth->execute;
+}
+
+# returns remaining reference count to $docid
+sub remove_xref3 {
+ my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
+ begin_lazy($self);
+ my $oidbin = pack('H*', $oidhex);
+ my ($sth, $ibx_id);
+ if (defined $eidx_key) {
+ $ibx_id = ibx_id($self, $eidx_key);
+ $sth = $self->{dbh}->prepare_cached(<<'');
+DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
+
+ $sth->bind_param(1, $docid);
+ $sth->bind_param(2, $ibx_id);
+ $sth->bind_param(3, $oidbin, SQL_BLOB);
+ } else {
+ $sth = $self->{dbh}->prepare_cached(<<'');
+DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
+
+ $sth->bind_param(1, $docid);
+ $sth->bind_param(2, $oidbin, SQL_BLOB);
+ }
+ $sth->execute;
+ $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ?
+
+ $sth->execute($docid);
+ my $nr = $sth->fetchrow_array;
+ if ($nr == 0) {
+ delete_by_num($self, $docid);
+ } elsif (defined($ibx_id) && $rm_eidx_info) {
+ # if deduplication rules in ContentHash change, it's
+ # possible a docid can have multiple rows with the
+ # same ibx_id. This governs whether or not we call
+ # ->shard_remove_eidx_info in ExtSearchIdx.
+ $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+ $sth->execute($docid, $ibx_id);
+ my $count = $sth->fetchrow_array;
+ $$rm_eidx_info = ($count == 0);
+ }
+ $nr;
+}
+
+# for when an xref3 goes missing, this does NOT update {ts}
+sub update_blob {
+ my ($self, $smsg, $oidhex) = @_;
+ my $sth = $self->{dbh}->prepare(<<'');
+UPDATE over SET ddd = ? WHERE num = ?
+
+ $smsg->{blob} = $oidhex;
+ $sth->bind_param(1, ddd_for($smsg), SQL_BLOB);
+ $sth->bind_param(2, $smsg->{num});
+ $sth->execute;
+}
+
+sub eidxq_add {
+ my ($self, $docid) = @_;
+ $self->dbh->prepare_cached(<<'')->execute($docid);
+INSERT OR IGNORE INTO eidxq (docid) VALUES (?)
+
+}
+
+sub eidxq_del {
+ my ($self, $docid) = @_;
+ $self->dbh->prepare_cached(<<'')->execute($docid);
+DELETE FROM eidxq WHERE docid = ?
+
+}
+
1;
}
sub setup_rlimit {
- my ($self, $name, $config) = @_;
+ my ($self, $name, $cfg) = @_;
foreach my $rlim (@PublicInbox::Spawn::RLIMITS) {
my $k = lc($rlim);
$k =~ tr/_//d;
$k = "publicinboxlimiter.$name.$k";
- defined(my $v = $config->{$k}) or next;
+ defined(my $v = $cfg->{$k}) or next;
my @rlimit = split(/\s*,\s*/, $v);
if (scalar(@rlimit) == 1) {
push @rlimit, $rlimit[0];
package PublicInbox::Search;
use strict;
use parent qw(Exporter);
-our @EXPORT_OK = qw(mdocid);
+our @EXPORT_OK = qw(retry_reopen int_val);
use List::Util qw(max);
# values for searching, changing the numeric value breaks
use PublicInbox::Smsg;
use PublicInbox::Over;
-my $QP_FLAGS;
-our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem);
+our $QP_FLAGS;
+our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query);
our $Xap; # 'Search::Xapian' or 'Xapian'
-my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
-my $ENQ_ASCENDING;
+our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
+
+# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16,
+# let's hope the ABI is stable
+our $ENQ_DESCENDING = 0;
+our $ENQ_ASCENDING = 1;
sub load_xapian () {
return 1 if defined $Xap;
'NumberRangeProcessor' : 'NumberValueRangeProcessor');
$X{$_} = $Xap.'::'.$_ for (keys %X);
- # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm,
- # so lets hope this part of the ABI is stable because it's
- # just an integer:
- $ENQ_ASCENDING = $x eq 'Xapian' ?
- 1 : Search::Xapian::ENQ_ASCENDING();
-
- # for Smsg:
- *PublicInbox::Smsg::sortable_unserialise =
- $Xap.'::sortable_unserialise';
+ *sortable_serialise = $x.'::sortable_serialise';
+ *sortable_unserialise = $x.'::sortable_unserialise';
# n.b. FLAG_PURE_NOT is expensive not suitable for a public
# website as it could become a denial-of-service vector
# FLAG_PHRASE also seems to cause performance problems chert
}
}
-sub _xdb ($) {
+sub xdb_sharded {
+ my ($self) = @_;
+ opendir(my $dh, $self->{xpfx}) or return; # not initialized yet
+
+ # We need numeric sorting so shard[0] is first for reading
+ # Xapian metadata, if needed
+ my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return;
+ my (@xdb, $slow_phrase);
+ for (0..$last) {
+ my $shard_dir = "$self->{xpfx}/$_";
+ if (-d $shard_dir && -r _) {
+ push @xdb, $X{Database}->new($shard_dir);
+ $slow_phrase ||= -f "$shard_dir/iamchert";
+ } else { # gaps from missing epochs throw off mdocid()
+ warn "E: $shard_dir missing or unreadable\n";
+ return;
+ }
+ }
+ $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase;
+ $self->{nshard} = scalar(@xdb);
+ my $xdb = shift @xdb;
+ $xdb->add_database($_) for @xdb;
+ $xdb;
+}
+
+sub _xdb {
my ($self) = @_;
my $dir = xdir($self, 1);
- my ($xdb, $slow_phrase);
- my $qpf = \($self->{qp_flags} ||= $QP_FLAGS);
+ $self->{qp_flags} //= $QP_FLAGS;
if ($self->{ibx_ver} >= 2) {
- my @xdb;
- opendir(my $dh, $dir) or return; # not initialized yet
-
- # We need numeric sorting so shard[0] is first for reading
- # Xapian metadata, if needed
- my $last = max(grep(/\A[0-9]+\z/, readdir($dh)));
- return if !defined($last);
- for (0..$last) {
- my $shard_dir = "$dir/$_";
- if (-d $shard_dir && -r _) {
- push @xdb, $X{Database}->new($shard_dir);
- $slow_phrase ||= -f "$shard_dir/iamchert";
- } else { # gaps from missing epochs throw off mdocid()
- warn "E: $shard_dir missing or unreadable\n";
- return;
- }
- }
- $self->{nshard} = scalar(@xdb);
- $xdb = shift @xdb;
- $xdb->add_database($_) for @xdb;
+ xdb_sharded($self);
} else {
- $slow_phrase = -f "$dir/iamchert";
- $xdb = $X{Database}->new($dir);
+ $self->{qp_flags} |= FLAG_PHRASE() if !-f "$dir/iamchert";
+ $X{Database}->new($dir);
}
- $$qpf |= FLAG_PHRASE() unless $slow_phrase;
- $xdb;
}
# v2 Xapian docids don't conflict, so they're identical to
sub xdb ($) {
my ($self) = @_;
- $self->{xdb} ||= do {
+ $self->{xdb} //= do {
load_xapian();
- _xdb($self);
+ $self->_xdb;
};
}
$opts ||= {};
my $qp = $self->{qp} //= qparse_new($self);
my $query = $qp->parse_query($query_string, $self->{qp_flags});
- $opts->{relevance} = 1 unless exists $opts->{relevance};
_do_enquire($self, $query, $opts);
}
sub retry_reopen {
- my ($self, $cb, $arg) = @_;
+ my ($self, $cb, @arg) = @_;
for my $i (1..10) {
if (wantarray) {
my @ret;
- eval { @ret = $cb->($arg) };
+ eval { @ret = $cb->($self, @arg) };
return @ret unless $@;
} else {
my $ret;
- eval { $ret = $cb->($arg) };
+ eval { $ret = $cb->($self, @arg) };
return $ret unless $@;
}
# Exception: The revision being read has been discarded -
sub _do_enquire {
my ($self, $query, $opts) = @_;
- retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]);
+ retry_reopen($self, \&_enquire_once, $query, $opts);
}
# returns true if all docs have the THREADID value
}
sub _enquire_once { # retry_reopen callback
- my ($self, $query, $opts) = @{$_[0]};
+ my ($self, $query, $opts) = @_;
my $xdb = xdb($self);
+ if (defined(my $eidx_key = $opts->{eidx_key})) {
+ $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key);
+ }
+ if (defined(my $uid_range = $opts->{uid_range})) {
+ my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
+ sortable_serialise($uid_range->[0]),
+ sortable_serialise($uid_range->[1]));
+ $query = $X{Query}->new(OP_FILTER(), $query, $range);
+ }
my $enquire = $X{Enquire}->new($xdb);
$enquire->set_query($query);
$opts ||= {};
my $desc = !$opts->{asc};
- if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID
+ my $rel = $opts->{relevance} // 0;
+ if ($rel == -1) { # ORDER BY docid/UID
+ $enquire->set_weighting_scheme($X{BoolWeight}->new);
$enquire->set_docid_order($ENQ_ASCENDING);
+ } elsif ($rel == 0) {
+ $enquire->set_sort_by_value_then_relevance(TS, $desc);
+ } elsif ($rel == -2) {
$enquire->set_weighting_scheme($X{BoolWeight}->new);
- } elsif ($opts->{relevance}) {
+ $enquire->set_docid_order($ENQ_DESCENDING);
+ } else { # rel > 0
$enquire->set_sort_by_relevance_then_value(TS, $desc);
- } else {
- $enquire->set_sort_by_value_then_relevance(TS, $desc);
}
# `mairix -t / --threads' or JMAP collapseThreads
\@ret;
}
+sub int_val ($$) {
+ my ($doc, $col) = @_;
+ my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
+ sortable_unserialise($val) + 0; # PV => IV conversion
+}
+
1;
use PublicInbox::MID qw(mids_for_index mids);
use PublicInbox::MsgIter;
use PublicInbox::IdxStack;
-use Carp qw(croak);
+use Carp qw(croak carp);
use POSIX qw(strftime);
+use Time::Local qw(timegm);
use PublicInbox::OverIdx;
use PublicInbox::Spawn qw(spawn nodatacow_dir);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size);
+our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
+ index_text term_generator add_val is_bad_blob);
my $X = \%PublicInbox::Search::X;
-my ($DB_CREATE_OR_OPEN, $DB_OPEN);
+our ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000;
use constant DEBUG => !!$ENV{DEBUG};
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
+our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
sub new {
my ($class, $ibx, $creat, $shard) = @_;
ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
- my $levels = qr/\A(?:full|medium|basic)\z/;
my $inboxdir = $ibx->{inboxdir};
my $version = $ibx->version;
my $indexlevel = 'full';
$altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ];
}
if ($ibx->{indexlevel}) {
- if ($ibx->{indexlevel} =~ $levels) {
+ if ($ibx->{indexlevel} =~ $INDEXLEVELS) {
$indexlevel = $ibx->{indexlevel};
} else {
die("Invalid indexlevel $ibx->{indexlevel}\n");
$self->{-set_skip_docdata_once} = 1;
$self->{-skip_docdata} = 1;
}
- $ibx->umask_prepare;
if ($version == 1) {
$self->{lock_path} = "$inboxdir/ssoma.lock";
my $dir = $self->xdir;
}
}
return unless defined $flag;
- $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync};
+ $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
croak "Failed opening $dir: $@" if $@;
$self->{xdb} = $xdb;
$self->{term_generator} //= do {
my $tg = $X->{TermGenerator}->new;
- $tg->set_stemmer($self->stemmer);
+ $tg->set_stemmer(PublicInbox::Search::stemmer($self));
$tg;
}
}
}
}
+sub index_list_id ($$$) {
+ my ($self, $doc, $hdr) = @_;
+ for my $l ($hdr->header_raw('List-Id')) {
+ $l =~ /<([^>]+)>/ or next;
+ my $lid = lc $1;
+ $doc->add_boolean_term('G' . $lid);
+ index_text($self, $lid, 1, 'XL'); # probabilistic
+ }
+}
+
sub index_ids ($$$$) {
my ($self, $doc, $hdr, $mids) = @_;
for my $mid (@$mids) {
}
}
$doc->add_boolean_term('Q' . $_) for @$mids;
- for my $l ($hdr->header_raw('List-Id')) {
- $l =~ /<([^>]+)>/ or next;
- my $lid = lc $1;
- $doc->add_boolean_term('G' . $lid);
- index_text($self, $lid, 1, 'XL'); # probabilistic
- }
+ index_list_id($self, $doc, $hdr);
}
-sub add_xapian ($$$$) {
+sub eml2doc ($$$;$) {
my ($self, $eml, $smsg, $mids) = @_;
+ $mids //= mids_for_index($eml);
my $doc = $X->{Document}->new;
add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
my @ds = gmtime($smsg->{ds});
$tg->set_document($doc);
index_headers($self, $smsg);
+ if (defined(my $eidx_key = $smsg->{eidx_key})) {
+ $doc->add_boolean_term('O'.$eidx_key);
+ }
msg_iter($eml, \&index_xapian, [ $self, $doc ]);
index_ids($self, $doc, $eml, $mids);
}
}
}
+ $doc;
+}
+
+sub add_xapian ($$$$) {
+ my ($self, $eml, $smsg, $mids) = @_;
+ my $doc = eml2doc($self, $eml, $smsg, $mids);
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
$smsg->{num};
}
+sub _get_doc ($$) {
+ my ($self, $docid) = @_;
+ my $doc = eval { $self->{xdb}->get_document($docid) };
+ $doc // do {
+ warn "E: $@\n" if $@;
+ warn "E: #$docid missing in Xapian\n";
+ undef;
+ }
+}
+
+sub add_eidx_info {
+ my ($self, $docid, $eidx_key, $eml) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ term_generator($self)->set_document($doc);
+ $doc->add_boolean_term('O'.$eidx_key);
+ index_list_id($self, $doc, $eml);
+ $self->{xdb}->replace_document($docid, $doc);
+}
+
+sub remove_eidx_info {
+ my ($self, $docid, $eidx_key, $eml) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ eval { $doc->remove_term('O'.$eidx_key) };
+ warn "W: ->remove_term O$eidx_key: $@\n" if $@;
+ for my $l ($eml ? $eml->header_raw('List-Id') : ()) {
+ $l =~ /<([^>]+)>/ or next;
+ my $lid = lc $1;
+ eval { $doc->remove_term('G' . $lid) };
+ warn "W: ->remove_term G$lid: $@\n" if $@;
+
+ # nb: we don't remove the XL probabilistic terms
+ # since terms may overlap if cross-posted.
+ #
+ # IOW, a message which has both <foo.example.com>
+ # and <bar.example.com> would have overlapping
+ # "XLexample" and "XLcom" as terms and which we
+ # wouldn't know if they're safe to remove if we just
+ # unindex <foo.example.com> while preserving
+ # <bar.example.com>.
+ #
+ # In any case, this entire sub is will likely never
+ # be needed and users using the "l:" prefix are probably
+ # rarer.
+ }
+ $self->{xdb}->replace_document($docid, $doc);
+}
+
+sub smsg_from_doc ($) {
+ my ($doc) = @_;
+ my $data = $doc->get_data or return;
+ my $smsg = bless {}, 'PublicInbox::Smsg';
+ $smsg->{ts} = int_val($doc, PublicInbox::Search::TS());
+ my $dt = int_val($doc, PublicInbox::Search::DT());
+ my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt);
+ $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy);
+ $smsg->load_from_data($data);
+ $smsg;
+}
+
sub xdb_remove {
- my ($self, $oid, @removed) = @_;
+ my ($self, @docids) = @_;
my $xdb = $self->{xdb} or return;
- for my $num (@removed) {
- my $doc = eval { $xdb->get_document($num) };
- unless ($doc) {
- warn "E: $@\n" if $@;
- warn "E: #$num $oid missing in Xapian\n";
- next;
- }
- my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->load_expand($doc);
- my $blob = $smsg->{blob} // '(unset)';
- if ($blob eq $oid) {
- $xdb->delete_document($num);
- } else {
- warn "E: #$num $oid != $blob in Xapian\n";
- }
+ for my $docid (@docids) {
+ eval { $xdb->delete_document($docid) };
+ warn "E: #$docid not in in Xapian? $@\n" if $@;
}
}
-sub remove_by_oid {
- my ($self, $oid, $num) = @_;
- die "BUG: remove_by_oid is v2-only\n" if $self->{oidx};
+sub remove_by_docid {
+ my ($self, $num) = @_;
+ die "BUG: remove_by_docid is v2-only\n" if $self->{oidx};
$self->begin_txn_lazy;
- xdb_remove($self, $oid, $num) if need_xapian($self);
+ xdb_remove($self, $num) if need_xapian($self);
}
sub index_git_blob_id {
$tmp{$_}++ for @removed;
}
if (!$nr) {
- $mids = join('> <', @$mids);
- warn "W: <$mids> missing for removal from overview\n";
+ my $m = join('> <', @$mids);
+ warn "W: <$m> missing for removal from overview\n";
}
while (my ($num, $nr) = each %tmp) {
warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1;
} else { # just in case msgmap and over.sqlite3 become desynched:
$self->{mm}->mid_delete($mids->[0]);
}
- xdb_remove($self, $oid, keys %tmp) if need_xapian($self);
+ xdb_remove($self, keys %tmp) if need_xapian($self);
}
sub index_mm {
}
}
+sub is_bad_blob ($$$$) {
+ my ($oid, $type, $size, $expect_oid) = @_;
+ if ($type ne 'blob') {
+ carp "W: $expect_oid is not a blob (type=$type)";
+ return 1;
+ }
+ croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid;
+ $size == 0 ? 1 : 0; # size == 0 means purged
+}
+
sub index_both { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
+ return if is_bad_blob($oid, $type, $size, $sync->{oid});
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
$size += crlf_adjust($$bref);
my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
my $self = $sync->{sidx};
+ local $self->{current_info} = "$self->{current_info}: $oid";
my $eml = PublicInbox::Eml->new($bref);
$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
die "E: could not generate NNTP article number for $oid";
add_message($self, $eml, $smsg, $sync);
+ ++$self->{nidx};
+ my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing';
+ ${$sync->{latest_cmt}} = $cur_cmt;
}
sub unindex_both { # git->cat_async callback
- my ($bref, $oid, $type, $size, $self) = @_;
+ my ($bref, $oid, $type, $size, $sync) = @_;
+ return if is_bad_blob($oid, $type, $size, $sync->{oid});
+ my $self = $sync->{sidx};
+ local $self->{current_info} = "$self->{current_info}: $oid";
unindex_eml($self, $oid, PublicInbox::Eml->new($bref));
+ # may be undef if leftover
+ if (defined(my $cur_cmt = $sync->{cur_cmt})) {
+ ${$sync->{latest_cmt}} = $cur_cmt;
+ }
+ ++$self->{nidx};
+}
+
+sub with_umask {
+ my $self = shift;
+ ($self->{ibx} // $self->{eidx})->with_umask(@_);
}
# called by public-inbox-index
sub index_sync {
my ($self, $opt) = @_;
delete $self->{lock_path} if $opt->{-skip_lock};
- $self->{ibx}->with_umask(\&_index_sync, $self, $opt);
- if ($opt->{reindex}) {
+ $self->with_umask(\&_index_sync, $self, $opt);
+ if ($opt->{reindex} && !$opt->{quit}) {
my %again = %$opt;
delete @again{qw(rethread reindex)};
index_sync($self, \%again);
+ $opt->{quit} = $again{quit}; # propagate to caller
}
}
sub v1_checkpoint ($$;$) {
my ($self, $sync, $stk) = @_;
- $self->{ibx}->git->check_async_wait;
- $self->{ibx}->git->cat_async_wait;
+ $self->{ibx}->git->async_wait_all;
- # latest_cmt may be undef
- my $newest = $stk ? $stk->{latest_cmt} : undef;
- if ($newest) {
+ # $newest may be undef
+ my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}};
+ if (defined($newest)) {
my $cur = $self->{mm}->last_commit || '';
if (need_update($self, $cur, $newest)) {
$self->{mm}->last_commit($newest);
}
- } else {
- ${$sync->{max}} = $self->{batch_bytes};
}
+ ${$sync->{max}} = $self->{batch_bytes};
$self->{mm}->{dbh}->commit;
- if ($newest && need_xapian($self)) {
- my $xdb = $self->{xdb};
+ my $xdb = need_xapian($self) ? $self->{xdb} : undef;
+ if ($newest && $xdb) {
my $cur = $xdb->get_metadata('last_commit');
if (need_update($self, $cur, $newest)) {
$xdb->set_metadata('last_commit', $newest);
}
-
+ }
+ if ($stk) { # all done if $stk is passed
# let SearchView know a full --reindex was done so it can
# generate ->has_threadid-dependent links
- if ($sync->{reindex} && !ref($sync->{reindex})) {
+ if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) {
my $n = $xdb->get_metadata('has_threadid');
$xdb->set_metadata('has_threadid', '1') if $n ne '1';
}
+ $self->{oidx}->rethread_done($sync->{-opt}); # all done
}
-
- $self->{oidx}->rethread_done($sync->{-opt}) if $newest; # all done
commit_txn_lazy($self);
- $self->{ibx}->git->cleanup;
+ $sync->{ibx}->git->cleanup;
my $nr = ${$sync->{nr}};
idx_release($self, $nr);
# let another process do some work...
if (my $pr = $sync->{-opt}->{-progress}) {
$pr->("indexed $nr/$sync->{ntodo}\n") if $nr;
}
- if (!$stk) { # more to come
+ if (!$stk && !$sync->{quit}) { # more to come
begin_txn_lazy($self);
$self->{mm}->{dbh}->begin_work;
}
# only for v1
sub process_stack {
my ($self, $sync, $stk) = @_;
- my $git = $self->{ibx}->git;
+ my $git = $sync->{ibx}->git;
my $max = $self->{batch_bytes};
my $nr = 0;
$sync->{nr} = \$nr;
$sync->{max} = \$max;
$sync->{sidx} = $self;
+ $sync->{latest_cmt} = \(my $latest_cmt);
$self->{mm}->{dbh}->begin_work;
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
for my $oid (@leftovers) {
+ last if $sync->{quit};
$oid = unpack('H*', $oid);
- $git->cat_async($oid, \&unindex_both, $self);
+ $git->cat_async($oid, \&unindex_both, $sync);
}
}
if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
$sync->{index_oid} = \&index_both;
}
- while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
+ while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) {
+ my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid };
+ last if $sync->{quit};
if ($f eq 'm') {
- my $arg = { %$sync, autime => $at, cotime => $ct };
+ $arg->{autime} = $at;
+ $arg->{cotime} = $ct;
if ($sync->{max_size}) {
$git->check_async($oid, \&check_size, $arg);
} else {
}
v1_checkpoint($self, $sync) if $max <= 0;
} elsif ($f eq 'd') {
- $git->cat_async($oid, \&unindex_both, $self);
+ $git->cat_async($oid, \&unindex_both, $arg);
}
}
- v1_checkpoint($self, $sync, $stk);
+ v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk);
}
-sub log2stack ($$$$) {
- my ($sync, $git, $range, $ibx) = @_;
+sub log2stack ($$$) {
+ my ($sync, $git, $range) = @_;
my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise)
my ($add, $del);
- if ($ibx->version == 1) {
+ if ($sync->{ibx}->version == 1) {
my $path = $hex.'{2}/'.$hex.'{38}';
$add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!;
$del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!;
my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
--no-notes --no-color --no-renames --no-abbrev),
$range);
- my ($at, $ct, $stk);
+ my ($at, $ct, $stk, $cmt);
while (<$fh>) {
+ return if $sync->{quit};
if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
- ($at, $ct) = ($1 + 0, $2 + 0);
- $stk //= PublicInbox::IdxStack->new($3);
+ ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3);
+ $stk //= PublicInbox::IdxStack->new($cmt);
} elsif (/$del/) {
my $oid = $1;
if ($D) { # reindex case
$D->{pack('H*', $oid)}++;
} else { # non-reindex case:
- $stk->push_rec('d', $at, $ct, $oid);
+ $stk->push_rec('d', $at, $ct, $oid, $cmt);
}
} elsif (/$add/) {
my $oid = $1;
my $oid_bin = pack('H*', $oid);
my $nr = --$D->{$oid_bin};
delete($D->{$oid_bin}) if $nr <= 0;
-
# nr < 0 (-1) means it never existed
- $stk->push_rec('m', $at, $ct, $oid) if $nr < 0;
- } else {
- $stk->push_rec('m', $at, $ct, $oid);
+ next if $nr >= 0;
}
+ $stk->push_rec('m', $at, $ct, $oid, $cmt);
}
}
close $fh or die "git log failed: \$?=$?";
$stk->read_prepare;
}
-sub prepare_stack ($$$) {
- my ($self, $sync, $range) = @_;
- my $git = $self->{ibx}->git;
+sub prepare_stack ($$) {
+ my ($sync, $range) = @_;
+ my $git = $sync->{ibx}->git;
if (index($range, '..') < 0) {
# don't show annoying git errors to users who run -index
return PublicInbox::IdxStack->new->read_prepare if $?;
}
$sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
- log2stack($sync, $git, $range, $self->{ibx});
+ log2stack($sync, $git, $range);
}
# --is-ancestor requires git 1.8.0+
ref($reindex) eq 'HASH' ? $reindex->{from} : '';
}
+sub quit_cb ($) {
+ my ($sync) = @_;
+ sub {
+ # we set {-opt}->{quit} too, so ->index_sync callers
+ # can abort multi-inbox loops this way
+ $sync->{quit} = $sync->{-opt}->{quit} = 1;
+ warn "gracefully quitting\n";
+ }
+}
+
# indexes all unindexed messages (v1 only)
sub _index_sync {
my ($self, $opt) = @_;
my $tip = $opt->{ref} || 'HEAD';
- my $git = $self->{ibx}->git;
+ my $ibx = $self->{ibx};
+ local $self->{current_info} = "$ibx->{inboxdir}";
$self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES;
- $git->batch_prepare;
+ $ibx->git->batch_prepare;
my $pr = $opt->{-progress};
- my $sync = { reindex => $opt->{reindex}, -opt => $opt };
+ my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx };
+ my $quit = quit_cb($sync);
+ local $SIG{QUIT} = $quit;
+ local $SIG{INT} = $quit;
+ local $SIG{TERM} = $quit;
my $xdb = $self->begin_txn_lazy;
$self->{oidx}->rethread_prepare($opt);
my $mm = _msgmap_init($self);
my $lx = reindex_from($sync->{reindex}, $last_commit);
my $range = $lx eq '' ? $tip : "$lx..$tip";
$pr->("counting changes\n\t$range ... ") if $pr;
- my $stk = prepare_stack($self, $sync, $range);
+ my $stk = prepare_stack($sync, $range);
$sync->{ntodo} = $stk ? $stk->num_records : 0;
$pr->("$sync->{ntodo}\n") if $pr; # continue previous line
- process_stack($self, $sync, $stk);
+ process_stack($self, $sync, $stk) if !$sync->{quit};
}
sub DESTROY {
sub begin_txn_lazy {
my ($self) = @_;
- $self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn};
+ $self->with_umask(\&_begin_txn, $self) if !$self->{txn};
}
# store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard)
sub _commit_txn {
my ($self) = @_;
+ if (my $eidx = $self->{eidx}) {
+ $eidx->git->async_wait_all;
+ $eidx->{transact_bytes} = 0;
+ }
if (my $xdb = $self->{xdb}) {
set_metadata_once($self);
$xdb->commit_transaction;
sub commit_txn_lazy {
my ($self) = @_;
delete($self->{txn}) and
- $self->{ibx}->with_umask(\&_commit_txn, $self);
+ $self->with_umask(\&_commit_txn, $self);
}
sub worker_done {
die "$$ $0 still in transaction\n" if $self->{txn};
}
+sub eidx_shard_new {
+ my ($class, $eidx, $shard) = @_;
+ my $self = bless {
+ eidx => $eidx,
+ xpfx => $eidx->{xpfx},
+ indexlevel => $eidx->{indexlevel},
+ -skip_docdata => 1,
+ shard => $shard,
+ creat => 1,
+ }, $class;
+ $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium';
+ $self;
+}
+
+# ensure there's no stale Xapian docs by treating $over as canonical
+sub over_check {
+ my ($self, $over) = @_;
+ begin_txn_lazy($self);
+ my $sth = $over->dbh->prepare(<<'');
+SELECT COUNT(*) FROM over WHERE num = ?
+
+ my $xdb = $self->{xdb};
+ my $cur = $xdb->postlist_begin('');
+ my $end = $xdb->postlist_end('');
+ my $xdir = $self->xdir;
+ for (; $cur != $end; $cur++) {
+ my $docid = $cur->get_docid;
+ $sth->execute($docid);
+ my $x = $sth->fetchrow_array;
+ next if $x > 0;
+ warn "I: removing $xdir #$docid, not in `over'\n";
+ $xdb->delete_document($docid);
+ }
+}
+
1;
use strict;
use v5.10.1;
use parent qw(PublicInbox::SearchIdx);
+use bytes qw(length);
use IO::Handle (); # autoflush
use PublicInbox::Eml;
+use PublicInbox::Sigfd;
sub new {
- my ($class, $v2w, $shard) = @_;
+ my ($class, $v2w, $shard) = @_; # v2w may be ExtSearchIdx
my $ibx = $v2w->{ibx};
- my $self = $class->SUPER::new($ibx, 1, $shard);
+ my $self = $ibx ? $class->SUPER::new($ibx, 1, $shard)
+ : $class->eidx_shard_new($v2w, $shard);
# create the DB before forking:
$self->idx_acquire;
$self->set_metadata_once;
my ($r, $w);
pipe($r, $w) or die "pipe failed: $!\n";
$w->autoflush(1);
+ my $oldset = PublicInbox::Sigfd::block_signals();
my $pid = fork;
defined $pid or die "fork failed: $!\n";
if ($pid == 0) {
+ # these signals are localized in parent
+ $SIG{$_} = 'IGNORE' for (qw(TERM INT QUIT));
+ PublicInbox::Sigfd::sig_setmask($oldset);
my $bnote = $v2w->atfork_child;
close $w or die "failed to close: $!";
die "unexpected MM $self->{mm}" if $self->{mm};
exit;
}
+ PublicInbox::Sigfd::sig_setmask($oldset);
$self->{pid} = $pid;
$self->{w} = $w;
close $r or die "failed to close: $!";
}
+sub eml ($$) {
+ my ($r, $len) = @_;
+ return if $len == 0;
+ my $n = read($r, my $bref, $len) or die "read: $!\n";
+ $n == $len or die "short read: $n != $len\n";
+ PublicInbox::Eml->new(\$bref);
+}
+
# this reads all the writes to $self->{w} from the parent process
sub shard_worker_loop ($$$$$) {
my ($self, $v2w, $r, $shard, $bnote) = @_;
- $0 = "pi-v2-shard[$shard]";
+ $0 = "shard[$shard]";
$self->begin_txn_lazy;
while (my $line = readline($r)) {
+ chomp $line;
$v2w->{current_info} = "[$shard] $line";
- if ($line eq "commit\n") {
+ if ($line eq 'commit') {
$self->commit_txn_lazy;
- } elsif ($line eq "close\n") {
+ } elsif ($line eq 'close') {
$self->idx_release;
- } elsif ($line eq "barrier\n") {
+ } elsif ($line eq 'barrier') {
$self->commit_txn_lazy;
# no need to lock < 512 bytes is atomic under POSIX
print $bnote "barrier $shard\n" or
die "write failed for barrier $!\n";
- } elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) {
- $self->remove_by_oid($1, $2 + 0);
+ } elsif ($line =~ /\AD ([0-9]+)\z/s) {
+ $self->remove_by_docid($1 + 0);
+ } elsif ($line =~ s/\A\+X //) {
+ my ($len, $docid, $eidx_key) = split(/ /, $line, 3);
+ $self->add_eidx_info($docid, $eidx_key, eml($r, $len));
+ } elsif ($line =~ s/\A-X //) {
+ my ($len, $docid, $eidx_key) = split(/ /, $line, 3);
+ $self->remove_eidx_info($docid, $eidx_key,
+ eml($r, $len));
+ } elsif ($line =~ s/\AO ([^\n]+)//) {
+ my $over_fn = $1;
+ $over_fn =~ tr/\0/\n/;
+ $self->over_check(PublicInbox::Over->new($over_fn));
} else {
- chomp $line;
+ my $eidx_key;
+ if ($line =~ s/\AX=(.+)\0//) {
+ $eidx_key = $1;
+ $v2w->{current_info} =~ s/\0/\\0 /;
+ }
# n.b. $mid may contain spaces(!)
- my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid)
+ my ($len, $bytes, $num, $oid, $ds, $ts, $tid, $mid)
= split(/ /, $line, 8);
$self->begin_txn_lazy;
- my $n = read($r, my $msg, $to_read) or die "read: $!\n";
- $n == $to_read or die "short read: $n != $to_read\n";
- my $mime = PublicInbox::Eml->new(\$msg);
my $smsg = bless {
bytes => $bytes,
num => $num + 0,
- blob => $blob,
+ blob => $oid,
mid => $mid,
tid => $tid,
ds => $ds,
ts => $ts,
}, 'PublicInbox::Smsg';
- $self->add_message($mime, $smsg);
+ $smsg->{eidx_key} = $eidx_key if defined($eidx_key);
+ $self->add_message(eml($r, $len), $smsg);
}
}
$self->worker_done;
}
sub index_raw {
- my ($self, $msgref, $eml, $smsg) = @_;
+ my ($self, $msgref, $eml, $smsg, $eidx_key) = @_;
if (my $w = $self->{w}) {
+ my @ekey = defined($eidx_key) ? ("X=$eidx_key\0") : ();
+ $msgref //= \($eml->as_string);
+ $smsg->{raw_bytes} //= length($$msgref);
# mid must be last, it can contain spaces (but not LF)
- print $w join(' ', @$smsg{qw(raw_bytes bytes
+ print $w @ekey, join(' ', @$smsg{qw(raw_bytes bytes
num blob ds ts tid mid)}),
"\n", $$msgref or die "failed to write shard $!\n";
} else {
if ($eml) {
- undef $$msgref;
+ undef($$msgref) if $msgref;
} else { # --xapian-only + --sequential-shard:
$eml = PublicInbox::Eml->new($msgref);
}
$self->begin_txn_lazy;
+ $smsg->{eidx_key} = $eidx_key if defined $eidx_key;
$self->add_message($eml, $smsg);
}
}
+sub shard_add_eidx_info {
+ my ($self, $docid, $eidx_key, $eml) = @_;
+ if (my $w = $self->{w}) {
+ my $hdr = $eml->header_obj->as_string;
+ my $len = length($hdr);
+ print $w "+X $len $docid $eidx_key\n", $hdr or
+ die "failed to write shard: $!";
+ } else {
+ $self->add_eidx_info($docid, $eidx_key, $eml);
+ }
+}
+
+sub shard_remove_eidx_info {
+ my ($self, $docid, $eidx_key, $eml) = @_;
+ if (my $w = $self->{w}) {
+ my $hdr = $eml ? $eml->header_obj->as_string : '';
+ my $len = length($hdr);
+ print $w "-X $len $docid $eidx_key\n", $hdr or
+ die "failed to write shard: $!";
+ } else {
+ $self->remove_eidx_info($docid, $eidx_key, $eml);
+ }
+}
+
sub atfork_child {
close $_[0]->{w} or die "failed to close write pipe: $!\n";
}
}
sub shard_remove {
- my ($self, $oid, $num) = @_;
- if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child
- print $w "D $oid $num\n" or die "failed to write remove $!";
+ my ($self, $num) = @_;
+ if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child
+ print $w "D $num\n" or die "failed to write remove $!";
} else { # same process
- $self->remove_by_oid($oid, $num);
+ $self->remove_by_docid($num);
+ }
+}
+
+sub shard_over_check {
+ my ($self, $over) = @_;
+ if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child
+ my ($over_fn) = $over->{dbh}->sqlite_db_filename;
+ $over_fn =~ tr/\n/\0/;
+ print $w "O $over_fn\n" or die "failed to write over $!";
+ } else {
+ $self->over_check($over);
}
}
# We'll trust the client Date: header here instead of the Received:
# time since this is for display (and not retrieval)
_set_parent(\%id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $rootset = [ grep {
!delete($_->{parent}) && $_->visible($ibx)
} values %id_table ];
my %seen = ($cur => 1); # self-referential loop prevention
my @q = ($cur);
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
while (defined($cur = shift @q)) {
my $c = $cur->{children}; # The hashref here...
use PublicInbox::WwwStream qw(html_oneshot);
use PublicInbox::SearchThread;
use PublicInbox::SearchQuery;
-use PublicInbox::Search qw(mdocid);
+use PublicInbox::Search;
my %rmap_inc;
sub mbox_results {
sub sres_top_html {
my ($ctx) = @_;
- my $srch = $ctx->{-inbox}->search or
+ my $srch = $ctx->{ibx}->isrch or
return PublicInbox::WWW::need($ctx, 'Search');
my $q = PublicInbox::SearchQuery->new($ctx->{qp});
my $x = $q->{x};
my $pad = length("$total");
my $pfx = ' ' x $pad;
my $res = \($ctx->{-html_tip});
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
- my @nums = @{$ibx->search->mset_to_artnums($mset)};
+ my @nums = @{$ibx->isrch->mset_to_artnums($mset)};
my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)};
my ($min, $max);
sub err_txt {
my ($ctx, $err) = @_;
- my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/';
+ my $u = $ctx->{ibx}->base_url($ctx->{env}) . '_/text/help/';
$err =~ s/^\s*Exception:\s*//; # bad word to show users :P
$err =~ s!(\S+)!path2inc($1)!sge;
$err = ascii_html($err);
}
my $A = $q->qs_html(x => 'A', r => undef);
$rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
- if ($ctx->{-inbox}->search->has_threadid) {
+ if ($ctx->{ibx}->isrch->has_threadid) {
$rv .= qq{\n\t\t\tdownload mbox.gz: } .
# we set name=z w/o using it since it seems required for
# lynx (but works fine for w3m).
sub mset_thread {
my ($ctx, $mset, $q) = @_;
- my $ibx = $ctx->{-inbox};
- my $nshard = $ibx->search->{nshard} // 1;
- my %pct = map { mdocid($nshard, $_) => get_pct($_) } $mset->items;
- my $msgs = $ibx->over->get_all(keys %pct);
- $_->{pct} = $pct{$_->{num}} for @$msgs;
+ my $ibx = $ctx->{ibx};
+ my @pct = map { get_pct($_) } $mset->items;
+ my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset);
+ my $i = 0;
+ $_->{pct} = $pct[$i++] for @$msgs;
my $r = $q->{r};
if ($r) { # for descriptions in search_nav_bot
- my @pct = values %pct;
$q->{-min_pct} = min(@pct);
$q->{-max_pct} = max(@pct);
}
sub adump {
my ($cb, $mset, $q, $ctx) = @_;
- $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset);
+ $ctx->{ids} = $ctx->{ibx}->isrch->mset_to_artnums($mset);
$ctx->{search_query} = $q; # used by WwwAtomStream::atom_header
PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i);
}
sub adump_i {
my ($ctx) = @_;
while (my $num = shift @{$ctx->{ids}}) {
- my $smsg = eval { $ctx->{-inbox}->over->get_art($num) } or next;
+ my $smsg = eval { $ctx->{ibx}->over->get_art($num) } or next;
return $smsg;
}
}
use PublicInbox::MID qw(mids);
use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-use Time::Local qw(timegm);
-
-sub get_val ($$) {
- my ($doc, $col) = @_;
- # sortable_unserialise is defined by PublicInbox::Search::load_xapian()
- sortable_unserialise($doc->get_value($col));
-}
sub to_doc_data {
my ($self) = @_;
) = split(/\n/, $_[1]);
}
-sub load_expand {
- my ($self, $doc) = @_;
- my $data = $doc->get_data or return;
- $self->{ts} = get_val($doc, PublicInbox::Search::TS());
- my $dt = get_val($doc, PublicInbox::Search::DT());
- my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt);
- $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy);
- load_from_data($self, $data);
- $self;
-}
-
sub psgi_cull ($) {
my ($self) = @_;
sub find_smsgs ($$$) {
my ($self, $ibx, $want) = @_;
- my $srch = $ibx->search or return;
+ my $srch = $ibx->isrch or return;
my $post = $want->{oid_b} or die 'BUG: no {oid_b}';
$post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post";
use warnings;
sub get {
- my ($config, $key, $default) = @_;
- my $spamcheck = $config->{$key};
+ my ($cfg, $key, $default) = @_;
+ my $spamcheck = $cfg->{$key};
$spamcheck = $default unless $spamcheck;
return if !$spamcheck || $spamcheck eq 'none';
our $epoll_wait_events;
our $epoll_wait_size = 0;
sub epoll_wait_mod4 {
- # resize our static buffer if requested size is bigger than we've ever done
- if ($_[1] > $epoll_wait_size) {
- $epoll_wait_size = $_[1];
- $epoll_wait_events = "\0" x 12 x $epoll_wait_size;
- }
- my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0);
- for (0..$ct-1) {
- @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8));
- }
- return $ct;
+ my ($epfd, $maxevents, $timeout_msec, $events) = @_;
+ # resize our static buffer if maxevents bigger than we've ever done
+ if ($maxevents > $epoll_wait_size) {
+ $epoll_wait_size = $maxevents;
+ vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0;
+ }
+ @$events = ();
+ my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events,
+ $maxevents, $timeout_msec);
+ for (0..$ct - 1) {
+ # 12-byte struct epoll_event
+ # 4 bytes uint32_t events mask (skipped, useless to us)
+ # 8 bytes: epoll_data_t union (first 4 bytes are the fd)
+ # So we skip the first 4 bytes and take the middle 4:
+ $events->[$_] = unpack('L', substr($epoll_wait_events,
+ 12 * $_ + 4, 4));
+ }
}
sub epoll_wait_mod8 {
- # resize our static buffer if requested size is bigger than we've ever done
- if ($_[1] > $epoll_wait_size) {
- $epoll_wait_size = $_[1];
- $epoll_wait_events = "\0" x 16 x $epoll_wait_size;
- }
- my $ct;
- if ($no_deprecated) {
- $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef);
- } else {
- $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0);
- }
- for (0..$ct-1) {
- # 16 byte epoll_event structs, with format:
- # 4 byte mask [idx 1]
- # 4 byte padding (we put it into idx 2, useless)
- # 8 byte data (first 4 bytes are fd, into idx 0)
- @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12));
- }
- return $ct;
+ my ($epfd, $maxevents, $timeout_msec, $events) = @_;
+
+ # resize our static buffer if maxevents bigger than we've ever done
+ if ($maxevents > $epoll_wait_size) {
+ $epoll_wait_size = $maxevents;
+ vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0;
+ }
+ @$events = ();
+ my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events,
+ $maxevents, $timeout_msec,
+ $no_deprecated ? undef : ());
+ for (0..$ct - 1) {
+ # 16-byte struct epoll_event
+ # 4 bytes uint32_t events mask (skipped, useless to us)
+ # 4 bytes padding (skipped, useless)
+ # 8 bytes epoll_data_t union (first 4 bytes are the fd)
+ # So skip the first 8 bytes, take 4, and ignore the last 4:
+ $events->[$_] = unpack('L', substr($epoll_wait_events,
+ 16 * $_ + 8, 4));
+ }
}
sub signalfd ($$$) {
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
package PublicInbox::Tmpfile;
use strict;
-use warnings;
-use base qw(Exporter);
+use v5.10.1;
+use parent qw(Exporter);
our @EXPORT = qw(tmpfile);
use Fcntl qw(:DEFAULT);
use Errno qw(EEXIST);
# unlinked filename which makes sense when viewed with lsof
# (at least on Linux)
# And if we ever stop caring to have debuggable filenames, O_TMPFILE :)
+#
+# This is also for Perl <5.32 which lacks: open(..., '+>>', undef)
+# <https://rt.perl.org/Ticket/Display.html?id=134221>
sub tmpfile ($;$$) {
my ($id, $sock, $append) = @_;
if (defined $sock) {
use Crypt::CBC;
use Plack::Util;
use MIME::Base64 qw(decode_base64url);
-my $CODE_URL = 'https://public-inbox.org/public-inbox.git';
+my @CODE_URL = qw(http://ou63pmih66umazou.onion/public-inbox.git
+ https://public-inbox.org/public-inbox.git);
my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8');
sub new {
my $unsubscribe = $opt{unsubscribe} or
die "`unsubscribe' callback not given\n";
+ my $code_url = $opt{code_url} || \@CODE_URL;
+ $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY';
bless {
- pi_config => $opt{pi_config}, # PublicInbox::Config
+ pi_cfg => $opt{pi_config}, # PublicInbox::Config
owner_email => $opt{owner_email},
cipher => $cipher,
unsubscribe => $unsubscribe,
contact => qq(<a\nhref="mailto:$e">$e</a>),
- code_url => $opt{code_url} || $CODE_URL,
+ code_url => $code_url,
confirm => $opt{confirm},
}, $class;
}
"<html><head><title>$title</title></head><body><pre>".
join("\n", "<b>$title</b>\n", @body) . '</pre><hr>'.
"<pre>This page is available under AGPL-3.0+\n" .
- "git clone $self->{code_url}\n" .
+ join('', map { "git clone $_\n" } @{$self->{code_url}}) .
qq(Email $self->{contact} if you have any questions).
'</pre></body></html>'
] ];
my $archive_url = $self->{archive_urls}->{$list_addr};
unless ($archive_url) {
- if (my $config = $self->{pi_config}) {
+ if (my $cfg = $self->{pi_cfg}) {
# PublicInbox::Config::lookup
- my $ibx = $config->lookup($list_addr);
+ my $ibx = $cfg->lookup($list_addr);
# PublicInbox::Inbox::base_url
$archive_url = $ibx->base_url if $ibx;
}
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd);
-use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size);
+use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size
+ is_bad_blob);
use IO::Handle; # ->autoflush
use File::Temp ();
sub count_shards ($) {
my ($self) = @_;
- # always load existing shards in case core count changes:
- # Also, shard count may change while -watch is running
- my $srch = $self->{ibx}->search or return 0;
- delete $self->{ibx}->{search};
- $srch->{nshard} // 0
+ if (my $ibx = $self->{ibx}) {
+ # always load existing shards in case core count changes:
+ # Also, shard count may change while -watch is running
+ my $srch = $ibx->search or return 0;
+ delete $ibx->{search};
+ $srch->{nshard} // 0
+ } else { # ExtSearchIdx
+ $self->{nshard} // do {
+ if ($self->xdb_sharded) {
+ $self->{nshard} // die 'BUG: {nshard} unset';
+ } else {
+ 0;
+ }
+ }
+ }
}
sub new {
die "$dir does not exist\n";
}
}
- $v2ibx->umask_prepare;
-
my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION;
my $self = {
ibx => $v2ibx,
}
$self->idx_init;
$self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum;
- my $epoch_max = -1;
- git_dir_latest($self, \$epoch_max);
- if (defined $skip_epoch && $epoch_max == -1) {
- $epoch_max = $skip_epoch;
- }
- $self->git_init($epoch_max >= 0 ? $epoch_max : 0);
+ my $max = $self->{ibx}->max_git_epoch;
+ $max = $skip_epoch if (defined($skip_epoch) && !defined($max));
+ $self->git_init($max // 0);
$self->done;
}
$self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb);
}
+sub idx_shard ($$) {
+ my ($self, $num) = @_;
+ $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})];
+}
+
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
$self->{oidx}->add_overview($mime, $smsg);
- my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
+ my $idx = idx_shard($self, $smsg->{num});
$idx->index_raw($msgref, $mime, $smsg);
my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
$n >= $self->{batch_bytes};
($num, $mid0);
}
-sub idx_shard {
- my ($self, $shard_i) = @_;
- $self->{idx_shards}->[$shard_i];
-}
-
sub _idx_init { # with_umask callback
my ($self, $opt) = @_;
$self->lock_acquire unless $opt && $opt->{-skip_lock};
$self->{shards} = $nshards if $nshards && $nshards != $self->{shards};
$self->{batch_bytes} = $opt->{batch_size} //
$PublicInbox::SearchIdx::BATCH_BYTES;
- $self->{batch_bytes} *= $self->{shards} if $self->{parallel};
# need to create all shards before initializing msgmap FD
# idx_shards must be visible to all forked processes
my $idx = $self->{idx_shards} = [];
push @$idx, PublicInbox::SearchIdxShard->new($self, $_) for (0..$max);
+ # SearchIdxShard may do their own flushing, so don't scale
+ # until after forking
+ $self->{batch_bytes} *= $self->{shards} if $self->{parallel};
+
+ my $ibx = $self->{ibx} or return; # ExtIdxSearch
+
# Now that all subprocesses are up, we can open the FDs
# for SQLite:
my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
- "$self->{ibx}->{inboxdir}/msgmap.sqlite3",
- $self->{ibx}->{-no_fsync} ? 2 : 1);
+ "$ibx->{inboxdir}/msgmap.sqlite3",
+ $ibx->{-no_fsync} ? 2 : 1);
$mm->{dbh}->begin_work;
}
+sub parallel_init ($$) {
+ my ($self, $indexlevel) = @_;
+ if (($indexlevel // 'full') eq 'basic') {
+ $self->{parallel} = 0;
+ } else {
+ pipe(my ($r, $w)) or die "pipe failed: $!";
+ # pipe for barrier notifications doesn't need to be big,
+ # 1031: F_SETPIPE_SZ
+ fcntl($w, 1031, 4096) if $^O eq 'linux';
+ $self->{bnote} = [ $r, $w ];
+ $w->autoflush(1);
+ }
+}
+
# idempotent
sub idx_init {
my ($self, $opt) = @_;
delete @$ibx{qw(mm search)};
$ibx->git->cleanup;
- $self->{parallel} = 0 if ($ibx->{indexlevel}//'') eq 'basic';
- if ($self->{parallel}) {
- pipe(my ($r, $w)) or die "pipe failed: $!";
- # pipe for barrier notifications doesn't need to be big,
- # 1031: F_SETPIPE_SZ
- fcntl($w, 1031, 4096) if $^O eq 'linux';
- $self->{bnote} = [ $r, $w ];
- $w->autoflush(1);
- }
-
- $ibx->umask_prepare;
+ parallel_init($self, $ibx->{indexlevel});
$ibx->with_umask(\&_idx_init, $self, $opt);
}
sub _replace_oids ($$$) {
my ($self, $mime, $replace_map) = @_;
$self->done;
- my $pfx = "$self->{ibx}->{inboxdir}/git";
+ my $ibx = $self->{ibx};
+ my $pfx = "$ibx->{inboxdir}/git";
my $rewrites = []; # epoch => commit
- my $max = $self->{epoch_max};
-
- unless (defined($max)) {
- defined(my $latest = git_dir_latest($self, \$max)) or return;
- $self->{epoch_max} = $max;
- }
+ my $max = $self->{epoch_max} //= $ibx->max_git_epoch // return;
foreach my $i (0..$max) {
my $git_dir = "$pfx/$i.git";
} else { # ->purge or ->remove
$self->{mm}->num_delete($num);
}
- unindex_oid_remote($self, $oid, $mid);
+ unindex_oid_aux($self, $oid, $mid);
}
}
my ($self, $raw) = @_;
# grab the expected OID we have to reindex:
pipe(my($in, $w)) or die "pipe: $!";
- my $git_dir = $self->{ibx}->git->{git_dir};
+ my $git_dir = $self->git->{git_dir};
my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)];
my $r = popen_rd($cmd, undef, { 0 => $in });
print $w $$raw or die "print \$w: $!";
}
# make sure we really got the OID:
- my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid);
+ my ($blob, $type, $bytes) = $self->git->check($expect_oid);
$blob eq $expect_oid or die "BUG: $expect_oid not found after replace";
# don't leak FDs to Xapian:
- $self->{ibx}->git->cleanup;
+ $self->git->cleanup;
# reindex modified messages:
for my $smsg (@$need_reindex) {
$self->{mm}->last_commit_xap($v, $i, $cmt);
}
-sub set_last_commits ($) {
+sub set_last_commits ($) { # this is NOT for ExtSearchIdx
my ($self) = @_;
defined(my $epoch_max = $self->{epoch_max}) or return;
my $last_commit = $self->{last_commit};
}
my $shards = $self->{idx_shards};
if ($shards) {
- my $dbh = $self->{mm}->{dbh};
+ my $mm = $self->{mm};
+ my $dbh = $mm->{dbh} if $mm;
# SQLite msgmap data is second in importance
- $dbh->commit;
+ $dbh->commit if $dbh;
# SQLite overview is third
$self->{oidx}->commit_lazy;
# Now deal with Xapian
if ($wait) {
- my $barrier = $self->barrier_init(scalar @$shards);
+ my $barrier = barrier_init($self, scalar @$shards);
# each shard needs to issue a barrier command
$_->shard_barrier for @$shards;
# wait for each Xapian shard
- $self->barrier_wait($barrier);
+ barrier_wait($self, $barrier);
} else {
$_->shard_commit for @$shards;
}
+ my $midx = $self->{midx}; # misc index
+ $midx->commit_txn if $midx;
+
# last_commit is special, don't commit these until
- # remote shards are done:
- $dbh->begin_work;
+ # Xapian shards are done:
+ $dbh->begin_work if $dbh;
set_last_commits($self);
- $dbh->commit;
-
- $dbh->begin_work;
+ if ($dbh) {
+ $dbh->commit;
+ $dbh->begin_work;
+ }
+ $midx->begin_txn if $midx;
}
$self->{total_bytes} += $self->{transact_bytes};
$self->{transact_bytes} = 0;
}
eval { $self->{oidx}->dbh_close };
$err .= "over close: $@\n" if $@;
+ delete $self->{midx};
delete $self->{bnote};
my $nbytes = $self->{total_bytes};
$self->{total_bytes} = 0;
$self->lock_release(!!$nbytes) if $shards;
- $self->{ibx}->git->cleanup;
+ $self->git->cleanup;
die $err if $err;
}
+sub write_alternates ($$$) {
+ my ($info_dir, $mode, $out) = @_;
+ my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir);
+ my $tmp = $fh->filename;
+ print $fh @$out or die "print $tmp: $!\n";
+ chmod($mode, $fh) or die "fchmod $tmp: $!\n";
+ close $fh or die "close $tmp $!\n";
+ my $alt = "$info_dir/alternates";
+ rename($tmp, $alt) or die "rename $tmp => $alt: $!\n";
+ $fh->unlink_on_destroy(0);
+}
+
sub fill_alternates ($$) {
my ($self, $epoch) = @_;
}
}
return unless $new;
-
- my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir);
- my $tmp = $fh->filename;
- print $fh join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n"
- or die "print $tmp: $!\n";
- chmod($mode, $fh) or die "fchmod $tmp: $!\n";
- close $fh or die "close $tmp $!\n";
- rename($tmp, $alt) or die "rename $tmp => $alt: $!\n";
- $fh->unlink_on_destroy(0);
+ write_alternates($info_dir, $mode,
+ [join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n"]);
}
sub git_init {
$git_dir
}
-sub git_dir_latest {
- my ($self, $max) = @_;
- $$max = -1;
- my $pfx = "$self->{ibx}->{inboxdir}/git";
- return unless -d $pfx;
- my $latest;
- opendir my $dh, $pfx or die "opendir $pfx: $!\n";
- while (defined(my $git_dir = readdir($dh))) {
- $git_dir =~ m!\A([0-9]+)\.git\z! or next;
- if ($1 > $$max) {
- $$max = $1;
- $latest = "$pfx/$git_dir";
- }
- }
- $latest;
-}
-
sub importer {
my ($self) = @_;
my $im = $self->{im};
}
my $epoch = 0;
my $max;
- my $latest = git_dir_latest($self, \$max);
+ my $latest = $self->{ibx}->git_dir_latest(\$max);
if (defined $latest) {
my $git = PublicInbox::Git->new($latest);
my $packed_bytes = $git->packed_bytes;
sub reindex_checkpoint ($$) {
my ($self, $sync) = @_;
- $self->{ibx}->git->cleanup; # *async_wait
+ $self->git->async_wait_all;
+ $self->update_last_commit($sync);
${$sync->{need_checkpoint}} = 0;
my $mm_tmp = $sync->{mm_tmp};
$mm_tmp->atfork_prepare if $mm_tmp;
- $self->done; # release lock
+ die 'BUG: {im} during reindex' if $self->{im};
+ if ($self->{ibx_map} && !$sync->{checkpoint_unlocks}) {
+ checkpoint($self, 1); # no need to release lock on pure index
+ } else {
+ $self->done; # release lock
+ }
- if (my $pr = $sync->{-opt}->{-progress}) {
+ if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) {
$pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}}));
}
# allow -watch or -mda to write...
$self->idx_init($sync->{-opt}); # reacquire lock
+ if (my $intvl = $sync->{check_intvl}) { # eidx
+ $sync->{next_check} = PublicInbox::DS::now() + $intvl;
+ }
$mm_tmp->atfork_parent if $mm_tmp;
}
+sub index_finalize ($$) {
+ my ($arg, $index) = @_;
+ ++$arg->{self}->{nidx};
+ if (defined(my $cur = $arg->{cur_cmt})) {
+ ${$arg->{latest_cmt}} = $cur;
+ } elsif ($index) {
+ die 'BUG: {cur_cmt} missing';
+ } # else { unindexing @leftovers doesn't set {cur_cmt}
+}
+
sub index_oid { # cat_async callback
my ($bref, $oid, $type, $size, $arg) = @_;
- return if $size == 0; # purged
+ is_bad_blob($oid, $type, $size, $arg->{oid}) and
+ return index_finalize($arg, 1); # size == 0 purged returns here
+ my $self = $arg->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
my ($num, $mid0);
my $eml = PublicInbox::Eml->new($$bref);
my $mids = mids($eml);
my $chash = content_hash($eml);
- my $self = $arg->{v2w};
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
if (do_idx($self, $bref, $eml, $smsg)) {
${$arg->{need_checkpoint}} = 1;
}
+ index_finalize($arg, 1);
}
# only update last_commit for $i on reindex iff newer than current
-sub update_last_commit ($$$$) {
- my ($self, $git, $i, $cmt) = @_;
- my $last = last_epoch_commit($self, $i);
- if (defined $last && is_ancestor($git, $last, $cmt)) {
- my @cmd = (qw(rev-list --count), "$last..$cmt");
- chomp(my $n = $git->qx(@cmd));
+sub update_last_commit {
+ my ($self, $sync, $stk) = @_;
+ my $unit = $sync->{unit} // return;
+ my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}};
+ defined($latest_cmt) or return;
+ my $last = last_epoch_commit($self, $unit->{epoch});
+ if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) {
+ my @cmd = (qw(rev-list --count), "$last..$latest_cmt");
+ chomp(my $n = $unit->{git}->qx(@cmd));
return if $n ne '' && $n == 0;
}
- last_epoch_commit($self, $i, $cmt);
+ last_epoch_commit($self, $unit->{epoch}, $latest_cmt);
}
-sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" }
-
-sub last_commits ($$) {
- my ($self, $epoch_max) = @_;
+sub last_commits {
+ my ($self, $sync) = @_;
my $heads = [];
- for (my $i = $epoch_max; $i >= 0; $i--) {
+ for (my $i = $sync->{epoch_max}; $i >= 0; $i--) {
$heads->[$i] = last_epoch_commit($self, $i);
}
$heads;
}
# returns a revision range for git-log(1)
-sub log_range ($$$$$) {
- my ($self, $sync, $git, $i, $tip) = @_;
+sub log_range ($$$) {
+ my ($sync, $unit, $tip) = @_;
my $opt = $sync->{-opt};
my $pr = $opt->{-progress} if (($opt->{verbose} || 0) > 1);
+ my $i = $unit->{epoch};
my $cur = $sync->{ranges}->[$i] or do {
$pr->("$i.git indexing all of $tip\n") if $pr;
return $tip; # all of it
my $range = "$cur..$tip";
$pr->("$i.git checking contiguity... ") if $pr;
- if (is_ancestor($git, $cur, $tip)) { # common case
+ my $git = $unit->{git};
+ if (is_ancestor($sync->{self}->git, $cur, $tip)) { # common case
$pr->("OK\n") if $pr;
my $n = $git->qx(qw(rev-list --count), $range);
chomp($n);
warn "discarding history at $cur\n";
}
warn <<"";
-reindexing $git->{git_dir} starting at
-$range
-
- $sync->{unindex_range}->{$i} = "$base..$cur";
+reindexing $git->{git_dir}
+starting at $range
+
+ # $cur^0 may no longer exist if pruned by git
+ if ($git->qx(qw(rev-parse -q --verify), "$cur^0")) {
+ $unit->{unindex_range} = "$base..$cur";
+ } elsif ($base && $git->qx(qw(rev-parse -q --verify), $base)) {
+ $unit->{unindex_range} = "$base..";
+ } else {
+ warn "W: unable to unindex before $range\n";
+ }
}
$range;
}
-sub sync_prepare ($$$) {
- my ($self, $sync, $epoch_max) = @_;
+# overridden by ExtSearchIdx
+sub artnum_max { $_[0]->{mm}->num_highwater }
+
+sub sync_prepare ($$) {
+ my ($self, $sync) = @_;
+ $sync->{ranges} = sync_ranges($self, $sync);
my $pr = $sync->{-opt}->{-progress};
my $regen_max = 0;
- my $head = $self->{ibx}->{ref_head} || 'refs/heads/master';
-
- # reindex stops at the current heads and we later rerun index_sync
- # without {reindex}
- my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex};
-
- for (my $i = $epoch_max; $i >= 0; $i--) {
- my $git_dir = git_dir_n($self, $i);
+ my $head = $sync->{ibx}->{ref_head} || 'HEAD';
+ my $pfx;
+ if ($pr) {
+ ($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g);
+ $pfx //= $sync->{ibx}->{inboxdir};
+ }
+
+ my $reindex_heads;
+ if ($self->{ibx_map}) {
+ # ExtSearchIdx won't index messages unless they're in
+ # over.sqlite3 for a given inbox, so don't read beyond
+ # what's in the per-inbox index.
+ $reindex_heads = [];
+ my $v = PublicInbox::Search::SCHEMA_VERSION;
+ my $mm = $sync->{ibx}->mm;
+ for my $i (0..$sync->{epoch_max}) {
+ $reindex_heads->[$i] = $mm->last_commit_xap($v, $i);
+ }
+ } elsif ($sync->{reindex}) { # V2 inbox
+ # reindex stops at the current heads and we later
+ # rerun index_sync without {reindex}
+ $reindex_heads = $self->last_commits($sync);
+ }
+ if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
+ $sync->{index_oid} = $self->can('index_oid');
+ }
+ my $git_pfx = "$sync->{ibx}->{inboxdir}/git";
+ for (my $i = $sync->{epoch_max}; $i >= 0; $i--) {
+ my $git_dir = "$git_pfx/$i.git";
-d $git_dir or next; # missing epochs are fine
my $git = PublicInbox::Git->new($git_dir);
+ my $unit = { git => $git, epoch => $i };
+ my $tip;
if ($reindex_heads) {
- $head = $reindex_heads->[$i] or next;
+ $tip = $head = $reindex_heads->[$i] or next;
+ } else {
+ $tip = $git->qx(qw(rev-parse -q --verify), $head);
+ next if $?; # new repo
+ chomp $tip;
}
- chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head));
-
- next if $?; # new repo
- my $range = log_range($self, $sync, $git, $i, $tip) or next;
+ my $range = log_range($sync, $unit, $tip) or next;
# can't use 'rev-list --count' if we use --diff-filter
- $pr->("$i.git counting $range ... ") if $pr;
+ $pr->("$pfx $i.git counting $range ... ") if $pr;
# Don't bump num_highwater on --reindex by using {D}.
# We intentionally do NOT use {D} in the non-reindex case
# because we want NNTP article number gaps from unindexed
# messages to show up in mirrors, too.
$sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
- my $stk = log2stack($sync, $git, $range, $self->{ibx});
+ my $stk = log2stack($sync, $git, $range);
+ return 0 if $sync->{quit};
my $nr = $stk ? $stk->num_records : 0;
$pr->("$nr\n") if $pr;
- $sync->{stacks}->[$i] = $stk if $stk;
+ $unit->{stack} = $stk; # may be undef
+ unshift @{$sync->{todo}}, $unit;
$regen_max += $nr;
}
+ return 0 if $sync->{quit};
# XXX this should not happen unless somebody bypasses checks in
# our code and blindly injects "d" file history into git repos
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
- my $arg = { v2w => $self };
- my $all = $self->{ibx}->git;
+ local $self->{current_info} = 'leftover ';
+ my $unindex_oid = $self->can('unindex_oid');
for my $oid (@leftovers) {
+ last if $sync->{quit};
$oid = unpack('H*', $oid);
- $self->{current_info} = "leftover $oid";
- $all->cat_async($oid, \&unindex_oid, $arg);
+ my $req = { %$sync, oid => $oid };
+ $self->git->cat_async($oid, $unindex_oid, $req);
}
- $all->cat_async_wait;
+ $self->git->cat_async_wait;
}
- if (!$regen_max && !keys(%{$self->{unindex_range}})) {
+ return 0 if $sync->{quit};
+ if (!$regen_max) {
$sync->{-regen_fmt} = "%u/?\n";
return 0;
}
$sync->{-regen_fmt} = "% ${pad}u/$regen_max\n";
$sync->{nr} = \(my $nr = 0);
return -1 if $sync->{reindex};
- $regen_max + $self->{mm}->num_highwater() || 0;
+ $regen_max + $self->artnum_max || 0;
}
-sub unindex_oid_remote ($$$) {
+sub unindex_oid_aux ($$$) {
my ($self, $oid, $mid) = @_;
my @removed = $self->{oidx}->remove_oid($oid, $mid);
for my $num (@removed) {
- my $idx = idx_shard($self, $num % $self->{shards});
- $idx->shard_remove($oid, $num);
+ my $idx = idx_shard($self, $num);
+ $idx->shard_remove($num);
}
}
sub unindex_oid ($$;$) { # git->cat_async callback
- my ($bref, $oid, $type, $size, $sync) = @_;
- my $self = $sync->{v2w};
- my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is_bad_blob($oid, $type, $size, $arg->{oid}) and
+ return index_finalize($arg, 0);
+ my $self = $arg->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
+ my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef;
my $mm = $self->{mm};
my $mids = mids(PublicInbox::Eml->new($bref));
undef $$bref;
}
$mm->num_delete($num);
}
- unindex_oid_remote($self, $oid, $mid);
+ unindex_oid_aux($self, $oid, $mid);
}
+ index_finalize($arg, 0);
}
+sub git { $_[0]->{ibx}->git }
+
# this is rare, it only happens when we get discontiguous history in
# a mirror because the source used -purge or -edit
-sub unindex ($$$$) {
- my ($self, $sync, $git, $unindex_range) = @_;
+sub unindex_todo ($$$) {
+ my ($self, $sync, $unit) = @_;
+ my $unindex_range = delete($unit->{unindex_range}) // return;
my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num
my $before = scalar keys %$unindexed;
# order does not matter, here:
- my @cmd = qw(log --raw -r
- --no-notes --no-color --no-abbrev --no-renames);
- my $fh = $git->popen(@cmd, $unindex_range);
- my $all = $self->{ibx}->git;
+ my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color
+ --no-abbrev --no-renames), $unindex_range);
local $sync->{in_unindex} = 1;
+ my $unindex_oid = $self->can('unindex_oid');
while (<$fh>) {
/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next;
- $all->cat_async($1, \&unindex_oid, $sync);
+ $self->git->cat_async($1, $unindex_oid, { %$sync, oid => $1 });
}
close $fh or die "git log failed: \$?=$?";
- $all->cat_async_wait;
+ $self->git->cat_async_wait;
return unless $sync->{-opt}->{prune};
my $after = scalar keys %$unindexed;
return if $before == $after;
# ensure any blob can not longer be accessed via dumb HTTP
- PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}",
+ PublicInbox::Import::run_die(['git',
+ "--git-dir=$unit->{git}->{git_dir}",
qw(-c gc.reflogExpire=now gc --prune=all --quiet)]);
}
-sub sync_ranges ($$$) {
- my ($self, $sync, $epoch_max) = @_;
+sub sync_ranges ($$) {
+ my ($self, $sync) = @_;
my $reindex = $sync->{reindex};
-
- return last_commits($self, $epoch_max) unless $reindex;
+ return $self->last_commits($sync) unless $reindex;
return [] if ref($reindex) ne 'HASH';
my $ranges = $reindex->{from}; # arrayref;
sub index_xap_only { # git->cat_async callback
my ($bref, $oid, $type, $size, $smsg) = @_;
- my $self = $smsg->{v2w};
- my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
+ my $self = $smsg->{self};
+ my $idx = idx_shard($self, $smsg->{num});
$smsg->{raw_bytes} = $size;
$idx->index_raw($bref, undef, $smsg);
$self->{transact_bytes} += $size;
"$beg..$end (% $step)\n");
}
for (my $num = $beg; $num <= $end; $num += $step) {
+ last if $sync->{quit};
my $smsg = $ibx->over->get_art($num) or next;
- $smsg->{v2w} = $self;
+ $smsg->{self} = $self;
$ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
if ($self->{transact_bytes} >= $self->{batch_bytes}) {
${$sync->{nr}} = $num;
}
}
-sub index_epoch ($$$) {
- my ($self, $sync, $i) = @_;
-
- my $git_dir = git_dir_n($self, $i);
- -d $git_dir or return; # missing epochs are fine
- my $git = PublicInbox::Git->new($git_dir);
- if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare
- unindex($self, $sync, $git, $unindex_range);
- }
- defined(my $stk = $sync->{stacks}->[$i]) or return;
- $sync->{stacks}->[$i] = undef;
- my $all = $self->{ibx}->git;
- while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
- $self->{current_info} = "$i.git $oid";
+sub index_todo ($$$) {
+ my ($self, $sync, $unit) = @_;
+ return if $sync->{quit};
+ unindex_todo($self, $sync, $unit);
+ my $stk = delete($unit->{stack}) or return;
+ my $all = $self->git;
+ my $index_oid = $self->can('index_oid');
+ my $unindex_oid = $self->can('unindex_oid');
+ my $pfx;
+ if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) {
+ $pfx = "$1 $2"; # v2
+ } else { # v1
+ ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g);
+ $pfx //= $unit->{git}->{git_dir};
+ }
+ local $self->{current_info} = "$pfx ";
+ local $sync->{latest_cmt} = \(my $latest_cmt);
+ local $sync->{unit} = $unit;
+ while (my ($f, $at, $ct, $oid, $cmt) = $stk->pop_rec) {
+ if ($sync->{quit}) {
+ warn "waiting to quit...\n";
+ $all->async_wait_all;
+ $self->update_last_commit($sync);
+ return;
+ }
+ my $req = {
+ %$sync,
+ autime => $at,
+ cotime => $ct,
+ oid => $oid,
+ cur_cmt => $cmt
+ };
if ($f eq 'm') {
- my $arg = { %$sync, autime => $at, cotime => $ct };
if ($sync->{max_size}) {
- $all->check_async($oid, \&check_size, $arg);
+ $all->check_async($oid, \&check_size, $req);
} else {
- $all->cat_async($oid, \&index_oid, $arg);
+ $all->cat_async($oid, $index_oid, $req);
}
} elsif ($f eq 'd') {
- $all->cat_async($oid, \&unindex_oid, $sync);
+ $all->cat_async($oid, $unindex_oid, $req);
}
if (${$sync->{need_checkpoint}}) {
reindex_checkpoint($self, $sync);
}
}
- $all->check_async_wait;
- $all->cat_async_wait;
- update_last_commit($self, $git, $i, $stk->{latest_cmt});
+ $all->async_wait_all;
+ $self->update_last_commit($sync, $stk);
}
sub xapian_only {
$sync //= {
need_checkpoint => \(my $bool = 0),
-opt => $opt,
- v2w => $self,
+ self => $self,
nr => \(my $nr = 0),
-regen_fmt => "%u/?\n",
};
if ($seq || !$self->{parallel}) {
my $shard_end = $self->{shards} - 1;
for my $i (0..$shard_end) {
+ last if $sync->{quit};
index_xap_step($self, $sync, $art_beg + $i);
if ($i != $shard_end) {
reindex_checkpoint($self, $sync);
index_xap_step($self, $sync, $art_beg, 1);
}
}
- $self->{ibx}->git->cat_async_wait;
+ $self->git->cat_async_wait;
$self->done;
}
$opt //= {};
return xapian_only($self, $opt) if $opt->{xapian_only};
- my $pr = $opt->{-progress};
my $epoch_max;
- my $latest = git_dir_latest($self, \$epoch_max);
- return unless defined $latest;
+ my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return;
+ if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison
+ use Time::HiRes qw(stat);
+ if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) {
+ my $c = $mm[10]; # 10 = ctime (nsec NV)
+ my @hd = stat("$latest/refs/heads");
+ my @pr = stat("$latest/packed-refs");
+ return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0);
+ }
+ }
+ my $pr = $opt->{-progress};
my $seq = $opt->{sequential_shard};
my $art_beg; # the NNTP article number we start xapian_only at
my $idxlevel = $self->{ibx}->{indexlevel};
$self->{oidx}->rethread_prepare($opt);
my $sync = {
need_checkpoint => \(my $bool = 0),
- unindex_range => {}, # EPOCH => oid_old..oid_new
reindex => $opt->{reindex},
-opt => $opt,
- v2w => $self,
+ self => $self,
+ ibx => $self->{ibx},
+ epoch_max => $epoch_max,
};
- $sync->{ranges} = sync_ranges($self, $sync, $epoch_max);
- if (sync_prepare($self, $sync, $epoch_max)) {
+ my $quit = PublicInbox::SearchIdx::quit_cb($sync);
+ local $SIG{QUIT} = $quit;
+ local $SIG{INT} = $quit;
+ local $SIG{TERM} = $quit;
+
+ if (sync_prepare($self, $sync)) {
# tmp_clone seems to fail if inside a transaction, so
# we rollback here (because we opened {mm} for reading)
# Note: we do NOT rely on DBI transactions for atomicity;
# xapian_only works incrementally w/o --reindex
if ($seq && !$opt->{reindex}) {
- $art_beg = $sync->{mm_tmp}->max;
- $art_beg++ if defined($art_beg);
+ $art_beg = $sync->{mm_tmp}->max || -1;
+ $art_beg++;
}
}
- if ($sync->{max_size} = $opt->{max_size}) {
- $sync->{index_oid} = \&index_oid;
- }
# work forwards through history
- index_epoch($self, $sync, $_) for (0..$epoch_max);
- $self->{oidx}->rethread_done($opt);
+ index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []};
+ $self->{oidx}->rethread_done($opt) unless $sync->{quit};
$self->done;
if (my $nr = $sync->{nr}) {
$pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr;
}
+ my $quit_warn;
# deal with Xapian shards sequentially
if ($seq && delete($sync->{mm_tmp})) {
- $self->{ibx}->{indexlevel} = $idxlevel;
- xapian_only($self, $opt, $sync, $art_beg);
+ if ($sync->{quit}) {
+ $quit_warn = 1;
+ } else {
+ $self->{ibx}->{indexlevel} = $idxlevel;
+ xapian_only($self, $opt, $sync, $art_beg);
+ $quit_warn = 1 if $sync->{quit};
+ }
}
# --reindex on the command-line
- if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
+ if (!$sync->{quit} && $opt->{reindex} &&
+ !ref($opt->{reindex}) && $idxlevel ne 'basic') {
$self->lock_acquire;
my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
if (my $xdb = $s0->idx_acquire) {
}
# reindex does not pick up new changes, so we rerun w/o it:
- if ($opt->{reindex}) {
+ if ($opt->{reindex} && !$sync->{quit}) {
my %again = %$opt;
$sync = undef;
delete @again{qw(rethread reindex -skip_lock)};
index_sync($self, \%again);
+ $opt->{quit} = $again{quit}; # propagate to caller
}
+ warn <<EOF if $quit_warn;
+W: interrupted, --xapian-only --reindex required upon restart
+EOF
}
1;
# /$INBOX/$MSGID/ for unindexed v1 inboxes
sub no_over_html ($) {
my ($ctx) = @_;
- my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; # 404
+ my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; # 404
my $eml = PublicInbox::Eml->new($bref);
$ctx->{mhref} = '';
PublicInbox::WwwStream::init($ctx);
sub msg_page {
my ($ctx) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
$ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
my $over = $ctx->{over} = $ibx->over or return no_over_html($ctx);
my ($id, $prev);
'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style';
my $info = '';
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
if (my $url = $ibx->{infourl}) {
$url = prurl($ctx->{env}, $url);
$info = qq(\n List information: <a\nhref="$url">$url</a>\n);
sub thread_html {
my ($ctx) = @_;
my $mid = $ctx->{mid};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my ($nr, $msgs) = $ibx->over->get_thread($mid);
return missing_thread($ctx) if $nr == 0;
sub add_text_body { # callback for each_part
my ($p, $ctx) = @_;
my $upfx = $ctx->{mhref};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
# $p - from each_part: [ Email::MIME-like, depth, $idx ]
my ($part, $depth, $idx) = @$p;
sub _msg_page_prepare_obuf {
my ($eml, $ctx) = @_;
- my $over = $ctx->{-inbox}->over;
+ my $over = $ctx->{ibx}->over;
my $obfs_ibx = $ctx->{-obfs_ibx};
my $rv = '';
my $mids = mids_for_index($eml);
sub thread_skel ($$$) {
my ($skel, $ctx, $hdr) = @_;
my $mid = mids($hdr)->[0];
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my ($nr, $msgs) = $ibx->over->get_thread($mid);
my $parent = in_reply_to($hdr);
$$skel .= "\n<b>Thread overview: </b>";
# returns a string buffer
sub html_footer {
my ($ctx, $hdr) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $upfx = '../';
my $skel;
my $rv = '<pre>';
my ($ctx, $level, $smsg) = @_;
my $mid = $smsg->{mid};
my $has_blob = $smsg->{blob} // do {
- if (my $by_mid = $ctx->{-inbox}->smsg_by_mid($mid)) {
+ if (my $by_mid = $ctx->{ibx}->smsg_by_mid($mid)) {
%$smsg = (%$smsg, %$by_mid);
1;
}
}
my @out;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
# sort by recency, this allows new posts to "bump" old topics...
$t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1);
$t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1);
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $msgs = $ibx->recent($opts, $after, $before);
my $nr = scalar @$msgs;
if ($nr < $lim && defined($after)) {
$ctx->{'log'} = tmpfile("solve.$oid_b");
$ctx->{fn} = $fn;
- my $solver = PublicInbox::SolverGit->new($ctx->{-inbox},
+ my $solver = PublicInbox::SolverGit->new($ctx->{ibx},
\&solve_result, $ctx);
# PSGI server will call this immediately and give us a callback (-wcb)
sub {
our $OID_RE = qr![a-f0-9]{7,}!;
sub new {
- my ($class, $pi_config) = @_;
- $pi_config ||= PublicInbox::Config->new;
- bless { pi_config => $pi_config }, $class;
+ my ($class, $pi_cfg) = @_;
+ bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class;
}
# backwards compatibility, do not use
eval "require PublicInbox::$_;";
}
if (ref($self)) {
- my $pi_config = $self->{pi_config};
- if (defined($pi_config->{'publicinbox.cgitrc'})) {
- $pi_config->limiter('-cgit');
+ my $pi_cfg = $self->{pi_cfg};
+ if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
+ $pi_cfg->limiter('-cgit');
}
$self->cgit;
$self->stylesheets_prepare($_) for ('', '../', '../../');
$self->news_www;
- $pi_config->each_inbox(\&preload_inbox);
+ $pi_cfg->each_inbox(\&preload_inbox);
}
}
# returns undef if valid, array ref response if invalid
sub invalid_inbox ($$) {
my ($ctx, $inbox) = @_;
- my $ibx = $ctx->{www}->{pi_config}->lookup_name($inbox);
+ my $ibx = $ctx->{www}->{pi_cfg}->lookup_name($inbox) //
+ $ctx->{www}->{pi_cfg}->lookup_ei($inbox);
if (defined $ibx) {
- $ctx->{-inbox} = $ibx;
+ $ctx->{ibx} = $ibx;
return;
}
return $ret if $ret;
my $mid = $ctx->{mid} = uri_unescape($mid_ue);
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) {
my ($x2, $x38) = ($1, $2);
# this is horrifically wasteful for legacy URLs:
- my $str = $ctx->{-inbox}->msg_by_path("$x2/$x38") or return;
+ my $str = $ctx->{ibx}->msg_by_path("$x2/$x38") or return;
my $s = PublicInbox::Eml->new($str);
$mid = PublicInbox::MID::mid_clean($s->header_raw('Message-ID'));
return r301($ctx, $inbox, mid_escape($mid));
# /$INBOX/$MESSAGE_ID/t/
sub get_thread {
my ($ctx, $flat) = @_;
- $ctx->{-inbox}->over or return need($ctx, 'Overview');
+ $ctx->{ibx}->over or return need($ctx, 'Overview');
$ctx->{flat} = $flat;
require PublicInbox::View;
PublicInbox::View::thread_html($ctx);
# especially on older systems. Stick to zlib since that's what git uses.
sub get_thread_mbox {
my ($ctx, $sfx) = @_;
- my $over = $ctx->{-inbox}->over or return need($ctx, 'Overview');
+ my $over = $ctx->{ibx}->over or return need($ctx, 'Overview');
require PublicInbox::Mbox;
PublicInbox::Mbox::thread_mbox($ctx, $over, $sfx);
}
# /$INBOX/$MESSAGE_ID/t.atom -> thread as Atom feed
sub get_thread_atom {
my ($ctx) = @_;
- $ctx->{-inbox}->over or return need($ctx, 'Overview');
+ $ctx->{ibx}->over or return need($ctx, 'Overview');
require PublicInbox::Feed;
PublicInbox::Feed::generate_thread_atom($ctx);
}
sub r301 {
my ($ctx, $inbox, $mid_ue, $suffix) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
unless ($ibx) {
my $r404 = invalid_inbox($ctx, $inbox);
return $r404 if $r404;
- $ibx = $ctx->{-inbox};
+ $ibx = $ctx->{ibx};
}
my $url = $ibx->base_url($ctx->{env});
my $qs = $ctx->{env}->{QUERY_STRING};
sub serve_git {
my ($ctx, $epoch, $path) = @_;
my $env = $ctx->{env};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $git = defined $epoch ? $ibx->git_epoch($epoch) : $ibx->git;
$git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404();
}
sub mbox_results {
my ($ctx) = @_;
if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
- $ctx->{-inbox}->search or return need($ctx, 'search');
+ $ctx->{ibx}->isrch or return need($ctx, 'search');
require PublicInbox::SearchView;
return PublicInbox::SearchView::mbox_results($ctx);
}
my ($self) = @_;
$self->{news_www} ||= do {
require PublicInbox::NewsWWW;
- PublicInbox::NewsWWW->new($self->{pi_config});
+ PublicInbox::NewsWWW->new($self->{pi_cfg});
}
}
sub cgit {
my ($self) = @_;
$self->{cgit} ||= do {
- my $pi_config = $self->{pi_config};
+ my $pi_cfg = $self->{pi_cfg};
- if (defined($pi_config->{'publicinbox.cgitrc'})) {
+ if (defined($pi_cfg->{'publicinbox.cgitrc'})) {
require PublicInbox::Cgit;
- PublicInbox::Cgit->new($pi_config);
+ PublicInbox::Cgit->new($pi_cfg);
} else {
require Plack::Util;
Plack::Util::inline_object(call => sub { r404() });
} || sub { $_[0] };
my $css_map = {};
- my $stylesheets = $self->{pi_config}->{css} || [];
+ my $stylesheets = $self->{pi_cfg}->{css} || [];
my $links = [];
my $inline_ok = 1;
my $css = $css_map->{$key};
if (!defined($css) && $key eq 'userContent') {
my $env = $ctx->{env};
- $css = PublicInbox::UserContent::sample($ctx->{-inbox}, $env);
+ $css = PublicInbox::UserContent::sample($ctx->{ibx}, $env);
}
defined $css or return r404();
my $h = [ 'Content-Length', bytes::length($css),
sub get_description {
my ($ctx, $inbox) = @_;
invalid_inbox($ctx, $inbox) || do {
- my $d = $ctx->{-inbox}->description . "\n";
+ my $d = $ctx->{ibx}->description . "\n";
[ 200, [ 'Content-Length', bytes::length($d),
'Content-Type', 'text/plain' ], [ $d ] ];
};
}
sub new {
- my ($class, $config) = @_;
+ my ($class, $cfg) = @_;
my (%mdmap, $spamc);
my (%imap, %nntp); # url => [inbox objects] or 'watchspam'
# indefinitely...
foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) {
my $k = "$pfx.watchspam";
- defined(my $dirs = $config->{$k}) or next;
+ defined(my $dirs = $cfg->{$k}) or next;
$dirs = PublicInbox::Config::_array($dirs);
for my $dir (@$dirs) {
my $url;
my $k = 'publicinboxwatch.spamcheck';
my $default = undef;
- my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default);
+ my $spamcheck = PublicInbox::Spamcheck::get($cfg, $k, $default);
$spamcheck = _spamcheck_cb($spamcheck) if $spamcheck;
- $config->each_inbox(sub {
+ $cfg->each_inbox(sub {
# need to make all inboxes writable for spam removal:
my $ibx = $_[0] = PublicInbox::InboxWritable->new($_[0]);
spamcheck => $spamcheck,
mdmap => \%mdmap,
mdre => $mdre,
- config => $config,
+ pi_cfg => $cfg,
imap => scalar keys %imap ? \%imap : undef,
nntp => scalar keys %nntp? \%nntp : undef,
importers => {},
$path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return;
my $eml = eml_from_path($path) or return;
local $SIG{__WARN__} = warn_ignore_cb();
- $self->{config}->each_inbox(\&remove_eml_i, $self, $eml, $path);
+ $self->{pi_cfg}->each_inbox(\&remove_eml_i, $self, $eml, $path);
}
sub import_eml ($$$) {
warn "unmappable dir: $1\n";
return;
}
- my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+ my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
local $SIG{__WARN__} = sub {
my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
$warn_cb->($pfx, "path: $path\n", @_);
# flesh out common IMAP-specific data structures
sub imap_common_init ($) {
my ($self) = @_;
- my $cfg = $self->{config};
+ my $cfg = $self->{pi_cfg};
my $mic_args = {}; # scheme://authority => Mail:IMAPClient arg
for my $url (sort keys %{$self->{imap}}) {
my $uri = PublicInbox::URIimap->new($url);
if ($flags =~ /\\Seen\b/) {
local $SIG{__WARN__} = warn_ignore_cb();
my $eml = PublicInbox::Eml->new($raw);
- $self->{config}->each_inbox(\&remove_eml_i,
+ $self->{pi_cfg}->each_inbox(\&remove_eml_i,
$self, $eml, "$url UID:$uid");
}
} else {
my $key = $req;
$key =~ s/\.PEEK//;
my ($uids, $batch);
- my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+ my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
local $SIG{__WARN__} = sub {
my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
$batch //= '?';
# flesh out common NNTP-specific data structures
sub nntp_common_init ($) {
my ($self) = @_;
- my $cfg = $self->{config};
+ my $cfg = $self->{pi_cfg};
my $nn_args = {}; # scheme://authority => Net::NNTP->new arg
for my $url (sort keys %{$self->{nntp}}) {
my $sec = uri_section(uri_new($url));
$beg = $l_art + 1;
warn "I: $url fetching ARTICLE $beg..$end\n";
- my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+ my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
my ($err, $art);
local $SIG{__WARN__} = sub {
my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
}
} elsif ($inboxes eq 'watchspam') {
my $eml = PublicInbox::Eml->new(\$raw);
- $self->{config}->each_inbox(\&remove_eml_i,
+ $self->{pi_cfg}->each_inbox(\&remove_eml_i,
$self, $eml, "$url ARTICLE $art");
} else {
die "BUG: destination unknown $inboxes";
sub sqldump ($$) {
my ($ctx, $altid_pfx) = @_;
my $env = $ctx->{env};
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $altid_map = $ibx->altid_map;
my $fn = $altid_map->{$altid_pfx};
unless (defined $fn) {
sub new {
my ($class, $ctx, $cb) = @_;
- $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env});
+ $ctx->{feed_base_url} = $ctx->{ibx}->base_url($ctx->{env});
$ctx->{cb} = $cb || \&PublicInbox::GzipFilter::close;
$ctx->{emit_header} = 1;
bless $ctx, $class;
my ($self) = @_;
my $cb = $self->{cb} or return;
while (my $smsg = $cb->($self)) {
- my $eml = $self->{-inbox}->smsg_eml($smsg) or next;
+ my $eml = $self->{ibx}->smsg_eml($smsg) or next;
return $self->translate(feed_entry($self, $smsg, $eml));
}
delete $self->{cb};
sub atom_header {
my ($ctx, $title) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $base_url = $ctx->{feed_base_url};
my $search_q = $ctx->{search_query};
my $self_url = $base_url;
$title = title_tag($title);
my $from = $eml->header('From') // $eml->header('Sender') //
- $ctx->{-inbox}->{-primary_address};
+ $ctx->{ibx}->{-primary_address};
my ($email) = PublicInbox::Address::emails($from);
my $name = ascii_html(join(', ', PublicInbox::Address::names($from)));
- $email = ascii_html($email // $ctx->{-inbox}->{-primary_address});
+ $email = ascii_html($email // $ctx->{ibx}->{-primary_address});
my $s = delete($ctx->{emit_header}) ? atom_header($ctx, $title) : '';
$s .= "<entry><author><name>$name</name><email>$email</email>" .
return 1 if $referer eq ''; # no referer is always OK for wget/curl
# prevent deep-linking from other domains on some browsers (Firefox)
- # n.b.: $ctx->{-inbox}->base_url($env) with INBOX_URL won't work
+ # n.b.: $ctx->{ibx}->base_url($env) with INBOX_URL won't work
# with dillo, we can only match "$url_scheme://$HTTP_HOST/" without
# path components
my $base_url = $env->{'psgi.url_scheme'} . '://' .
$ctx->{idx} = $idx;
bless $ctx, __PACKAGE__;
my $eml;
- if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) {
+ if ($ctx->{smsg} = $ctx->{ibx}->smsg_by_mid($ctx->{mid})) {
return sub { # public-inbox-httpd-only
$ctx->{wcb} = $_[0];
scan_attach($ctx);
} if $ctx->{env}->{'pi-httpd.async'};
# generic PSGI:
- $eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg});
- } elsif (!$ctx->{-inbox}->over) {
- if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) {
+ $eml = $ctx->{ibx}->smsg_eml($ctx->{smsg});
+ } elsif (!$ctx->{ibx}->over) {
+ if (my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid})) {
$eml = PublicInbox::Eml->new($bref);
}
}
my ($ctx, $key, $default) = @_;
$key //= 'publicInbox.wwwListing';
$default //= '404';
- my $v = $ctx->{www}->{pi_config}->{lc $key} // $default;
+ my $v = $ctx->{www}->{pi_cfg}->{lc $key} // $default;
again:
if ($v eq 'match=domain') {
my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME};
sub response {
my ($class, $ctx) = @_;
bless $ctx, $class;
+ if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) {
+ $ALL->misc->reopen;
+ }
my $re = $ctx->url_regexp or return $ctx->psgi_triple;
- my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_config},
+ my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg},
\&list_match_i, $re, $ctx);
sub {
$ctx->{-wcb} = $_[0]; # HTTP server callback
use bytes (); # length
use PublicInbox::Hval qw(ascii_html prurl ts2str);
our $TOR_URL = 'https://www.torproject.org/';
-our $CODE_URL = 'https://public-inbox.org/public-inbox.git';
+our $CODE_URL = [ qw(http://ou63pmih66umazou.onion/public-inbox.git
+ https://public-inbox.org/public-inbox.git) ];
sub base_url ($) {
my $ctx = shift;
- my $base_url = $ctx->{-inbox}->base_url($ctx->{env});
+ my $base_url = $ctx->{ibx}->base_url($ctx->{env});
chop $base_url; # no trailing slash for clone
$base_url;
}
sub html_top ($) {
my ($ctx) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $desc = ascii_html($ibx->description);
my $title = delete($ctx->{-title_html}) // $desc;
my $upfx = $ctx->{-upfx} || '';
qq(<a\nhref="$color">color</a> / ).
qq(<a\nhref=#mirror>mirror</a> / ).
qq(<a\nhref="$atom">Atom feed</a>);
- if ($ibx->search) {
+ if ($ibx->isrch) {
my $q_val = delete($ctx->{-q_value_html}) // '';
$q_val = qq(\nvalue="$q_val") if $q_val ne '';
# XXX gross, for SearchView.pm
sub coderepos ($) {
my ($ctx) = @_;
- my $ibx = $ctx->{-inbox};
+ my $cr = $ctx->{ibx}->{coderepo} // return ();
+ my $cfg = $ctx->{www}->{pi_cfg};
+ my $upfx = ($ctx->{-upfx} // ''). '../';
my @ret;
- if (defined(my $cr = $ibx->{coderepo})) {
- my $cfg = $ctx->{www}->{pi_config};
- my $env = $ctx->{env};
- for my $cr_name (@$cr) {
- my $urls = $cfg->{"coderepo.$cr_name.cgiturl"};
- if ($urls) {
- $ret[0] //= <<EOF;
+ for my $cr_name (@$cr) {
+ my $urls = $cfg->{"coderepo.$cr_name.cgiturl"} // next;
+ $ret[0] //= <<EOF;
code repositories for the project(s) associated with this inbox:
EOF
- $ret[0] .= "\n\t".prurl($env, $_) for @$urls;
- }
+ for (@$urls) {
+ # relative or absolute URL?, prefix relative "foo.git"
+ # with appropriate number of "../"
+ my $u = m!\A(?:[a-z\+]+:)?//! ? $_ : $upfx.$_;
+ $u = ascii_html(prurl($ctx->{env}, $u));
+ $ret[0] .= qq(\n\t<a\nhref="$u">$u</a>);
}
}
- @ret; # may be empty
+ @ret; # may be empty, this sub is called as an arg for join()
}
sub code_footer ($) {
id=mirror>This inbox may be cloned and mirrored by anyone:</a>
EOF
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $desc = ascii_html($ibx->description);
my @urls;
}
$urls .= "\n" . join('', map { "\tgit clone --mirror $_\n" } @urls);
- my $addrs = $ibx->{address};
- $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
- my $v = defined $max ? '-V2' : '-V1';
- $urls .= <<EOF;
+ if (my $addrs = $ibx->{address}) {
+ $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY';
+ my $v = defined $max ? '-V2' : '-V1';
+ $urls .= <<EOF;
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
$addrs
public-inbox-index $dir
EOF
+ }
my $cfg_link = ($ctx->{-upfx} // '').'_/text/config/raw';
$urls .= <<EOF;
my $cb = $ctx->{cb} or return;
while (defined(my $x = $cb->($ctx))) { # x = smsg or scalar non-ref
if (ref($x)) { # smsg
- my $eml = $ctx->{-inbox}->smsg_eml($x) or next;
+ my $eml = $ctx->{ibx}->smsg_eml($x) or next;
$ctx->{smsg} = $x;
return $ctx->translate($cb->($ctx, $eml));
} else { # scalar
# enforce trailing slash for "wget -r" compatibility
if (!$have_tslash && $code == 200) {
- my $url = $ctx->{-inbox}->base_url($env);
+ my $url = $ctx->{ibx}->base_url($env);
$url .= "_/text/$key/";
return [ 302, [ 'Content-Type', 'text/plain',
sub _colors_help ($$) {
my ($ctx, $txt) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $env = $ctx->{env};
my $base_url = $ibx->base_url($env);
$$txt .= "color customization for $base_url\n";
# n.b. this is a perfect candidate for memoization
sub inbox_config ($$$) {
my ($ctx, $hdr, $txt) = @_;
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
push @$hdr, 'Content-Disposition', 'inline; filename=inbox.config';
my $name = dq_escape($ibx->{name});
my $inboxdir = '/path/to/top-level-inbox';
; line number ranges in `[PATCH]' emails link to /$INBOX_NAME/$OID/s/,
; an HTTP endpoint which reconstructs git blobs via git-apply(1).
EOF
- my $pi_config = $ctx->{www}->{pi_config};
+ my $pi_cfg = $ctx->{www}->{pi_cfg};
for my $cr_name (@$cr) {
- my $urls = $pi_config->{"coderepo.$cr_name.cgiturl"};
+ my $urls = $pi_cfg->{"coderepo.$cr_name.cgiturl"};
my $path = "/path/to/$cr_name";
$cr_name = dq_escape($cr_name);
return inbox_config($ctx, $hdr, $txt) if $key eq 'config';
return if $key ne 'help'; # TODO more keys?
- my $ibx = $ctx->{-inbox};
+ my $ibx = $ctx->{ibx};
my $base_url = $ibx->base_url($ctx->{env});
$$txt .= "public-inbox help for $base_url\n";
$$txt .= <<EOF;
# n.b. we use the Xapian DB for any regeneratable,
# order-of-arrival-independent data.
- my $srch = $ibx->search;
+ my $srch = $ibx->isrch;
if ($srch) {
$$txt .= <<EOF;
search
$opt->{reindex}->{from} = $lc;
}
} else { # v2
- my $max;
- $im->git_dir_latest(\$max) or return;
+ my $max = $ibx->max_git_epoch // return;
my $from = $opt->{reindex}->{from};
my $mm = $ibx->mm;
my $v = PublicInbox::Search::SCHEMA_VERSION();
local %SIG = %SIG;
setup_signals();
- $ibx->umask_prepare;
$ibx->with_umask(\&_run, $ibx, $cb, $opt);
}
--- /dev/null
+/*
+ * Copyright (C) 2020 all contributors <meta@public-inbox.org>
+ * License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+ *
+ * libgit2 for Inline::C
+ * Avoiding Git::Raw since it doesn't guarantee a stable API,
+ * while libgit2 itself seems reasonably stable.
+ */
+#include <git2.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <poll.h>
+
+static void croak_if_err(int rc, const char *msg)
+{
+ if (rc != GIT_OK) {
+ const git_error *e = giterr_last();
+
+ croak("%d %s (%s)", rc, msg, e ? e->message : "unknown");
+ }
+}
+
+SV *new()
+{
+ git_odb *odb;
+ SV *ref, *self;
+ int rc = git_odb_new(&odb);
+ croak_if_err(rc, "git_odb_new");
+
+ ref = newSViv((IV)odb);
+ self = newRV_noinc(ref);
+ sv_bless(self, gv_stashpv("PublicInbox::Gcf2", GV_ADD));
+ SvREADONLY_on(ref);
+
+ return self;
+}
+
+static git_odb *odb_ptr(SV *self)
+{
+ return (git_odb *)SvIV(SvRV(self));
+}
+
+void DESTROY(SV *self)
+{
+ git_odb_free(odb_ptr(self));
+}
+
+/* needs "$GIT_DIR/objects", not $GIT_DIR */
+void add_alternate(SV *self, const char *objects_path)
+{
+ int rc = git_odb_add_disk_alternate(odb_ptr(self), objects_path);
+ croak_if_err(rc, "git_odb_add_disk_alternate");
+}
+
+#define CAPA(v) (sizeof(v) / sizeof((v)[0]))
+
+/*
+ * returns true on success, false on failure
+ * this requires an unabbreviated git OID
+ */
+int cat_oid(SV *self, int fd, SV *oidsv)
+{
+ /*
+ * adjust when libgit2 gets SHA-256 support, we return the
+ * same header as git-cat-file --batch "$OID $TYPE $SIZE\n"
+ */
+ char hdr[GIT_OID_HEXSZ + sizeof(" commit 18446744073709551615")];
+ struct iovec vec[3];
+ size_t nvec = CAPA(vec);
+ git_oid oid;
+ git_odb_object *object = NULL;
+ int rc, err = 0;
+ STRLEN oidlen;
+ char *oidptr = SvPV(oidsv, oidlen);
+
+ /* same trailer as git-cat-file --batch */
+ vec[2].iov_len = 1;
+ vec[2].iov_base = "\n";
+
+ rc = git_oid_fromstrn(&oid, oidptr, oidlen);
+ if (rc == GIT_OK)
+ rc = git_odb_read(&object, odb_ptr(self), &oid);
+ if (rc == GIT_OK) {
+ vec[0].iov_base = hdr;
+ vec[1].iov_base = (void *)git_odb_object_data(object);
+ vec[1].iov_len = git_odb_object_size(object);
+
+ git_oid_nfmt(hdr, GIT_OID_HEXSZ, git_odb_object_id(object));
+ vec[0].iov_len = GIT_OID_HEXSZ +
+ snprintf(hdr + GIT_OID_HEXSZ,
+ sizeof(hdr) - GIT_OID_HEXSZ,
+ " %s %zu\n",
+ git_object_type2string(
+ git_odb_object_type(object)),
+ vec[1].iov_len);
+ } else { /* caller retries */
+ nvec = 0;
+ }
+ while (nvec && !err) {
+ ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec);
+
+ if (w > 0) {
+ size_t done = 0;
+ size_t i;
+
+ for (i = CAPA(vec) - nvec; i < CAPA(vec); i++) {
+ if (w >= vec[i].iov_len) {
+ /* fully written vec */
+ w -= vec[i].iov_len;
+ done++;
+ } else { /* partially written vec */
+ char *p = vec[i].iov_base;
+ vec[i].iov_base = p + w;
+ vec[i].iov_len -= w;
+ break;
+ }
+ }
+ nvec -= done;
+ } else if (w < 0) {
+ err = errno;
+ switch (err) {
+ case EAGAIN: {
+ struct pollfd pfd;
+ pfd.events = POLLOUT;
+ pfd.fd = fd;
+ poll(&pfd, 1, -1);
+ }
+ /* fall-through */
+ case EINTR:
+ err = 0;
+ }
+ } else { /* w == 0 */
+ err = ENOSPC;
+ }
+ }
+ if (object)
+ git_odb_object_free(object);
+ if (err)
+ croak("writev error: %s", strerror(err));
+
+ return rc == GIT_OK;
+}
die "$new_dir exists\n" if -d $new_dir;
die "$old_dir not a directory\n" unless -d $old_dir;
-require Cwd;
-Cwd->import('abs_path');
+require PublicInbox::Admin;
require PublicInbox::Config;
require PublicInbox::InboxWritable;
-my $abs = abs_path($old_dir);
-die "failed to resolve $old_dir: $!\n" if (!defined($abs));
-
my $cfg = PublicInbox::Config->new;
-my $old;
-$cfg->each_inbox(sub {
- $old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir;
-});
-if ($old) {
- $old = PublicInbox::InboxWritable->new($old);
-} else {
+my @old = PublicInbox::Admin::resolve_inboxes([$old_dir], undef, $cfg);
+@old > 1 and die "BUG: resolved several inboxes from $old_dir:\n",
+ map { "\t$_->{inboxdir}\n" } @old;
+my $old = PublicInbox::InboxWritable->new($old[0]);
+if (delete $old->{-unconfigured}) {
warn "W: $old_dir not configured in " .
PublicInbox::Config::default_file() . "\n";
- $old = PublicInbox::InboxWritable->new({
- inboxdir => $old_dir,
- name => 'ignored',
- -primary_address => 'old@example.com',
- address => [ 'old@example.com' ],
- });
}
die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2;
-require File::Spec;
require PublicInbox::Admin;
my $detected = PublicInbox::Admin::detect_indexlevel($old);
$old->{indexlevel} //= $detected;
}
local %ENV = (%$env, %ENV) if $env;
my $new = { %$old };
-$new->{inboxdir} = File::Spec->canonpath($new_dir);
+$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir);
$new->{version} = 2;
$new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} });
$new->{-no_fsync} = 1 if !$opt->{fsync};
my $v2w;
-$old->umask_prepare;
sub link_or_copy ($$) {
my ($src, $dst) = @_;
# rename/relink $edit_fn
open my $new_fh, '<', $edit_fn or
die "can't read edited file ($edit_fn): $!\n";
- my $new_raw = do { local $/; <$new_fh> };
+ defined(my $new_raw = do { local $/; <$new_fh> }) or die
+ "read $edit_fn: $!\n";
if (!$opt->{raw}) {
# get rid of the From we added
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# Basic tool to create a Xapian search index for a public-inbox.
+use strict;
+use v5.10.1;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-extindex [options] [EXTINDEX_DIR] [INBOX_DIR...]
+
+ Create and update external (detached) search indices
+
+ --no-fsync speed up indexing, risk corruption on power outage
+ --watch run persistently and watch for inbox updates
+ -L LEVEL `medium', or `full' (default: full)
+ --all index all configured inboxes
+ --jobs=NUM set or disable parallelization (NUM=0)
+ --batch-size=BYTES flush changes to OS after a given number of bytes
+ --max-size=BYTES do not index messages larger than the given size
+ --gc perform garbage collection instead of indexing
+ --verbose | -v increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-extindex(1) man page for full documentation.
+EOF
+my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
+ fsync|sync!
+ indexlevel|index-level|L=s max_size|max-size=s
+ batch_size|batch-size=s
+ gc commit-interval=i watch scan!
+ all help|h))
+ or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+my $cfg = PublicInbox::Config->new;
+my $eidx_dir = shift(@ARGV);
+unless (defined $eidx_dir) {
+ if ($opt->{all} && $cfg->ALL) {
+ $eidx_dir = $cfg->ALL->{topdir};
+ } else {
+ die "E: $help";
+ }
+}
+my @ibxs;
+if ($opt->{gc}) {
+ die "E: inbox paths must not be specified with --gc\n" if @ARGV;
+ die "E: --all not compatible with --gc\n" if $opt->{all};
+ die "E: --watch is not compatible with --gc\n" if $opt->{watch};
+} else {
+ @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+}
+PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
+require PublicInbox::ExtSearchIdx;
+my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
+if ($opt->{gc}) {
+ $eidx->attach_config($cfg);
+ $eidx->eidx_gc($opt);
+} else {
+ if ($opt->{all}) {
+ $eidx->attach_config($cfg);
+ } else {
+ $eidx->attach_inbox($_) for @ibxs;
+ }
+ if ($opt->{watch}) {
+ $cfg = undef; # save memory only after SIGHUP
+ $eidx->eidx_watch($opt);
+ } else {
+ $eidx->eidx_sync($opt);
+ }
+}
require PublicInbox::HTTP;
require PublicInbox::HTTPD;
}
+
my %httpds;
my $app;
my $refresh = sub {
my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
usage: public-inbox-index [options] INBOX_DIR
- Create and update search indices
+ Create and update per-inbox search indices
options:
--no-fsync speed up indexing, risk corruption on power outage
-L LEVEL `basic', `medium', or `full' (default: full)
+ -E EXTINDEX update extindex (default: `all')
--all index all configured inboxes
--compact | -c run public-inbox-compact(1) after indexing
--sequential-shard index Xapian shards sequentially for slow storage
BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
See public-inbox-index(1) man page for full documentation.
EOF
-my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 };
+my $opt = {
+ quiet => -1, compact => 0, max_size => undef, fsync => 1,
+ 'update-extindex' => [], # ":s@" optional arg sets '' if no arg given
+};
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
fsync|sync! xapian_only|xapian-only
indexlevel|index-level|L=s max_size|max-size=s
batch_size|batch-size=s
sequential_shard|seq-shard|sequential-shard
- skip-docdata all help|h))
+ no-update-extindex update-extindex|E=s@
+ fast-noop|F skip-docdata all help|h))
or die $help;
if ($opt->{help}) { print $help; exit 0 };
die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
if ($opt->{xapian_only} && !$opt->{reindex}) {
die "--xapian-only requires --reindex\n";
}
+if ($opt->{reindex} && delete($opt->{'fast-noop'})) {
+ warn "--fast-noop ignored with --reindex\n";
+}
# require lazily to speed up --help
require PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
my $cfg = PublicInbox::Config->new; # Config is loaded by Admin
+$opt->{-use_cwd} = 1;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR $help; exit 1 }
+my (@eidx, %eidx_seen);
+my $update_extindex = $opt->{'update-extindex'};
+if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
+ # extindex and normal inboxes may have different owners
+ push(@$update_extindex, 'all') if -w $ALL->{topdir};
+}
+@$update_extindex = () if $opt->{'no-update-extindex'};
+if (scalar @$update_extindex) {
+ PublicInbox::Admin::require_or_die('-search');
+ require PublicInbox::ExtSearchIdx;
+}
+for my $ei_name (@$update_extindex) {
+ my $es = $cfg->lookup_ei($ei_name);
+ my $topdir;
+ if (!$es && -d $ei_name) { # allow dirname or config section name
+ $topdir = $ei_name;
+ } elsif ($es) {
+ $topdir = $es->{topdir};
+ } else {
+ die "extindex `$ei_name' not configured or found\n";
+ }
+ my $o = { %$opt };
+ delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic';
+ $eidx_seen{$topdir} //=
+ push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o));
+}
my $mods = {};
+my @eidx_unconfigured;
foreach my $ibx (@ibxs) {
# detect_indexlevel may also set $ibx->{-skip_docdata}
my $detected = PublicInbox::Admin::detect_indexlevel($ibx);
$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
'full' : $detected);
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
+ if (@eidx && $ibx->{-unconfigured}) {
+ push @eidx_unconfigured, " $ibx->{inboxdir}\n";
+ }
}
+warn <<EOF if @eidx_unconfigured;
+The following inboxes are unconfigured and will not be updated in
+@$update_extindex:\n@eidx_unconfigured
+EOF
# "Search::Xapian" includes SWIG "Xapian", too:
$opt->{compact} = 0 if !$mods->{'Search::Xapian'};
EOL
$ibx_opt = { %$opt, sequential_shard => $v };
}
- PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+ my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+ last if $ibx_opt->{quit};
if (my $copt = $opt->{compact_opt}) {
local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard};
PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
}
+ last if $ibx_opt->{quit};
+ next if $ibx->{-unconfigured} || !$nidx;
+ for my $eidx (@eidx) {
+ $eidx->attach_inbox($ibx);
+ }
+}
+my $pr = $opt->{-progress};
+for my $eidx (@eidx) {
+ $pr->("indexing $eidx->{topdir} ...\n") if $pr;
+ $eidx->eidx_sync($opt);
+ last if $opt->{quit};
}
defined $perm or die "(f)stat failed on $pi_config: $!\n";
chmod($perm & 07777, $fh) or
die "(f)chmod failed on future $pi_config: $!\n";
- my $old;
- {
- local $/;
- $old = <$oh>;
- }
+ defined(my $old = do { local $/; <$oh> }) or die "read $pi_config: $!\n";
print $fh $old or die "failed to write: $!\n";
close $oh or die "failed to close $pi_config: $!\n";
my $pfx = "publicinbox.$name";
my @x = (qw/git config/, "--file=$pi_config_tmp");
-require File::Spec;
-$inboxdir = File::Spec->canonpath($inboxdir);
+$inboxdir = PublicInbox::Config::rel2abs_collapsed($inboxdir);
+die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0;
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
if (-f "$inboxdir/inbox.lock") {
if (!defined $version) {
$version = 2;
$ibx->{-skip_docdata} = $skip_docdata;
}
$ibx->init_inbox(0, $skip_epoch, $skip_artnum);
-require Cwd;
-my $tmp = Cwd::abs_path($inboxdir);
-defined($tmp) or die "failed to resolve $inboxdir: $!\n";
-$inboxdir = $tmp;
-die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s;
# needed for git prior to v2.1.0
umask(0077) if defined $perm;
die "--all only works with `rm'\n" if $opt{all} && $train ne 'rm';
my $spamc = PublicInbox::Spamcheck::Spamc->new;
-my $pi_config = PublicInbox::Config->new;
+my $pi_cfg = PublicInbox::Config->new;
my $err;
my $mime = PublicInbox::Eml->new(do{
- local $/;
- my $data = <STDIN>;
+ defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n";
$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
if ($train ne 'rm') {
# spam is removed from all known inboxes since it is often Bcc:-ed
if ($train eq 'spam' || ($train eq 'rm' && $opt{all})) {
- $pi_config->each_inbox(sub {
+ $pi_cfg->each_inbox(sub {
my ($ibx) = @_;
$ibx = PublicInbox::InboxWritable->new($ibx);
my $im = $ibx->importer(0);
for ($mime->header('Cc'), $mime->header('To')) {
foreach my $addr (PublicInbox::Address::emails($_)) {
$addr = lc($addr);
- $dests{$addr} //= $pi_config->lookup($addr) // 0;
+ $dests{$addr} //= $pi_cfg->lookup($addr) // 0;
}
}
next if $seen{"$ibx"}++;
remove_or_add($ibx, $train, $mime, $addr);
}
- my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime);
+ my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_cfg, $mime);
for my $ibx (@$dests) {
next if $seen{"$ibx"}++;
remove_or_add($ibx, $train, $mime, $ibx->{-primary_address});
$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
$ems->prepare(\$str);
my $eml = PublicInbox::Eml->new(\$str);
-my $config = PublicInbox::Config->new;
+my $cfg = PublicInbox::Config->new;
my $key = 'publicinboxmda.spamcheck';
my $default = 'PublicInbox::Spamcheck::Spamc';
-my $spamc = PublicInbox::Spamcheck::get($config, $key, $default);
+my $spamc = PublicInbox::Spamcheck::get($cfg, $key, $default);
my $dests = [];
my $recipient = $ENV{ORIGINAL_RECIPIENT};
if (defined $recipient) {
- my $ibx = $config->lookup($recipient); # first check
+ my $ibx = $cfg->lookup($recipient); # first check
push @$dests, $ibx if $ibx;
}
if (!scalar(@$dests)) {
- $dests = PublicInbox::MDA->inboxes_for_list_id($config, $eml);
+ $dests = PublicInbox::MDA->inboxes_for_list_id($cfg, $eml);
if (!scalar(@$dests) && !defined($recipient)) {
die "ORIGINAL_RECIPIENT not defined in ENV\n";
}
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt);
PublicInbox::AdminEdit::check_editable(\@ibxs);
-my $data = do { local $/; <STDIN> };
+defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n";
$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
my $n_purged = 0;
my $spool = shift @ARGV or die usage();
my $recipient = $ENV{ORIGINAL_RECIPIENT};
defined $recipient or die usage();
-my $config = PublicInbox::Config->new;
-my $ibx = $config->lookup($recipient);
+my $cfg = PublicInbox::Config->new;
+my $ibx = $cfg->lookup($recipient);
my $git = $ibx->git;
my $im;
if ($ibx->version == 2) {
use Test::More;
use PublicInbox::TestCommon;
use PublicInbox::Import;
-use_ok 'PublicInbox::Admin', qw(resolve_repo_dir);
+use_ok 'PublicInbox::Admin';
my ($tmpdir, $for_destroy) = tmpdir();
my $git_dir = "$tmpdir/v1";
my $v2_dir = "$tmpdir/v2";
my ($res, $err, $v);
PublicInbox::Import::init_bare($git_dir);
+*resolve_inboxdir = \&PublicInbox::Admin::resolve_inboxdir;
# v1
-is(resolve_repo_dir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
-is(resolve_repo_dir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
+is(resolve_inboxdir($git_dir), $git_dir, 'top-level GIT_DIR resolved');
+is(resolve_inboxdir("$git_dir/objects"), $git_dir, 'GIT_DIR/objects resolved');
ok(chdir($git_dir), 'chdir GIT_DIR works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
ok(chdir("$git_dir/objects"), 'chdir GIT_DIR/objects works');
-is(resolve_repo_dir(), $git_dir, 'resolve_repo_dir works in GIT_DIR');
-$res = resolve_repo_dir(undef, \$v);
+is(resolve_inboxdir(), $git_dir, 'resolve_inboxdir works in GIT_DIR');
+$res = resolve_inboxdir(undef, \$v);
is($v, 1, 'version 1 detected');
is($res, $git_dir, 'detects directory along with version');
ok(chdir($no_vcs_dir), 'chdir to a non-inbox');
open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
- $res = eval { resolve_repo_dir() };
+ $res = eval { resolve_inboxdir() };
open STDERR, '>&', $olderr or die "restore stderr: $!";
is($res, undef, 'fails inside non-version-controlled dir');
ok(chdir($tmpdir), 'back to test-specific $tmpdir');
open STDERR, '>&', $null or die "redirect stderr to /dev/null: $!";
- $res = eval { resolve_repo_dir($no_vcs_dir) };
+ $res = eval { resolve_inboxdir($no_vcs_dir) };
$err = $@;
open STDERR, '>&', $olderr or die "restore stderr: $!";
is($res, undef, 'fails on non-version-controlled dir');
PublicInbox::V2Writable->new($ibx, 1)->idx_init;
ok(-e "$v2_dir/inbox.lock", 'exists');
- is(resolve_repo_dir($v2_dir), $v2_dir,
- 'resolve_repo_dir works on v2_dir');
- ok(chdir($v2_dir), 'chdir v2_dir OK');
- is(resolve_repo_dir(), $v2_dir, 'resolve_repo_dir works inside v2_dir');
- $res = resolve_repo_dir(undef, \$v);
+ is(resolve_inboxdir($v2_dir), $v2_dir,
+ 'resolve_inboxdir works on v2_dir');
+ chdir($v2_dir) or BAIL_OUT "chdir v2_dir: $!";
+ is(resolve_inboxdir(), $v2_dir, 'resolve_inboxdir works inside v2_dir');
+ $res = resolve_inboxdir(undef, \$v);
is($v, 2, 'version 2 detected');
is($res, $v2_dir, 'detects directory along with version');
# TODO: should work from inside Xapian dirs, and git dirs, here...
+ PublicInbox::Import::init_bare("$v2_dir/git/0.git");
+ my $objdir = "$v2_dir/git/0.git/objects";
+ is($v2_dir, resolve_inboxdir($objdir, \$v), 'at $objdir');
+ is($v, 2, 'version 2 detected at $objdir');
+ chdir($objdir) or BAIL_OUT "chdir objdir: $!";
+ is(resolve_inboxdir(undef, \$v), $v2_dir, 'inside $objdir');
+ is($v, 2, 'version 2 detected inside $objdir');
}
-chdir '/';
+chdir '/' or BAIL_OUT "chdir: $!";
my @pairs = (
'1g' => 1024 ** 3,
'url' => [ 'http://example.com/meta' ],
-primary_address => 'meta@public-inbox.org',
'name' => 'meta',
- feedmax => 25,
-httpbackend_limiter => undef,
nntpserver => undef,
}, "lookup matches expected output");
'inboxdir' => '/home/pi/test-main.git',
'domain' => 'public-inbox.org',
'name' => 'test',
- feedmax => 25,
'url' => [ 'http://example.com/test' ],
-httpbackend_limiter => undef,
nntpserver => undef,
pipe($x, $y) or die;
is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($r), EPOLLIN), 0, 'add EPOLLIN');
my $events = [];
-my $n = $p->epoll_wait(9, 0, $events);
+$p->epoll_wait(9, 0, $events);
is_deeply($events, [], 'no events set');
-is($n, 0, 'nothing ready, yet');
is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($w), EPOLLOUT|EPOLLONESHOT), 0,
'add EPOLLOUT|EPOLLONESHOT');
-$n = $p->epoll_wait(9, -1, $events);
-is($n, 1, 'got POLLOUT event');
-is($events->[0]->[0], fileno($w), '$w ready');
+$p->epoll_wait(9, -1, $events);
+is(scalar(@$events), 1, 'got POLLOUT event');
+is($events->[0], fileno($w), '$w ready');
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after oneshot');
+$p->epoll_wait(9, 0, $events);
+is(scalar(@$events), 0, 'nothing ready after oneshot');
is_deeply($events, [], 'no events set after oneshot');
syswrite($w, '1') == 1 or die;
for my $t (0..1) {
- $n = $p->epoll_wait(9, $t, $events);
- is($events->[0]->[0], fileno($r), "level-trigger POLLIN ready #$t");
- is($n, 1, "only event ready #$t");
+ $p->epoll_wait(9, $t, $events);
+ is($events->[0], fileno($r), "level-trigger POLLIN ready #$t");
+ is(scalar(@$events), 1, "only event ready #$t");
}
syswrite($y, '1') == 1 or die;
is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($x), EPOLLIN|EPOLLONESHOT), 0,
'EPOLLIN|EPOLLONESHOT add');
-is($p->epoll_wait(9, -1, $events), 2, 'epoll_wait has 2 ready');
-my @fds = sort(map { $_->[0] } @$events);
+$p->epoll_wait(9, -1, $events);
+is(scalar @$events, 2, 'epoll_wait has 2 ready');
+my @fds = sort @$events;
my @exp = sort((fileno($r), fileno($x)));
is_deeply(\@fds, \@exp, 'got both ready FDs');
is($p->epoll_ctl(EPOLL_CTL_DEL, fileno($r), 0), 0, 'EPOLL_CTL_DEL OK');
-$n = $p->epoll_wait(9, 0, $events);
-is($n, 0, 'nothing ready after EPOLL_CTL_DEL');
+$p->epoll_wait(9, 0, $events);
+is(scalar @$events, 0, 'nothing ready after EPOLL_CTL_DEL');
done_testing;
'epoll_ctl socket EPOLLOUT');
my @events;
-is(epoll_wait($epfd, 100, 10000, \@events), 1, 'epoll_wait returns');
+epoll_wait($epfd, 100, 10000, \@events);
is(scalar(@events), 1, 'got one event');
-is($events[0]->[0], fileno($w), 'got expected FD');
-is($events[0]->[1], EPOLLOUT, 'got expected event');
+is($events[0], fileno($w), 'got expected FD');
close $w;
-is(epoll_wait($epfd, 100, 0, \@events), 0, 'epoll_wait timeout');
+epoll_wait($epfd, 100, 0, \@events);
+is(@events, 0, 'epoll_wait timeout');
done_testing;
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Config;
+use PublicInbox::Search;
+use PublicInbox::InboxWritable;
+use Fcntl qw(:seek);
+my $json = PublicInbox::Config::json() or plan skip_all => 'JSON missing';
+require_git(2.6);
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::ExtSearch';
+use_ok 'PublicInbox::ExtSearchIdx';
+use_ok 'PublicInbox::OverIdx';
+my $sock = tcp_server();
+my $host_port = $sock->sockhost . ':' . $sock->sockport;
+my ($home, $for_destroy) = tmpdir();
+local $ENV{HOME} = $home;
+mkdir "$home/.public-inbox" or BAIL_OUT $!;
+my $cfg_path = "$home/.public-inbox/config";
+open my $fh, '>', $cfg_path or BAIL_OUT $!;
+print $fh <<EOF or BAIL_OUT $!;
+[publicinboxMda]
+ spamcheck = none
+EOF
+close $fh or BAIL_OUT $!;
+my $v2addr = 'v2test@example.com';
+my $v1addr = 'v1test@example.com';
+ok(run_script([qw(-init -Lbasic -V2 v2test --newsgroup v2.example),
+ "$home/v2test", 'http://example.com/v2test', $v2addr ]), 'v2test init');
+my $env = { ORIGINAL_RECIPIENT => $v2addr };
+my $eml = eml_load('t/utf8.eml');
+
+$eml->header_set('List-Id', '<v2.example.com>');
+open($fh, '+>', undef) or BAIL_OUT $!;
+$fh->autoflush(1);
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
+ 'http://example.com/v1test', $v1addr ]), 'v1test init');
+
+$eml->header_set('List-Id', '<v1.example.com>');
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+truncate($fh, 0) or BAIL_OUT $!;
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+$env = { ORIGINAL_RECIPIENT => $v1addr };
+run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
+run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?";
+
+ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
+{
+ my $es = PublicInbox::ExtSearch->new("$home/extindex");
+ ok($es->has_threadid, '->has_threadid');
+}
+
+{ # TODO: -extindex should write this to config
+ open $fh, '>>', $cfg_path or BAIL_OUT $!;
+ print $fh <<EOF or BAIL_OUT $!;
+; for ->ALL
+[extindex "all"]
+ topdir = $home/extindex
+EOF
+ close $fh or BAIL_OUT $!;
+
+ my $pi_cfg = PublicInbox::Config->new;
+ $pi_cfg->fill_all;
+ ok($pi_cfg->ALL, '->ALL');
+ my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
+ my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
+ is_deeply($ret, { 'v1.example' => 1, 'v2.example' => 1 },
+ '->nntp_xref_for');
+}
+
+SKIP: {
+ require_mods(qw(Net::NNTP), 1);
+ my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
+ my $cmd = [ '-nntpd', '-W0', "--stdout=$out", "--stderr=$err" ];
+ my $td = start_script($cmd, undef, { 3 => $sock });
+ my $n = Net::NNTP->new($host_port);
+ my @xp = $n->xpath('<testmessage@example.com>');
+ is_deeply(\@xp, [ qw(v1.example/1 v2.example/1) ]);
+ $n->group('v1.example');
+ my $res = $n->head(1);
+ @$res = grep(/^Xref: /, @$res);
+ like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works');
+}
+
+my $es = PublicInbox::ExtSearch->new("$home/extindex");
+{
+ my $smsg = $es->over->get_art(1);
+ ok($smsg, 'got first article');
+ is($es->over->get_art(2), undef, 'only one added');
+ my $xref3 = $es->over->get_xref3(1);
+ like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1');
+ like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2');
+ is(scalar(@$xref3), 2, 'only to entries');
+}
+
+if ('inbox edited') {
+ my ($in, $out, $err);
+ $in = $out = $err = '';
+ my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+ my $env = { MAIL_EDITOR => "$^X -i -p -e 's/test message/BEST MSG/'" };
+ my $cmd = [ qw(-edit -Ft/utf8.eml), "$home/v2test" ];
+ ok(run_script($cmd, $env, $opt), '-edit');
+ ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+ 'extindex again');
+ like($err, qr/discontiguous range/, 'warned about discontiguous range');
+ my $msg1 = $es->over->get_art(1) or BAIL_OUT 'msg1 missing';
+ my $msg2 = $es->over->get_art(2) or BAIL_OUT 'msg2 missing';
+ is($msg1->{mid}, $msg2->{mid}, 'edited message indexed');
+ isnt($msg1->{blob}, $msg2->{blob}, 'blobs differ');
+ my $eml2 = $es->smsg_eml($msg2);
+ like($eml2->body, qr/BEST MSG/, 'edited body in #2');
+ unlike($eml2->body, qr/test message/, 'old body discarded in #2');
+ my $eml1 = $es->smsg_eml($msg1);
+ like($eml1->body, qr/test message/, 'original body in #1');
+ my $x1 = $es->over->get_xref3(1);
+ my $x2 = $es->over->get_xref3(2);
+ is(scalar(@$x1), 1, 'original only has one xref3');
+ is(scalar(@$x2), 1, 'new message has one xref3');
+ isnt($x1->[0], $x2->[0], 'xref3 differs');
+
+ my $mset = $es->mset('b:"BEST MSG"');
+ is($mset->size, 1, 'new message found');
+ $mset = $es->mset('b:"test message"');
+ is($mset->size, 1, 'old message found');
+ delete @$es{qw(git over xdb)}; # fork preparation
+
+ my $pi_cfg = PublicInbox::Config->new;
+ $pi_cfg->fill_all;
+ is(scalar($pi_cfg->ALL->mset('s:Testing')->items), 2,
+ '2 results in ->ALL');
+ my $res = {};
+ my $nr = 0;
+ $pi_cfg->each_inbox(sub {
+ $nr++;
+ my ($ibx) = @_;
+ local $SIG{__WARN__} = sub {}; # FIXME support --reindex
+ my $mset = $ibx->isrch->mset('s:Testing');
+ $res->{$ibx->eidx_key} = $ibx->isrch->mset_to_smsg($ibx, $mset);
+ });
+ is($nr, 2, 'two inboxes');
+ my $exp = {};
+ for my $v (qw(v1 v2)) {
+ my $ibx = $pi_cfg->lookup_newsgroup("$v.example");
+ my $smsg = $ibx->over->get_art(1);
+ $smsg->psgi_cull;
+ $exp->{"$v.example"} = [ $smsg ];
+ }
+ is_deeply($res, $exp, 'isearch limited results');
+ $pi_cfg = $res = $exp = undef;
+
+ open my $rmfh, '+>', undef or BAIL_OUT $!;
+ $rmfh->autoflush(1);
+ print $rmfh $eml2->as_string or BAIL_OUT $!;
+ seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
+ $opt->{0} = $rmfh;
+ ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
+
+ ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
+ 'extindex after rm');
+ is($es->over->get_art(2), undef, 'doc #2 gone');
+ $mset = $es->mset('b:"BEST MSG"');
+ is($mset->size, 0, 'new message gone');
+}
+
+my $misc = $es->misc;
+my @it = $misc->mset('')->items;
+is(scalar(@it), 2, 'two inboxes');
+like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
+like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
+
+my $cfg = PublicInbox::Config->new;
+my $schema_version = PublicInbox::Search::SCHEMA_VERSION();
+my $f = "$home/extindex/ei$schema_version/over.sqlite3";
+my $oidx = PublicInbox::OverIdx->new($f);
+if ('inject w/o indexing') {
+ use PublicInbox::Import;
+ my $v1ibx = $cfg->lookup_name('v1test');
+ my $last_v1_commit = $v1ibx->mm->last_commit;
+ my $v2ibx = $cfg->lookup_name('v2test');
+ my $last_v2_commit = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ my $git0 = PublicInbox::Git->new("$v2ibx->{inboxdir}/git/0.git");
+ chomp(my $cmt = $git0->qx(qw(rev-parse HEAD^0)));
+ is($last_v2_commit, $cmt, 'v2 index up-to-date');
+
+ my $v2im = PublicInbox::Import->new($git0, undef, undef, $v2ibx);
+ $v2im->{lock_path} = undef;
+ $v2im->{path_type} = 'v2';
+ $v2im->add(eml_load('t/mda-mime.eml'));
+ $v2im->done;
+ chomp(my $tip = $git0->qx(qw(rev-parse HEAD^0)));
+ isnt($tip, $cmt, '0.git v2 updated');
+
+ # inject a message w/o updating index
+ rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or
+ BAIL_OUT $!;
+ open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!;
+ run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or
+ BAIL_OUT '-mda';
+ rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or
+ BAIL_OUT $!;
+
+ my ($in, $out, $err);
+ $in = $out = $err = '';
+ my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+ ok(run_script([qw(-extindex -v -v --all), "$home/extindex"],
+ undef, undef), 'extindex noop');
+ $es->{xdb}->reopen;
+ my $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+ is($mset->size, 0, 'did not attempt to index unindexed v1 message');
+ $mset = $es->mset('mid:multipart-html-sucks@11');
+ is($mset->size, 0, 'did not attempt to index unindexed v2 message');
+ ok(run_script([qw(-index --all)]), 'indexed v1 and v2 inboxes');
+
+ isnt($v1ibx->mm->last_commit, $last_v1_commit, '-index v1 worked');
+ isnt($v2ibx->mm->last_commit_xap($schema_version, 0),
+ $last_v2_commit, '-index v2 worked');
+ ok(run_script([qw(-extindex --all), "$home/extindex"]),
+ 'extindex updates');
+
+ $es->{xdb}->reopen;
+ $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+ is($mset->size, 1, 'got v1 message');
+ $mset = $es->mset('mid:multipart-html-sucks@11');
+ is($mset->size, 1, 'got v2 message');
+}
+
+if ('reindex catches missed messages') {
+ my $v2ibx = $cfg->lookup_name('v2test');
+ my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+ my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ my $eml = eml_load('t/data/0001.patch');
+ $im->add($eml);
+ $im->done;
+ my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ isnt($cmt_a, $cmt_b, 'v2 0.git HEAD updated');
+ $oidx->dbh;
+ my $uv = $v2ibx->uidvalidity;
+ my $lc_key = "lc-v2:v2.example//$uv;0";
+ is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+ 'update lc-v2 meta, old is as expected');
+ my $max = $oidx->max;
+ $oidx->dbh_close;
+ ok(run_script([qw(-extindex), "$home/extindex", $v2ibx->{inboxdir}]),
+ '-extindex noop');
+ is($oidx->max, $max, '->max unchanged');
+ is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 unchanged');
+ $oidx->dbh_close;
+ my $opt = { 2 => \(my $err = '') };
+ ok(run_script([qw(-extindex --reindex), "$home/extindex",
+ $v2ibx->{inboxdir}], undef, $opt),
+ '--reindex for unseen');
+ is($oidx->max, $max + 1, '->max bumped');
+ is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 stays unchanged');
+ my @err = split(/^/, $err);
+ is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+ like($err[0], qr/I: reindex_unseen/, 'got reindex_unseen message');
+ my $new = $oidx->get_art($max + 1);
+ is($new->{subject}, $eml->header('Subject'), 'new message added');
+
+ $es->{xdb}->reopen;
+ my $mset = $es->mset("mid:$new->{mid}");
+ is($mset->size, 1, 'previously unseen, now indexed in Xapian');
+
+ ok($im->remove($eml), 'remove new message from v2 inbox');
+ $im->done;
+ my $cmt_c = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ is($oidx->eidx_meta($lc_key, $cmt_c), $cmt_b,
+ 'bump lc-v2 meta again to skip v2 remove');
+ $err = '';
+ $oidx->dbh_close;
+ ok(run_script([qw(-extindex --reindex), "$home/extindex",
+ $v2ibx->{inboxdir}], undef, $opt),
+ '--reindex for stale');
+ @err = split(/^/, $err);
+ is(scalar(@err), 1, 'only one warning') or diag "err=$err";
+ like($err[0], qr/\(#$new->{num}\): stale/, 'got stale message warning');
+ is($oidx->get_art($new->{num}), undef,
+ 'stale message gone from over');
+ is_deeply($oidx->get_xref3($new->{num}), [],
+ 'stale message has no xref3');
+ $es->{xdb}->reopen;
+ $mset = $es->mset("mid:$new->{mid}");
+ is($mset->size, 0, 'stale mid gone Xapian');
+}
+
+if ('reindex catches content bifurcation') {
+ use PublicInbox::MID qw(mids);
+ my $v2ibx = $cfg->lookup_name('v2test');
+ my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
+ my $eml = eml_load('t/data/message_embed.eml');
+ my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ $im->add($eml);
+ $im->done;
+ my $cmt_b = $v2ibx->mm->last_commit_xap($schema_version, 0);
+ my $uv = $v2ibx->uidvalidity;
+ my $lc_key = "lc-v2:v2.example//$uv;0";
+ $oidx->dbh;
+ is($oidx->eidx_meta($lc_key, $cmt_b), $cmt_a,
+ 'update lc-v2 meta, old is as expected');
+ my $mid = mids($eml)->[0];
+ my $smsg = $v2ibx->over->next_by_mid($mid, \(my $id), \(my $prev));
+ my $oldmax = $oidx->max;
+ my $x3_orig = $oidx->get_xref3(3);
+ is(scalar(@$x3_orig), 1, '#3 has one xref');
+ $oidx->add_xref3(3, $smsg->{num}, $smsg->{blob}, 'v2.example');
+ my $x3 = $oidx->get_xref3(3);
+ is(scalar(@$x3), 2, 'injected xref3');
+ $oidx->commit_lazy;
+ my $opt = { 2 => \(my $err = '') };
+ ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
+ 'extindex --all is noop');
+ is($err, '', 'no warnings in index');
+ $oidx->dbh;
+ is($oidx->max, $oldmax, 'oidx->max unchanged');
+ $oidx->dbh_close;
+ ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
+ undef, $opt), 'extindex --reindex');
+ $oidx->dbh;
+ ok($oidx->max > $oldmax, 'oidx->max bumped');
+ like($err, qr/split into 2 due to deduplication change/,
+ 'bifurcation noted');
+ my $added = $oidx->get_art($oidx->max);
+ is($added->{blob}, $smsg->{blob}, 'new blob indexed');
+ is_deeply(["v2.example:$smsg->{num}:$smsg->{blob}"],
+ $oidx->get_xref3($added->{num}),
+ 'xref3 corrected for bifurcated message');
+ is_deeply($oidx->get_xref3(3), $x3_orig, 'xref3 restored for #3');
+}
+
+if ('--reindex --rethread') {
+ my $before = $oidx->dbh->selectrow_array(<<'');
+SELECT MAX(tid) FROM over WHERE num > 0
+
+ my $opt = {};
+ ok(run_script([qw(-extindex --reindex --rethread --all),
+ "$home/extindex"], undef, $opt),
+ '--rethread');
+ my $after = $oidx->dbh->selectrow_array(<<'');
+SELECT MIN(tid) FROM over WHERE num > 0
+
+ # actual rethread logic is identical to v1/v2 and tested elsewhere
+ ok($after > $before, '--rethread updates MIN(tid)');
+}
+
+if ('remove v1test and test gc') {
+ xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
+ { GIT_CONFIG => $cfg_path });
+ my $opt = { 2 => \(my $err = '') };
+ ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
+ 'extindex --gc');
+ like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
+ is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+ 'no non-informational messages');
+ $misc->{xdb}->reopen;
+ @it = $misc->mset('')->items;
+ is(scalar(@it), 1, 'only one inbox left');
+}
+
+done_testing;
{
# check initial feed
{
- my $feed = string_feed({ -inbox => $ibx });
+ my $feed = string_feed({ ibx => $ibx });
SKIP: {
skip 'XML::TreePP missing', 3 unless $have_xml_treepp;
my $t = XML::TreePP->new->parse($feed);
# check spam shows up
{
- my $spammy_feed = string_feed({ -inbox => $ibx });
+ my $spammy_feed = string_feed({ ibx => $ibx });
SKIP: {
skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
my $t = XML::TreePP->new->parse($spammy_feed);
# spam no longer shows up
{
- my $feed = string_feed({ -inbox => $ibx });
+ my $feed = string_feed({ ibx => $ibx });
SKIP: {
skip 'XML::TreePP missing', 2 unless $have_xml_treepp;
my $t = XML::TreePP->new->parse($feed);
];
my $ibx = PublicInbox::Inbox->new({ inboxdir => $git_dir,
altid => $altid });
- $f = PublicInbox::Filter::RubyLang->new(-inbox => $ibx);
+ $f = PublicInbox::Filter::RubyLang->new(ibx => $ibx);
$msg = <<'EOF';
X-Mail-Count: 12
Message-ID: <a@b>
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Fcntl qw(:seek);
+use IO::Handle ();
+use POSIX qw(_exit);
+use Cwd qw(abs_path);
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2';
+use PublicInbox::Import;
+my ($tmpdir, $for_destroy) = tmpdir();
+
+my $gcf2 = PublicInbox::Gcf2::new();
+is(ref($gcf2), 'PublicInbox::Gcf2', '::new works');
+my $COPYING = 'dba13ed2ddf783ee8118c6a581dbf75305f816a3';
+open my $agpl, '<', 'COPYING' or BAIL_OUT "AGPL-3 missing: $!";
+$agpl = do { local $/; <$agpl> };
+
+PublicInbox::Import::init_bare($tmpdir);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr);
+is($?, 0, 'fast-import succeeded');
+$gcf2->add_alternate("$tmpdir/objects");
+
+{
+ my ($r, $w);
+ pipe($r, $w) or BAIL_OUT $!;
+ my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+ $gcf2->cat_oid(fileno($w), $tree);
+ close $w;
+ is("$tree tree 30\n", <$r>, 'tree header ok');
+ $r = do { local $/; <$r> };
+ is(chop($r), "\n", 'got trailing newline');
+ is(length($r), 30, 'tree length matches');
+}
+
+chomp(my $objdir = xqx([qw(git rev-parse --git-path objects)]));
+if ($objdir =~ /\A--git-path\n/) { # git <2.5
+ chomp($objdir = xqx([qw(git rev-parse --git-dir)]));
+ $objdir .= '/objects';
+}
+if ($objdir && -d $objdir) {
+ $objdir = abs_path($objdir);
+ open my $alt, '>>', "$tmpdir/objects/info/alternates" or
+ BAIL_OUT $!;
+ print $alt $objdir, "\n" or BAIL_OUT $!;
+ close $alt or BAIL_OUT $!;
+
+ # calling gcf2->add_alternate on an already-added path won't
+ # cause alternates to be reloaded, so we do
+ # $gcf2->add_alternate($objdir) later on instead of
+ # $gcf2->add_alternate("$tmpdir/objects");
+ # $objdir = "$tmpdir/objects";
+} else {
+ $objdir = undef
+}
+
+my $nr = $ENV{TEST_LEAK_NR};
+my $cat = $ENV{TEST_LEAK_CAT} // 10;
+diag "checking for leaks... (TEST_LEAK_NR=$nr TEST_LEAK_CAT=$cat)" if $nr;
+
+SKIP: {
+ skip 'not in git worktree', 21 unless defined($objdir);
+ $gcf2->add_alternate($objdir);
+ eval { $gcf2->add_alternate($objdir) };
+ ok(!$@, 'no error adding alternate redundantly');
+ if ($nr) {
+ diag "adding alternate $nr times redundantly";
+ $gcf2->add_alternate($objdir) for (1..$nr);
+ diag 'done adding redundant alternates';
+ }
+
+ open my $fh, '+>', undef or BAIL_OUT "open: $!";
+ $fh->autoflush(1);
+
+ ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails');
+ seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+ is(do { local $/; <$fh> }, '', 'nothing written');
+
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
+ ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails');
+ seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+ is(do { local $/; <$fh> }, '', 'nothing written for z40');
+
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
+ my $ck_copying = sub {
+ my ($desc) = @_;
+ seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+ is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc");
+ my $buf = do { local $/; <$fh> };
+ is(chop($buf), "\n", 'got trailing \\n');
+ is($buf, $agpl, "AGPL matches ($desc)");
+ };
+ ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal');
+ $ck_copying->('regular file');
+
+ $gcf2 = PublicInbox::Gcf2::new();
+ $gcf2->add_alternate("$tmpdir/objects");
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
+ ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate');
+ $ck_copying->('alternates after reopen');
+
+ $^O eq 'linux' or skip('pipe tests are Linux-only', 14);
+ for my $blk (1, 0) {
+ my ($r, $w);
+ pipe($r, $w) or BAIL_OUT $!;
+ fcntl($w, 1031, 4096) or
+ skip('Linux too old for F_SETPIPE_SZ', 14);
+ $w->blocking($blk);
+ seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+ truncate($fh, 0) or BAIL_OUT "truncate: $!";
+ defined(my $pid = fork) or BAIL_OUT "fork: $!";
+ if ($pid == 0) {
+ close $w;
+ tick; # wait for parent to block on writev
+ my $buf = do { local $/; <$r> };
+ print $fh $buf or _exit(1);
+ _exit(0);
+ }
+ ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk");
+ close $w or BAIL_OUT "close: $!";
+ is(waitpid($pid, 0), $pid, 'child exited');
+ is($?, 0, 'no error in child');
+ $ck_copying->("pipe blocking($blk)");
+
+ pipe($r, $w) or BAIL_OUT $!;
+ fcntl($w, 1031, 4096) or BAIL_OUT $!;
+ $w->blocking($blk);
+ close $r;
+ local $SIG{PIPE} = 'IGNORE';
+ eval { $gcf2->cat_oid(fileno($w), $COPYING) };
+ like($@, qr/writev error:/, 'got writev error');
+ }
+}
+
+if ($nr) {
+ open my $null, '>', '/dev/null' or BAIL_OUT "open /dev/null: $!";
+ my $fd = fileno($null);
+ local $SIG{PIPE} = 'IGNORE';
+ my ($r, $w);
+ pipe($r, $w);
+ close $r;
+ my $broken = fileno($w);
+ for (1..$nr) {
+ my $obj = PublicInbox::Gcf2::new();
+ if (defined($objdir)) {
+ $obj->add_alternate($objdir);
+ for (1..$cat) {
+ $obj->cat_oid($fd, $COPYING);
+ eval { $obj->cat_oid($broken, $COPYING) };
+ $obj->cat_oid($fd, '0'x40);
+ $obj->cat_oid($fd, 'invalid');
+ }
+ }
+ }
+}
+done_testing;
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use PublicInbox::TestCommon;
+use Test::More;
+use Cwd qw(getcwd);
+use PublicInbox::Import;
+use PublicInbox::DS;
+
+require_mods('PublicInbox::Gcf2');
+use_ok 'PublicInbox::Gcf2Client';
+my ($tmpdir, $for_destroy) = tmpdir();
+my $git_a = "$tmpdir/a.git";
+my $git_b = "$tmpdir/b.git";
+PublicInbox::Import::init_bare($git_a);
+PublicInbox::Import::init_bare($git_b);
+my $fi_data = './t/git.fast-import-data';
+my $rdr = {};
+open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr);
+is($?, 0, 'fast-import succeeded');
+
+my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
+my $called = 0;
+my $err_f = "$tmpdir/err";
+{
+ PublicInbox::DS->Reset;
+ open my $err, '>>', $err_f or BAIL_OUT $!;
+ my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+ $gcf2c->cat_async("$tree $git_a", sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is($oid, $tree, 'got expected OID');
+ is($size, 30, 'got expected length');
+ is($type, 'tree', 'got tree type');
+ is(length($$bref), 30, 'got a tree');
+ is($arg, 'hi', 'arg passed');
+ $called++;
+ }, 'hi');
+ $gcf2c->cat_async_step($gcf2c->{inflight});
+
+ open $err, '<', $err_f or BAIL_OUT $!;
+ my $estr = do { local $/; <$err> };
+ is($estr, '', 'nothing in stderr');
+
+ my $trunc = substr($tree, 0, 39);
+ $gcf2c->cat_async("$trunc $git_a", sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is(undef, $bref, 'missing bref is undef');
+ is($oid, $trunc, 'truncated OID printed');
+ is($type, 'missing', 'type is "missing"');
+ is($size, undef, 'size is undef');
+ is($arg, 'bye', 'arg passed when missing');
+ $called++;
+ }, 'bye');
+ $gcf2c->cat_async_step($gcf2c->{inflight});
+
+ open $err, '<', $err_f or BAIL_OUT $!;
+ $estr = do { local $/; <$err> };
+ like($estr, qr/retrying/, 'warned about retry');
+
+ # try failed alternates lookup
+ PublicInbox::DS->Reset;
+ open $err, '>', $err_f or BAIL_OUT $!;
+ $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+ $gcf2c->cat_async("$tree $git_b", sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is(undef, $bref, 'missing bref from alt is undef');
+ $called++;
+ });
+ $gcf2c->cat_async_step($gcf2c->{inflight});
+ open $err, '<', $err_f or BAIL_OUT $!;
+ $estr = do { local $/; <$err> };
+ like($estr, qr/retrying/, 'warned about retry before alt update');
+
+ # now try successful alternates lookup
+ open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!;
+ print $alt "$git_a/objects\n" or BAIL_OUT $!;
+ close $alt or BAIL_OUT;
+ my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]);
+ $gcf2c->cat_async("$tree $git_a", sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is($oid, $tree, 'oid match on alternates retry');
+ is($$bref, $expect, 'tree content matched');
+ $called++;
+ });
+ $gcf2c->cat_async_step($gcf2c->{inflight});
+}
+is($called, 4, 'cat_async callbacks hit');
+done_testing;
is(length($$x), $size, 'read correct number of bytes');
my $ref = $gcf->qx(qw(cat-file blob), $buf);
+ is($?, 0, 'no error on scalar success');
my @ref = $gcf->qx(qw(cat-file blob), $buf);
+ is($?, 0, 'no error on wantarray success');
my $nl = scalar @ref;
ok($nl > 1, "qx returned array length of $nl");
+ is(join('', @ref), $ref, 'qx array and scalar context both work');
$gcf->qx(qw(repack -adq));
ok($gcf->packed_bytes > 0, 'packed size is positive');
+ $gcf->qx(qw(rev-parse --verify bogus));
+ isnt($?, 0, '$? set on failure'.$?);
}
SKIP: {
use_ok 'PublicInbox::IdxStack';
my $oid_a = '03c21563cf15c241687966b5b2a3f37cdc193316';
my $oid_b = '963caad026055ab9bcbe3ee9550247f9d8840feb';
+my $cmt_a = 'df8e4a0612545d53672036641e9f076efc94c2f6';
+my $cmt_b = '3ba7c9fa4a083c439e768882c571c2026a981ca5';
my $stk = PublicInbox::IdxStack->new;
is($stk->read_prepare, $stk, 'nothing');
is($stk->pop_rec, undef, 'undef on empty');
$stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
is($stk->read_prepare, $stk, 'read_prepare');
is($stk->num_records, 1, 'num_records');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop once');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop once');
is($stk->pop_rec, undef, 'undef on empty');
$stk = PublicInbox::IdxStack->new;
-$stk->push_rec('m', 1234, 5678, $oid_a);
-$stk->push_rec('d', 1234, 5678, $oid_b);
+$stk->push_rec('m', 1234, 5678, $oid_a, $cmt_a);
+$stk->push_rec('d', 1234, 5678, $oid_b, $cmt_b);
is($stk->read_prepare, $stk, 'read_prepare');
is($stk->num_records, 2, 'num_records');
-is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b], 'pop');
-is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop-pop');
+is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b, $cmt_b], 'pop');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a, $cmt_a], 'pop-pop');
is($stk->pop_rec, undef, 'empty');
SKIP: {
while (<$fh>) {
chomp;
my ($at, $ct, $H) = split(/\./);
- $stk //= PublicInbox::IdxStack->new($H);
+ $stk //= PublicInbox::IdxStack->new;
# not bothering to parse blobs here, just using commit OID
# as a blob OID since they're the same size + format
- $stk->push_rec('m', $at + 0, $ct + 0, $H);
- push(@expect, [ 'm', $at, $ct, $H ]);
+ $stk->push_rec('m', $at + 0, $ct + 0, $H, $H);
+ push(@expect, [ 'm', $at, $ct, $H, $H ]);
}
$stk or skip('nothing from git log', 3);
is($stk->read_prepare, $stk, 'read_prepare');
my $have_inotify = eval { require Linux::Inotify2; 1 };
-my $pi_config = PublicInbox::Config->new;
-$pi_config->each_inbox(sub {
+my $pi_cfg = PublicInbox::Config->new;
+$pi_cfg->each_inbox(sub {
my ($ibx) = @_;
my $env = { ORIGINAL_RECIPIENT => $ibx->{-primary_address} };
my $name = $ibx->{name};
$sidx->set_metadata_once;
$sidx->idx_release; # allow watching on lockfile
}
- my $pi_config = PublicInbox::Config->new(\<<EOF);
+ my $pi_cfg = PublicInbox::Config->new(\<<EOF);
publicinbox.inbox-idle.inboxdir=$inboxdir
publicinbox.inbox-idle.indexlevel=basic
publicinbox.inbox-idle.address=test\@example.com
EOF
my $ident = 'whatever';
- $pi_config->each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
- my $ii = PublicInbox::InboxIdle->new($pi_config);
+ $pi_cfg->each_inbox(sub { shift->subscribe_unlock($ident, $obj) });
+ my $ii = PublicInbox::InboxIdle->new($pi_cfg);
ok($ii, 'InboxIdle created');
SKIP: {
skip('inotify or kqueue missing', 1) unless $ii->{sock};
PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
$ii->event_step;
is(scalar @{$obj->{called}}, 1, 'called on unlock');
- $pi_config->each_inbox(sub { shift->unsubscribe_unlock($ident) });
+ $pi_cfg->each_inbox(sub { shift->unsubscribe_unlock($ident) });
ok($im->add(eml_load('t/data/0001.patch')), "$V added #2");
$im->done;
PublicInbox::SearchIdx->new($ibx)->index_sync if $V == 1;
EOF
ok(run_script(['-mda'], $env, $opt), 'message delivered');
}
- my $config = PublicInbox::Config->new;
- my $ibx = $config->lookup_name($v);
+ my $cfg = PublicInbox::Config->new;
+ my $ibx = $cfg->lookup_name($v);
# make sure all serials are searchable:
for my $i (1..2) {
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::InboxWritable;
+require_mods(qw(Search::Xapian DBD::SQLite));
+use_ok 'PublicInbox::MiscSearch';
+use_ok 'PublicInbox::MiscIdx';
+
+my ($tmp, $for_destroy) = tmpdir();
+my $eidx = { xpfx => "$tmp/eidx", -no_fsync => 1 }; # mock ExtSearchIdx
+{
+ mkdir "$tmp/v1" or BAIL_OUT "mkdir $!";
+ open my $fh, '>', "$tmp/v1/description" or BAIL_OUT "open: $!";
+ print $fh "Everything sucks this year\n" or BAIL_OUT "print $!";
+ close $fh or BAIL_OUT "close $!";
+}
+{
+ my $v1 = PublicInbox::InboxWritable->new({
+ inboxdir => "$tmp/v1",
+ name => 'hope',
+ address => [ 'nope@example.com' ],
+ indexlevel => 'basic',
+ version => 1,
+ });
+ $v1->init_inbox;
+ my $mi = PublicInbox::MiscIdx->new($eidx);
+ $mi->begin_txn;
+ $mi->index_ibx($v1);
+ $mi->commit_txn;
+}
+
+my $ms = PublicInbox::MiscSearch->new("$tmp/eidx/misc");
+my $mset = $ms->mset('"everything sucks today"');
+is(scalar($mset->items), 0, 'no match on description phrase');
+
+$mset = $ms->mset('"everything sucks this year"');
+is(scalar($mset->items), 1, 'match phrase on description');
+
+$mset = $ms->mset('everything sucks');
+is(scalar($mset->items), 1, 'match words in description');
+
+$mset = $ms->mset('nope@example.com');
+is(scalar($mset->items), 1, 'match full address');
+
+$mset = $ms->mset('nope');
+is(scalar($mset->items), 1, 'match partial address');
+
+$mset = $ms->mset('hope');
+is(scalar($mset->items), 1, 'match name');
+my $mi = ($mset->items)[0];
+my $doc = $mi->get_document;
+is($doc->get_data, '{}', 'stored empty data');
+
+done_testing;
my %mid2num;
my %num2mid;
my @mids = qw(a@b c@d e@f g@h aa@bb aa@cc);
-is_deeply([$d->minmax], [undef,undef], "empty min max on new DB");
+is_deeply([$d->minmax], [0,0], "zero min max on new DB");
foreach my $mid (@mids) {
my $n = $d->mid_insert($mid);
require_mods(qw(DBD::SQLite Data::Dumper));
use_ok 'PublicInbox::NNTP';
use_ok 'PublicInbox::Inbox';
+use PublicInbox::Config;
{
sub quote_str {
{ # test setting NNTP headers in HEAD and ARTICLE requests
my $u = 'https://example.com/a/';
- my $ng = PublicInbox::Inbox->new({ name => 'test',
+ my $ibx = PublicInbox::Inbox->new({ name => 'test',
inboxdir => 'test.git',
address => 'a@example.com',
-primary_address => 'a@example.com',
newsgroup => 'test',
domain => 'example.com',
url => [ '//example.com/a' ]});
- is($ng->base_url, $u, 'URL expanded');
+ is($ibx->base_url, $u, 'URL expanded');
my $mid = 'a@b';
my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n");
my $hdr = $mime->header_obj;
my $mock_self = {
- nntpd => { grouplist => [], servername => 'example.com' },
- ng => $ng,
+ nntpd => {
+ servername => 'example.com',
+ pi_cfg => bless {}, 'PublicInbox::Config',
+ },
+ ibx => $ibx,
};
- my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };
+ my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ibx };
PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
- is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
- 'Archived-At: set');
- is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ],
- 'List-Archive: set');
- is_deeply([ $mime->header('List-Post') ], [ '<mailto:a@example.com>' ],
- 'List-Post: set');
is_deeply([ $mime->header('Newsgroups') ], [ 'test' ],
'Newsgroups: set');
is_deeply([ $mime->header('Xref') ], [ 'example.com test:1' ],
'Xref: set');
- $ng->{-base_url} = 'http://mirror.example.com/m/';
+ $ibx->{-base_url} = 'http://mirror.example.com/m/';
$smsg->{num} = 2;
PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
- is_deeply([ $mime->header('Archived-At') ],
- [ "<${u}a\@b/>", '<http://mirror.example.com/m/a@b/>' ],
- 'Archived-At: appended');
is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ],
'Old Xref: clobbered');
}
'WAL journal_mode not clobbered if manually set');
}
+# ext index additions
+$over->eidx_prep;
+{
+ my @arg = qw(1349 2019 adeadba7cafe example.key);
+ ok($over->add_xref3(@arg), 'first add');
+ ok($over->add_xref3(@arg), 'add idempotent');
+ my $xref3 = $over->get_xref3(1349);
+ is_deeply($xref3, [ 'example.key:2019:adeadba7cafe' ], 'xref3 works');
+
+ @arg = qw(1349 2018 deadbeefcafe example.kee);
+ ok($over->add_xref3(@arg), 'add another xref3');
+ $xref3 = $over->get_xref3(1349);
+ is_deeply($xref3, [ 'example.key:2019:adeadba7cafe',
+ 'example.kee:2018:deadbeefcafe' ],
+ 'xref3 works forw two');
+
+ @arg = qw(1349 adeadba7cafe example.key);
+ is($over->remove_xref3(@arg), 1, 'remove first');
+ $xref3 = $over->get_xref3(1349);
+ is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
+ 'confirm removal successful');
+ $over->rollback_lazy;
+}
+
done_testing();
use PublicInbox::Import;
use PublicInbox::Git;
use PublicInbox::Config;
-my $config = PublicInbox::Config->new(\<<EOF);
+my $cfg = PublicInbox::Config->new(\<<EOF);
$cfgpfx.address=$addr
$cfgpfx.inboxdir=$maindir
EOF
$im->done;
}
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
my $app = builder(sub {
enable('Head');
mount('/a' => builder(sub { sub { $www->call(@_) } }));
$res = $cb->(GET('/a/test/blah%40example.com/raw'));
is($res->code, 200, 'OK with URLMap mount');
- like($res->content, qr!^List-Archive: <http://[^/]+/a/test/>!m,
- 'List-Archive set in /raw mboxrd');
like($res->content,
- qr!^Archived-At: <http://[^/]+/a/test/blah\@example\.com/>!m,
- 'Archived-At set in /raw mboxrd');
+ qr/^Message-Id: <blah\@example\.com>\n/sm,
+ 'headers appear in /raw');
# redirects
$res = $cb->(GET('/a/test/m/blah%40example.com.html'));
SKIP: {
require_mods(qw(DBD::SQLite Search::Xapian IO::Uncompress::Gunzip), 3);
- my $ibx = $config->lookup_name('test');
+ my $ibx = $cfg->lookup_name('test');
require_ok 'PublicInbox::SearchIdx';
PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
test_psgi($app, sub {
my $gz = $res->content;
my $raw;
IO::Uncompress::Gunzip::gunzip(\$gz => \$raw);
- like($raw, qr!^List-Archive: <http://[^/]+/a/test/>!m,
- 'List-Archive set in /t.mbox.gz mboxrd');
- like($raw,
- qr!^Archived-At:\x20
- <http://[^/]+/a/test/blah\@example\.com/>!mx,
- 'Archived-At set in /t.mbox.gz mboxrd');
+ like($raw, qr!^Message-Id:\x20<blah\@example\.com>\n!sm,
+ 'headers appear in /t.mbox.gz mboxrd');
});
}
PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
my $cfgpfx = "publicinbox.test";
-my $config = PublicInbox::Config->new(\<<EOF);
+my $cfg = PublicInbox::Config->new(\<<EOF);
$cfgpfx.address=git\@vger.kernel.org
$cfgpfx.inboxdir=$tmpdir
EOF
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
test_psgi(sub { $www->call(@_) }, sub {
my ($cb) = @_;
my $res;
$xdb->set_metadata('has_threadid', '0');
$sidx->idx_release;
}
- $config->each_inbox(sub { delete $_[0]->{search} });
+ $cfg->each_inbox(sub { delete $_[0]->{search} });
$res = $cb->(GET('/test/?q=s:test'));
is($res->code, 200, 'successful search w/o has_threadid');
unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
'"From_" line stored to test old bug workaround');
my $cfgpfx = "publicinbox.v2test";
-my $cfg = <<EOF;
+my $cfg = PublicInbox::Config->new(\<<EOF);
$cfgpfx.address=$ibx->{-primary_address}
$cfgpfx.inboxdir=$inboxdir
EOF
-my $config = PublicInbox::Config->new(\$cfg);
-my $www = PublicInbox::WWW->new($config);
+my $www = PublicInbox::WWW->new($cfg);
my ($res, $raw, @from_);
my $client0 = sub {
my ($cb) = @_;
like($raw, qr/^hello ghosts$/m, 'got third message');
@from_ = ($raw =~ m/^From /mg);
is(scalar(@from_), 3, 'three From_ lines');
- $config->each_inbox(sub { $_[0]->search->reopen });
+ $cfg->each_inbox(sub { $_[0]->search->reopen });
SKIP: {
eval { require IO::Uncompress::Gunzip };
$im->done;
my @h = $mime->header('Message-ID');
is_deeply($exp, \@h, 'reused existing Message-ID');
- $config->each_inbox(sub { $_[0]->search->reopen });
+ $cfg->each_inbox(sub { $_[0]->search->reopen });
}
my $client2 = sub {
ok($im->add($mime), "added attachment $body");
}
$im->done;
- $config->each_inbox(sub { $_[0]->search->reopen });
+ $cfg->each_inbox(sub { $_[0]->search->reopen });
}
my $client3 = sub {
like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears');
my $doc = $m->get_document;
my $col = PublicInbox::Search::BYTES();
- my $bytes = PublicInbox::Smsg::get_val($doc, $col);
+ my $bytes = PublicInbox::Search::int_val($doc, $col);
like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit');
ok($bytes > 0, '$bytes is > 0');
is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over');
$col = PublicInbox::Search::UID();
- my $uid = PublicInbox::Smsg::get_val($doc, $col);
+ my $uid = PublicInbox::Search::int_val($doc, $col);
is($uid, $smsg->{num}, 'UID column matches {num}');
is($uid, $m->get_docid, 'UID column matches docid');
}
});
done_testing();
-
-1;
$mime->header_set('Message-ID', "<$y>");
$mime->header_set('References', "<$x>");
ok($im->add($mime), 'add excessively long References');
- $im->barrier;
+ $im->done;
my $msgs = $ibx->over->get_thread('x'x244);
is(2, scalar(@$msgs), 'got both messages');
is($msgs->[0]->{mid}, 'x'x244, 'stored truncated mid');
is($msgs->[1]->{references}, '<'.('x'x244).'>', 'stored truncated ref');
is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)');
- $im->done;
}
my $tmp = {
$cfgpfx.altid=serial:alerts:file=msgmap.sqlite3
publicinboxwatch.watchspam=maildir:$spamdir
EOF
- my $config = PublicInbox::Config->new(\$orig);
- my $ibx = $config->lookup_name($v);
+ my $cfg = PublicInbox::Config->new(\$orig);
+ my $ibx = $cfg->lookup_name($v);
ok($ibx, 'found inbox by name');
- my $w = PublicInbox::Watch->new($config);
+ my $w = PublicInbox::Watch->new($cfg);
for my $i (1..2) {
$w->scan('full');
}
}
$w->scan('full');
- $config = PublicInbox::Config->new(\$orig);
- $ibx = $config->lookup_name($v);
+ $cfg = PublicInbox::Config->new(\$orig);
+ $ibx = $cfg->lookup_name($v);
is($ibx->search->reopen->mset('b:spam')->size, 0, 'spam removed');
is_deeply([], \@warn, 'no warnings');
{
my @w;
local $SIG{__WARN__} = sub { push @w, @_ };
- my $config = PublicInbox::Config->new(\<<EOF);
+ my $cfg = PublicInbox::Config->new(\<<EOF);
$cfgpfx.address=$addr
$cfgpfx.inboxdir=$git_dir
$cfgpfx.watch=maildir:$spamdir
publicinboxlearn.watchspam=maildir:$spamdir
EOF
- my $wm = PublicInbox::Watch->new($config);
+ my $wm = PublicInbox::Watch->new($cfg);
is(scalar grep(/is a spam folder/, @w), 1, 'got warning about spam');
is_deeply($wm->{mdmap}, { "$spamdir/cur" => 'watchspam' },
'only got the spam folder to watch');
close $fh or BAIL_OUT $!;
}
-my $config = PublicInbox::Config->new($cfg_path);
-PublicInbox::Watch->new($config)->scan('full');
+my $cfg = PublicInbox::Config->new($cfg_path);
+PublicInbox::Watch->new($cfg)->scan('full');
my $git = PublicInbox::Git->new($git_dir);
my @list = $git->qx(qw(rev-list refs/heads/master));
is(scalar @list, 1, 'one revision in rev-list');
};
$write_spam->();
is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
@list = $git->qx(qw(rev-list refs/heads/master));
is(scalar @list, 2, 'two revisions in rev-list');
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
the body of a message to majordomo\@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html\n);
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
is(scalar @list, 1, 'tree has one file');
my $mref = $git->cat_file('HEAD:'.$list[0]);
is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
$write_spam->();
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
is(scalar @list, 0, 'tree is empty');
@list = $git->qx(qw(rev-list refs/heads/master));
my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
local $ENV{PATH} = $fail_path;
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- $config->{'publicinboxwatch.spamcheck'} = 'spamc';
+ $cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
{
local $SIG{__WARN__} = sub {}; # quiet spam check warning
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
}
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
is(scalar @list, 0, 'tree has no files spamc checked');
my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
local $ENV{PATH} = $main_path;
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- $config->{'publicinboxwatch.spamcheck'} = 'spamc';
+ $cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
is(scalar @list, 1, 'tree has one file after spamc checked');
$delivered++;
};
PublicInbox::DS->Reset;
- my $ii = PublicInbox::InboxIdle->new($config);
+ my $ii = PublicInbox::InboxIdle->new($cfg);
my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup';
- $config->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
+ $cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
PublicInbox::DS->SetPostLoopCallback(sub { $delivered == 0 });
# wait for -watch to setup inotify watches
$cfgpfx.filter=PublicInbox::Filter::Vger
publicinboxlearn.watchspam=maildir:$spamdir
EOF
-my $config = PublicInbox::Config->new(\$orig);
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$orig);
+my $ibx = $cfg->lookup_name('test');
ok($ibx, 'found inbox by name');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
my $total = scalar @{$ibx->over->recent};
is($total, 1, 'got one revision');
};
$write_spam->();
is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::Watch->new($config)->scan('full');
+PublicInbox::Watch->new($cfg)->scan('full');
is_deeply($ibx->over->recent, [], 'deleted file');
is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
the body of a message to majordomo\@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html\n);
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
my $msgs = $ibx->over->recent;
is(scalar(@$msgs), 1, 'got one file back');
my $mref = $ibx->msg_by_smsg($msgs->[0]);
is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
$write_spam->();
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
$msgs = $ibx->over->recent;
is(scalar(@$msgs), 0, 'inbox is empty again');
is(unlink(glob("$spamdir/cur/*")), 1, 'unlinked trained spam');
my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
local $ENV{PATH} = $fail_path;
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- $config->{'publicinboxwatch.spamcheck'} = 'spamc';
+ $cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
{
local $SIG{__WARN__} = sub {}; # quiet spam check warning
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
}
my $msgs = $ibx->over->recent;
is(scalar(@$msgs), 0, 'inbox is still empty');
my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
local $ENV{PATH} = $main_path;
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- $config->{'publicinboxwatch.spamcheck'} = 'spamc';
- PublicInbox::Watch->new($config)->scan('full');
+ $cfg->{'publicinboxwatch.spamcheck'} = 'spamc';
+ PublicInbox::Watch->new($cfg)->scan('full');
my $msgs = $ibx->over->recent;
is(scalar(@$msgs), 1, 'inbox has one mail after spamc OK-ed a message');
my $mref = $ibx->msg_by_smsg($msgs->[0]);
like($$mref, qr/something\n\z/s, 'message scrubbed on import');
- delete $config->{'publicinboxwatch.spamcheck'};
+ delete $cfg->{'publicinboxwatch.spamcheck'};
}
{
open my $fh, '<', $patch or die "failed to open $patch: $!\n";
$msg = do { local $/; <$fh> };
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
my $post = $ibx->search->reopen->mset('dfpost:6e006fd7');
is($post->size, 1, 'diff postimage found');
my $pre = $ibx->search->mset('dfpre:090d998b6c2c');
my $v1pfx = "publicinbox.v1";
my $v1addr = 'v1-public@example.com';
PublicInbox::Import::init_bare($v1repo);
- my $cfg2 = <<EOF;
+ my $raw = <<EOF;
$orig$v1pfx.address=$v1addr
$v1pfx.inboxdir=$v1repo
$v1pfx.watch=maildir:$maildir
EOF
- my $config = PublicInbox::Config->new(\$cfg2);
+ my $cfg = PublicInbox::Config->new(\$raw);
my $both = <<EOF;
From: user\@example.com
To: $addr, $v1addr
both
EOF
PublicInbox::Emergency->new($maildir)->prepare(\$both);
- PublicInbox::Watch->new($config)->scan('full');
+ PublicInbox::Watch->new($cfg)->scan('full');
my $mset = $ibx->search->reopen->mset('m:both@b.com');
my $msgs = $ibx->search->mset_to_smsg($ibx, $mset);
- my $v1 = $config->lookup_name('v1');
+ my $v1 = $cfg->lookup_name('v1');
my $msg = $v1->git->cat_file($msgs->[0]->{blob});
is($both, $$msg, 'got original message back from v1');
$msg = $ibx->git->cat_file($msgs->[0]->{blob});
X-Mailing-List: no@example.com
Message-ID: <do.not.want@example.com>
EOF
- my $cfg = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
+ my $raw = $orig."$cfgpfx.listid=i.want.you.to.want.me\n";
PublicInbox::Emergency->new($maildir)->prepare(\$want);
PublicInbox::Emergency->new($maildir)->prepare(\$do_not_want);
- my $config = PublicInbox::Config->new(\$cfg);
- PublicInbox::Watch->new($config)->scan('full');
- $ibx = $config->lookup_name('test');
+ my $cfg = PublicInbox::Config->new(\$raw);
+ PublicInbox::Watch->new($cfg)->scan('full');
+ $ibx = $cfg->lookup_name('test');
my $num = $ibx->mm->num_for('do.want@example.com');
ok(defined $num, 'List-ID matched for watch');
$num = $ibx->mm->num_for('do.not.want@example.com');
is($num, undef, 'unaccepted List-ID matched for watch');
- $cfg = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
- $config = PublicInbox::Config->new(\$cfg);
- PublicInbox::Watch->new($config)->scan('full');
- $ibx = $config->lookup_name('test');
+ $raw = $orig."$cfgpfx.watchheader=X-Mailing-List:no\@example.com\n";
+ $cfg = PublicInbox::Config->new(\$raw);
+ PublicInbox::Watch->new($cfg)->scan('full');
+ $ibx = $cfg->lookup_name('test');
$num = $ibx->mm->num_for('do.not.want@example.com');
ok(defined $num, 'X-Mailing-List matched');
}
PublicInbox::Emergency->new($maildir)->prepare(\$msg_cc);
PublicInbox::Emergency->new($maildir)->prepare(\$msg_none);
-my $cfg = <<EOF;
+my $raw = <<EOF;
$cfgpfx.address=$addr
$cfgpfx.inboxdir=$inboxdir
$cfgpfx.watch=maildir:$maildir
$cfgpfx.watchheader=To:$addr
$cfgpfx.watchheader=Cc:$addr
EOF
-my $config = PublicInbox::Config->new(\$cfg);
-PublicInbox::Watch->new($config)->scan('full');
-my $ibx = $config->lookup_name('test');
+my $cfg = PublicInbox::Config->new(\$raw);
+PublicInbox::Watch->new($cfg)->scan('full');
+my $ibx = $cfg->lookup_name('test');
ok($ibx, 'found inbox by name');
my $num = $ibx->mm->num_for('to@a.com');
my ($tmpdir, $for_destroy) = tmpdir();
my $bare = PublicInbox::Git->new("$tmpdir/bare.git");
PublicInbox::Import::init_bare($bare->{git_dir});
-is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
- 'empty repo has no fingerprint');
+is($bare->manifest_entry, undef, 'empty repo has no manifest entry');
{
my $fi_data = './t/git.fast-import-data';
open my $fh, '<', $fi_data or die "open $fi_data: $!";
'fast-import');
}
-like(PublicInbox::ManifestJsGz::fingerprint($bare), qr/\A[a-f0-9]{40}\z/,
+like($bare->manifest_entry->{fingerprint}, qr/\A[a-f0-9]{40}\z/,
'got fingerprint with non-empty repo');
sub tiny_test {
select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
my $mime_ctx = {
env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
- -inbox => $ibx,
+ ibx => $ibx,
www => Plack::Util::inline_object(style => sub {''}),
obuf => \(my $mime_buf = ''),
mhref => '../',
--- /dev/null
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use File::Path qw(mkpath);
+use IO::Handle (); # autoflush
+use POSIX qw(_exit);
+use Cwd qw(getcwd abs_path);
+use File::Spec;
+my $many_root = $ENV{TEST_MANY_ROOT} or
+ plan skip_all => 'TEST_MANY_ROOT not defined';
+my $cwd = getcwd();
+mkpath($many_root);
+-d $many_root or BAIL_OUT "$many_root: $!";
+$many_root = abs_path($many_root);
+$many_root =~ m!\A\Q$cwd\E/! and BAIL_OUT "$many_root must not be in $cwd";
+require_git 2.6;
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::V2Writable';
+my $nr_inbox = $ENV{NR_INBOX} // 10;
+my $nproc = $ENV{NPROC} || PublicInbox::V2Writable::detect_nproc() || 2;
+my $indexlevel = $ENV{TEST_INDEXLEVEL} // 'basic';
+diag "NR_INBOX=$nr_inbox NPROC=$nproc TEST_INDEXLEVEL=$indexlevel";
+diag "TEST_MANY_ROOT=$many_root";
+my $level_cfg = $indexlevel eq 'full' ? '' : "\tindexlevel = $indexlevel\n";
+my $pfx = "$many_root/$nr_inbox-$indexlevel";
+mkpath($pfx);
+open my $cfg_fh, '>>', "$pfx/config" or BAIL_OUT $!;
+$cfg_fh->autoflush(1);
+my $v2_init_add = sub {
+ my ($i) = @_;
+ my $ibx = PublicInbox::Inbox->new({
+ inboxdir => "$pfx/test-$i",
+ name => "test-$i",
+ newsgroup => "inbox.comp.test.foo.test-$i",
+ address => [ "test-$i\@example.com" ],
+ url => [ "//example.com/test-$i" ],
+ version => 2,
+ });
+ $ibx->{indexlevel} = $indexlevel if $level_cfg ne '';
+ my $entry = <<EOF;
+[publicinbox "$ibx->{name}"]
+ address = $ibx->{-primary_address}
+ url = $ibx->{url}->[0]
+ newsgroup = $ibx->{newsgroup}
+ inboxdir = $ibx->{inboxdir}
+EOF
+ $entry .= $level_cfg;
+ print $cfg_fh $entry or die $!;
+ my $v2w = PublicInbox::V2Writable->new($ibx, { nproc => 0 });
+ $v2w->init_inbox(0);
+ $v2w->add(PublicInbox::Eml->new(<<EOM));
+Date: Sat, 02 Oct 2010 00:00:00 +0000
+From: Lorelei <l\@example.com>
+To: test-$i\@example.com
+Message-ID: <20101002-000000-$i\@example.com>
+Subject: hello world $i
+
+hi
+EOM
+ $v2w->done;
+};
+
+my @children;
+for my $i (1..$nproc) {
+ my ($r, $w);
+ pipe($r, $w) or BAIL_OUT $!;
+ my $pid = fork;
+ if ($pid == 0) {
+ close $w;
+ while (my $i = <$r>) {
+ chomp $i;
+ $v2_init_add->($i);
+ }
+ _exit(0);
+ }
+ defined $pid or BAIL_OUT "fork: $!";
+ close $r or BAIL_OUT $!;
+ push @children, [ $w, $pid ];
+ $w->autoflush(1);
+}
+
+for my $i (0..$nr_inbox) {
+ print { $children[$i % @children]->[0] } "$i\n" or BAIL_OUT $!;
+}
+
+for my $c (@children) {
+ close $c->[0] or BAIL_OUT "close $!";
+}
+my $i = 0;
+for my $c (@children) {
+ my $pid = waitpid($c->[1], 0);
+ is($?, 0, ++$i.' exited ok');
+}
+ok(close($cfg_fh), 'config written');
+done_testing;
my $ctx = {
env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
- -inbox => $ibx,
+ ibx => $ibx,
www => Plack::Util::inline_object(style => sub {''}),
};
my ($mime, $res, $oid, $type);
diag "enquire: ".timestr($elapsed)." for $n";
$elapsed = timeit(1, sub {
- PublicInbox::View::thread_results({-inbox => $ibx}, $msgs);
+ PublicInbox::View::thread_results({ibx => $ibx}, $msgs);
});
diag "thread_results ".timestr($elapsed);