+++ /dev/null
-=head1 NAME
-
-public-inbox-gcf2 - internal libgit2-based blob retriever
-
-=head1 SYNOPSIS
-
- This is an internal command used by public-inbox.
- It may change unrecognizably or cease to exist at some point
-
-=head1 DESCRIPTION
-
-public-inbox-gcf2 is an optional internal process used by
-public-inbox daemons for read-only access to underlying git
-repositories.
-
-Users are NOT expected to run public-inbox-gcf2 on their own.
-It replaces multiple C<git cat-file --batch> processes by treating
-any git repos it knows about as alternates.
-
-None of its behaviors are stable and it is ALL subject to change
-at any time.
-
-Any lines written to its standard input prefixed with a C</>
-are interpreted as a git directory. That git directory
-will be suffixed with "/objects" and treated as an alternate.
-It writes nothing to stdout in this case.
-
-Otherwise it behaves like C<git cat-file --batch>, but only accepts
-unabbreviated hexadecimal object IDs in its standard input.
-Its output format is identical to C<git cat-file --batch>. It
-only works for L<public-inbox-v2-format(5)> inboxes and v1
-inboxes indexed by L<public-inbox-index(1)>.
-
-=head1 OPTIONS
-
-=head1 ENVIRONMENT
-
-=over 8
-
-=item PERL_INLINE_DIRECTORY
-
-This must be set unless C<~/.cache/public-inbox/inline-c>
-exists. C<public-inbox-gcf2> uses L<Inline::C> and libgit2
-and compiles a small shim on its first run.
-
-=back
-
-=head1 CONTACT
-
-Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org>
-
-The mail archives are hosted at L<https://public-inbox.org/meta/>
-and L<http://hjrcffqmbrq6wope.onion/meta/>
-
-=head1 COPYRIGHT
-
-Copyright 2020 all contributors L<mailto:meta@public-inbox.org>
-
-License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
-
-=head1 SEE ALSO
-
-L<git-cat-file(1)>
Documentation/public-inbox-convert.pod
Documentation/public-inbox-daemon.pod
Documentation/public-inbox-edit.pod
-Documentation/public-inbox-gcf2.pod
Documentation/public-inbox-httpd.pod
Documentation/public-inbox-imapd.pod
Documentation/public-inbox-index.pod
script/public-inbox-compact
script/public-inbox-convert
script/public-inbox-edit
-script/public-inbox-gcf2
script/public-inbox-httpd
script/public-inbox-imapd
script/public-inbox-index
$v->{rsync_docs} = [ @{$v->{gz_docs}}, @{$v->{docs}},
@{$v->{docs_html}}, qw(NEWS.atom NEWS.atom.gz)];
-# filter out public-inbox-gcf2 from the website, it's an internal command
-for my $var (qw(gz_docs rsync_docs)) {
- @{$v->{$var}} = grep(!/-gcf2/, @{$v->{$var}});
-}
-
# external manpages which we host ourselves, since some packages
# (currently just Xapian) doesn't host manpages themselves.
my @xman = qw(copydatabase.1 xapian-compact.1);
require PublicInbox::Listener;
use PublicInbox::EOFpipe;
use PublicInbox::Sigfd;
+use PublicInbox::GitAsyncCat;
my @CMD;
my ($set_user, $oldset);
my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize);
daemon_prepare($default);
my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef;
my $for_destroy = daemonize();
+
+ # this wastes a bit of memory for non-PublicInbox::WWW -httpd users
+ # oh well...
+ eval {
+ require PublicInbox::Gcf2;
+ require PublicInbox::Gcf2Client;
+ };
+ local $PublicInbox::GitAsyncCat::GCF2C =
+ PublicInbox::Gcf2Client::new() if !$@;
+
daemon_loop($refresh, $post_accept, $tlsd, $af_default);
PublicInbox::DS->Reset;
# ->DESTROY runs when $for_destroy goes out-of-scope
# Copyright (C) 2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# backend for public-inbox-gcf2(1) (git-cat-file based on libgit2,
-# other libgit2 stuff may go here, too)
+# backend for a git-cat-file-workalike based on libgit2,
+# other libgit2 stuff may go here, too.
package PublicInbox::Gcf2;
use strict;
use PublicInbox::Spawn qw(which popen_rd);
use Fcntl qw(LOCK_EX);
+use IO::Handle; # autoflush
my (%CFG, $c_src, $lockfh);
BEGIN {
# PublicInbox::Spawn will set PERL_INLINE_DIRECTORY
undef $c_src;
undef %CFG;
undef $lockfh;
+
+# Usage: $^X -MPublicInbox::Gcf2 -e 'PublicInbox::Gcf2::loop()'
+# (see lib/PublicInbox/Gcf2Client.pm)
+sub loop {
+ my $gcf2 = new();
+ STDERR->autoflush(1);
+ STDOUT->autoflush(1);
+
+ while (<STDIN>) {
+ chomp;
+ my ($oid, $git_dir) = split(/ /, $_, 2);
+ $gcf2->add_alternate("$git_dir/objects");
+ if (!$gcf2->cat_oid(1, $oid)) {
+ # retry once if missing. We only get unabbreviated OIDs
+ # from SQLite or Xapian DBs, here, so malicious clients
+ # can't trigger excessive retries:
+ warn "I: $$ $oid missing, retrying in $git_dir\n";
+
+ $gcf2 = new();
+ $gcf2->add_alternate("$git_dir/objects");
+
+ if ($gcf2->cat_oid(1, $oid)) {
+ warn "I: $$ $oid found after retry\n";
+ } else {
+ warn "W: $$ $oid missing after retry\n";
+ print "$oid missing\n"; # mimic git-cat-file
+ }
+ }
+ }
+}
+
1;
# Copyright (C) 2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# connects public-inbox processes to public-inbox-gcf2(1)
+# connects public-inbox processes to PublicInbox::Gcf2::loop()
package PublicInbox::Gcf2Client;
use strict;
-use parent 'PublicInbox::Git';
+use parent qw(PublicInbox::DS);
+use PublicInbox::Git;
use PublicInbox::Spawn qw(popen_rd);
use IO::Handle ();
+use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLOUT);
+# fields:
+# async_cat => GitAsyncCat ref (read-only pipe)
+# sock => writable pipe to Gcf2::loop
-sub new {
- my ($rdr) = @_;
- my $self = bless {}, __PACKAGE__;
+sub new { bless($_[0] // {}, __PACKAGE__) }
+
+sub gcf2c_begin ($) {
+ my ($self) = @_;
+ # ensure the child process has the same @INC we do:
+ my $env = { PERL5LIB => join(':', @INC) };
my ($out_r, $out_w);
- pipe($out_r, $out_w) or $self->fail("pipe failed: $!");
- $rdr //= {};
- $rdr->{0} = $out_r;
- @$self{qw(in pid)} = popen_rd(['public-inbox-gcf2'], undef, $rdr);
- $self->{inflight} = [];
- $self->{out} = $out_w;
+ pipe($out_r, $out_w) or die "pipe failed: $!";
+ my $rdr = { 0 => $out_r, 2 => $self->{2} };
+ my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop()]];
+ @$self{qw(in pid)} = popen_rd($cmd, $env, $rdr);
fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ
$out_w->autoflush(1);
- $self;
+ $out_w->blocking(0);
+ $self->SUPER::new($out_w, 0); # EPOLL_CTL_ADD (a bit wasteful :x)
+ $self->{inflight} = [];
+}
+
+sub fail {
+ my $self = shift;
+ $self->close; # PublicInbox::DS::close
+ PublicInbox::Git::fail($self, @_);
+}
+
+sub cat_async ($$$;$) {
+ my ($self, $req, $cb, $arg) = @_;
+ my $inflight = $self->{inflight} // gcf2c_begin($self);
+
+ # rare, I hope:
+ cat_async_step($self, $inflight) if $self->{wbuf};
+
+ $self->write(\"$req\n") or $self->fail("gcf2c write: $!");
+ push @$inflight, $req, $cb, $arg;
}
-# always false, since -gcf2 retries internally
+# ensure PublicInbox::Git::cat_async_step never calls cat_async_retry
sub alternates_changed {}
+no warnings 'once';
+
+# this is the write-only end of a pipe, DS->EventLoop will call this
+*event_step = \&PublicInbox::DS::flush_write;
+
+# used by GitAsyncCat
+*cat_async_step = \&PublicInbox::Git::cat_async_step;
+
1;
my $rbuf = delete($self->{cat_rbuf}) // \(my $new = '');
my ($bref, $oid, $type, $size);
my $head = my_readline($self->{in}, $rbuf);
+ # ->fail may be called via Gcf2Client.pm
if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) {
($oid, $type, $size) = ($1, $2, $3 + 0);
$bref = my_read($self->{in}, $rbuf, $size + 1) or
- fail($self, defined($bref) ? 'read EOF' : "read: $!");
- chop($$bref) eq "\n" or fail($self, 'LF missing after blob');
+ $self->fail(defined($bref) ? 'read EOF' : "read: $!");
+ chop($$bref) eq "\n" or $self->fail('LF missing after blob');
} elsif ($head =~ s/ missing\n//s) {
$oid = $head;
# ref($req) indicates it's already been retried
$type = 'missing';
$oid = ref($req) ? $$req : $req if $oid eq '';
} else {
- fail($self, "Unexpected result from async git cat-file: $head");
+ $self->fail("Unexpected result from async git cat-file: $head");
}
eval { $cb->($bref, $oid, $type, $size, $arg) };
$self->{cat_rbuf} = $rbuf if $$rbuf ne '';
sub _destroy {
my ($self, $rbuf, $in, $out, $pid, $err) = @_;
- my $p = delete $self->{$pid} or return;
delete @$self{($rbuf, $in, $out)};
delete $self->{$err} if $err; # `err_c'
+ # GitAsyncCat::event_step may delete {pid}
+ my $p = delete $self->{$pid} or return;
+
# PublicInbox::DS may not be loaded
eval { PublicInbox::DS::dwaitpid($p, undef, undef) };
waitpid($p, 0) if $@; # wait synchronously if not in event loop
sub cat_async_abort ($) {
my ($self) = @_;
- my $inflight = delete $self->{inflight} or die 'BUG: not in async';
+ if (my $inflight = delete $self->{inflight}) {
+ while (@$inflight) {
+ my ($req, $cb, $arg) = splice(@$inflight, 0, 3);
+ $req =~ s/ .*//; # drop git_dir for Gcf2Client
+ eval { $cb->(undef, $req, undef, undef, $arg) };
+ warn "E: $req: $@ (in abort)\n" if $@;
+ }
+ }
cleanup($self);
}
sub fail {
my ($self, $msg) = @_;
- $self->{inflight} ? cat_async_abort($self) : cleanup($self);
- croak("git $self->{git_dir}: $msg");
+ cat_async_abort($self);
+ croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg");
}
sub popen {
!!($self->{pid} || $self->{pid_c});
}
+
# assuming a well-maintained repo, this should be a somewhat
# accurate estimation of its size
# TODO: show this in the WWW UI as a hint to potential cloners
sub cat_async_begin {
my ($self) = @_;
cleanup($self) if $self->alternates_changed;
- batch_prepare($self);
+ $self->batch_prepare;
die 'BUG: already in async' if $self->{inflight};
$self->{inflight} = [];
}
push(@$inflight, $oid, $cb, $arg);
}
-# this is safe to call inside $cb, but not guaranteed to enqueue
-# returns true if successful, undef if not.
sub async_prefetch {
my ($self, $oid, $cb, $arg) = @_;
- if (defined($self->{async_cat}) && (my $inflight = $self->{inflight})) {
+ if (my $inflight = $self->{inflight}) {
# we could use MAX_INFLIGHT here w/o the halving,
# but lets not allow one client to monopolize a git process
if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) {
package PublicInbox::GitAsyncCat;
use strict;
use parent qw(PublicInbox::DS Exporter);
+use POSIX qw(WNOHANG);
use PublicInbox::Syscall qw(EPOLLIN EPOLLET);
-our @EXPORT = qw(git_async_cat);
+our @EXPORT = qw(git_async_cat git_async_prefetch);
+use PublicInbox::Git ();
+
+our $GCF2C; # singleton PublicInbox::Gcf2Client
+
+sub close {
+ my ($self) = @_;
+
+ if (my $gitish = delete $self->{gitish}) {
+ PublicInbox::Git::cat_async_abort($gitish);
+ }
+ $self->SUPER::close; # PublicInbox::DS::close
+}
sub event_step {
my ($self) = @_;
- my $gitish = $self->{gitish};
+ my $gitish = $self->{gitish} or return;
return $self->close if ($gitish->{in} // 0) != ($self->{sock} // 1);
my $inflight = $gitish->{inflight};
if ($inflight && @$inflight) {
$gitish->cat_async_step($inflight);
- $self->requeue if @$inflight || exists $gitish->{cat_rbuf};
+
+ # child death?
+ if (($gitish->{in} // 0) != ($self->{sock} // 1)) {
+ $self->close;
+ } elsif (@$inflight || exists $gitish->{cat_rbuf}) {
+ # ok, more to do, requeue for fairness
+ $self->requeue;
+ }
+ } elsif ((my $pid = waitpid($gitish->{pid}, WNOHANG)) > 0) {
+ # May happen if the child process is killed by a BOFH
+ # (or segfaults)
+ delete $gitish->{pid};
+ warn "E: gitish $pid exited with \$?=$?\n";
+ $self->close;
}
}
sub git_async_cat ($$$$) {
my ($git, $oid, $cb, $arg) = @_;
- my $gitish = $git->{gcf2c}; # PublicInbox::Gcf2Client
+ my $gitish = $GCF2C;
if ($gitish) {
$oid .= " $git->{git_dir}";
} else {
};
}
+# this is safe to call inside $cb, but not guaranteed to enqueue
+# returns true if successful, undef if not.
+sub git_async_prefetch {
+ my ($git, $oid, $cb, $arg) = @_;
+ if ($GCF2C) {
+ if ($GCF2C->{async_cat} && !$GCF2C->{wbuf}) {
+ $oid .= " $git->{git_dir}";
+ return $GCF2C->cat_async($oid, $cb, $arg);
+ }
+ } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) {
+ # we could use MAX_INFLIGHT here w/o the halving,
+ # but lets not allow one client to monopolize a git process
+ if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) {
+ print { $git->{out} } $oid, "\n" or
+ $git->fail("write error: $!");
+ return push(@$inflight, $oid, $cb, $arg);
+ }
+ }
+ undef;
+}
+
1;
}
my $pre;
if (!$self->{wbuf} && (my $nxt = $msgs->[0])) {
- $pre = $self->{ibx}->git->async_prefetch($nxt->{blob},
+ $pre = git_async_prefetch($self->{ibx}->git, $nxt->{blob},
\&fetch_blob_cb, $fetch_arg);
}
fetch_run_ops($self, $smsg, $bref, $ops, $partial);
+++ /dev/null
-#!perl -w
-# Copyright (C) 2020 all contributors <meta@public-inbox.org>
-# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-eval { require PublicInbox::Gcf2 };
-die "libgit2 development package or Inline::C missing for $0: $@\n" if $@;
-my $gcf2 = PublicInbox::Gcf2::new();
-use IO::Handle; # autoflush
-STDERR->autoflush(1);
-STDOUT->autoflush(1);
-
-while (<STDIN>) {
- chomp;
- my ($oid, $git_dir) = split(/ /, $_, 2);
- $gcf2->add_alternate("$git_dir/objects");
- if (!$gcf2->cat_oid(1, $oid)) {
- # retry once if missing. We only get unabbreviated OIDs
- # from SQLite or Xapian DBs, here, so malicious clients
- # can't trigger excessive retries:
- warn "I: $$ $oid missing, retrying in $git_dir...\n";
-
- $gcf2 = PublicInbox::Gcf2::new();
- $gcf2->add_alternate("$git_dir/objects");
-
- if ($gcf2->cat_oid(1, $oid)) {
- warn "I: $$ $oid found after retry\n";
- } else {
- warn "W: $$ $oid missing after retry\n";
- print "$oid missing\n"; # mimic git-cat-file
- }
- }
-}
require PublicInbox::HTTP;
require PublicInbox::HTTPD;
}
+
my %httpds;
my $app;
my $refresh = sub {
use Test::More;
use Cwd qw(getcwd);
use PublicInbox::Import;
+use PublicInbox::DS;
require_mods('PublicInbox::Gcf2');
use_ok 'PublicInbox::Gcf2Client';
my $called = 0;
my $err_f = "$tmpdir/err";
{
- local $ENV{PATH} = getcwd()."/blib/script:$ENV{PATH}";
- open my $err, '>', $err_f or BAIL_OUT $!;
+ PublicInbox::DS->Reset;
+ open my $err, '>>', $err_f or BAIL_OUT $!;
my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
$gcf2c->cat_async("$tree $git_a", sub {
my ($bref, $oid, $type, $size, $arg) = @_;
is($arg, 'hi', 'arg passed');
$called++;
}, 'hi');
- $gcf2c->cat_async_wait;
+ $gcf2c->cat_async_step($gcf2c->{inflight});
open $err, '<', $err_f or BAIL_OUT $!;
my $estr = do { local $/; <$err> };
is($arg, 'bye', 'arg passed when missing');
$called++;
}, 'bye');
- $gcf2c->cat_async_wait;
+ $gcf2c->cat_async_step($gcf2c->{inflight});
open $err, '<', $err_f or BAIL_OUT $!;
$estr = do { local $/; <$err> };
like($estr, qr/retrying/, 'warned about retry');
# try failed alternates lookup
+ PublicInbox::DS->Reset;
open $err, '>', $err_f or BAIL_OUT $!;
$gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
$gcf2c->cat_async("$tree $git_b", sub {
is(undef, $bref, 'missing bref from alt is undef');
$called++;
});
- $gcf2c->cat_async_wait;
+ $gcf2c->cat_async_step($gcf2c->{inflight});
open $err, '<', $err_f or BAIL_OUT $!;
$estr = do { local $/; <$err> };
like($estr, qr/retrying/, 'warned about retry before alt update');
is($$bref, $expect, 'tree content matched');
$called++;
});
- $gcf2c->cat_async_wait;
+ $gcf2c->cat_async_step($gcf2c->{inflight});
}
is($called, 4, 'cat_async callbacks hit');
done_testing;