2 # Copyright (C) 2016-2019 all contributors <meta@public-inbox.org>
3 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
5 my $usage = "$0 [-j JOBS] [-s SLOW_THRESHOLD] URL_OF_INBOX\n";
8 use File::Temp qw(tempfile);
10 use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
13 use POSIX qw(:sys_wait_h);
14 use Time::HiRes qw(gettimeofday tv_interval);
20 '-j|jobs=i' => \$nproc,
21 '-s|slow-threshold=f' => \$slow,
23 GetOptions(%opts) or die "bad command-line args\n$usage";
24 my $root_url = shift or die $usage;
26 chomp(my $xmlstarlet = `which xmlstarlet 2>/dev/null`);
27 my $atom_check = eval {
29 my $cmd = [ qw(xmlstarlet val -e -) ];
31 my ($in, $out, $err) = @_;
32 IPC::Run::run($cmd, $in, $out, $err);
37 $SIG{TERM} = sub { exit 0 };
40 my $pid = waitpid(-1, WNOHANG);
41 return if !defined $pid || $pid <= 0;
42 my $p = delete $workers{$pid} || '(unknown)';
43 warn("$pid [$p] exited with $?\n") if $?;
47 my @todo = IO::Socket->socketpair(AF_UNIX, SOCK_SEQPACKET, 0);
48 die "socketpair failed: $!" unless $todo[1];
49 my @done = IO::Socket->socketpair(AF_UNIX, SOCK_SEQPACKET, 0);
50 die "socketpair failed: $!" unless $done[1];
53 foreach my $p (1..$nproc) {
55 die "fork failed: $!\n" unless defined $pid;
61 worker_loop($todo[0], $done[1]);
65 my ($fh, $tmp) = tempfile('www-check-XXXXXXXX',
66 SUFFIX => '.gdbm', UNLINK => 1, TMPDIR => 1);
67 my $gdbm = tie my %seen, 'GDBM_File', $tmp, &GDBM_WRCREAT, 0600;
68 defined $gdbm or die "gdbm open failed: $!\n";
73 $todo[1]->blocking(0);
74 $done[0]->blocking(0);
78 my @queue = ($root_url);
79 my $timeout = $slow * 4;
80 while (keys %workers) { # reacts to SIGCHLD
83 vec($rvec, fileno($done[0]), 1) = 1;
85 vec($wvec, fileno($todo[1]), 1) = 1;
86 } elsif ($ndone == $nsent) {
87 kill 'TERM', keys %workers;
90 if (!select($rvec, $wvec, undef, $timeout)) {
91 while (my ($k, $v) = each %seen) {
93 print "WAIT ($ndone/$nsent) <$k>\n";
96 while ($u = shift @queue) {
97 my $s = $todo[1]->send($u, MSG_EOR);
105 $r = $done[0]->recv($u, 65535, 0);
106 } while (!defined $r && $!{EINTR});
108 if ($u =~ s/\ADONE\t//) {
120 my ($todo_rd, $done_wr) = @_;
121 my $m = WWW::Mechanize->new(autocheck => 0);
122 my $cc = LWP::ConnCache->new;
125 $todo_rd->recv(my $u, 65535, 0);
128 my $t = [ gettimeofday ];
130 $t = tv_interval($t);
131 printf "SLOW %0.06f % 5d %s\n", $t, $$, $u if $t > $slow;
133 if ($r->is_success) {
135 (split('#', $_->URI->abs->as_string))[0] => 1;
137 $_->tag && $_->url !~ /:/
139 @links = keys %links;
140 } elsif ($r->code != 300) {
141 warn "W: ".$r->code . " $u\n"
146 foreach my $l (@links, "DONE\t$u") {
149 $s = $done_wr->send($l, MSG_EOR);
150 } while (!defined $s && $!{EINTR});
151 die "$$ send $!\n" unless defined $s;
153 die "$$ send truncated $s < $n\n" if $s != $n;
156 # make sure the HTML source doesn't screw up terminals
157 # when people curl the source (not remotely an expert
158 # on languages or encodings, here).
159 my $ct = $r->header('Content-Type');
160 if ($atom_check && $ct =~ m!\bapplication/atom\+xml\b!) {
161 my $raw = $r->decoded_content;
162 my ($out, $err) = ('', '');
163 $atom_check->(\$raw, \$out, \$err) and
164 warn "Atom ($?) - $u - <1:$out> <2:$err>\n";
167 next if $ct !~ m!\btext/html\b!;
168 my $dc = $r->decoded_content;
169 if ($dc =~ /([\x00-\x08\x0d-\x1f\x7f-\x{99999999}]+)/s) {