From: Eric Wong (Contractor, The Linux Foundation) Date: Wed, 18 Apr 2018 20:58:35 +0000 (+0000) Subject: Merge remote-tracking branch 'origin/master' into v2 X-Git-Tag: v1.1.0-pre1~23 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=cfb8d16578e7f2f2e300f9f436205e4a8fc7f322;hp=119463b3b8517e5ec149198bb83588999118ee1d Merge remote-tracking branch 'origin/master' into v2 * origin/master: nntp: allow and ignore empty commands mbox: do not barf on queries which return no results nntp: fix NEWNEWS command searchview: fix non-numeric comparison Allow specification of the number of search results to return githttpbackend: avoid infinite loop on generic PSGI servers http: fix modification of read-only value extmsg: use news.gmane.org for Message-ID lookups extmsg: rework partial MID matching to favor current inbox Update the installation instructions with Fedora package names nntp: do not drain rbuf if there is a command pending nntp: improve fairness during XOVER and similar commands searchidx: do not modify Xapian DB while iterating Don't use LIMIT in UPDATE statements --- diff --git a/AUTHORS b/AUTHORS index 201ed034..1ad02cd8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -4,3 +4,4 @@ See history in git (via `git clone https://public-inbox.org/public-inbox') for a full history of the project. * Eric Wong (BDFL) +* The Linux Foundation (v2 work) diff --git a/Documentation/public-inbox-compact.pod b/Documentation/public-inbox-compact.pod new file mode 100644 index 00000000..4a519ce9 --- /dev/null +++ b/Documentation/public-inbox-compact.pod @@ -0,0 +1,50 @@ +=head1 NAME + +public-inbox-compact - compact Xapian DBs + +=head1 SYNOPSIS + + public-inbox-compact INBOX_DIR + +=head1 DESCRIPTION + +public-inbox-compact is a wrapper for L +designed for "v2" inboxes. It combines multiple Xapian +partitions into one to reduce space overhead after an initial +mass import (using multiple partitions) is done. + +It locks the inbox and prevents other processes such as +L from writing while it operates. + +It also supports "v1" (ssoma) inboxes with limited +usefulness over L + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +The default config file, normally "~/.public-inbox/config". +See L + +=back + +=head1 UPGRADING + +=head1 CONTACT + +Feedback welcome via plain-text mail to L + +The mail archives are hosted at L +and L + +=head1 COPYRIGHT + +Copyright 2018 all contributors L + +License: AGPL-3.0+ L + +=head1 SEE ALSO + +L, L diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 8250b459..22ee9095 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -40,7 +40,7 @@ Default: none, required =item publicinbox..mainrepo -The absolute path to the git repository which hosts the +The absolute path to the directory which hosts the public-inbox. This must be specified once. Default: none, required diff --git a/Documentation/public-inbox-convert.pod b/Documentation/public-inbox-convert.pod new file mode 100644 index 00000000..1e16ea45 --- /dev/null +++ b/Documentation/public-inbox-convert.pod @@ -0,0 +1,45 @@ +=head1 NAME + +public-inbox-convert - convert v1 inboxes to v2 + +=head1 SYNOPSIS + + public-inbox-convert OLD_DIR NEW_DIR + +=head1 DESCRIPTION + +public-inbox-convert copies the contents of an old "v1" inbox +into a new "v2" inbox. It makes no changes to the old inbox +and users are expected to update the "mainrepo" path in +L to point to the path of NEW_DIR +once they are satisfied with the conversion. + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +The default config file, normally "~/.public-inbox/config". +See L + +=back + +=head1 UPGRADING + +=head1 CONTACT + +Feedback welcome via plain-text mail to L + +The mail archives are hosted at L +and L + +=head1 COPYRIGHT + +Copyright 2013-2018 all contributors L + +License: AGPL-3.0+ L + +=head1 SEE ALSO + +L, L diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 838a2069..acc90392 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -4,7 +4,7 @@ public-inbox-index - create and update search indices =head1 SYNOPSIS -public-inbox-index [OPTIONS] GIT_DIR +public-inbox-index [OPTIONS] REPO_DIR =head1 DESCRIPTION @@ -46,14 +46,14 @@ This does not touch the NNTP article number database. =head1 FILES All public-inbox-specific files are contained within the -C<$GIT_DIR/public-inbox/> directory. All files are expected to +C<$REPO_DIR/public-inbox/> directory. All files are expected to grow in size as more messages are archived, so using compaction commands (e.g. L) is not recommended unless the list is no longer active. =over -=item $GIT_DIR/public-inbox/msgmap.sqlite3 +=item $REPO_DIR/public-inbox/msgmap.sqlite3 The stable NNTP article number to Message-ID mapping is stored in an SQLite3 database. @@ -70,7 +70,7 @@ messages. This file is relatively small, and typically less than 5% of the space of the mail stored in a packed git repository. -=item $GIT_DIR/public-inbox/xapian* +=item $REPO_DIR/public-inbox/xapian* The database used by L. This directory name is followed by a number indicating the index schema version this diff --git a/INSTALL b/INSTALL index d57696e7..87aa6961 100644 --- a/INSTALL +++ b/INSTALL @@ -44,8 +44,6 @@ Numerous optional modules are likely to be useful as well: - Plack deb: libplack-perl rpm: perl-Plack, perl-Plack-Test, - perl-Plack-Middleware-ReverseProxy, - perl-Plack-Middleware-Deflater (for HTML/Atom generation) - URI::Escape deb: liburi-perl @@ -80,10 +78,31 @@ Numerous optional modules are likely to be useful as well: rpm: perl-Filesys-Notify-Simple (for public-inbox-watch) + - Inline::C[7] deb: libinline-c-perl + (speeds up spawning on Linux + (see public-inbox-daemon(8)) + + - Plack::Middleware::ReverseProxy + + deb: libplack-middleware-reverseproxy-perl + rpm: perl-Plack-Middleware-ReverseProxy + (ensures redirects are correct when running + behind nginx or Varnish) + + - Plack::Middleware::Deflater + + deb: libplack-middleware-deflater-perl + rpm: perl-Plack-Middleware-Deflater + (saves bandwidth on responses) + On Fedora systems, you'll probably also end up wanting perl-Test-HTTP-Server-Simple, perl-Devel-Peek, and perl-IPC-Run to run the -test suite. +test suite. On Debian systems, libxml-feed-perl and libipc-run-perl(*) +will aid in running the test suite (XML::Feed and IPC::Run respectively, +on CPAN). + +(*) we hope to drop this dependency someday standard MakeMaker installation (Perl) -------------------------------------- diff --git a/MANIFEST b/MANIFEST index 5074d8dc..8038ad48 100644 --- a/MANIFEST +++ b/MANIFEST @@ -7,7 +7,9 @@ Documentation/design_notes.txt Documentation/design_www.txt Documentation/hosted.txt Documentation/include.mk +Documentation/public-inbox-compact.pod Documentation/public-inbox-config.pod +Documentation/public-inbox-convert.pod Documentation/public-inbox-daemon.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-index.pod @@ -46,6 +48,7 @@ examples/varnish-4.vcl lib/PublicInbox/Address.pm lib/PublicInbox/AltId.pm lib/PublicInbox/Config.pm +lib/PublicInbox/ContentId.pm lib/PublicInbox/Daemon.pm lib/PublicInbox/Emergency.pm lib/PublicInbox/EvCleanup.pm @@ -65,17 +68,22 @@ lib/PublicInbox/HTTPD/Async.pm lib/PublicInbox/Hval.pm lib/PublicInbox/Import.pm lib/PublicInbox/Inbox.pm +lib/PublicInbox/InboxWritable.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm +lib/PublicInbox/Lock.pm lib/PublicInbox/MDA.pm lib/PublicInbox/MID.pm lib/PublicInbox/MIME.pm lib/PublicInbox/Mbox.pm lib/PublicInbox/MsgIter.pm +lib/PublicInbox/MsgTime.pm lib/PublicInbox/Msgmap.pm lib/PublicInbox/NNTP.pm lib/PublicInbox/NNTPD.pm lib/PublicInbox/NewsWWW.pm +lib/PublicInbox/Over.pm +lib/PublicInbox/OverIdx.pm lib/PublicInbox/ParentPipe.pm lib/PublicInbox/ProcessPipe.pm lib/PublicInbox/Qspawn.pm @@ -83,6 +91,7 @@ lib/PublicInbox/Reply.pm lib/PublicInbox/SaPlugin/ListMirror.pm lib/PublicInbox/Search.pm lib/PublicInbox/SearchIdx.pm +lib/PublicInbox/SearchIdxPart.pm lib/PublicInbox/SearchMsg.pm lib/PublicInbox/SearchThread.pm lib/PublicInbox/SearchView.pm @@ -90,6 +99,7 @@ lib/PublicInbox/Spamcheck/Spamc.pm lib/PublicInbox/Spawn.pm lib/PublicInbox/SpawnPP.pm lib/PublicInbox/Unsubscribe.pm +lib/PublicInbox/V2Writable.pm lib/PublicInbox/View.pm lib/PublicInbox/WWW.pm lib/PublicInbox/WWW.pod @@ -102,6 +112,8 @@ sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre sa_config/user/.spamassassin/user_prefs +script/public-inbox-compact +script/public-inbox-convert script/public-inbox-httpd script/public-inbox-index script/public-inbox-init @@ -112,6 +124,7 @@ script/public-inbox-watch script/public-inbox.cgi scripts/dc-dlvr scripts/dc-dlvr.pre +scripts/dupe-finder scripts/edit-sa-prefs scripts/import_maildir scripts/import_slrnspool @@ -122,11 +135,14 @@ scripts/ssoma-replay scripts/xhdr-num2mid t/address.t t/altid.t +t/altid_v2.t t/cgi.t t/check-www-inbox.perl t/common.perl t/config.t t/config_limiter.t +t/content_id.t +t/convert-compact.t t/emergency.t t/fail-bin/spamc t/feed.t @@ -157,20 +173,31 @@ t/msg_iter.t t/msgmap.t t/nntp.t t/nntpd.t +t/over.t +t/perf-nntpd.t +t/perf-threading.t t/plack.t t/precheck.t t/psgi_attach.t t/psgi_mount.t t/psgi_search.t t/psgi_text.t +t/psgi_v2.t t/qspawn.t t/reply.t t/search-thr-index.t t/search.t t/spamcheck_spamc.t t/spawn.t -t/thread-all.t t/thread-cycle.t +t/time.t t/utf8.mbox +t/v1-add-remove-add.t +t/v2-add-remove-add.t +t/v2mda.t +t/v2mirror.t +t/v2reindex.t +t/v2writable.t t/view.t t/watch_maildir.t +t/watch_maildir_v2.t diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm index f334adea..548f417c 100644 --- a/lib/PublicInbox/Address.pm +++ b/lib/PublicInbox/Address.pm @@ -8,7 +8,8 @@ use warnings; # just enough to make thing sanely displayable and pass to git sub emails { - ($_[0] =~ /([\w\.\+=\-]+\@[\w\.\-]+)>?\s*(?:\(.*?\))?(?:,\s*|\z)/g) + ($_[0] =~ /([\w\.\+=\?"\(\)\-!#\$%&'\*\/\^\`\|\{\}~]+\@[\w\.\-\(\)]+) + (?:\s[^>]*)?>?\s*(?:\(.*?\))?(?:,\s*|\z)/gx) } sub names { diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index d1b2dc24..4a6ff97c 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -22,17 +22,31 @@ sub new { } split(/[&;]/, $query); my $f = $params{file} or die "file: required for $type spec $spec\n"; unless (index($f, '/') == 0) { - $f = "$inbox->{mainrepo}/public-inbox/$f"; + if (($inbox->{version} || 1) == 1) { + $f = "$inbox->{mainrepo}/public-inbox/$f"; + } else { + $f = "$inbox->{mainrepo}/$f"; + } } bless { - mm_alt => PublicInbox::Msgmap->new_file($f, $writable), + filename => $f, + writable => $writable, xprefix => 'X'.uc($prefix), }, $class; } +sub mm_alt { + my ($self) = @_; + $self->{mm_alt} ||= eval { + my $f = $self->{filename}; + my $writable = $self->{writable}; + PublicInbox::Msgmap->new_file($f, $writable); + }; +} + sub mid2alt { my ($self, $mid) = @_; - $self->{mm_alt}->num_for($mid); + $self->mm_alt->num_for($mid); } 1; diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm new file mode 100644 index 00000000..b1d27eb8 --- /dev/null +++ b/lib/PublicInbox/ContentId.pm @@ -0,0 +1,102 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +package PublicInbox::ContentId; +use strict; +use warnings; +use base qw/Exporter/; +our @EXPORT_OK = qw/content_id content_digest/; +use PublicInbox::MID qw(mids references); +use PublicInbox::MsgIter; + +# not sure if less-widely supported hash families are worth bothering with +use Digest::SHA; + +sub digest_addr ($$$) { + my ($dig, $h, $v) = @_; + $v =~ tr/"//d; + $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; + utf8::encode($v); + $dig->add("$h\0$v\0"); +} + +sub content_digest ($) { + my ($mime) = @_; + my $dig = Digest::SHA->new(256); + my $hdr = $mime->header_obj; + + # References: and In-Reply-To: get used interchangeably + # in some "duplicates" in LKML. We treat them the same + # in SearchIdx, so treat them the same for this: + my %seen; + foreach my $mid (@{mids($hdr)}) { + # do NOT consider the Message-ID as part of the content_id + # if we got here, we've already got Message-ID reuse + $seen{$mid} = 1; + } + foreach my $mid (@{references($hdr)}) { + next if $seen{$mid}; + $dig->add("ref\0$mid\0"); + } + + # Only use Sender: if From is not present + foreach my $h (qw(From Sender)) { + my @v = $hdr->header($h); + if (@v) { + digest_addr($dig, $h, $_) foreach @v; + } + } + foreach my $h (qw(Subject Date)) { + my @v = $hdr->header($h); + foreach my $v (@v) { + utf8::encode($v); + $dig->add("$h\0$v\0"); + } + } + # Some mail processors will add " to unquoted names that were + # not in the original message. For the purposes of deduplication, + # do not take it into account: + foreach my $h (qw(To Cc)) { + my @v = $hdr->header($h); + digest_addr($dig, $h, $_) foreach @v; + } + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + $dig->add("\0$depth:".join('.', @idx)."\0"); + my $fn = $part->filename; + if (defined $fn) { + utf8::encode($fn); + $dig->add("fn\0$fn\0"); + } + my @d = $part->header('Content-Description'); + foreach my $d (@d) { + utf8::encode($d); + $dig->add("d\0$d\0"); + } + $dig->add("b\0"); + my $ct = $part->content_type || 'text/plain'; + my $s = eval { $part->body_str }; + if ($@ && $ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + } + if (defined $s) { + $s =~ s/\r\n/\n/gs; + $s =~ s/\s*\z//s; + utf8::encode($s); + } else { + $s = $part->body; + } + $dig->add($s); + }); + $dig; +} + +sub content_id ($) { + content_digest($_[0])->digest; +} + +1; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index 0329bd34..4629aadb 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -460,6 +460,7 @@ sub daemon_loop ($$) { @listeners = map { PublicInbox::Listener->new($_, $post_accept) } @listeners; + PublicInbox::EvCleanup::enable(); Danga::Socket->EventLoop; $parent_pipe = undef; } diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm index 231b4197..66adc631 100644 --- a/lib/PublicInbox/Emergency.pm +++ b/lib/PublicInbox/Emergency.pm @@ -18,7 +18,7 @@ sub new { next if -d $d; -d $d or mkdir($d) or die "failed to mkdir($d): $!\n"; } - bless { dir => $dir, files => {}, t => 0, cnt => 0 }, $class; + bless { dir => $dir, files => {}, t => 0, cnt => 0, pid => $$ }, $class; } sub _fn_in { @@ -75,6 +75,7 @@ sub fh { sub commit { my ($self) = @_; + $$ == $self->{pid} or return; # no-op in forked child delete $self->{fh}; my $tmp = delete $self->{tmp} or return; diff --git a/lib/PublicInbox/EvCleanup.pm b/lib/PublicInbox/EvCleanup.pm index 559730e9..1a3a3d5e 100644 --- a/lib/PublicInbox/EvCleanup.pm +++ b/lib/PublicInbox/EvCleanup.pm @@ -7,6 +7,10 @@ use strict; use warnings; use base qw(Danga::Socket); use fields qw(rd); + +my $ENABLED; +sub enabled { $ENABLED } +sub enable { $ENABLED = 1 } my $singleton; my $asapq = [ [], undef ]; my $nextq = [ [], undef ]; @@ -77,8 +81,8 @@ sub later ($) { END { _run_asap(); - _run_next(); - _run_later(); + _run_all($nextq); + _run_all($laterq); } 1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 760614df..04cb4062 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -31,30 +31,19 @@ sub ext_msg { my $cur = $ctx->{-inbox}; my $mid = $ctx->{mid}; - eval { require PublicInbox::Search }; - my $have_xap = $@ ? 0 : 1; - my (@nox, @ibx, @found); + eval { require PublicInbox::Msgmap }; + my $have_mm = $@ ? 0 : 1; + my (@ibx, @found); $ctx->{www}->{pi_config}->each_inbox(sub { my ($other) = @_; return if $other->{name} eq $cur->{name} || !$other->base_url; - my $s = $other->search; - if (!$s) { - push @nox, $other; - return; - } - - # try to find the URL with Xapian to avoid forking - my $doc_id = eval { $s->find_unique_doc_id('mid', $mid) }; - if ($@) { - # xapian not configured properly for this repo - push @nox, $other; - return; - } + my $mm = $other->mm or return; - # maybe we found it! - if (defined $doc_id) { + # try to find the URL with Msgmap to avoid forking + my $num = $mm->num_for($mid); + if (defined $num) { push @found, $other; } else { # no point in trying the fork fallback if we @@ -66,20 +55,6 @@ sub ext_msg { return exact($ctx, \@found, $mid) if @found; - # Xapian not installed or configured for some repos, - # do a full MID check (this is expensive...): - if (@nox) { - my $path = mid2path($mid); - foreach my $other (@nox) { - my (undef, $type, undef) = $other->path_check($path); - - if ($type && $type eq 'blob') { - push @found, $other; - } - } - } - return exact($ctx, \@found, $mid) if @found; - # fall back to partial MID matching my $n_partial = 0; my @partial; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index c32e7bde..b373a1eb 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -8,19 +8,18 @@ use warnings; use PublicInbox::MIME; use PublicInbox::View; use PublicInbox::WwwAtomStream; +use PublicInbox::SearchMsg; # this loads w/o Search::Xapian # main function sub generate { my ($ctx) = @_; - my @paths; - each_recent_blob($ctx, sub { push @paths, $_[0] }); - return _no_thread() unless @paths; + my $msgs = recent_msgs($ctx); + return _no_thread() unless @$msgs; my $ibx = $ctx->{-inbox}; PublicInbox::WwwAtomStream->response($ctx, 200, sub { - while (my $path = shift @paths) { - my $mime = do_cat_mail($ibx, $path) or next; - return $mime; + while (my $smsg = shift @$msgs) { + $ibx->smsg_mime($smsg) and return $smsg; } }); } @@ -28,18 +27,16 @@ sub generate { sub generate_thread_atom { my ($ctx) = @_; my $mid = $ctx->{mid}; - my $res = $ctx->{srch}->get_thread($mid); - return _no_thread() unless $res->{total}; + my $msgs = $ctx->{srch}->get_thread($mid); + return _no_thread() unless @$msgs; my $ibx = $ctx->{-inbox}; my $html_url = $ibx->base_url($ctx->{env}); $html_url .= PublicInbox::Hval->new_msgid($mid)->{href}; $ctx->{-html_url} = $html_url; - my $msgs = $res->{msgs}; PublicInbox::WwwAtomStream->response($ctx, 200, sub { - while (my $msg = shift @$msgs) { - $msg = $ibx->msg_by_smsg($msg) and - return PublicInbox::MIME->new($msg); + while (my $smsg = shift @$msgs) { + $ibx->smsg_mime($smsg) and return $smsg; } }); } @@ -63,27 +60,22 @@ sub generate_html_index { sub new_html { my ($ctx) = @_; - my @paths; - my (undef, $last) = each_recent_blob($ctx, sub { - my ($path, $commit, $ts, $u, $subj) = @_; - $ctx->{first} ||= $commit; - push @paths, $path; - }); - if (!@paths) { + my $msgs = recent_msgs($ctx); + if (!@$msgs) { return [404, ['Content-Type', 'text/plain'], ["No messages, yet\n"] ]; } $ctx->{-html_tip} = '
';
 	$ctx->{-upfx} = '';
 	$ctx->{-hr} = 1;
+	my $ibx = $ctx->{-inbox};
 	PublicInbox::WwwStream->response($ctx, 200, sub {
-		while (my $path = shift @paths) {
-			my $m = do_cat_mail($ctx->{-inbox}, $path) or next;
-			my $more = scalar @paths;
-			my $s = PublicInbox::View::index_entry($m, $ctx, $more);
-			return $s;
+		while (my $smsg = shift @$msgs) {
+			my $m = $ibx->smsg_mime($smsg) or next;
+			my $more = scalar @$msgs;
+			return PublicInbox::View::index_entry($m, $ctx, $more);
 		}
-		new_html_footer($ctx, $last);
+		PublicInbox::View::pagination_footer($ctx, './new.html');
 	});
 }
 
@@ -93,30 +85,23 @@ sub _no_thread () {
 	[404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]];
 }
 
-sub new_html_footer {
-	my ($ctx, $last) = @_;
-	my $qp = delete $ctx->{qp} or return;
-	my $old_r = $qp->{r};
-	my $latest = '';
-	my $next = '    ';
-
-	if ($last) {
-		$next = qq!next!;
+sub recent_msgs {
+	my ($ctx) = @_;
+	my $ibx = $ctx->{-inbox};
+	my $max = $ibx->{feedmax};
+	my $qp = $ctx->{qp};
+	my $v = $ibx->{version} || 1;
+	if ($v > 2) {
+		die "BUG: unsupported inbox version: $v\n";
 	}
-	if ($old_r) {
-		$latest = qq! latest!;
+	if (my $srch = $ibx->search) {
+		return PublicInbox::View::paginate_recent($ctx, $max);
 	}
-	"
page: $next$latest
"; -} -sub each_recent_blob { - my ($ctx, $cb) = @_; - my $max = $ctx->{-inbox}->{feedmax}; my $hex = '[a-f0-9]'; - my $addmsg = qr!^:000000 100644 \S+ \S+ A\t(${hex}{2}/${hex}{38})$!; - my $delmsg = qr!^:100644 000000 \S+ \S+ D\t(${hex}{2}/${hex}{38})$!; + my $addmsg = qr!^:000000 100644 \S+ (\S+) A\t${hex}{2}/${hex}{38}$!; + my $delmsg = qr!^:100644 000000 (\S+) \S+ D\t(${hex}{2}/${hex}{38})$!; my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~\d+)?/; - my $qp = $ctx->{qp}; # revision ranges may be specified my $range = 'HEAD'; @@ -128,54 +113,41 @@ sub each_recent_blob { # get recent messages # we could use git log -z, but, we already know ssoma will not # leave us with filenames with spaces in them.. - my $log = $ctx->{-inbox}->git->popen(qw/log + my $log = $ibx->git->popen(qw/log --no-notes --no-color --raw -r - --abbrev=16 --abbrev-commit/, - "--format=%h%x00%ct%x00%an%x00%s%x00", - $range); + --no-abbrev --abbrev-commit/, + "--format=%H", $range); my %deleted; # only an optimization at this point my $last; - my $nr = 0; - my ($cur_commit, $first_commit, $last_commit); - my ($ts, $subj, $u); + my $last_commit; local $/ = "\n"; + my @oids; while (defined(my $line = <$log>)) { if ($line =~ /$addmsg/o) { my $add = $1; next if $deleted{$add}; # optimization-only - $cb->($add, $cur_commit, $ts, $u, $subj) and $nr++; - if ($nr >= $max) { + push @oids, $add; + if (scalar(@oids) >= $max) { $last = 1; last; } } elsif ($line =~ /$delmsg/o) { $deleted{$1} = 1; - } elsif ($line =~ /^${hex}{7,40}/o) { - ($cur_commit, $ts, $u, $subj) = split("\0", $line); - unless (defined $first_commit) { - $first_commit = $cur_commit; - } } } if ($last) { local $/ = "\n"; while (my $line = <$log>) { - if ($line =~ /^(${hex}{7,40})/o) { + if ($line =~ /^(${hex}{7,40})/) { $last_commit = $1; last; } } } - # for pagination - ($first_commit, $last_commit); -} - -sub do_cat_mail { - my ($ibx, $path) = @_; - my $mime = eval { $ibx->msg_by_path($path) } or return; - PublicInbox::MIME->new($mime); + $ctx->{next_page} = "r=$last_commit" if $last_commit; + [ map { bless {blob => $_ }, 'PublicInbox::SearchMsg' } @oids ]; } 1; diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index 63e8d422..cb69e38a 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -6,6 +6,7 @@ package PublicInbox::Filter::RubyLang; use base qw(PublicInbox::Filter::Base); use strict; use warnings; +use PublicInbox::MID qw(mids); my $l1 = qr/Unsubscribe:\s /x; @@ -44,16 +45,23 @@ sub scrub { my $altid = $self->{-altid}; if ($altid) { my $hdr = $mime->header_obj; - my $mid = $hdr->header_raw('Message-ID'); - unless (defined $mid) { - return $self->REJECT('Message-Id missing'); + my $mids = mids($hdr); + return $self->REJECT('Message-ID missing') unless (@$mids); + my @v = $hdr->header_raw('X-Mail-Count'); + my $n; + foreach (@v) { + /\A\s*(\d+)\s*\z/ or next; + $n = $1; + last; } - my $n = $hdr->header_raw('X-Mail-Count'); - if (!defined($n) || $n !~ /\A\s*\d+\s*\z/) { + unless (defined $n) { return $self->REJECT('X-Mail-Count not numeric'); } - $mid = PublicInbox::MID::mid_clean($mid); - $altid->{mm_alt}->mid_set($n, $mid); + foreach my $mid (@$mids) { + my $r = $altid->mm_alt->mid_set($n, $mid); + next if $r == 0; + last; + } } $self->ACCEPT($mime); } diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index ea2b814e..95df52ed 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -15,7 +15,19 @@ use PublicInbox::Spawn qw(spawn popen_rd); sub new { my ($class, $git_dir) = @_; - bless { git_dir => $git_dir }, $class + my @st; + $st[7] = $st[10] = 0; + bless { git_dir => $git_dir, st => \@st }, $class +} + +sub alternates_changed { + my ($self) = @_; + my $alt = "$self->{git_dir}/objects/info/alternates"; + my @st = stat($alt) or return 0; + my $old_st = $self->{st}; + # 10 - ctime, 7 - size + return 0 if ($st[10] == $old_st->[10] && $st[7] == $old_st->[7]); + $self->{st} = \@st; } sub _bidi_pipe { @@ -38,14 +50,23 @@ sub _bidi_pipe { sub cat_file { my ($self, $obj, $ref) = @_; + my ($retried, $in, $head); +again: batch_prepare($self); $self->{out}->print($obj, "\n") or fail($self, "write error: $!"); - my $in = $self->{in}; + $in = $self->{in}; local $/ = "\n"; - my $head = $in->getline; - $head =~ / missing$/ and return undef; + $head = $in->getline; + if ($head =~ / missing$/) { + if (!$retried && alternates_changed($self)) { + $retried = 1; + cleanup($self); + goto again; + } + return; + } $head =~ /^[0-9a-f]{40} \S+ (\d+)$/ or fail($self, "Unexpected result from git cat-file: $head"); @@ -139,6 +160,18 @@ sub cleanup { _destroy($self, qw(in_c out_c pid_c)); } +# assuming a well-maintained repo, this should be a somewhat +# accurate estimation of its size +# TODO: show this in the WWW UI as a hint to potential cloners +sub packed_bytes { + my ($self) = @_; + my $n = 0; + foreach my $p (glob("$self->{git_dir}/objects/pack/*.pack")) { + $n += -s $p; + } + $n +} + sub DESTROY { cleanup(@_) } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 8eec17eb..b25427ee 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -7,19 +7,32 @@ package PublicInbox::Import; use strict; use warnings; -use Fcntl qw(:flock :DEFAULT); +use base qw(PublicInbox::Lock); use PublicInbox::Spawn qw(spawn); -use PublicInbox::MID qw(mid_mime mid2path); +use PublicInbox::MID qw(mids mid_mime mid2path); use PublicInbox::Address; +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use PublicInbox::ContentId qw(content_digest); +use PublicInbox::MDA; +use POSIX qw(strftime); sub new { - my ($class, $git, $name, $email, $inbox) = @_; + my ($class, $git, $name, $email, $ibx) = @_; + my $ref = 'refs/heads/master'; + if ($ibx) { + $ref = $ibx->{ref_head} || 'refs/heads/master'; + $name ||= $ibx->{name}; + $email ||= $ibx->{-primary_address}; + } bless { git => $git, ident => "$name <$email>", mark => 1, - ref => 'refs/heads/master', - inbox => $inbox, + ref => $ref, + inbox => $ibx, + path_type => '2/38', # or 'v2' + lock_path => "$git->{git_dir}/ssoma.lock", # v2 changes this + bytes_added => 0, }, $class } @@ -33,25 +46,28 @@ sub gfi_start { pipe($in_r, $in_w) or die "pipe failed: $!"; pipe($out_r, $out_w) or die "pipe failed: $!"; my $git = $self->{git}; - my $git_dir = $git->{git_dir}; - my $lockpath = "$git_dir/ssoma.lock"; - sysopen(my $lockfh, $lockpath, O_WRONLY|O_CREAT) or - die "failed to open lock $lockpath: $!"; - # wait for other processes to be done - flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + $self->lock_acquire; + local $/ = "\n"; - chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref})); + my $ref = $self->{ref}; + chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + if ($self->{path_type} ne '2/38' && $self->{tip}) { + local $/ = "\0"; + my @tree = $git->qx(qw(ls-tree -r -z --name-only), $ref); + chomp @tree; + $self->{-tree} = { map { $_ => 1 } @tree }; + } + my $git_dir = $git->{git_dir}; my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import - --quiet --done --date-format=rfc2822)); + --quiet --done --date-format=raw)); my $rdr = { 0 => fileno($out_r), 1 => fileno($in_w) }; my $pid = spawn(\@cmd, undef, $rdr); die "spawn fast-import failed: $!" unless defined $pid; $out_w->autoflush(1); $self->{in} = $in_r; $self->{out} = $out_w; - $self->{lockfh} = $lockfh; $self->{pid} = $pid; $self->{nchg} = 0; binmode $out_w, ':raw' or die "binmode :raw failed: $!"; @@ -61,14 +77,7 @@ sub gfi_start { sub wfail () { die "write to fast-import failed: $!" } -sub now2822 () { - my @t = gmtime(time); - my $day = qw(Sun Mon Tue Wed Thu Fri Sat)[$t[6]]; - my $mon = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)[$t[4]]; - - sprintf('%s, %2d %s %d %02d:%02d:%02d +0000', - $day, $t[3], $mon, $t[5] + 1900, $t[2], $t[1], $t[0]); -} +sub now_raw () { time . ' +0000' } sub norm_body ($) { my ($mime) = @_; @@ -77,6 +86,7 @@ sub norm_body ($) { $b } +# only used for v1 (ssoma) inboxes sub _check_path ($$$$) { my ($r, $w, $tip, $path) = @_; return if $tip eq ''; @@ -86,27 +96,13 @@ sub _check_path ($$$$) { $info =~ /\Amissing / ? undef : $info; } -# returns undef on non-existent -# ('MISMATCH', msg) on mismatch -# (:MARK, msg) on success -sub remove { - my ($self, $mime, $msg) = @_; # mime = Email::MIME - - my $mid = mid_mime($mime); - my $path = mid2path($mid); - - my ($r, $w) = $self->gfi_start; - my $tip = $self->{tip}; - my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); - $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info"; - my $blob = $1; - - print $w "cat-blob $blob\n" or wfail; +sub _cat_blob ($$$) { + my ($r, $w, $oid) = @_; + print $w "cat-blob $oid\n" or wfail; local $/ = "\n"; - $info = <$r>; + my $info = <$r>; defined $info or die "EOF from fast-import / cat-blob: $!"; - $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or - die "unexpected cat-blob response: $info"; + $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or return; my $left = $1; my $offset = 0; my $buf = ''; @@ -121,7 +117,26 @@ sub remove { $n = read($r, my $lf, 1); defined($n) or die "read final byte of cat-blob failed: $!"; die "bad read on final byte: <$lf>" if $lf ne "\n"; - my $cur = PublicInbox::MIME->new($buf); + + # fixup some bugginess in old versions: + $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + \$buf; +} + +sub cat_blob { + my ($self, $oid) = @_; + my ($r, $w) = $self->gfi_start; + _cat_blob($r, $w, $oid); +} + +sub check_remove_v1 { + my ($r, $w, $tip, $path, $mime) = @_; + + my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info"; + my $oid = $1; + my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; + my $cur = PublicInbox::MIME->new($msg); my $cur_s = $cur->header('Subject'); $cur_s = '' unless defined $cur_s; my $cur_m = $mime->header('Subject'); @@ -129,6 +144,103 @@ sub remove { if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { return ('MISMATCH', $cur); } + (undef, $cur); +} + +sub checkpoint { + my ($self) = @_; + return unless $self->{pid}; + print { $self->{out} } "checkpoint\n" or wfail; + undef; +} + +sub progress { + my ($self, $msg) = @_; + return unless $self->{pid}; + print { $self->{out} } "progress $msg\n" or wfail; + $self->{in}->getline eq "progress $msg\n" or die + "progress $msg not received\n"; + undef; +} + +sub _update_git_info ($$) { + my ($self, $do_gc) = @_; + # for compatibility with existing ssoma installations + # we can probably remove this entirely by 2020 + my $git_dir = $self->{git}->{git_dir}; + my @cmd = ('git', "--git-dir=$git_dir"); + my $index = "$git_dir/ssoma.index"; + if (-e $index && !$ENV{FAST}) { + my $env = { GIT_INDEX_FILE => $index }; + run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); + } + run_die([@cmd, 'update-server-info'], undef); + ($self->{path_type} eq '2/38') and eval { + require PublicInbox::SearchIdx; + my $inbox = $self->{inbox} || $git_dir; + my $s = PublicInbox::SearchIdx->new($inbox); + $s->index_sync({ ref => $self->{ref} }); + }; + eval { run_die([@cmd, qw(gc --auto)], undef) } if $do_gc; +} + +sub barrier { + my ($self) = @_; + + # For safety, we ensure git checkpoint is complete before because + # the data in git is still more important than what is in Xapian + # in v2. Performance may be gained by delaying the ->progress + # call but we lose safety + if ($self->{nchg}) { + $self->checkpoint; + $self->progress('checkpoint'); + _update_git_info($self, 0); + $self->{nchg} = 0; + } +} + +# used for v2 +sub get_mark { + my ($self, $mark) = @_; + die "not active\n" unless $self->{pid}; + my ($r, $w) = $self->gfi_start; + print $w "get-mark $mark\n" or wfail; + defined(my $oid = <$r>) or die "get-mark failed, need git 2.6.0+\n"; + chomp($oid); + $oid; +} + +# returns undef on non-existent +# ('MISMATCH', Email::MIME) on mismatch +# (:MARK, Email::MIME) on success +# +# v2 callers should check with Xapian before calling this as +# it is not idempotent. +sub remove { + my ($self, $mime, $msg) = @_; # mime = Email::MIME + + my $path_type = $self->{path_type}; + my ($path, $err, $cur, $blob); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + if ($path_type eq '2/38') { + $path = mid2path(v1_mid0($mime)); + ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); + return ($err, $cur) if $err; + } else { + my $sref; + if (ref($mime) eq 'SCALAR') { # optimization used by V2Writable + $sref = $mime; + } else { # XXX should not be necessary: + my $str = $mime->as_string; + $sref = \$str; + } + my $len = length($$sref); + $blob = $self->{mark}++; + print $w "blob\nmark :$blob\ndata $len\n", + $$sref, "\n" or wfail; + } my $ref = $self->{ref}; my $commit = $self->{mark}++; @@ -137,7 +249,7 @@ sub remove { print $w "reset $ref\n" or wfail; } my $ident = $self->{ident}; - my $now = now2822(); + my $now = now_raw(); $msg ||= 'rm'; my $len = length($msg) + 1; print $w "commit $ref\nmark :$commit\n", @@ -145,44 +257,146 @@ sub remove { "committer $ident $now\n", "data $len\n$msg\n\n", 'from ', ($parent ? $parent : $tip), "\n" or wfail; - print $w "D $path\n\n" or wfail; + if (defined $path) { + print $w "D $path\n\n" or wfail; + } else { + clean_tree_v2($self, $w, 'd'); + print $w "M 100644 :$blob d\n\n" or wfail; + } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); } -# returns undef on duplicate -sub add { - my ($self, $mime, $check_cb) = @_; # mime = Email::MIME +sub git_timestamp { + my ($ts, $zone) = @_; + $ts = 0 if $ts < 0; # git uses unsigned times + "$ts $zone"; +} +sub extract_author_info ($) { + my ($mime) = @_; + + my $sender = ''; my $from = $mime->header('From'); my ($email) = PublicInbox::Address::emails($from); my ($name) = PublicInbox::Address::names($from); + if (!defined($name) || !defined($email)) { + $sender = $mime->header('Sender'); + if (!defined($name)) { + ($name) = PublicInbox::Address::names($sender); + } + if (!defined($email)) { + ($email) = PublicInbox::Address::emails($sender); + } + } + if (defined $email) { + # quiet down wide character warnings with utf8::encode + utf8::encode($email); + } else { + $email = ''; + warn "no email in From: $from or Sender: $sender\n"; + } + # git gets confused with: # "'A U Thor ' via foo" # ref: # - $name =~ tr/<>//d; + if (defined $name) { + $name =~ tr/<>//d; + utf8::encode($name); + } else { + $name = ''; + warn "no name in From: $from or Sender: $sender\n"; + } + ($name, $email); +} + +# kill potentially confusing/misleading headers +sub drop_unwanted_headers ($) { + my ($mime) = @_; + + $mime->header_set($_) for qw(bytes lines content-length status); + $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS; +} + +# used by V2Writable, too +sub append_mid ($$) { + my ($hdr, $mid0) = @_; + # @cur is likely empty if we need to call this sub, but it could + # have random unparseable crap which we'll preserve, too. + my @cur = $hdr->header_raw('Message-ID'); + $hdr->header_set('Message-ID', @cur, "<$mid0>"); +} + +sub v1_mid0 ($) { + my ($mime) = @_; + my $hdr = $mime->header_obj; + my $mids = mids($hdr); + + if (!scalar(@$mids)) { # spam often has no Message-Id + my $mid0 = digest2mid(content_digest($mime), $hdr); + append_mid($hdr, $mid0); + return $mid0; + } + $mids->[0]; +} +sub clean_tree_v2 ($$$) { + my ($self, $w, $keep) = @_; + my $tree = $self->{-tree} or return; #v2 only + delete $tree->{$keep}; + foreach (keys %$tree) { + print $w "D $_\n" or wfail; + } + %$tree = ($keep => 1); +} - my $date = $mime->header('Date'); +# returns undef on duplicate +# returns the :MARK of the most recent commit +sub add { + my ($self, $mime, $check_cb) = @_; # mime = Email::MIME + + my ($name, $email) = extract_author_info($mime); + my $hdr = $mime->header_obj; + my @at = msg_datestamp($hdr); + my @ct = msg_timestamp($hdr); + my $author_time_raw = git_timestamp(@at); + my $commit_time_raw = git_timestamp(@ct); my $subject = $mime->header('Subject'); $subject = '(no subject)' unless defined $subject; - my $mid = mid_mime($mime); - my $path = mid2path($mid); + my $path_type = $self->{path_type}; + + my $path; + if ($path_type eq '2/38') { + $path = mid2path(v1_mid0($mime)); + } else { # v2 layout, one file: + $path = 'm'; + } my ($r, $w) = $self->gfi_start; my $tip = $self->{tip}; - _check_path($r, $w, $tip, $path) and return; + if ($path_type eq '2/38') { + _check_path($r, $w, $tip, $path) and return; + } - # kill potentially confusing/misleading headers - $mime->header_set($_) for qw(bytes lines content-length status); + drop_unwanted_headers($mime); + + # spam check: if ($check_cb) { $mime = $check_cb->($mime) or return; } - $mime = $mime->as_string; my $blob = $self->{mark}++; - print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail; - print $w $mime, "\n" or wfail; + my $str = $mime->as_string; + my $n = length($str); + $self->{bytes_added} += $n; + print $w "blob\nmark :$blob\ndata ", $n, "\n" or wfail; + print $w $str, "\n" or wfail; + + # v2: we need this for Xapian + if ($self->{want_object_info}) { + my $oid = $self->get_mark(":$blob"); + $self->{last_object} = [ $oid, $n, \$str ]; + } my $ref = $self->{ref}; my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; @@ -191,26 +405,24 @@ sub add { print $w "reset $ref\n" or wfail; } - utf8::encode($email); - utf8::encode($name); utf8::encode($subject); - # quiet down wide character warnings: print $w "commit $ref\nmark :$commit\n", - "author $name <$email> $date\n", - "committer $self->{ident} ", now2822(), "\n" or wfail; + "author $name <$email> $author_time_raw\n", + "committer $self->{ident} $commit_time_raw\n" or wfail; print $w "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; } + clean_tree_v2($self, $w, $path); print $w "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; } -sub run_die ($$) { - my ($cmd, $env) = @_; - my $pid = spawn($cmd, $env, undef); +sub run_die ($;$$) { + my ($cmd, $env, $rdr) = @_; + my $pid = spawn($cmd, $env, $rdr); defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; $? == 0 or die join(' ', @$cmd) . " failed: $?\n"; @@ -224,33 +436,138 @@ sub done { my $pid = delete $self->{pid} or die 'BUG: missing {pid} when done'; waitpid($pid, 0) == $pid or die 'fast-import did not finish'; $? == 0 or die "fast-import failed: $?"; - my $nchg = delete $self->{nchg}; - # for compatibility with existing ssoma installations - # we can probably remove this entirely by 2020 - my $git_dir = $self->{git}->{git_dir}; - # XXX: change the following scope to: if (-e $index) # in 2018 or so.. - my @cmd = ('git', "--git-dir=$git_dir"); - if ($nchg && !$ENV{FAST}) { - my $index = "$git_dir/ssoma.index"; - my $env = { GIT_INDEX_FILE => $index }; - run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); + _update_git_info($self, 1) if delete $self->{nchg}; + + $self->lock_release; +} + +sub atfork_child { + my ($self) = @_; + foreach my $f (qw(in out)) { + close $self->{$f} or die "failed to close import[$f]: $!\n"; } - if ($nchg) { - run_die([@cmd, 'update-server-info'], undef); - eval { - require PublicInbox::SearchIdx; - my $inbox = $self->{inbox} || $git_dir; - my $s = PublicInbox::SearchIdx->new($inbox); - $s->index_sync({ ref => $self->{ref} }); - }; - - eval { run_die([@cmd, qw(gc --auto)], undef) }; +} + +sub digest2mid ($$) { + my ($dig, $hdr) = @_; + my $b64 = $dig->clone->b64digest; + # Make our own URLs nicer: + # See "Base 64 Encoding with URL and Filename Safe Alphabet" in RFC4648 + $b64 =~ tr!+/=!-_!d; + + # Add a date prefix to prevent a leading '-' in case that trips + # up some tools (e.g. if a Message-ID were a expected as a + # command-line arg) + my $dt = msg_datestamp($hdr); + $dt = POSIX::strftime('%Y%m%d%H%M%S', gmtime($dt)); + "$dt.$b64" . '@z'; +} + +sub clean_purge_buffer { + my ($oids, $buf) = @_; + my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; + @$oids = (); + + foreach my $i (0..$#$buf) { + my $l = $buf->[$i]; + if ($l =~ /^author .* (\d+ [\+-]?\d+)$/) { + $buf->[$i] = "author <> $1\n"; + } elsif ($l =~ /^data (\d+)/) { + $buf->[$i++] = "data " . length($cmt_msg) . "\n"; + $buf->[$i] = $cmt_msg; + last; + } } +} - my $lockfh = delete $self->{lockfh} or die "BUG: not locked: $!"; - flock($lockfh, LOCK_UN) or die "unlock failed: $!"; - close $lockfh or die "close lock failed: $!"; +sub purge_oids { + my ($self, $purge) = @_; + my $tmp = "refs/heads/purge-".((keys %$purge)[0]); + my $old = $self->{'ref'}; + my $git = $self->{git}; + my @export = (qw(fast-export --no-data --use-done-feature), $old); + my ($rd, $pid) = $git->popen(@export); + my ($r, $w) = $self->gfi_start; + my @buf; + my $npurge = 0; + my @oids; + my ($done, $mark); + my $tree = $self->{-tree}; + while (<$rd>) { + if (/^reset (?:.+)/) { + push @buf, "reset $tmp\n"; + } elsif (/^commit (?:.+)/) { + if (@buf) { + $w->print(@buf) or wfail; + @buf = (); + } + push @buf, "commit $tmp\n"; + } elsif (/^data (\d+)/) { + # only commit message, so $len is small: + my $len = $1; # + 1 for trailing "\n" + push @buf, $_; + my $n = read($rd, my $buf, $len) or die "read: $!"; + $len == $n or die "short read ($n < $len)"; + push @buf, $buf; + } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { + my ($oid, $path) = ($1, $2); + if ($purge->{$oid}) { + push @oids, $oid; + delete $tree->{$path}; + } else { + $tree->{$path} = 1; + push @buf, $_; + } + } elsif (/^D (\w+)/) { + my $path = $1; + push @buf, $_ if $tree->{$path}; + } elsif ($_ eq "\n") { + if (@oids) { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + clean_purge_buffer(\@oids, \@buf); + $npurge++; + } + $w->print(@buf, "\n") or wfail; + @buf = (); + } elsif ($_ eq "done\n") { + $done = 1; + } elsif (/^mark :(\d+)$/) { + push @buf, $_; + $mark = $1; + } else { + push @buf, $_; + } + } + if (@buf) { + $w->print(@buf) or wfail; + } + die 'done\n not seen from fast-export' unless $done; + chomp(my $cmt = $self->get_mark(":$mark")) if $npurge; + $self->{nchg} = 0; # prevent _update_git_info until update-ref: + $self->done; + my @git = ('git', "--git-dir=$git->{git_dir}"); + + run_die([@git, qw(update-ref), $old, $tmp]) if $npurge; + + run_die([@git, qw(update-ref -d), $tmp]); + + return if $npurge == 0; + + run_die([@git, qw(-c gc.reflogExpire=now gc --prune=all)]); + my $err = 0; + foreach my $oid (keys %$purge) { + my @info = $git->check($oid); + if (@info) { + warn "$oid not purged\n"; + $err++; + } + } + _update_git_info($self, 0); + die "Failed to purge $err object(s)\n" if $err; + $cmt; } 1; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 2ec2be69..706089ca 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -8,6 +8,8 @@ use warnings; use PublicInbox::Git; use PublicInbox::MID qw(mid2path); use Devel::Peek qw(SvREFCNT); +use PublicInbox::MIME; +use POSIX qw(strftime); my $cleanup_timer; eval { @@ -29,6 +31,7 @@ sub cleanup_task () { sub _cleanup_later ($) { my ($self) = @_; + return unless PublicInbox::EvCleanup::enabled(); $cleanup_timer ||= PublicInbox::EvCleanup::later(*cleanup_task); $CLEANUP->{"$self"} = $self; } @@ -73,24 +76,71 @@ sub new { _set_limiter($opts, $pi_config, 'httpbackend'); _set_uint($opts, 'feedmax', 25); $opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'}; + my $dir = $opts->{mainrepo}; + if (defined $dir && -f "$dir/inbox.lock") { + $opts->{version} = 2; + } bless $opts, $class; } +sub git_part { + my ($self, $part) = @_; + ($self->{version} || 1) == 2 or return; + $self->{"$part.git"} ||= eval { + my $git_dir = "$self->{mainrepo}/git/$part.git"; + my $g = PublicInbox::Git->new($git_dir); + $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; + # no cleanup needed, we never cat-file off this, only clone + $g; + }; +} + sub git { my ($self) = @_; $self->{git} ||= eval { - my $g = PublicInbox::Git->new($self->{mainrepo}); + my $git_dir = $self->{mainrepo}; + $git_dir .= '/all.git' if (($self->{version} || 1) == 2); + my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; _cleanup_later($self); $g; }; } +sub max_git_part { + my ($self) = @_; + my $v = $self->{version}; + return unless defined($v) && $v == 2; + my $part = $self->{-max_git_part}; + my $changed = git($self)->alternates_changed; + if (!defined($part) || $changed) { + $self->git->cleanup if $changed; + my $gits = "$self->{mainrepo}/git"; + if (opendir my $dh, $gits) { + my $max = -1; + while (defined(my $git_dir = readdir($dh))) { + $git_dir =~ m!\A(\d+)\.git\z! or next; + $max = $1 if $1 > $max; + } + $part = $self->{-max_git_part} = $max if $max >= 0; + } else { + warn "opendir $gits failed: $!\n"; + } + } + $part; +} + sub mm { my ($self) = @_; $self->{mm} ||= eval { + require PublicInbox::Msgmap; _cleanup_later($self); - PublicInbox::Msgmap->new($self->{mainrepo}); + my $dir = $self->{mainrepo}; + if (($self->{version} || 1) >= 2) { + PublicInbox::Msgmap->new_file("$dir/msgmap.sqlite3"); + } else { + PublicInbox::Msgmap->new($dir); + } }; } @@ -98,7 +148,7 @@ sub search { my ($self) = @_; $self->{search} ||= eval { _cleanup_later($self); - PublicInbox::Search->new($self->{mainrepo}, $self->{altid}); + PublicInbox::Search->new($self, $self->{altid}); }; } @@ -120,7 +170,7 @@ sub description { local $/ = "\n"; chomp $desc; $desc =~ s/\s+/ /smg; - $desc = '($GIT_DIR/description missing)' if $desc eq ''; + $desc = '($REPO_DIR/description missing)' if $desc eq ''; $self->{description} = $desc; } @@ -222,26 +272,49 @@ sub msg_by_path ($$;$) { sub msg_by_smsg ($$;$) { my ($self, $smsg, $ref) = @_; - return unless defined $smsg; # ghost - - # backwards compat to fallback to msg_by_mid - # TODO: remove if we bump SCHEMA_VERSION in Search.pm: - defined(my $blob = $smsg->{blob}) or - return msg_by_mid($self, $smsg->mid); + # ghosts may have undef smsg (from SearchThread.node) or + # no {blob} field + return unless defined $smsg; + defined(my $blob = $smsg->{blob}) or return; my $str = git($self)->cat_file($blob, $ref); $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; $str; } -sub path_check { - my ($self, $path) = @_; - git($self)->check('HEAD:'.$path); +sub smsg_mime { + my ($self, $smsg, $ref) = @_; + if (my $s = msg_by_smsg($self, $smsg, $ref)) { + $smsg->{mime} = PublicInbox::MIME->new($s); + return $smsg; + } +} + +sub mid2num($$) { + my ($self, $mid) = @_; + my $mm = mm($self) or return; + $mm->num_for($mid); +} + +sub smsg_by_mid ($$) { + my ($self, $mid) = @_; + my $srch = search($self) or return; + # favor the Message-ID we used for the NNTP article number: + my $num = mid2num($self, $mid); + defined $num ? $srch->lookup_article($num) : undef; } sub msg_by_mid ($$;$) { my ($self, $mid, $ref) = @_; - msg_by_path($self, mid2path($mid), $ref); + my $srch = search($self) or + return msg_by_path($self, mid2path($mid), $ref); + my $smsg = smsg_by_mid($self, $mid); + $smsg ? msg_by_smsg($self, $smsg, $ref) : undef; +} + +sub recent { + my ($self, $opts, $after, $before) = @_; + search($self)->{over_ro}->recent($opts, $after, $before); } 1; diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm new file mode 100644 index 00000000..5c11a36c --- /dev/null +++ b/lib/PublicInbox/InboxWritable.pm @@ -0,0 +1,228 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# Extends read-only Inbox for writing +package PublicInbox::InboxWritable; +use strict; +use warnings; +use base qw(PublicInbox::Inbox); +use PublicInbox::Import; +use PublicInbox::Filter::Base; +*REJECT = *PublicInbox::Filter::Base::REJECT; + +use constant { + PERM_UMASK => 0, + OLD_PERM_GROUP => 1, + OLD_PERM_EVERYBODY => 2, + PERM_GROUP => 0660, + PERM_EVERYBODY => 0664, +}; + +sub new { + my ($class, $ibx) = @_; + bless $ibx, $class; +} + +sub importer { + my ($self, $parallel) = @_; + $self->{-importer} ||= eval { + my $v = $self->{version} || 1; + if ($v == 2) { + eval { require PublicInbox::V2Writable }; + die "v2 not supported: $@\n" if $@; + my $v2w = PublicInbox::V2Writable->new($self); + $v2w->{parallel} = $parallel; + $v2w; + } elsif ($v == 1) { + my $git = $self->git; + my $name = $self->{name}; + my $addr = $self->{-primary_address}; + PublicInbox::Import->new($git, $name, $addr, $self); + } else { + die "unsupported inbox version: $v\n"; + } + } +} + +sub filter { + my ($self) = @_; + my $f = $self->{filter}; + if ($f && $f =~ /::/) { + my @args = (-inbox => $self); + # basic line splitting, only + # Perhaps we can have proper quote splitting one day... + ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; + + eval "require $f"; + if ($@) { + warn $@; + } else { + # e.g: PublicInbox::Filter::Vger->new(@args) + return $f->new(@args); + } + } + undef; +} + +sub is_maildir_basename ($) { + my ($bn) = @_; + return 0 if $bn !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/; + if ($bn =~ /:2,([A-Z]+)\z/i) { + my $flags = $1; + return 0 if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail + } + 1; +} + +sub is_maildir_path ($) { + my ($path) = @_; + my @p = split(m!/+!, $path); + (is_maildir_basename($p[-1]) && -f $path) ? 1 : 0; +} + +sub maildir_path_load ($) { + my ($path) = @_; + if (open my $fh, '<', $path) { + local $/; + my $str = <$fh>; + $str or return; + return PublicInbox::MIME->new(\$str); + } elsif ($!{ENOENT}) { + # common with Maildir + return; + } else { + warn "failed to open $path: $!\n"; + return; + } +} + +sub import_maildir { + my ($self, $dir) = @_; + my $im = $self->importer(1); + my $filter = $self->filter; + foreach my $sub (qw(cur new tmp)) { + -d "$dir/$sub" or die "$dir is not a Maildir (missing $sub)\n"; + } + foreach my $sub (qw(cur new)) { + opendir my $dh, "$dir/$sub" or die "opendir $dir/$sub: $!\n"; + while (defined(my $fn = readdir($dh))) { + next unless is_maildir_basename($fn); + my $mime = maildir_file_load("$dir/$fn") or next; + if ($filter) { + my $ret = $filter->scrub($mime) or return; + return if $ret == REJECT(); + $mime = $ret; + } + $im->add($mime); + } + } + $im->done; +} + +# asctime: From example@example.com Fri Jun 23 02:56:55 2000 +my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; + +sub mb_add ($$$$) { + my ($im, $variant, $filter, $msg) = @_; + $$msg =~ s/(\r?\n)+\z/$1/s; + my $mime = PublicInbox::MIME->new($msg); + if ($variant eq 'mboxrd') { + $$msg =~ s/^>(>*From )/$1/sm; + } elsif ($variant eq 'mboxo') { + $$msg =~ s/^>From /From /sm; + } + if ($filter) { + my $ret = $filter->scrub($mime) or return; + return if $ret == REJECT(); + $mime = $ret; + } + $im->add($mime) +} + +sub import_mbox { + my ($self, $fh, $variant) = @_; + if ($variant !~ /\A(?:mboxrd|mboxo)\z/) { + die "variant must be 'mboxrd' or 'mboxo'\n"; + } + my $im = $self->importer(1); + my $prev = undef; + my $msg = ''; + my $filter = $self->filter; + while (defined(my $l = <$fh>)) { + if ($l =~ /$from_strict/o) { + if (!defined($prev) || $prev =~ /^\r?$/) { + mb_add($im, $variant, $filter, \$msg) if $msg; + $msg = ''; + $prev = $l; + next; + } + warn "W[$.] $l\n"; + } + $prev = $l; + $msg .= $l; + } + mb_add($im, $variant, $filter, \$msg) if $msg; + $im->done; +} + +sub _read_git_config_perm { + my ($self) = @_; + my @cmd = qw(config); + chomp(my $perm = $self->git->qx('config', 'core.sharedRepository')); + $perm; +} + +sub _git_config_perm { + my $self = shift; + my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); + return PERM_GROUP if (!defined($perm) || $perm eq ''); + return PERM_UMASK if ($perm eq 'umask'); + return PERM_GROUP if ($perm eq 'group'); + if ($perm =~ /\A(?:all|world|everybody)\z/) { + return PERM_EVERYBODY; + } + return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); + return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); + + my $i = oct($perm); + return PERM_UMASK if ($i == PERM_UMASK); + return PERM_GROUP if ($i == OLD_PERM_GROUP); + return PERM_EVERYBODY if ($i == OLD_PERM_EVERYBODY); + + if (($i & 0600) != 0600) { + die "core.sharedRepository mode invalid: ". + sprintf('%.3o', $i) . "\nOwner must have permissions\n"; + } + ($i & 0666); +} + +sub _umask_for { + my ($perm) = @_; # _git_config_perm return value + my $rv = $perm; + return umask if $rv == 0; + + # set +x bit if +r or +w were set + $rv |= 0100 if ($rv & 0600); + $rv |= 0010 if ($rv & 0060); + $rv |= 0001 if ($rv & 0006); + (~$rv & 0777); +} + +sub with_umask { + my ($self, $cb) = @_; + my $old = umask $self->{umask}; + my $rv = eval { $cb->() }; + my $err = $@; + umask $old; + die $err if $err; + $rv; +} + +sub umask_prepare { + my ($self) = @_; + my $perm = _git_config_perm($self); + my $umask = _umask_for($perm); + $self->{umask} = $umask; +} + +1; diff --git a/lib/PublicInbox/Lock.pm b/lib/PublicInbox/Lock.pm new file mode 100644 index 00000000..ca6b33f2 --- /dev/null +++ b/lib/PublicInbox/Lock.pm @@ -0,0 +1,31 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# Base class for per-inbox locking +package PublicInbox::Lock; +use strict; +use warnings; +use Fcntl qw(:flock :DEFAULT); +use Carp qw(croak); + +# we only acquire the flock if creating or reindexing; +# PublicInbox::Import already has the lock on its own. +sub lock_acquire { + my ($self) = @_; + croak 'already locked' if $self->{lockfh}; + my $lock_path = $self->{lock_path} or return; + sysopen(my $lockfh, $lock_path, O_WRONLY|O_CREAT) or + die "failed to open lock $lock_path: $!\n"; + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + $self->{lockfh} = $lockfh; +} + +sub lock_release { + my ($self) = @_; + return unless $self->{lock_path}; + my $lockfh = delete $self->{lockfh} or croak 'not locked'; + flock($lockfh, LOCK_UN) or die "unlock failed: $!\n"; + close $lockfh or die "close failed: $!\n"; +} + +1; diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index d5af8f94..637404eb 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -81,8 +81,6 @@ sub set_list_headers { $pa =~ tr/@/./; # RFC2919 $simple->header_set("List-Id", "<$pa>"); } - - $simple->header_set($_) foreach @BAD_HEADERS; } 1; diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 2c9822f4..c82e8401 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -6,10 +6,14 @@ package PublicInbox::MID; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC/; +our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC + mids references/; use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; -use constant MID_MAX => 40; # SHA-1 hex length +use constant { + MID_MAX => 40, # SHA-1 hex length # TODO: get rid of this + MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q') +}; sub mid_clean { my ($mid) = @_; @@ -46,7 +50,53 @@ sub mid2path { "$x2/$x38"; } -sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') } +# Only for v1 code paths: +sub mid_mime ($) { mids($_[0]->header_obj)->[0] } + +sub mids ($) { + my ($hdr) = @_; + my @mids; + my @v = $hdr->header_raw('Message-Id'); + foreach my $v (@v) { + my @cur = ($v =~ /<([^>]+)>/sg); + if (@cur) { + push(@mids, @cur); + } else { + push(@mids, $v); + } + } + uniq_mids(\@mids); +} + +# last References should be IRT, but some mail clients do things +# out of order, so trust IRT over References iff IRT exists +sub references ($) { + my ($hdr) = @_; + my @mids; + foreach my $f (qw(References In-Reply-To)) { + my @v = $hdr->header_raw($f); + foreach my $v (@v) { + push(@mids, ($v =~ /<([^>]+)>/sg)); + } + } + uniq_mids(\@mids); +} + +sub uniq_mids ($) { + my ($mids) = @_; + my @ret; + my %seen; + foreach my $mid (@$mids) { + if (length($mid) > MAX_MID_SIZE) { + warn "Message-ID: <$mid> too long, truncating\n"; + $mid = substr($mid, 0, MAX_MID_SIZE); + } + next if $seen{$mid}; + push @ret, $mid; + $seen{$mid} = 1; + } + \@ret; +} # RFC3986, section 3.3: sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' } diff --git a/lib/PublicInbox/MIME.pm b/lib/PublicInbox/MIME.pm index 54925a85..456eed64 100644 --- a/lib/PublicInbox/MIME.pm +++ b/lib/PublicInbox/MIME.pm @@ -23,6 +23,8 @@ package PublicInbox::MIME; use strict; use warnings; use base qw(Email::MIME); +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; if ($Email::MIME::VERSION <= 1.937) { sub parts_multipart { diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 04c86cc1..11b23022 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -26,12 +26,51 @@ sub subject_fn ($) { $fn eq '' ? 'no-subject' : $fn; } -sub emit1 { - my ($ctx, $msg) = @_; - $msg = Email::Simple->new($msg); - my $fn = subject_fn($msg); +sub mb_stream { + my ($more) = @_; + bless $more, 'PublicInbox::Mbox'; +} + +# called by PSGI server as body response +sub getline { + my ($more) = @_; # self + my ($ctx, $id, $prev, $next, $cur) = @$more; + if ($cur) { # first + pop @$more; + return msg_str($ctx, $cur); + } + $cur = $next or return; + my $ibx = $ctx->{-inbox}; + $next = $ibx->search->next_by_mid($ctx->{mid}, \$id, \$prev); + @$more = ($ctx, $id, $prev, $next); # $next may be undef, here + my $mref = $ibx->msg_by_smsg($cur) or return; + msg_str($ctx, Email::Simple->new($mref)); +} + +sub close {} # noop + +sub emit_raw { + my ($ctx) = @_; + my $mid = $ctx->{mid}; + my $ibx = $ctx->{-inbox}; + my $first; + my $more; + if (my $srch = $ibx->search) { + my ($id, $prev); + my $smsg = $srch->next_by_mid($mid, \$id, \$prev) or return; + my $mref = $ibx->msg_by_smsg($smsg) or return; + $first = Email::Simple->new($mref); + my $next = $srch->next_by_mid($mid, \$id, \$prev); + # $more is for ->getline + $more = [ $ctx, $id, $prev, $next, $first ] if $next; + } else { + my $mref = $ibx->msg_by_mid($mid) or return; + $first = Email::Simple->new($mref); + } + return unless defined $first; + my $fn = subject_fn($first); my @hdr = ('Content-Type'); - if ($ctx->{-inbox}->{obfuscate}) { + if ($ibx->{obfuscate}) { # obfuscation is stupid, but maybe scrapers are, too... push @hdr, 'application/mbox'; $fn .= '.mbox'; @@ -40,14 +79,11 @@ sub emit1 { $fn .= '.txt'; } push @hdr, 'Content-Disposition', "inline; filename=$fn"; - - # single message should be easily renderable in browsers, - # unless obfuscation is enabled :< - [ 200, \@hdr, [ msg_str($ctx, $msg) ] ] + [ 200, \@hdr, $more ? mb_stream($more) : [ msg_str($ctx, $first) ] ]; } sub msg_str { - my ($ctx, $simple) = @_; # Email::Simple object + my ($ctx, $simple, $mid) = @_; # Email::Simple object my $header_obj = $simple->header_obj; # drop potentially confusing headers, ssoma already should've dropped @@ -57,7 +93,7 @@ sub msg_str { } my $ibx = $ctx->{-inbox}; my $base = $ibx->base_url($ctx->{env}); - my $mid = mid_clean($header_obj->header('Message-ID')); + $mid = $ctx->{mid} unless defined $mid; $mid = mid_escape($mid); my @append = ( 'Archived-At', "<$base$mid/>", @@ -93,9 +129,24 @@ sub thread_mbox { my ($ctx, $srch, $sfx) = @_; eval { require IO::Compress::Gzip }; return sub { need_gzip(@_) } if $@; - - my $cb = sub { $srch->get_thread($ctx->{mid}, @_) }; - PublicInbox::MboxGz->response($ctx, $cb); + my $mid = $ctx->{mid}; + my $msgs = $srch->get_thread($mid, {}); + return [404, [qw(Content-Type text/plain)], []] if !@$msgs; + my $prev = $msgs->[-1]; + my $i = 0; + my $cb = sub { + while (1) { + if (my $smsg = $msgs->[$i++]) { + return $smsg; + } + # refill result set + $msgs = $srch->get_thread($mid, $prev); + return unless @$msgs; + $prev = $msgs->[-1]; + $i = 0; + } + }; + PublicInbox::MboxGz->response($ctx, $cb, $msgs->[0]->subject); } sub emit_range { @@ -110,12 +161,56 @@ sub emit_range { mbox_all($ctx, $query); } +sub mbox_all_ids { + my ($ctx) = @_; + my $prev = 0; + my $ids = $ctx->{-inbox}->mm->ids_after(\$prev) or return + [404, [qw(Content-Type text/plain)], ["No results found\n"]]; + my $i = 0; + my $over = $ctx->{srch}->{over_ro}; + my $cb = sub { + do { + while ((my $num = $ids->[$i++])) { + my $smsg = $over->get_art($num) or next; + return $smsg; + } + $ids = $ctx->{-inbox}->mm->ids_after(\$prev); + $i = 0; + } while (@$ids); + undef; + }; + return PublicInbox::MboxGz->response($ctx, $cb, 'all'); +} + sub mbox_all { my ($ctx, $query) = @_; eval { require IO::Compress::Gzip }; return sub { need_gzip(@_) } if $@; - my $cb = sub { $ctx->{srch}->query($query, @_) }; + return mbox_all_ids($ctx) if $query eq ''; + my $opts = { mset => 2 }; + my $srch = $ctx->{srch}; + my $mset = $srch->query($query, $opts); + $opts->{offset} = $mset->size or + return [404, [qw(Content-Type text/plain)], + ["No results found\n"]]; + my $i = 0; + my $cb = sub { # called by MboxGz->getline + while (1) { + while (my $mi = (($mset->items)[$i++])) { + my $doc = $mi->get_document; + my $smsg = $srch->retry_reopen(sub { + PublicInbox::SearchMsg->load_doc($doc); + }) or next; + return $smsg; + } + # refill result set + $mset = $srch->query($query, $opts); + my $size = $mset->size or return; + $opts->{offset} += $size; + $i = 0; + } + }; PublicInbox::MboxGz->response($ctx, $cb, 'results-'.$query); } @@ -146,8 +241,6 @@ sub new { gz => IO::Compress::Gzip->new(\$buf, Time => 0), cb => $cb, ctx => $ctx, - msgs => [], - opts => { offset => 0 }, }, $class; } @@ -155,63 +248,35 @@ sub response { my ($class, $ctx, $cb, $fn) = @_; my $body = $class->new($ctx, $cb); # http://www.iana.org/assignments/media-types/application/gzip - $body->{hdr} = [ 'Content-Type', 'application/gzip' ]; - $body->{fn} = $fn; - # fill in Content-Disposition filename - my $hdr = $body->getline; - if ($body->{hdr}) { - return [ 404, ['Content-Type','text/plain'], - [ "No results found\n" ] ]; + my @h = qw(Content-Type application/gzip); + if ($fn) { + $fn = to_filename($fn); + push @h, 'Content-Disposition', "inline; filename=$fn.mbox.gz"; } - [ 200, $hdr, $body ]; -} - -sub set_filename ($$) { - my ($fn, $msg) = @_; - return to_filename($fn) if defined($fn); - - PublicInbox::Mbox::subject_fn($msg); + [ 200, \@h, $body ]; } # called by Plack::Util::foreach or similar sub getline { my ($self) = @_; my $ctx = $self->{ctx} or return; - my $res; - my $ibx = $ctx->{-inbox}; - my $gz = $self->{gz}; - do { - # work on existing result set - while (defined(my $smsg = shift @{$self->{msgs}})) { - my $msg = eval { $ibx->msg_by_smsg($smsg) } or next; - $msg = Email::Simple->new($msg); - $gz->write(PublicInbox::Mbox::msg_str($ctx, $msg)); - - # use subject of first message as subject - if (my $hdr = delete $self->{hdr}) { - my $fn = set_filename($self->{fn}, $msg); - push @$hdr, 'Content-Disposition', - "inline; filename=$fn.mbox.gz"; - return $hdr; - } - my $bref = $self->{buf}; - if (length($$bref) >= 8192) { - my $ret = $$bref; # copy :< - ${$self->{buf}} = ''; - return $ret; - } - - # be fair to other clients on public-inbox-httpd: - return ''; + while (my $smsg = $self->{cb}->()) { + my $msg = $ctx->{-inbox}->msg_by_smsg($smsg) or next; + $msg = Email::Simple->new($msg); + $self->{gz}->write(PublicInbox::Mbox::msg_str($ctx, $msg, + $smsg->{mid})); + my $bref = $self->{buf}; + if (length($$bref) >= 8192) { + my $ret = $$bref; # copy :< + ${$self->{buf}} = ''; + return $ret; } - # refill result set - $res = $self->{cb}->($self->{opts}); - $self->{msgs} = $res->{msgs}; - $res = scalar @{$self->{msgs}}; - $self->{opts}->{offset} += $res; - } while ($res); - $gz->close; + # be fair to other clients on public-inbox-httpd: + return ''; + } + delete($self->{gz})->close; + # signal that we're done and can return undef next call: delete $self->{ctx}; ${delete $self->{buf}}; } diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm new file mode 100644 index 00000000..c67a41ff --- /dev/null +++ b/lib/PublicInbox/MsgTime.pm @@ -0,0 +1,82 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +package PublicInbox::MsgTime; +use strict; +use warnings; +use base qw(Exporter); +our @EXPORT_OK = qw(msg_timestamp msg_datestamp); +use Date::Parse qw(str2time); +use Time::Zone qw(tz_offset); + +sub zone_clamp ($) { + my ($zone) = @_; + $zone ||= '+0000'; + # "-1200" is the furthest westermost zone offset, + # but git fast-import is liberal so we use "-1400" + if ($zone >= 1400 || $zone <= -1400) { + warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; + $zone = '+0000'; + } + $zone; +} + +sub time_response ($) { + my ($ret) = @_; + wantarray ? @$ret : $ret->[0]; +} + +sub msg_received_at ($) { + my ($hdr) = @_; # Email::MIME::Header + my @recvd = $hdr->header_raw('Received'); + my ($ts, $zone); + foreach my $r (@recvd) { + $zone = undef; + $r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+ + \d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next; + $zone = $2; + $ts = eval { str2time($1) } and last; + my $mid = $hdr->header_raw('Message-ID'); + warn "no date in $mid Received: $r\n"; + } + defined $ts ? [ $ts, zone_clamp($zone) ] : undef; +} + +sub msg_date_only ($) { + my ($hdr) = @_; # Email::MIME::Header + my @date = $hdr->header_raw('Date'); + my ($ts, $zone); + foreach my $d (@date) { + $zone = undef; + # Y2K problems: 3-digit years + $d =~ s!([A-Za-z]{3}) (\d{3}) (\d\d:\d\d:\d\d)! + my $yyyy = $2 + 1900; "$1 $yyyy $3"!e; + $ts = eval { str2time($d) }; + if ($@) { + my $mid = $hdr->header_raw('Message-ID'); + warn "bad Date: $d in $mid: $@\n"; + } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) { + $zone = $1; + } + } + defined $ts ? [ $ts, zone_clamp($zone) ] : undef; +} + +# Favors Received header for sorting globally +sub msg_timestamp ($) { + my ($hdr) = @_; # Email::MIME::Header + my $ret; + $ret = msg_received_at($hdr) and return time_response($ret); + $ret = msg_date_only($hdr) and return time_response($ret); + wantarray ? (time, '+0000') : time; +} + +# Favors the Date: header for display and sorting within a thread +sub msg_datestamp ($) { + my ($hdr) = @_; # Email::MIME::Header + my $ret; + $ret = msg_date_only($hdr) and return time_response($ret); + $ret = msg_received_at($hdr) and return time_response($ret); + wantarray ? (time, '+0000') : time; +} + +1; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index 6b6d1c6e..ec3d4f9d 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -12,6 +12,7 @@ use strict; use warnings; use DBI; use DBD::SQLite; +use File::Temp qw(tempfile); sub new { my ($class, $git_dir, $writable) = @_; @@ -23,9 +24,8 @@ sub new { new_file($class, "$d/msgmap.sqlite3", $writable); } -sub new_file { - my ($class, $f, $writable) = @_; - +sub dbh_new { + my ($f, $writable) = @_; my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { AutoCommit => 1, RaiseError => 1, @@ -34,6 +34,14 @@ sub new_file { sqlite_use_immediate_transaction => 1, }); $dbh->do('PRAGMA case_sensitive_like = ON'); + $dbh; +} + +sub new_file { + my ($class, $f, $writable) = @_; + return if !$writable && !-r $f; + + my $dbh = dbh_new($f, $writable); my $self = bless { dbh => $dbh }, $class; if ($writable) { @@ -45,6 +53,19 @@ sub new_file { $self; } +# used to keep track of used numeric mappings for v2 reindex +sub tmp_clone { + my ($self) = @_; + my ($fh, $fn) = tempfile('msgmap-XXXXXXXX', EXLOCK => 0, TMPDIR => 1); + $self->{dbh}->sqlite_backup_to_file($fn); + my $tmp = ref($self)->new_file($fn, 1); + $tmp->{dbh}->do('PRAGMA synchronous = OFF'); + $tmp->{tmp_name} = $fn; # SQLite won't work if unlinked, apparently + $tmp->{pid} = $$; + close $fh or die "failed to close $fn: $!"; + $tmp; +} + # n.b. invoked directly by scripts/xhdr-num2mid sub meta_accessor { my ($self, $key, $value) = @_; @@ -71,6 +92,14 @@ sub last_commit { $self->meta_accessor('last_commit', $commit); } +# v2 uses this to keep track of how up-to-date Xapian is +# old versions may be automatically GC'ed away in the future, +# but it's a trivial amount of storage. +sub last_commit_xap { + my ($self, $version, $i, $commit) = @_; + $self->meta_accessor("last_xap$version-$i", $commit); +} + sub created_at { my ($self, $second) = @_; $self->meta_accessor('created_at', $second); @@ -79,10 +108,10 @@ sub created_at { sub mid_insert { my ($self, $mid) = @_; my $dbh = $self->{dbh}; - my $sql = 'INSERT OR IGNORE INTO msgmap (mid) VALUES (?)'; - my $sth = $self->{mid_insert} ||= $dbh->prepare($sql); - $sth->bind_param(1, $mid); - return if $sth->execute == 0; + my $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO msgmap (mid) VALUES (?) + + return if $sth->execute($mid) == 0; $dbh->last_insert_id(undef, undef, 'msgmap', 'num'); } @@ -109,10 +138,14 @@ sub num_for { sub minmax { my ($self) = @_; my $dbh = $self->{dbh}; - my $sth = $self->{num_minmax} ||= - $dbh->prepare('SELECT MIN(num),MAX(num) FROM msgmap'); + # breaking MIN and MAX into separate queries speeds up from 250ms + # to around 700us with 2.7million messages. + my $sth = $dbh->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1); $sth->execute; - $sth->fetchrow_array; + my $min = $sth->fetchrow_array; + $sth = $dbh->prepare_cached('SELECT MAX(num) FROM msgmap', undef, 1); + $sth->execute; + ($min, $sth->fetchrow_array); } sub mid_prefixes { @@ -140,6 +173,14 @@ sub mid_delete { $sth->execute; } +sub num_delete { + my ($self, $num) = @_; + my $dbh = $self->{dbh}; + my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?'); + $sth->bind_param(1, $num); + $sth->execute; +} + sub create_tables { my ($dbh) = @_; my $e; @@ -157,17 +198,26 @@ sub create_tables { } # used by NNTP.pm -sub id_batch { - my ($self, $num, $cb) = @_; +sub ids_after { + my ($self, $num) = @_; + my $ids = $self->{dbh}->selectcol_arrayref(<<'', undef, $$num); +SELECT num FROM msgmap WHERE num > ? +ORDER BY num ASC LIMIT 1000 + + $$num = $ids->[-1] if @$ids; + $ids; +} + +sub msg_range { + my ($self, $beg, $end) = @_; my $dbh = $self->{dbh}; - my $sth = $dbh->prepare('SELECT num FROM msgmap WHERE num > ? '. - 'ORDER BY num ASC LIMIT 1000'); - $sth->execute($num); - my $ary = $sth->fetchall_arrayref; - @$ary = map { $_->[0] } @$ary; - my $nr = scalar @$ary; - $cb->($ary) if $nr; - $nr; + my $attr = { Columns => [] }; + my $mids = $dbh->selectall_arrayref(<<'', $attr, $$beg, $end); +SELECT num,mid FROM msgmap WHERE num >= ? AND num <= ? +ORDER BY num ASC + + $$beg = $mids->[-1]->[0] + 1 if @$mids; + $mids } # only used for mapping external serial numbers (e.g. articles from gmane) @@ -181,4 +231,31 @@ sub mid_set { $sth->execute($num, $mid); } +sub DESTROY { + my ($self) = @_; + delete $self->{dbh}; + my $f = delete $self->{tmp_name}; + if (defined $f && $self->{pid} == $$) { + unlink $f or warn "failed to unlink $f: $!\n"; + } +} + +sub atfork_parent { + my ($self) = @_; + my $f = $self->{tmp_name} or die "not a temporary clone\n"; + delete $self->{dbh} and die "tmp_clone dbh not prepared for parent"; + my $dbh = $self->{dbh} = dbh_new($f, 1); + $dbh->do('PRAGMA synchronous = OFF'); +} + +sub atfork_prepare { + my ($self) = @_; + my $f = $self->{tmp_name} or die "not a temporary clone\n"; + $self->{pid} == $$ or + die "BUG: atfork_prepare not called from $self->{pid}\n"; + $self->{dbh} or die "temporary clone not open\n"; + # must clobber prepared statements + %$self = (tmp_name => $f, pid => $$); +} + 1; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index c574c9e6..cdbd8e98 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -34,7 +34,6 @@ my $LIST_HEADERS = join("\r\n", @OVERVIEW, qw(:bytes :lines Xref To Cc)) . "\r\n"; # disable commands with easy DoS potential: -# LISTGROUP could get pretty bad, too... my %DISABLED; # = map { $_ => 1 } qw(xover list_overview_fmt newnews xhdr); my $EXPMAP; # fd -> [ idle_time, $self ] @@ -226,15 +225,12 @@ sub cmd_listgroup ($;$) { } $self->{ng} or return '412 no newsgroup selected'; - long_response($self, 0, long_response_limit, sub { - my ($i) = @_; - my $nr = $self->{ng}->mm->id_batch($$i, sub { - my ($ary) = @_; - more($self, join("\r\n", @$ary)); - }); - - # -1 to adjust for implicit increment in long_response - $$i = $nr ? $$i + $nr - 1 : long_response_limit; + my $n = 0; + long_response($self, sub { + my $ary = $self->{ng}->mm->ids_after(\$n); + scalar @$ary or return; + more($self, join("\r\n", @$ary)); + 1; }); } @@ -332,24 +328,22 @@ sub cmd_newnews ($$$$;$$) { }; return '.' unless @srch; - $ts .= '..'; - my $opts = { asc => 1, limit => 1000, offset => 0 }; - long_response($self, 0, long_response_limit, sub { - my ($i) = @_; + my $prev = 0; + long_response($self, sub { my $srch = $srch[0]; - my $res = $srch->query_ts($ts, $opts); - my $msgs = $res->{msgs}; - if (my $nr = scalar @$msgs) { + my $msgs = $srch->query_ts($ts, $prev); + if (scalar @$msgs) { more($self, '<' . join(">\r\n<", map { $_->mid } @$msgs ). '>'); - $opts->{offset} += $nr; + $prev = $msgs->[-1]->{num}; } else { shift @srch; if (@srch) { # continue onto next newsgroup - $opts->{offset} = 0; + $prev = 0; + return 1; } else { # break out of the long response. - $$i = long_response_limit; + return; } } }); @@ -414,12 +408,30 @@ sub header_append ($$$) { $hdr->header_set($k, @v, $v); } -sub set_nntp_headers { - my ($hdr, $ng, $n, $mid) = @_; +sub xref ($$$$) { + my ($self, $ng, $n, $mid) = @_; + my $ret = "$ng->{domain} $ng->{newsgroup}:$n"; + + # num_for is pretty cheap and sometimes we'll lookup the existence + # of an article without getting even the OVER info. In other words, + # I'm not sure if its worth optimizing by scanning To:/Cc: and + # PublicInbox::ExtMsg on the PSGI end is just as expensive + foreach my $other (@{$self->{nntpd}->{grouplist}}) { + next if $ng eq $other; + my $num = eval { $other->mm->num_for($mid) } or next; + $ret .= " $other->{newsgroup}:$num"; + } + $ret; +} + +sub set_nntp_headers ($$$$$) { + my ($self, $hdr, $ng, $n, $mid) = @_; # clobber some - $hdr->header_set('Newsgroups', $ng->{newsgroup}); - $hdr->header_set('Xref', xref($ng, $n)); + my $xref = xref($self, $ng, $n, $mid); + $hdr->header_set('Xref', $xref); + $xref =~ s/:\d+//g; + $hdr->header_set('Newsgroups', (split(/ /, $xref, 2))[1]); header_append($hdr, 'List-Post', "{-primary_address}>"); if (my $url = $ng->base_url) { $mid = mid_escape($mid); @@ -464,18 +476,16 @@ find_mid: defined $mid or return $err; } found: - my $bytes; - my $s = eval { $ng->msg_by_mid($mid, \$bytes) } or return $err; - $s = Email::Simple->new($s); - my $lines; + my $smsg = $ng->search->{over_ro}->get_art($n) or return $err; + my $msg = $ng->msg_by_smsg($smsg) or return $err; + my $s = Email::Simple->new($msg); if ($set_headers) { - set_nntp_headers($s->header_obj, $ng, $n, $mid); - $lines = $s->body =~ tr!\n!\n!; + set_nntp_headers($self, $s->header_obj, $ng, $n, $mid); # must be last $s->body_set('') if ($set_headers == 2); } - [ $n, $mid, $s, $bytes, $lines, $ng ]; + [ $n, $mid, $s, $smsg->bytes, $smsg->lines, $ng ]; } sub simple_body_write ($$) { @@ -573,8 +583,8 @@ sub get_range ($$) { [ $beg, $end ]; } -sub long_response ($$$$) { - my ($self, $beg, $end, $cb) = @_; +sub long_response ($$) { + my ($self, $cb) = @_; die "BUG: nested long response" if $self->{long_res}; my $fd = $self->{fd}; @@ -585,24 +595,14 @@ sub long_response ($$$$) { $self->watch_read(0); my $t0 = now(); $self->{long_res} = sub { - # limit our own running time for fairness with other - # clients and to avoid buffering too much: - my $lim = 100; - - my $err; - do { - eval { $cb->(\$beg, \$lim) }; - } until (($err = $@) || $self->{closed} || - ++$beg > $end || --$lim < 0 || - $self->{write_buf_size}); - - if ($err || $self->{closed}) { + my $more = eval { $cb->() }; + if ($@ || $self->{closed}) { $self->{long_res} = undef; - if ($err) { + if ($@) { err($self, "%s during long response[$fd] - %0.6f", - $err, now() - $t0); + $@, now() - $t0); } if ($self->{closed}) { out($self, " deferred[$fd] aborted - %0.6f", @@ -611,7 +611,7 @@ sub long_response ($$$$) { update_idle_time($self); $self->watch_read(1); } - } elsif ($lim < 0 || $self->{write_buf_size}) { + } elsif ($more) { # $self->{write_buf_size}: # no recursion, schedule another call ASAP # but only after all pending writes are done update_idle_time($self); @@ -643,19 +643,17 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. my $mm = $self->{ng}->mm; my ($beg, $end) = @$r; more($self, $xhdr ? r221 : r225); - long_response($self, $beg, $end, sub { - my ($i) = @_; - my $mid = $mm->mid_for($$i); - more($self, "$$i <$mid>") if defined $mid; + long_response($self, sub { + my $r = $mm->msg_range(\$beg, $end); + @$r or return; + more($self, join("\r\n", map { + "$_->[0] <$_->[1]>" + } @$r)); + 1; }); } } -sub xref ($$) { - my ($ng, $n) = @_; - "$ng->{domain} $ng->{newsgroup}:$n" -} - sub mid_lookup ($$) { my ($self, $mid) = @_; my $self_ng = $self->{ng}; @@ -675,9 +673,11 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin my ($self, $xhdr, $range) = @_; if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID - my ($ng, $n) = mid_lookup($self, $1); + my $mid = $1; + my ($ng, $n) = mid_lookup($self, $mid); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, xref($ng, $n)); + hdr_mid_response($self, $xhdr, $ng, $n, $range, + xref($self, $ng, $n, $mid)); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); @@ -686,26 +686,31 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin my $mm = $ng->mm; my ($beg, $end) = @$r; more($self, $xhdr ? r221 : r225); - long_response($self, $beg, $end, sub { - my ($i) = @_; - my $mid = $mm->mid_for($$i); - more($self, "$$i ".xref($ng, $$i)) if defined $mid; + long_response($self, sub { + my $r = $mm->msg_range(\$beg, $end); + @$r or return; + more($self, join("\r\n", map { + my $num = $_->[0]; + "$num ".xref($self, $ng, $num, $_->[1]); + } @$r)); + 1; }); } } sub search_header_for { - my ($srch, $mid, $field) = @_; - my $smsg = $srch->lookup_mail($mid) or return; - $smsg->$field; + my ($srch, $num, $field) = @_; + my $smsg = $srch->{over_ro}->get_art($num) or return; + return PublicInbox::SearchMsg::date($smsg) if $field eq 'date'; + $smsg->{$field}; } sub hdr_searchmsg ($$$$) { my ($self, $xhdr, $field, $range) = @_; if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID my ($ng, $n) = mid_lookup($self, $1); - return r430 unless $n; - my $v = search_header_for($ng->search, $range, $field); + return r430 unless defined $n; + my $v = search_header_for($ng->search, $n, $field); hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); } else { # numeric range $range = $self->{article} unless defined $range; @@ -715,22 +720,17 @@ sub hdr_searchmsg ($$$$) { return $r unless ref $r; my ($beg, $end) = @$r; more($self, $xhdr ? r221 : r225); - my $off = 0; - long_response($self, $beg, $end, sub { - my ($i, $lim) = @_; - my $res = $srch->query_xover($beg, $end, $off); - my $msgs = $res->{msgs}; + my $cur = $beg; + long_response($self, sub { + my $msgs = $srch->query_xover($cur, $end); my $nr = scalar @$msgs or return; - $off += $nr; - $$lim -= $nr; my $tmp = ''; foreach my $s (@$msgs) { - $tmp .= $s->num . ' ' . $s->$field . "\r\n"; + $tmp .= $s->{num} . ' ' . $s->$field . "\r\n"; } utf8::encode($tmp); do_more($self, $tmp); - # -1 to adjust for implicit increment in long_response - $$i = $nr ? $$i + $nr - 1 : long_response_limit; + $cur = $msgs->[-1]->{num} + 1; }); } } @@ -804,11 +804,11 @@ sub cmd_xrover ($;$) { my $mm = $ng->mm; my $srch = $ng->search; more($self, '224 Overview information follows'); - long_response($self, $beg, $end, sub { - my ($i) = @_; - my $mid = $mm->mid_for($$i) or return; - my $h = search_header_for($srch, $mid, 'references'); - more($self, "$$i $h"); + + long_response($self, sub { + my $h = search_header_for($srch, $beg, 'references'); + more($self, "$beg $h") if defined($h); + $beg++ < $end; }); } @@ -820,10 +820,10 @@ sub over_line ($$) { $smsg->{subject}, $smsg->{from}, PublicInbox::SearchMsg::date($smsg), - '<'.PublicInbox::SearchMsg::mid($smsg).'>', + "<$smsg->{mid}>", $smsg->{references}, - PublicInbox::SearchMsg::bytes($smsg), - PublicInbox::SearchMsg::lines($smsg)); + $smsg->{bytes}, + $smsg->{lines}); utf8::encode($s); $s } @@ -832,8 +832,8 @@ sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ /\A<(.+)>\z/) { my ($ng, $n) = mid_lookup($self, $1); - my $smsg = $ng->search->lookup_mail($range) or - return '430 No article with that message-id'; + defined $n or return r430; + my $smsg = $ng->search->{over_ro}->get_art($n) or return r430; more($self, '224 Overview information follows (multi-line)'); # Only set article number column if it's the current group @@ -854,22 +854,16 @@ sub cmd_xover ($;$) { my ($beg, $end) = @$r; more($self, "224 Overview information follows for $beg to $end"); my $srch = $self->{ng}->search; - my $off = 0; - long_response($self, $beg, $end, sub { - my ($i, $lim) = @_; - my $res = $srch->query_xover($beg, $end, $off); - my $msgs = $res->{msgs}; + my $cur = $beg; + long_response($self, sub { + my $msgs = $srch->query_xover($cur, $end); my $nr = scalar @$msgs or return; - $off += $nr; - $$lim -= $nr; # OVERVIEW.FMT more($self, join("\r\n", map { - over_line(PublicInbox::SearchMsg::num($_), $_); + over_line($_->{num}, $_); } @$msgs)); - - # -1 to adjust for implicit increment in long_response - $$i = $nr ? $$i + $nr - 1 : long_response_limit; + $cur = $msgs->[-1]->{num} + 1; }); } diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm new file mode 100644 index 00000000..07e54b64 --- /dev/null +++ b/lib/PublicInbox/Over.pm @@ -0,0 +1,183 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI +# Unlike Msgmap, this is an _UNSTABLE_ database which can be +# tweaked/updated over time and rebuilt. +package PublicInbox::Over; +use strict; +use warnings; +use DBI; +use DBD::SQLite; +use PublicInbox::SearchMsg; +use Compress::Zlib qw(uncompress); + +sub dbh_new { + my ($self) = @_; + my $ro = ref($self) eq 'PublicInbox::Over'; + my $dbh = DBI->connect("dbi:SQLite:dbname=$self->{filename}",'','', { + AutoCommit => 1, + RaiseError => 1, + PrintError => 0, + ReadOnly => $ro, + sqlite_use_immediate_transaction => 1, + }); + $dbh->{sqlite_unicode} = 1; + $dbh; +} + +sub new { + my ($class, $f) = @_; + bless { filename => $f }, $class; +} + +sub disconnect { $_[0]->{dbh} = undef } + +sub connect { $_[0]->{dbh} ||= $_[0]->dbh_new } + +sub load_from_row { + my ($smsg) = @_; + bless $smsg, 'PublicInbox::SearchMsg'; + if (defined(my $data = delete $smsg->{ddd})) { + $data = uncompress($data); + utf8::decode($data); + $smsg->load_from_data($data); + } + $smsg +} + +sub do_get { + my ($self, $sql, $opts, @args) = @_; + my $dbh = $self->connect; + my $lim = (($opts->{limit} || 0) + 0) || 1000; + $sql .= "LIMIT $lim"; + my $msgs = $dbh->selectall_arrayref($sql, { Slice => {} }, @args); + load_from_row($_) for @$msgs; + $msgs +} + +sub query_xover { + my ($self, $beg, $end) = @_; + do_get($self, <<'', {}, $beg, $end); +SELECT num,ts,ds,ddd FROM over WHERE num >= ? AND num <= ? +ORDER BY num ASC + +} + +sub query_ts { + my ($self, $ts, $prev) = @_; + do_get($self, <<'', {}, $ts, $prev); +SELECT num,ddd FROM over WHERE ts >= ? AND num > ? +ORDER BY num ASC + +} + +sub nothing () { wantarray ? (0, []) : [] }; + +sub get_thread { + my ($self, $mid, $prev) = @_; + my $dbh = $self->connect; + + my $id = $dbh->selectrow_array(<<'', undef, $mid); +SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + defined $id or return nothing; + + my $num = $dbh->selectrow_array(<<'', undef, $id); +SELECT num FROM id2num WHERE id = ? AND num > 0 +ORDER BY num ASC LIMIT 1 + + defined $num or return nothing; + + my ($tid, $sid) = $dbh->selectrow_array(<<'', undef, $num); +SELECT tid,sid FROM over WHERE num = ? LIMIT 1 + + defined $tid or return nothing; # $sid may be undef + my $sort_col = 'ds'; + $num = 0; + if ($prev) { + $num = $prev->{num} || 0; + $sort_col = 'num'; + } + my $cond = '(tid = ? OR sid = ?) AND num > ?'; + my $msgs = do_get($self, <<"", {}, $tid, $sid, $num); +SELECT num,ts,ds,ddd FROM over WHERE $cond ORDER BY $sort_col ASC + + return $msgs unless wantarray; + + my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num); +SELECT COUNT(num) FROM over WHERE $cond + + ($nr, $msgs); +} + +sub recent { + my ($self, $opts, $after, $before) = @_; + my ($s, @v); + if (defined($before)) { + if (defined($after)) { + $s = 'num > 0 AND ts >= ? AND ts <= ? ORDER BY ts DESC'; + @v = ($after, $before); + } else { + $s = 'num > 0 AND ts <= ? ORDER BY ts DESC'; + @v = ($before); + } + } else { + if (defined($after)) { + $s = 'num > 0 AND ts >= ? ORDER BY ts ASC'; + @v = ($after); + } else { + $s = 'num > 0 ORDER BY ts DESC'; + } + } + my $msgs = do_get($self, <<"", $opts, @v); +SELECT ts,ds,ddd FROM over WHERE $s + + return $msgs unless wantarray; + + my $nr = $self->{dbh}->selectrow_array(<<''); +SELECT COUNT(num) FROM over WHERE num > 0 + + ($nr, $msgs); +} + +sub get_art { + my ($self, $num) = @_; + my $dbh = $self->connect; + my $smsg = $dbh->selectrow_hashref(<<'', undef, $num); +SELECT num,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 + + return load_from_row($smsg) if $smsg; + undef; +} + +sub next_by_mid { + my ($self, $mid, $id, $prev) = @_; + my $dbh = $self->connect; + + unless (defined $$id) { + my $sth = $dbh->prepare_cached(<<'', undef, 1); + SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + $$id = $sth->fetchrow_array; + defined $$id or return; + } + my $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT num FROM id2num WHERE id = ? AND num > ? +ORDER BY num ASC LIMIT 1 + + $$prev ||= 0; + $sth->execute($$id, $$prev); + my $num = $sth->fetchrow_array or return; + $$prev = $num; + + $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT num,ts,ds,ddd FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + my $smsg = $sth->fetchrow_hashref or return; + load_from_row($smsg); +} + +1; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm new file mode 100644 index 00000000..62fec0da --- /dev/null +++ b/lib/PublicInbox/OverIdx.pm @@ -0,0 +1,418 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI +# Unlike Msgmap, this is an _UNSTABLE_ cache which can be +# tweaked/updated over time and rebuilt. +# +# Ghost messages (messages which are only referenced in References/In-Reply-To) +# are denoted by a negative NNTP article number. +package PublicInbox::OverIdx; +use strict; +use warnings; +use base qw(PublicInbox::Over); +use IO::Handle; +use DBI qw(:sql_types); # SQL_BLOB +use PublicInbox::MID qw/id_compress mids references/; +use PublicInbox::SearchMsg; +use Compress::Zlib qw(compress); +use PublicInbox::Search; + +sub dbh_new { + my ($self) = @_; + my $dbh = $self->SUPER::dbh_new; + $dbh->do('PRAGMA journal_mode = TRUNCATE'); + $dbh->do('PRAGMA cache_size = 80000'); + create_tables($dbh); + $dbh; +} + +sub get_counter ($$) { + my ($dbh, $key) = @_; + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT val FROM counter WHERE key = ? LIMIT 1 + + $sth->execute($key); + $sth->fetchrow_array; +} + +sub adj_counter ($$$) { + my ($self, $key, $op) = @_; + my $dbh = $self->{dbh}; + my $sth = $dbh->prepare_cached(<<""); +UPDATE counter SET val = val $op 1 WHERE key = ? + + $sth->execute($key); + + get_counter($dbh, $key); +} + +sub next_tid { adj_counter($_[0], 'thread', '+') } +sub next_ghost_num { adj_counter($_[0], 'ghost', '-') } + +sub id_for ($$$$$) { + my ($self, $tbl, $id_col, $val_col, $val) = @_; + my $dbh = $self->{dbh}; + my $in = $dbh->prepare_cached(<<"")->execute($val); +INSERT OR IGNORE INTO $tbl ($val_col) VALUES (?) + + if ($in == 0) { + my $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 + + $sth->execute($val); + $sth->fetchrow_array; + } else { + $dbh->last_insert_id(undef, undef, $tbl, $id_col); + } +} + +sub sid { + my ($self, $path) = @_; + return unless defined $path && $path ne ''; + id_for($self, 'subject', 'sid', 'path' => $path); +} + +sub mid2id { + my ($self, $mid) = @_; + id_for($self, 'msgid', 'id', 'mid' => $mid); +} + +sub delete_by_num { + my ($self, $num) = @_; + my $dbh = $self->{dbh}; + foreach (qw(over id2num)) { + $dbh->prepare_cached(<<"")->execute($num); +DELETE FROM $_ WHERE num = ? + + } +} + +# this includes ghosts +sub each_by_mid { + my ($self, $mid, $cols, $cb) = @_; + my $dbh = $self->{dbh}; + +=over + I originally wanted to stuff everything into a single query: + + SELECT over.* FROM over + LEFT JOIN id2num ON over.num = id2num.num + LEFT JOIN msgid ON msgid.id = id2num.id + WHERE msgid.mid = ? AND over.num >= ? + ORDER BY over.num ASC + LIMIT 1000 + + But it's faster broken out (and we're always in a + transaction for subroutines in this file) +=cut + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + my $id = $sth->fetchrow_array; + defined $id or return; + + push(@$cols, 'num'); + $cols = join(',', map { $_ } @$cols); + my $lim = 10; + my $prev = get_counter($dbh, 'ghost'); + while (1) { + $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT num FROM id2num WHERE id = ? AND num >= ? +ORDER BY num ASC +LIMIT $lim + + $sth->execute($id, $prev); + my $nums = $sth->fetchall_arrayref; + my $nr = scalar(@$nums) or return; + $prev = $nums->[-1]->[0]; + + $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT $cols FROM over WHERE over.num = ? LIMIT 1 + + foreach (@$nums) { + $sth->execute($_->[0]); + my $smsg = $sth->fetchrow_hashref; + $cb->(PublicInbox::Over::load_from_row($smsg)) or + return; + } + return if $nr != $lim; + } +} + +# this will create a ghost as necessary +sub resolve_mid_to_tid { + my ($self, $mid) = @_; + my $tid; + each_by_mid($self, $mid, ['tid'], sub { + my ($smsg) = @_; + my $cur_tid = $smsg->{tid}; + if (defined $tid) { + merge_threads($self, $tid, $cur_tid); + } else { + $tid = $cur_tid; + } + 1; + }); + defined $tid ? $tid : create_ghost($self, $mid); +} + +sub create_ghost { + my ($self, $mid) = @_; + my $id = $self->mid2id($mid); + my $num = $self->next_ghost_num; + $num < 0 or die "ghost num is non-negative: $num\n"; + my $tid = $self->next_tid; + my $dbh = $self->{dbh}; + $dbh->prepare_cached(<<'')->execute($num, $tid); +INSERT INTO over (num, tid) VALUES (?,?) + + $dbh->prepare_cached(<<'')->execute($id, $num); +INSERT INTO id2num (id, num) VALUES (?,?) + + $tid; +} + +sub merge_threads { + my ($self, $winner_tid, $loser_tid) = @_; + return if $winner_tid == $loser_tid; + my $dbh = $self->{dbh}; + $dbh->prepare_cached(<<'')->execute($winner_tid, $loser_tid); +UPDATE over SET tid = ? WHERE tid = ? + +} + +sub link_refs { + my ($self, $refs, $old_tid) = @_; + my $tid; + + if (@$refs) { + # first ref *should* be the thread root, + # but we can never trust clients to do the right thing + my $ref = $refs->[0]; + $tid = resolve_mid_to_tid($self, $ref); + merge_threads($self, $tid, $old_tid) if defined $old_tid; + + # the rest of the refs should point to this tid: + foreach my $i (1..$#$refs) { + $ref = $refs->[$i]; + my $ptid = resolve_mid_to_tid($self, $ref); + merge_threads($self, $tid, $ptid); + } + } else { + $tid = defined $old_tid ? $old_tid : $self->next_tid; + } + $tid; +} + +sub parse_references ($$$) { + my ($smsg, $mid0, $mids) = @_; + my $mime = $smsg->{mime}; + my $hdr = $mime->header_obj; + my $refs = references($hdr); + push(@$refs, @$mids) if scalar(@$mids) > 1; + return $refs if scalar(@$refs) == 0; + + # prevent circular references here: + my %seen = ( $mid0 => 1 ); + my @keep; + foreach my $ref (@$refs) { + if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { + warn "References: <$ref> too long, ignoring\n"; + next; + } + next if $seen{$ref}++; + push @keep, $ref; + } + $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; + \@keep; +} + +sub add_overview { + my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; + my $lines = $mime->body_raw =~ tr!\n!\n!; + my $smsg = bless { + mime => $mime, + mid => $mid0, + bytes => $bytes, + lines => $lines, + blob => $oid, + }, 'PublicInbox::SearchMsg'; + my $mids = mids($mime->header_obj); + my $refs = parse_references($smsg, $mid0, $mids); + my $subj = $smsg->subject; + my $xpath; + if ($subj ne '') { + $xpath = PublicInbox::Search::subject_path($subj); + $xpath = id_compress($xpath); + } + my $dd = $smsg->to_doc_data($oid, $mid0); + utf8::encode($dd); + $dd = compress($dd); + my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ]; + add_over($self, $values); +} + +sub add_over { + my ($self, $values) = @_; + my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; + my $old_tid; + my $vivified = 0; + + $self->begin_lazy; + $self->delete_by_num($num); + foreach my $mid (@$mids) { + my $v = 0; + each_by_mid($self, $mid, ['tid'], sub { + my ($cur) = @_; + my $cur_tid = $cur->{tid}; + my $n = $cur->{num}; + die "num must not be zero for $mid" if !$n; + $old_tid = $cur_tid unless defined $old_tid; + if ($n > 0) { # regular mail + merge_threads($self, $old_tid, $cur_tid); + } elsif ($n < 0) { # ghost + link_refs($self, $refs, $old_tid); + $self->delete_by_num($n); + $v++; + } + 1; + }); + $v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n"; + $vivified += $v; + } + my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); + my $sid = $self->sid($xpath); + my $dbh = $self->{dbh}; + my $sth = $dbh->prepare_cached(<<''); +INSERT INTO over (num, tid, sid, ts, ds, ddd) +VALUES (?,?,?,?,?,?) + + my $n = 0; + my @v = ($num, $tid, $sid, $ts, $ds); + foreach (@v) { $sth->bind_param(++$n, $_) } + $sth->bind_param(++$n, $ddd, SQL_BLOB); + $sth->execute; + $sth = $dbh->prepare_cached(<<''); +INSERT INTO id2num (id, num) VALUES (?,?) + + foreach my $mid (@$mids) { + my $id = $self->mid2id($mid); + $sth->execute($id, $num); + } +} + +sub delete_articles { + my ($self, $nums) = @_; + my $dbh = $self->connect; + $self->delete_by_num($_) foreach @$nums; +} + +sub remove_oid { + my ($self, $oid, $mid) = @_; + $self->begin_lazy; + each_by_mid($self, $mid, ['ddd'], sub { + my ($smsg) = @_; + $self->delete_by_num($smsg->{num}) if $smsg->{blob} eq $oid; + 1; + }); +} + +sub create_tables { + my ($dbh) = @_; + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS over ( + num INTEGER NOT NULL, + tid INTEGER NOT NULL, + sid INTEGER, + ts INTEGER, + ds INTEGER, + ddd VARBINARY, /* doc-data-deflated */ + UNIQUE (num) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); + $dbh->do('CREATE INDEX IF NOT EXISTS idx_sid ON over (sid)'); + $dbh->do('CREATE INDEX IF NOT EXISTS idx_ts ON over (ts)'); + $dbh->do('CREATE INDEX IF NOT EXISTS idx_ds ON over (ds)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS counter ( + key VARCHAR(8) PRIMARY KEY NOT NULL, + val INTEGER DEFAULT 0, + UNIQUE (key) +) + + $dbh->do("INSERT OR IGNORE INTO counter (key) VALUES ('thread')"); + $dbh->do("INSERT OR IGNORE INTO counter (key) VALUES ('ghost')"); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS subject ( + sid INTEGER PRIMARY KEY AUTOINCREMENT, + path VARCHAR(40) NOT NULL, + UNIQUE (path) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS id2num ( + id INTEGER NOT NULL, + num INTEGER NOT NULL, + UNIQUE (id, num) +) + + # performance critical: + $dbh->do('CREATE INDEX IF NOT EXISTS idx_inum ON id2num (num)'); + $dbh->do('CREATE INDEX IF NOT EXISTS idx_id ON id2num (id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS msgid ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + mid VARCHAR(244) NOT NULL, + UNIQUE (mid) +) + +} + +sub commit_lazy { + my ($self) = @_; + delete $self->{txn} or return; + $self->{dbh}->commit; +} + +sub begin_lazy { + my ($self) = @_; + return if $self->{txn}; + my $dbh = $self->connect or return; + $dbh->begin_work; + # $dbh->{Profile} = 2; + $self->{txn} = 1; +} + +sub rollback_lazy { + my ($self) = @_; + delete $self->{txn} or return; + $self->{dbh}->rollback; +} + +sub disconnect { + my ($self) = @_; + die "in transaction" if $self->{txn}; + $self->{dbh} = undef; +} + +sub create { + my ($self) = @_; + unless (-r $self->{filename}) { + require File::Path; + require File::Basename; + File::Path::mkpath(File::Basename::dirname($self->{filename})); + } + # create the DB: + PublicInbox::Over::connect($self); + $self->disconnect; +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index df02e0b5..7175ddc5 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -8,16 +8,15 @@ use strict; use warnings; # values for searching -use constant TS => 0; # timestamp -use constant NUM => 1; # NNTP article number -use constant BYTES => 2; # :bytes as defined in RFC 3977 -use constant LINES => 3; # :lines as defined in RFC 3977 -use constant YYYYMMDD => 4; # for searching in the WWW UI +use constant TS => 0; # Received: header in Unix time +use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI +use constant DT => 2; # Date: YYYYMMDDHHMMSS use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; use PublicInbox::MIME; -use PublicInbox::MID qw/mid_clean id_compress/; +use PublicInbox::MID qw/id_compress/; +use PublicInbox::Over; # This is English-only, everything else is non-standard and may be confused as # a prefix common in patch emails @@ -42,39 +41,34 @@ use constant { # 13 - fix threading for empty References/In-Reply-To # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0) # 14 - fix ghost root vivification - SCHEMA_VERSION => 14, + SCHEMA_VERSION => 15, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; -# setup prefixes -my %bool_pfx_internal = ( - type => 'T', # "mail" or "ghost" - thread => 'G', # newsGroup (or similar entity - e.g. a web forum name) -); - my %bool_pfx_external = ( - mid => 'Q', # uniQue id (Message-ID) + mid => 'Q', # Message-ID (full/exact), this is mostly uniQue ); +my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST'; my %prob_prefix = ( # for mairix compatibility s => 'S', - m => 'XMID', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial f => 'A', t => 'XTO', tc => 'XTO XCC', c => 'XCC', tcf => 'XTO XCC A', a => 'XTO XCC A', - b => 'XNQ XQUOT', - bs => 'XNQ XQUOT S', + b => $non_quoted_body . ' XQUOT', + bs => $non_quoted_body . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => 'XNQ', + nq => $non_quoted_body, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', @@ -85,7 +79,7 @@ my %prob_prefix = ( dfblob => 'XDFPRE XDFPOST', # default: - '' => 'XMID S A XNQ XQUOT XFN', + '' => 'XM S A XQUOT XFN ' . $non_quoted_body, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian @@ -95,6 +89,9 @@ our @HELP = ( date range as YYYYMMDD e.g. d:19931002..20101002 Open-ended ranges such as d:19931002.. and d:..20101002 are also supported +EOF + 'dt:' => < 'match within message body, including text attachments', 'nq:' => 'match non-quoted text within message body', @@ -117,72 +114,102 @@ EOF ); chomp @HELP; -my $mail_query = Search::Xapian::Query->new('T' . 'mail'); - sub xdir { - my (undef, $git_dir) = @_; - "$git_dir/public-inbox/xapian" . SCHEMA_VERSION; + my ($self) = @_; + if ($self->{version} == 1) { + "$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION; + } else { + my $dir = "$self->{mainrepo}/xap" . SCHEMA_VERSION; + my $part = $self->{partition}; + defined $part or die "partition not given"; + $dir .= "/$part"; + } } sub new { - my ($class, $git_dir, $altid) = @_; - my $dir = $class->xdir($git_dir); - my $db = Search::Xapian::Database->new($dir); - bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class; + my ($class, $mainrepo, $altid) = @_; + my $version = 1; + my $ibx = $mainrepo; + if (ref $ibx) { + $version = $ibx->{version} || 1; + $mainrepo = $ibx->{mainrepo}; + } + my $self = bless { + mainrepo => $mainrepo, + altid => $altid, + version => $version, + }, $class; + my $dir; + if ($version >= 2) { + $dir = "$self->{mainrepo}/xap" . SCHEMA_VERSION; + my $xdb; + my $parts = 0; + foreach my $part (<$dir/*>) { + -d $part && $part =~ m!/\d+\z! or next; + $parts++; + my $sub = Search::Xapian::Database->new($part); + if ($xdb) { + $xdb->add_database($sub); + } else { + $xdb = $sub; + } + } + $self->{xdb} = $xdb; + } else { + $dir = $self->xdir; + $self->{xdb} = Search::Xapian::Database->new($dir); + } + $self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3"); + $self; } -sub reopen { $_[0]->{xdb}->reopen } +sub reopen { + my ($self) = @_; + $self->{xdb}->reopen; + $self; # make chaining easier +} # read-only sub query { my ($self, $query_string, $opts) = @_; - my $query; - $opts ||= {}; - unless ($query_string eq '') { - $query = $self->qp->parse_query($query_string, QP_FLAGS); + if ($query_string eq '' && !$opts->{mset}) { + $self->{over_ro}->recent($opts); + } else { + my $query = $self->qp->parse_query($query_string, QP_FLAGS); $opts->{relevance} = 1 unless exists $opts->{relevance}; + _do_enquire($self, $query, $opts); } - - _do_enquire($self, $query, $opts); } sub get_thread { - my ($self, $mid, $opts) = @_; - my $smsg = eval { $self->lookup_message($mid) }; - - return { total => 0, msgs => [] } unless $smsg; - my $qtid = Search::Xapian::Query->new('G' . $smsg->thread_id); - my $path = $smsg->path; - if (defined $path && $path ne '') { - my $path = id_compress($smsg->path); - my $qsub = Search::Xapian::Query->new('XPATH' . $path); - $qtid = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); - } - $opts ||= {}; - $opts->{limit} ||= 1000; - - # always sort threads by timestamp, this makes life easier - # for the threading algorithm (in SearchThread.pm) - $opts->{asc} = 1; - - _do_enquire($self, $qtid, $opts); + my ($self, $mid, $prev) = @_; + $self->{over_ro}->get_thread($mid, $prev); } sub retry_reopen { my ($self, $cb) = @_; - my $ret; - for (1..10) { - eval { $ret = $cb->() }; - return $ret unless $@; + for my $i (1..10) { + if (wantarray) { + my @ret; + eval { @ret = $cb->() }; + return @ret unless $@; + } else { + my $ret; + eval { $ret = $cb->() }; + return $ret unless $@; + } # Exception: The revision being read has been discarded - # you should call Xapian::Database::reopen() if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + warn "reopen try #$i on $@\n"; reopen($self); } else { + warn "ref: ", ref($@), "\n"; die; } } + die "Too many Xapian database modifications in progress\n"; } sub _do_enquire { @@ -192,19 +219,16 @@ sub _do_enquire { sub _enquire_once { my ($self, $query, $opts) = @_; - my $enquire = $self->enquire; - if (defined $query) { - $query = Search::Xapian::Query->new(OP_AND,$query,$mail_query); - } else { - $query = $mail_query; - } + my $enquire = enquire($self); $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; - if ($opts->{relevance}) { + if (($opts->{mset} || 0) == 2) { + $enquire->set_docid_order(Search::Xapian::ENQ_ASCENDING()); + $enquire->set_weighting_scheme(Search::Xapian::BoolWeight->new); + delete $self->{enquire}; + } elsif ($opts->{relevance}) { $enquire->set_sort_by_relevance_then_value(TS, $desc); - } elsif ($opts->{num}) { - $enquire->set_sort_by_value(NUM, 0); } else { $enquire->set_sort_by_value_then_relevance(TS, $desc); } @@ -215,8 +239,9 @@ sub _enquire_once { my @msgs = map { PublicInbox::SearchMsg->load_doc($_->get_document); } $mset->items; + return \@msgs unless wantarray; - { total => $mset->get_matches_estimated, msgs => \@msgs } + ($mset->get_matches_estimated, \@msgs) } # read-write @@ -237,6 +262,8 @@ sub qp { $qp->set_stemming_strategy(STEM_SOME); $qp->add_valuerangeprocessor( Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(DT, 'dt:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $prefix); @@ -266,78 +293,25 @@ EOF $self->{query_parser} = $qp; } -sub num_range_processor { - $_[0]->{nrp} ||= Search::Xapian::NumberValueRangeProcessor->new(NUM); -} - # only used for NNTP server sub query_xover { my ($self, $beg, $end, $offset) = @_; - my $qp = Search::Xapian::QueryParser->new; - $qp->set_database($self->{xdb}); - $qp->add_valuerangeprocessor($self->num_range_processor); - my $query = $qp->parse_query("$beg..$end", QP_FLAGS); - - _do_enquire($self, $query, {num => 1, limit => 200, offset => $offset}); + $self->{over_ro}->query_xover($beg, $end, $offset); } sub query_ts { - my ($self, $ts, $opts) = @_; - my $qp = $self->{qp_ts} ||= eval { - my $q = Search::Xapian::QueryParser->new; - $q->set_database($self->{xdb}); - $q->add_valuerangeprocessor( - Search::Xapian::NumberValueRangeProcessor->new(TS)); - $q - }; - my $query = $qp->parse_query($ts, QP_FLAGS); - _do_enquire($self, $query, $opts); + my ($self, $ts, $prev) = @_; + $self->{over_ro}->query_ts($ts, $prev); } -sub lookup_message { - my ($self, $mid) = @_; - $mid = mid_clean($mid); - - my $doc_id = $self->find_unique_doc_id('Q' . $mid); - my $smsg; - if (defined $doc_id) { - # raises on error: - my $doc = $self->{xdb}->get_document($doc_id); - $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $smsg->{doc_id} = $doc_id; - } - $smsg; +sub lookup_article { + my ($self, $num) = @_; + $self->{over_ro}->get_art($num); } -sub lookup_mail { # no ghosts! - my ($self, $mid) = @_; - retry_reopen($self, sub { - my $smsg = lookup_message($self, $mid) or return; - $smsg->load_expand; - }); -} - -sub find_unique_doc_id { - my ($self, $termval) = @_; - - my ($begin, $end) = $self->find_doc_ids($termval); - - return undef if $begin->equal($end); # not found - - my $rv = $begin->get_docid; - - # sanity check - $begin->inc; - $begin->equal($end) or die "Term '$termval' is not unique\n"; - $rv; -} - -# returns begin and end PostingIterator -sub find_doc_ids { - my ($self, $termval) = @_; - my $db = $self->{xdb}; - - ($db->postlist_begin($termval), $db->postlist_end($termval)); +sub next_by_mid { + my ($self, $mid, $id, $prev) = @_; + $self->{over_ro}->next_by_mid($mid, $id, $prev); } # normalize subjects so they are suitable as pathnames for URLs diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5559b39d..7026fc4c 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -9,24 +9,19 @@ package PublicInbox::SearchIdx; use strict; use warnings; -use Fcntl qw(:flock :DEFAULT); +use base qw(PublicInbox::Search PublicInbox::Lock); use PublicInbox::MIME; -use Email::MIME::ContentType; -$Email::MIME::ContentType::STRICT_PARAMS = 0; -use base qw(PublicInbox::Search); -use PublicInbox::MID qw/mid_clean id_compress mid_mime/; +use PublicInbox::InboxWritable; +use PublicInbox::MID qw/mid_clean id_compress mid_mime mids/; use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); +use PublicInbox::OverIdx; +use PublicInbox::Spawn qw(spawn); require PublicInbox::Git; +use Compress::Zlib qw(compress); use constant { - MAX_MID_SIZE => 244, # max term size - 1 in Xapian - PERM_UMASK => 0, - OLD_PERM_GROUP => 1, - OLD_PERM_EVERYBODY => 2, - PERM_GROUP => 0660, - PERM_EVERYBODY => 0664, BATCH_BYTES => 1_000_000, DEBUG => !!$ENV{DEBUG}, }; @@ -51,26 +46,46 @@ sub git_unquote ($) { } sub new { - my ($class, $inbox, $creat) = @_; - my $git_dir = $inbox; - my $altid; - if (ref $inbox) { - $git_dir = $inbox->{mainrepo}; - $altid = $inbox->{altid}; + my ($class, $ibx, $creat, $part) = @_; + my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config + my $git_dir = $mainrepo; + my ($altid, $git); + my $version = 1; + if (ref $ibx) { + $mainrepo = $ibx->{mainrepo}; + $altid = $ibx->{altid}; + $version = $ibx->{version} || 1; if ($altid) { require PublicInbox::AltId; $altid = [ map { - PublicInbox::AltId->new($inbox, $_); + PublicInbox::AltId->new($ibx, $_); } @$altid ]; } + } else { # v1 + $ibx = { mainrepo => $git_dir, version => 1 }; } + $ibx = PublicInbox::InboxWritable->new($ibx); require Search::Xapian::WritableDatabase; - my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; - my $perm = $self->_git_config_perm; - my $umask = _umask_for($perm); - $self->{umask} = $umask; - $self->{lock_path} = "$git_dir/ssoma.lock"; - $self->{git} = PublicInbox::Git->new($git_dir); + my $self = bless { + mainrepo => $mainrepo, + -inbox => $ibx, + git => $ibx->git, + -altid => $altid, + version => $version, + }, $class; + $ibx->umask_prepare; + if ($version == 1) { + $self->{lock_path} = "$mainrepo/ssoma.lock"; + my $dir = $self->xdir; + $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); + } elsif ($version == 2) { + defined $part or die "partition is required for v2\n"; + # partition is a number + $self->{partition} = $part; + $self->{lock_path} = undef; + } else { + die "unsupported inbox version=$version\n"; + } $self->{creat} = ($creat || 0) == 1; $self; } @@ -79,66 +94,30 @@ sub _xdb_release { my ($self) = @_; my $xdb = delete $self->{xdb} or croak 'not acquired'; $xdb->close; - _lock_release($self) if $self->{creat}; + $self->lock_release if $self->{creat}; undef; } sub _xdb_acquire { my ($self) = @_; croak 'already acquired' if $self->{xdb}; - my $dir = PublicInbox::Search->xdir($self->{git_dir}); + my $dir = $self->xdir; my $flag = Search::Xapian::DB_OPEN; if ($self->{creat}) { require File::Path; - _lock_acquire($self); + $self->lock_acquire; File::Path::mkpath($dir); $flag = Search::Xapian::DB_CREATE_OR_OPEN; } $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); } -# we only acquire the flock if creating or reindexing; -# PublicInbox::Import already has the lock on its own. -sub _lock_acquire { - my ($self) = @_; - croak 'already locked' if $self->{lockfh}; - sysopen(my $lockfh, $self->{lock_path}, O_WRONLY|O_CREAT) or - die "failed to open lock $self->{lock_path}: $!\n"; - flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; - $self->{lockfh} = $lockfh; -} - -sub _lock_release { - my ($self) = @_; - my $lockfh = delete $self->{lockfh} or croak 'not locked'; - flock($lockfh, LOCK_UN) or die "unlock failed: $!\n"; - close $lockfh or die "close failed: $!\n"; -} - sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } -sub add_values ($$$) { - my ($smsg, $bytes, $num) = @_; - - my $ts = $smsg->ts; - my $doc = $smsg->{doc}; - add_val($doc, &PublicInbox::Search::TS, $ts); - - defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $smsg->{mime}->body_raw =~ tr!\n!\n!); - - my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); - add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); -} - sub index_users ($$) { my ($tg, $smsg) = @_; @@ -154,14 +133,19 @@ sub index_users ($$) { $tg->increase_termpos; } -sub index_text_inc ($$$) { - my ($tg, $text, $pfx) = @_; +sub index_diff_inc ($$$$) { + my ($tg, $text, $pfx, $xnq) = @_; + if (@$xnq) { + $tg->index_text(join("\n", @$xnq), 1, 'XNQ'); + $tg->increase_termpos; + @$xnq = (); + } $tg->index_text($text, 1, $pfx); $tg->increase_termpos; } sub index_old_diff_fn { - my ($tg, $seen, $fa, $fb) = @_; + my ($tg, $seen, $fa, $fb, $xnq) = @_; # no renames or space support for traditional diffs, # find the number of leading common paths to strip: @@ -171,7 +155,9 @@ sub index_old_diff_fn { $fa = join('/', @fa); $fb = join('/', @fb); if ($fa eq $fb) { - index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++; + unless ($seen->{$fa}++) { + index_diff_inc($tg, $fa, 'XDFN', $xnq); + } return 1; } shift @fa; @@ -184,40 +170,46 @@ sub index_diff ($$$) { my ($tg, $lines, $doc) = @_; my %seen; my $in_diff; + my @xnq; + my $xnq = \@xnq; foreach (@$lines) { if ($in_diff && s/^ //) { # diff context - index_text_inc($tg, $_, 'XDFCTX'); + index_diff_inc($tg, $_, 'XDFCTX', $xnq); } elsif (/^-- $/) { # email signature begins $in_diff = undef; } elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) { my ($fa, $fb) = ($1, $2); my $fn = (split('/', git_unquote($fa), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $fn = (split('/', git_unquote($fb), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; # traditional diff: } elsif (m/^diff -(.+) (\S+) (\S+)$/) { my ($opt, $fa, $fb) = ($1, $2, $3); + push @xnq, $_; # only support unified: next unless $opt =~ /[uU]/; - $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb); + $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb, + $xnq); } elsif (m!^--- ("?a/.+)!) { my $fn = (split('/', git_unquote($1), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (m!^\+\+\+ ("?b/.+)!) { my $fn = (split('/', git_unquote($1), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (/^--- (\S+)/) { $in_diff = $1; + push @xnq, $_; } elsif (defined $in_diff && /^\+\+\+ (\S+)/) { - $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1); + $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1, + $xnq); } elsif ($in_diff && s/^\+//) { # diff added - index_text_inc($tg, $_, 'XDFB'); + index_diff_inc($tg, $_, 'XDFB', $xnq); } elsif ($in_diff && s/^-//) { # diff removed - index_text_inc($tg, $_, 'XDFA'); + index_diff_inc($tg, $_, 'XDFA', $xnq); } elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) { my ($ba, $bb) = ($1, $2); index_git_blob_id($doc, 'XDFPRE', $ba); @@ -227,64 +219,67 @@ sub index_diff ($$$) { # traditional diff w/o -p } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) { # hunk header context - index_text_inc($tg, $1, 'XDFHH'); + index_diff_inc($tg, $1, 'XDFHH', $xnq); # ignore the following lines: - } elsif (/^(?:dis)similarity index/) { - } elsif (/^(?:old|new) mode/) { - } elsif (/^(?:deleted|new) file mode/) { - } elsif (/^(?:copy|rename) (?:from|to) /) { - } elsif (/^(?:dis)?similarity index /) { - } elsif (/^\\ No newline at end of file/) { - } elsif (/^Binary files .* differ/) { + } elsif (/^(?:dis)similarity index/ || + /^(?:old|new) mode/ || + /^(?:deleted|new) file mode/ || + /^(?:copy|rename) (?:from|to) / || + /^(?:dis)?similarity index / || + /^\\ No newline at end of file/ || + /^Binary files .* differ/) { + push @xnq, $_; } elsif ($_ eq '') { $in_diff = undef; } else { + push @xnq, $_; warn "non-diff line: $_\n" if DEBUG && $_ ne ''; $in_diff = undef; } } + + $tg->index_text(join("\n", @xnq), 1, 'XNQ'); + $tg->increase_termpos; } sub index_body ($$$) { my ($tg, $lines, $doc) = @_; my $txt = join("\n", @$lines); - $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT'); - $tg->increase_termpos; - # does it look like a diff? - if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) { - $txt = undef; - index_diff($tg, $lines, $doc); + if ($doc) { + # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + $txt = undef; + index_diff($tg, $lines, $doc); + } else { + $tg->index_text($txt, 1, 'XNQ'); + } + } else { + $tg->index_text($txt, 0, 'XQUOT'); } + $tg->increase_termpos; @$lines = (); } sub add_message { - my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object - my $db = $self->{xdb}; - - my ($doc_id, $old_tid); - my $mid = mid_clean(mid_mime($mime)); - + # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; + my $doc_id; + my $mids = mids($mime->header_obj); + $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility + unless (defined $num) { # v1 + $self->_msgmap_init; + $num = index_mm($self, $mime); + } eval { - die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; - my $smsg = $self->lookup_message($mid); - if ($smsg) { - # convert a ghost to a regular message - # it will also clobber any existing regular message - $doc_id = $smsg->{doc_id}; - $old_tid = $smsg->thread_id; - } - $smsg = PublicInbox::SearchMsg->new($mime); + my $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; - $doc->add_term('Q' . $mid); - my $subj = $smsg->subject; - if ($subj ne '') { - my $path = $self->subject_path($subj); - $doc->add_term('XPATH' . id_compress($path)); - } - - add_values($smsg, $bytes, $num); + add_val($doc, PublicInbox::Search::TS(), $smsg->ts); + my @ds = gmtime($smsg->ds); + my $yyyymmdd = strftime('%Y%m%d', @ds); + add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); + my $dt = strftime('%Y%m%d%H%M%S', @ds); + add_val($doc, PublicInbox::Search::DT(), $dt); my $tg = $self->term_generator; @@ -300,6 +295,7 @@ sub add_message { my $fn = $part->filename; if (defined $fn && $fn ne '') { $tg->index_text($fn, 1, 'XFN'); + $tg->increase_termpos; } return if $ct =~ m!\btext/x?html\b!i; @@ -333,52 +329,110 @@ sub add_message { index_body($tg, \@orig, $doc) if @orig; }); - link_message($self, $smsg, $old_tid); - $tg->index_text($mid, 1, 'XMID'); - $doc->set_data($smsg->to_doc_data($blob)); - + foreach my $mid (@$mids) { + $tg->index_text($mid, 1, 'XM'); + $tg->increase_termpos; + } + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids); + my $data = $smsg->to_doc_data($oid, $mid0); + $doc->set_data($data); if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_term($alt->{xprefix} . $id); + my $pfx = $alt->{xprefix}; + foreach my $mid (@$mids) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_boolean_term($pfx . $id); + } } } - if (defined $doc_id) { - $db->replace_document($doc_id, $doc); - } else { - $doc_id = $db->add_document($doc); + + if (my $over = $self->{over}) { + $over->add_overview($mime, $bytes, $num, $oid, $mid0); } + $doc->add_boolean_term('Q' . $_) foreach @$mids; + $self->{xdb}->replace_document($doc_id = $num, $doc); }; if ($@) { - warn "failed to index message <$mid>: $@\n"; + warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } $doc_id; } -# returns deleted doc_id on success, undef on missing +# returns begin and end PostingIterator +sub find_doc_ids { + my ($self, $termval) = @_; + my $db = $self->{xdb}; + + ($db->postlist_begin($termval), $db->postlist_end($termval)); +} + +sub batch_do { + my ($self, $termval, $cb) = @_; + my $batch_size = 1000; # don't let @ids grow too large to avoid OOM + while (1) { + my ($head, $tail) = $self->find_doc_ids($termval); + return if $head == $tail; + my @ids; + for (; $head != $tail && @ids < $batch_size; $head->inc) { + push @ids, $head->get_docid; + } + $cb->(\@ids); + } +} + sub remove_message { my ($self, $mid) = @_; my $db = $self->{xdb}; - my $doc_id; + my $called; $mid = mid_clean($mid); + my $over = $self->{over}; eval { - $doc_id = $self->find_unique_doc_id('Q' . $mid); - if (defined $doc_id) { - $db->delete_document($doc_id); - } else { - warn "cannot remove non-existent <$mid>\n"; - } + batch_do($self, 'Q' . $mid, sub { + my ($ids) = @_; + $db->delete_document($_) for @$ids; + $over->delete_articles($ids) if $over; + $called = 1; + }); }; - if ($@) { warn "failed to remove message <$mid>: $@\n"; - return undef; + } elsif (!$called) { + warn "cannot remove non-existent <$mid>\n"; } - $doc_id; +} + +# MID is a hint in V2 +sub remove_by_oid { + my ($self, $oid, $mid) = @_; + my $db = $self->{xdb}; + + $self->{over}->remove_oid($oid, $mid) if $self->{over}; + + # XXX careful, we cannot use batch_do here since we conditionally + # delete documents based on other factors, so we cannot call + # find_doc_ids twice. + my ($head, $tail) = $self->find_doc_ids('Q' . $mid); + return if $head == $tail; + + # there is only ONE element in @delete unless we + # have bugs in our v2writable deduplication check + my @delete; + for (; $head != $tail; $head->inc) { + my $docid = $head->get_docid; + my $doc = $db->get_document($docid); + my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $smsg->load_expand; + if ($smsg->{blob} eq $oid) { + push(@delete, $docid); + } + } + $db->delete_document($_) foreach @delete; + scalar(@delete); } sub term_generator { # write-only @@ -393,72 +447,6 @@ sub term_generator { # write-only $self->{term_generator} = $tg; } -# increments last_thread_id counter -# returns a 64-bit integer represented as a hex string -sub next_thread_id { - my ($self) = @_; - my $db = $self->{xdb}; - my $last_thread_id = int($db->get_metadata('last_thread_id') || 0); - - $db->set_metadata('last_thread_id', ++$last_thread_id); - - $last_thread_id; -} - -sub link_message { - my ($self, $smsg, $old_tid) = @_; - my $doc = $smsg->{doc}; - my $mid = $smsg->mid; - my $mime = $smsg->{mime}; - my $hdr = $mime->header_obj; - - # last References should be IRT, but some mail clients do things - # out of order, so trust IRT over References iff IRT exists - my @refs = (($hdr->header_raw('References') || '') =~ /<([^>]+)>/g); - push(@refs, (($hdr->header_raw('In-Reply-To') || '') =~ /<([^>]+)>/g)); - - my $tid; - if (@refs) { - my %uniq = ($mid => 1); - my @orig_refs = @refs; - @refs = (); - - # prevent circular references via References: here: - foreach my $ref (@orig_refs) { - if (length($ref) > MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - } - next if $uniq{$ref}; - $uniq{$ref} = 1; - push @refs, $ref; - } - } - - if (@refs) { - $smsg->{references} = '<'.join('> <', @refs).'>'; - - # first ref *should* be the thread root, - # but we can never trust clients to do the right thing - my $ref = shift @refs; - $tid = $self->_resolve_mid_to_tid($ref); - $self->merge_threads($tid, $old_tid) if defined $old_tid; - - # the rest of the refs should point to this tid: - foreach $ref (@refs) { - my $ptid = $self->_resolve_mid_to_tid($ref); - merge_threads($self, $tid, $ptid); - } - } else { - $tid = defined $old_tid ? $old_tid : $self->next_thread_id; - } - $doc->add_term('G' . $tid); -} - -sub index_blob { - my ($self, $mime, $bytes, $num, $blob) = @_; - $self->add_message($mime, $bytes, $num, $blob); -} - sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; @@ -479,33 +467,42 @@ sub index_mm { my ($self, $mime) = @_; my $mid = mid_clean(mid_mime($mime)); my $mm = $self->{mm}; - my $num = $mm->mid_insert($mid); + my $num; - # fallback to num_for since filters like RubyLang set the number - defined $num ? $num : $mm->num_for($mid); -} + if (defined $self->{regen_down}) { + $num = $mm->num_for($mid) and return $num; -sub unindex_mm { - my ($self, $mime) = @_; - $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); -} + while (($num = $self->{regen_down}--) > 0) { + if ($mm->mid_set($num, $mid) != 0) { + return $num; + } + } + } elsif (defined $self->{regen_up}) { + $num = $mm->num_for($mid) and return $num; -sub index_mm2 { - my ($self, $mime, $bytes, $blob) = @_; - my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $mime, $bytes, $num, $blob); + # this is to fixup old bugs due to add-remove-add + while (($num = ++$self->{regen_up})) { + if ($mm->mid_set($num, $mid) != 0) { + return $num; + } + } + } + + $num = $mm->mid_insert($mid) and return $num; + + # fallback to num_for since filters like RubyLang set the number + $mm->num_for($mid); } -sub unindex_mm2 { +sub unindex_mm { my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); - unindex_blob($self, $mime); } sub index_both { my ($self, $mime, $bytes, $blob) = @_; my $num = index_mm($self, $mime); - index_blob($self, $mime, $bytes, $num, $blob); + add_message($self, $mime, $bytes, $num, $blob); } sub unindex_both { @@ -527,7 +524,7 @@ sub do_cat_mail { sub index_sync { my ($self, $opts) = @_; - with_umask($self, sub { $self->_index_sync($opts) }); + $self->{-inbox}->with_umask(sub { $self->_index_sync($opts) }) } sub batch_adjust ($$$$) { @@ -535,11 +532,12 @@ sub batch_adjust ($$$$) { $$max -= $bytes; if ($$max <= 0) { $$max = BATCH_BYTES; - $batch_cb->($latest, 1); + $batch_cb->($latest); } } -sub rlog { +# only for v1 +sub read_log { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; @@ -550,48 +548,108 @@ sub rlog { my $bytes; my $max = BATCH_BYTES; local $/ = "\n"; + my %D; my $line; + my $newest; + my $mid = '20170114215743.5igbjup6qpsh3jfg@genre.crustytoothpaste.net'; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; + delete $D{$blob} and next; my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mids = mids($mime->header_obj); + foreach (@$mids) { + warn "ADD $mid\n" if ($_ eq $mid); + } batch_adjust(\$max, $bytes, $batch_cb, $latest); $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - my $mime = do_cat_mail($git, $blob, \$bytes) or next; - batch_adjust(\$max, $bytes, $batch_cb, $latest); - $del_cb->($self, $mime); + $D{$blob} = 1; } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; + $newest ||= $latest; } } - $batch_cb->($latest, 0); + # get the leftovers + foreach my $blob (keys %D) { + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mids = mids($mime->header_obj); + foreach (@$mids) { + warn "DEL $mid\n" if ($_ eq $mid); + } + $del_cb->($self, $mime); + } + $batch_cb->($latest, $newest); } sub _msgmap_init { my ($self) = @_; - $self->{mm} = eval { + die "BUG: _msgmap_init is only for v1\n" if $self->{version} != 1; + $self->{mm} ||= eval { require PublicInbox::Msgmap; - PublicInbox::Msgmap->new($self->{git_dir}, 1); + PublicInbox::Msgmap->new($self->{mainrepo}, 1); }; } sub _git_log { my ($self, $range) = @_; - $self->{git}->popen(qw/log --reverse --no-notes --no-color + my $git = $self->{git}; + + if (index($range, '..') < 0) { + my $regen_max = 0; + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $range); + ++$regen_max while <$fh>; + my (undef, $max) = $self->{mm}->minmax; + + if ($max && $max == $regen_max) { + # fix up old bugs in full indexes which caused messages to + # not appear in Msgmap + $self->{regen_up} = $max; + } else { + # normal regen is for for fresh data + $self->{regen_down} = $regen_max; + } + } + + $git->popen(qw/log --no-notes --no-color --no-renames --raw -r --no-abbrev/, $range); } -# indexes all unindexed messages +sub is_ancestor ($$$) { + my ($git, $cur, $tip) = @_; + return 0 unless $git->check($cur); + my $cmd = [ 'git', "--git-dir=$git->{git_dir}", + qw(merge-base --is-ancestor), $cur, $tip ]; + my $pid = spawn($cmd); + defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; + waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; + $? == 0; +} + +sub need_update ($$$) { + my ($self, $cur, $new) = @_; + my $git = $self->{git}; + return 1 if $cur && !is_ancestor($git, $cur, $new); + my $range = $cur eq '' ? $new : "$cur..$new"; + chomp(my $n = $git->qx(qw(rev-list --count), $range)); + ($n eq '' || $n > 0); +} + +# indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; my $reindex = $opts->{reindex}; my ($mkey, $last_commit, $lx, $xlog); - $self->{git}->batch_prepare; - my $xdb = _xdb_acquire($self); - $xdb->begin_transaction; + my $git = $self->{git}; + $git->batch_prepare; + + my $xdb = $self->begin_txn_lazy; + my $mm = _msgmap_init($self); do { $xlog = undef; $mkey = 'last_commit'; @@ -601,180 +659,118 @@ sub _index_sync { $lx = ''; $mkey = undef if $last_commit ne ''; } + + # use last_commit from msgmap if it is older or unset + my $lm = $mm->last_commit || ''; + if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) { + $lx = $lm; + } + + $self->{over}->rollback_lazy; + $self->{over}->disconnect; + delete $self->{txn}; $xdb->cancel_transaction; $xdb = _xdb_release($self); - # ensure we leak no FDs to "git log" + # ensure we leak no FDs to "git log" with Xapian <= 1.2 my $range = $lx eq '' ? $tip : "$lx..$tip"; $xlog = _git_log($self, $range); - $xdb = _xdb_acquire($self); - $xdb->begin_transaction; + $xdb = $self->begin_txn_lazy; } while ($xdb->get_metadata('last_commit') ne $last_commit); - my $mm = _msgmap_init($self); my $dbh = $mm->{dbh} if $mm; - my $mm_only; my $cb = sub { - my ($commit, $more) = @_; + my ($commit, $newest) = @_; if ($dbh) { - $mm->last_commit($commit) if $commit; + if ($newest) { + my $cur = $mm->last_commit || ''; + if (need_update($self, $cur, $newest)) { + $mm->last_commit($newest); + } + } $dbh->commit; } - if (!$mm_only) { - $xdb->set_metadata($mkey, $commit) if $mkey && $commit; - $xdb->commit_transaction; - $xdb = _xdb_release($self); + if ($mkey && $newest) { + my $cur = $xdb->get_metadata($mkey); + if (need_update($self, $cur, $newest)) { + $xdb->set_metadata($mkey, $newest); + } } + $self->commit_txn_lazy; # let another process do some work... < - if ($more) { - if (!$mm_only) { - $xdb = _xdb_acquire($self); - $xdb->begin_transaction; - } + if (!$newest) { + $xdb = $self->begin_txn_lazy; $dbh->begin_work if $dbh; } }; - if ($mm) { - $dbh->begin_work; - my $lm = $mm->last_commit || ''; - if ($lm eq $lx) { - # Common case is the indexes are synced, - # we only need to run git-log once: - rlog($self, $xlog, *index_both, *unindex_both, $cb); - } else { - # Uncommon case, msgmap and xapian are out-of-sync - # do not care for performance (but git is fast :>) - # This happens if we have to reindex Xapian since - # msgmap is a frozen format and our Xapian format - # is evolving. - my $r = $lm eq '' ? $tip : "$lm..$tip"; - - # first, ensure msgmap is up-to-date: - my $mkey_prev = $mkey; - $mkey = undef; # ignore xapian, for now - my $mlog = _git_log($self, $r); - $mm_only = 1; - rlog($self, $mlog, *index_mm, *unindex_mm, $cb); - $mm_only = $mlog = undef; - - # now deal with Xapian - $mkey = $mkey_prev; - $dbh = undef; - rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); - } - } else { - # user didn't install DBD::SQLite and DBI - rlog($self, $xlog, *index_blob, *unindex_blob, $cb); - } + $dbh->begin_work; + read_log($self, $xlog, *index_both, *unindex_both, $cb); } -# this will create a ghost as necessary -sub _resolve_mid_to_tid { - my ($self, $mid) = @_; - - my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid); - $smsg->thread_id; -} - -sub create_ghost { - my ($self, $mid) = @_; - - my $tid = $self->next_thread_id; - my $doc = Search::Xapian::Document->new; - $doc->add_term('Q' . $mid); - $doc->add_term('G' . $tid); - $doc->add_term('T' . 'ghost'); - - my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $self->{xdb}->add_document($doc); - - $smsg; +sub DESTROY { + # order matters for unlocking + $_[0]->{xdb} = undef; + $_[0]->{lockfh} = undef; } -sub merge_threads { - my ($self, $winner_tid, $loser_tid) = @_; - return if $winner_tid == $loser_tid; - my $db = $self->{xdb}; - - my $batch_size = 1000; # don't let @ids grow too large to avoid OOM - while (1) { - my ($head, $tail) = $self->find_doc_ids('G' . $loser_tid); - return if $head == $tail; - my @ids; - for (; $head != $tail && @ids < $batch_size; $head->inc) { - push @ids, $head->get_docid; - } - foreach my $docid (@ids) { - my $doc = $db->get_document($docid); - $doc->remove_term('G' . $loser_tid); - $doc->add_term('G' . $winner_tid); - $db->replace_document($docid, $doc); - } +# remote_* subs are only used by SearchIdxPart +sub remote_commit { + my ($self) = @_; + if (my $w = $self->{w}) { + print $w "commit\n" or die "failed to write commit: $!"; + } else { + $self->commit_txn_lazy; } } -sub _read_git_config_perm { +sub remote_close { my ($self) = @_; - my @cmd = qw(config core.sharedRepository); - my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd); - local $/ = "\n"; - my $perm = <$fh>; - chomp $perm if defined $perm; - $perm; -} - -sub _git_config_perm { - my $self = shift; - my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); - return PERM_GROUP if (!defined($perm) || $perm eq ''); - return PERM_UMASK if ($perm eq 'umask'); - return PERM_GROUP if ($perm eq 'group'); - if ($perm =~ /\A(?:all|world|everybody)\z/) { - return PERM_EVERYBODY; + if (my $w = delete $self->{w}) { + my $pid = delete $self->{pid} or die "no process to wait on\n"; + print $w "close\n" or die "failed to write to pid:$pid: $!\n"; + close $w or die "failed to close pipe for pid:$pid: $!\n"; + waitpid($pid, 0) == $pid or die "remote process did not finish"; + $? == 0 or die ref($self)." pid:$pid exited with: $?"; + } else { + die "transaction in progress $self\n" if $self->{txn}; + $self->_xdb_release if $self->{xdb}; } - return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); - return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); - - my $i = oct($perm); - return PERM_UMASK if ($i == PERM_UMASK); - return PERM_GROUP if ($i == OLD_PERM_GROUP); - return PERM_EVERYBODY if ($i == OLD_PERM_EVERYBODY); +} - if (($i & 0600) != 0600) { - die "core.sharedRepository mode invalid: ". - sprintf('%.3o', $i) . "\nOwner must have permissions\n"; +sub remote_remove { + my ($self, $oid, $mid) = @_; + if (my $w = $self->{w}) { + # triggers remove_by_oid in a partition + print $w "D $oid $mid\n" or die "failed to write remove $!"; + } else { + $self->begin_txn_lazy; + $self->remove_by_oid($oid, $mid); } - ($i & 0666); } -sub _umask_for { - my ($perm) = @_; # _git_config_perm return value - my $rv = $perm; - return umask if $rv == 0; - - # set +x bit if +r or +w were set - $rv |= 0100 if ($rv & 0600); - $rv |= 0010 if ($rv & 0060); - $rv |= 0001 if ($rv & 0006); - (~$rv & 0777); +sub begin_txn_lazy { + my ($self) = @_; + return if $self->{txn}; + my $xdb = $self->{xdb} || $self->_xdb_acquire; + $self->{over}->begin_lazy if $self->{over}; + $xdb->begin_transaction; + $self->{txn} = 1; + $xdb; } -sub with_umask { - my ($self, $cb) = @_; - my $old = umask $self->{umask}; - my $rv = eval { $cb->() }; - my $err = $@; - umask $old; - die $err if $err; - $rv; +sub commit_txn_lazy { + my ($self) = @_; + delete $self->{txn} or return; + $self->{xdb}->commit_transaction; + $self->{over}->commit_lazy if $self->{over}; } -sub DESTROY { - # order matters for unlocking - $_[0]->{xdb} = undef; - $_[0]->{lockfh} = undef; +sub worker_done { + my ($self) = @_; + die "$$ $0 xdb not released\n" if $self->{xdb}; + die "$$ $0 still in transaction\n" if $self->{txn}; } 1; diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm new file mode 100644 index 00000000..078d2df1 --- /dev/null +++ b/lib/PublicInbox/SearchIdxPart.pm @@ -0,0 +1,106 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +package PublicInbox::SearchIdxPart; +use strict; +use warnings; +use base qw(PublicInbox::SearchIdx); + +sub new { + my ($class, $v2writable, $part) = @_; + my $self = $class->SUPER::new($v2writable->{-inbox}, 1, $part); + # create the DB before forking: + $self->_xdb_acquire; + $self->_xdb_release; + $self->spawn_worker($v2writable, $part) if $v2writable->{parallel}; + $self; +} + +sub spawn_worker { + my ($self, $v2writable, $part) = @_; + my ($r, $w); + pipe($r, $w) or die "pipe failed: $!\n"; + binmode $r, ':raw'; + binmode $w, ':raw'; + my $pid = fork; + defined $pid or die "fork failed: $!\n"; + if ($pid == 0) { + my $bnote = $v2writable->atfork_child; + $v2writable = undef; + close $w or die "failed to close: $!"; + + # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size here + # speeds V2Writable batch imports across 8 cores by nearly 20% + fcntl($r, 1031, 1048576) if $^O eq 'linux'; + + eval { partition_worker_loop($self, $r, $part, $bnote) }; + die "worker $part died: $@\n" if $@; + die "unexpected MM $self->{mm}" if $self->{mm}; + exit; + } + $self->{pid} = $pid; + $self->{w} = $w; + close $r or die "failed to close: $!"; +} + +sub partition_worker_loop ($$$$) { + my ($self, $r, $part, $bnote) = @_; + $0 = "pi-v2-partition[$part]"; + $self->begin_txn_lazy; + while (my $line = $r->getline) { + if ($line eq "commit\n") { + $self->commit_txn_lazy; + } elsif ($line eq "close\n") { + $self->_xdb_release; + } elsif ($line eq "barrier\n") { + $self->commit_txn_lazy; + # no need to lock < 512 bytes is atomic under POSIX + print $bnote "barrier $part\n" or + die "write failed for barrier $!\n"; + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $self->begin_txn_lazy; + $self->remove_by_oid($oid, $mid); + } else { + chomp $line; + my ($len, $artnum, $oid, $mid0) = split(/ /, $line); + $self->begin_txn_lazy; + my $n = read($r, my $msg, $len) or die "read: $!\n"; + $n == $len or die "short read: $n != $len\n"; + my $mime = PublicInbox::MIME->new(\$msg); + $artnum = int($artnum); + $self->add_message($mime, $n, $artnum, $oid, $mid0); + } + } + $self->worker_done; +} + +# called by V2Writable +sub index_raw { + my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; + if (my $w = $self->{w}) { + print $w "$bytes $artnum $oid $mid0\n", $$msgref or die + "failed to write partition $!\n"; + $w->flush or die "failed to flush: $!\n"; + } else { + $$msgref = undef; + $self->begin_txn_lazy; + $self->add_message($mime, $bytes, $artnum, $oid, $mid0); + } +} + +sub atfork_child { + close $_[0]->{w} or die "failed to close write pipe: $!\n"; +} + +# called by V2Writable: +sub remote_barrier { + my ($self) = @_; + if (my $w = $self->{w}) { + print $w "barrier\n" or die "failed to print: $!"; + $w->flush or die "failed to flush: $!"; + } else { + $self->commit_txn_lazy; + } +} + +1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index afba8b1a..ab971e00 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -6,17 +6,15 @@ package PublicInbox::SearchMsg; use strict; use warnings; -use Search::Xapian; -use Date::Parse qw/str2time/; -use PublicInbox::MID qw/mid_clean/; +use PublicInbox::MID qw/mid_clean mid_mime/; use PublicInbox::Address; +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use Time::Local qw(timegm); sub new { my ($class, $mime) = @_; my $doc = Search::Xapian::Document->new; - $doc->add_term('T' . 'mail'); - - bless { type => 'mail', doc => $doc, mime => $mime }, $class; + bless { doc => $doc, mime => $mime }, $class; } sub wrap { @@ -24,49 +22,75 @@ sub wrap { bless { doc => $doc, mime => undef, mid => $mid }, $class; } +sub get { + my ($class, $head, $db, $mid) = @_; + my $doc_id = $head->get_docid; + my $doc = $db->get_document($doc_id); + load_expand(wrap($class, $doc, $mid)) +} + sub get_val ($$) { my ($doc, $col) = @_; Search::Xapian::sortable_unserialise($doc->get_value($col)); } +sub to_doc_data { + my ($self, $oid, $mid0) = @_; + $oid = '' unless defined $oid; + join("\n", + $self->subject, + $self->from, + $self->references, + $self->to, + $self->cc, + $oid, + $mid0, + $self->{bytes} || '', + $self->{lines} || '' + ); +} + +sub load_from_data ($$) { + my ($self) = $_[0]; # data = $_[1] + ( + $self->{subject}, + $self->{from}, + $self->{references}, + + # To: and Cc: are stored to optimize HDR/XHDR in NNTP since + # some NNTP clients will use that for message displays. + $self->{to}, + $self->{cc}, + + $self->{blob}, + $self->{mid}, + $self->{bytes}, + $self->{lines} + ) = split(/\n/, $_[1]); +} + sub load_expand { my ($self) = @_; my $doc = $self->{doc}; my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, &PublicInbox::Search::TS); + $self->{ts} = get_val($doc, PublicInbox::Search::TS()); + my $dt = get_val($doc, PublicInbox::Search::DT()); + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); + $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); utf8::decode($data); - my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); - $self->{subject} = $subj; - $self->{from} = $from; - $self->{references} = $refs; - $self->{to} = $to; - $self->{cc} = $cc; - $self->{blob} = $blob; + load_from_data($self, $data); $self; } sub load_doc { my ($class, $doc) = @_; - my $data = $doc->get_data or return; - my $ts = get_val($doc, &PublicInbox::Search::TS); - utf8::decode($data); - my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); - bless { - doc => $doc, - subject => $subj, - ts => $ts, - from => $from, - references => $refs, - to => $to, - cc => $cc, - blob => $blob, - }, $class; + my $self = bless { doc => $doc }, $class; + $self->load_expand; } # :bytes and :lines metadata in RFC 3977 -sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) } -sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) } -sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) } +sub bytes ($) { $_[0]->{bytes} } +sub lines ($) { $_[0]->{lines} } sub __hdr ($$) { my ($self, $field) = @_; @@ -91,9 +115,9 @@ my @MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); sub date ($) { my ($self) = @_; - my $ts = $self->{ts}; - return unless defined $ts; - my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ts); + my $ds = $self->{ds}; + return unless defined $ds; + my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ds); "$DoW[$wday], " . sprintf("%02d $MoY[$mon] %04d %02d:%02d:%02d +0000", $mday, $year+1900, $hour, $min, $sec); @@ -119,15 +143,12 @@ sub from_name { sub ts { my ($self) = @_; - $self->{ts} ||= eval { str2time($self->{mime}->header('Date')) } || 0; + $self->{ts} ||= eval { msg_timestamp($self->{mime}->header_obj) } || 0; } -sub to_doc_data { - my ($self, $blob) = @_; - my @rows = ($self->subject, $self->from, $self->references, - $self->to, $self->cc); - push @rows, $blob if defined $blob; - join("\n", @rows); +sub ds { + my ($self) = @_; + $self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0; } sub references { @@ -156,27 +177,13 @@ sub mid ($;$) { $self->{mid} = $mid; } elsif (my $rv = $self->{mid}) { $rv; + } elsif ($self->{doc}) { + $self->{mid} = _get_term_val($self, 'Q', qr/\AQ/); } else { - $self->{mid} = _get_term_val($self, 'Q', qr/\AQ/) || - $self->_extract_mid; + $self->_extract_mid; # v1 w/o Xapian } } sub _extract_mid { mid_clean(mid_mime($_[0]->{mime})) } -sub thread_id { - my ($self) = @_; - my $tid = $self->{thread}; - return $tid if defined $tid; - $self->{thread} = _get_term_val($self, 'G', qr/\AG/); # *G*roup -} - -# XXX: consider removing this, we can phrase match subject -sub path { - my ($self) = @_; - my $path = $self->{path}; - return $path if defined $path; - $self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path -} - 1; diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 6fbce15c..1d250b46 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -22,15 +22,15 @@ use strict; use warnings; sub thread { - my ($messages, $ordersub, $srch) = @_; + my ($messages, $ordersub, $ibx) = @_; my $id_table = {}; _add_message($id_table, $_) foreach @$messages; my $rootset = [ grep { - !delete($_->{parent}) && $_->visible($srch) + !delete($_->{parent}) && $_->visible($ibx) } values %$id_table ]; $id_table = undef; $rootset = $ordersub->($rootset); - $_->order_children($ordersub, $srch) for @$rootset; + $_->order_children($ordersub, $ibx) for @$rootset; $rootset; } @@ -131,20 +131,20 @@ sub has_descendent { # a ghost Message-ID is the result of a long header line # being folded/mangled by a MUA, and not a missing message. sub visible ($$) { - my ($self, $srch) = @_; - ($self->{smsg} ||= eval { $srch->lookup_mail($self->{id}) }) || + my ($self, $ibx) = @_; + ($self->{smsg} ||= eval { $ibx->smsg_by_mid($self->{id}) }) || (scalar values %{$self->{children}}); } sub order_children { - my ($cur, $ordersub, $srch) = @_; + my ($cur, $ordersub, $ibx) = @_; my %seen = ($cur => 1); # self-referential loop prevention my @q = ($cur); while (defined($cur = shift @q)) { my $c = $cur->{children}; # The hashref here... - $c = [ grep { !$seen{$_}++ && visible($_, $srch) } values %$c ]; + $c = [ grep { !$seen{$_}++ && visible($_, $ibx) } values %$c ]; $c = $ordersub->($c) if scalar @$c > 1; $cur->{children} = $c; # ...becomes an arrayref push @q, @$c; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 1c4442e4..5d500c1b 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -10,7 +10,7 @@ use PublicInbox::SearchMsg; use PublicInbox::Hval qw/ascii_html obfuscate_addrs/; use PublicInbox::View; use PublicInbox::WwwAtomStream; -use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape MID_ESC); +use PublicInbox::MID qw(MID_ESC); use PublicInbox::MIME; require PublicInbox::Git; require PublicInbox::SearchThread; @@ -118,11 +118,11 @@ sub mset_summary { obfuscate_addrs($obfs_ibx, $s); obfuscate_addrs($obfs_ibx, $f); } - my $ts = PublicInbox::View::fmt_ts($smsg->ts); + my $date = PublicInbox::View::fmt_ts($smsg->ds); my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href}; $$res .= qq{$rank. }. $s . "\n"; - $$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n"; + $$res .= "$pfx - by $f @ $date UTC [$pct%]\n\n"; } $$res .= search_nav_bot($mset, $q); *noop; @@ -181,10 +181,9 @@ sub search_nav_top { sub search_nav_bot { my ($mset, $q) = @_; my $total = $mset->get_matches_estimated; - my $nr = scalar $mset->items; my $o = $q->{o}; my $l = $q->{l}; - my $end = $o + $nr; + my $end = $o + $mset->size; my $beg = $o + 1; my $rv = '

';
 	if ($beg <= $end) {
@@ -229,8 +228,8 @@ sub mset_thread {
 	} ($mset->items) ]});
 	my $r = $q->{r};
 	my $rootset = PublicInbox::SearchThread::thread($msgs,
-		$r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ts,
-		$srch);
+		$r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ds,
+		$ctx);
 	my $skel = search_nav_bot($mset, $q). "
";
 	my $inbox = $ctx->{-inbox};
 	$ctx->{-upfx} = '';
@@ -250,15 +249,14 @@ sub mset_thread {
 		*PublicInbox::View::pre_thread);
 
 	@$msgs = reverse @$msgs if $r;
-	my $mime;
 	sub {
 		return unless $msgs;
-		while ($mime = pop @$msgs) {
-			$mime = $inbox->msg_by_smsg($mime) and last;
+		my $smsg;
+		while (my $m = pop @$msgs) {
+			$smsg = $inbox->smsg_mime($m) and last;
 		}
-		if ($mime) {
-			$mime = PublicInbox::MIME->new($mime);
-			return PublicInbox::View::index_entry($mime, $ctx,
+		if ($smsg) {
+			return PublicInbox::View::index_entry($smsg, $ctx,
 				scalar @$msgs);
 		}
 		$msgs = undef;
@@ -292,8 +290,7 @@ sub adump {
 	PublicInbox::WwwAtomStream->response($ctx, 200, sub {
 		while (my $x = shift @items) {
 			$x = load_doc_retry($srch, $x);
-			$x = $ibx->msg_by_smsg($x) and
-					return PublicInbox::MIME->new($x);
+			$x = $ibx->smsg_mime($x) and return $x;
 		}
 		return undef;
 	});
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
new file mode 100644
index 00000000..1316d628
--- /dev/null
+++ b/lib/PublicInbox/V2Writable.pm
@@ -0,0 +1,908 @@
+# Copyright (C) 2018 all contributors 
+# License: AGPL-3.0+ 
+
+# This interface wraps and mimics PublicInbox::Import
+package PublicInbox::V2Writable;
+use strict;
+use warnings;
+use base qw(PublicInbox::Lock);
+use PublicInbox::SearchIdxPart;
+use PublicInbox::MIME;
+use PublicInbox::Git;
+use PublicInbox::Import;
+use PublicInbox::MID qw(mids);
+use PublicInbox::ContentId qw(content_id content_digest);
+use PublicInbox::Inbox;
+use PublicInbox::OverIdx;
+use PublicInbox::Msgmap;
+use PublicInbox::Spawn qw(spawn);
+use PublicInbox::SearchIdx;
+use IO::Handle;
+
+# an estimate of the post-packed size to the raw uncompressed size
+my $PACKING_FACTOR = 0.4;
+
+# assume 2 cores if GNU nproc(1) is not available
+sub nproc_parts () {
+	my $n = int($ENV{NPROC} || `nproc 2>/dev/null` || 2);
+	# subtract for the main process and git-fast-import
+	$n -= 1;
+	$n < 1 ? 1 : $n;
+}
+
+sub count_partitions ($) {
+	my ($self) = @_;
+	my $nparts = 0;
+	my $xpfx = $self->{xpfx};
+
+	# always load existing partitions in case core count changes:
+	# Also, partition count may change while -watch is running
+	# due to -compact
+	if (-d $xpfx) {
+		foreach my $part (<$xpfx/*>) {
+			-d $part && $part =~ m!/\d+\z! or next;
+			eval {
+				Search::Xapian::Database->new($part)->close;
+				$nparts++;
+			};
+		}
+	}
+	$nparts;
+}
+
+sub new {
+	my ($class, $v2ibx, $creat) = @_;
+	my $dir = $v2ibx->{mainrepo} or die "no mainrepo in inbox\n";
+	unless (-d $dir) {
+		if ($creat) {
+			require File::Path;
+			File::Path::mkpath($dir);
+		} else {
+			die "$dir does not exist\n";
+		}
+	}
+
+	$v2ibx = PublicInbox::InboxWritable->new($v2ibx);
+
+	my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION;
+	my $self = {
+		-inbox => $v2ibx,
+		im => undef, #  PublicInbox::Import
+		parallel => 1,
+		transact_bytes => 0,
+		xpfx => $xpfx,
+		over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3", 1),
+		lock_path => "$dir/inbox.lock",
+		# limit each git repo (epoch) to 1GB or so
+		rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
+		last_commit => [], # git repo -> commit
+	};
+	$self->{partitions} = count_partitions($self) || nproc_parts();
+	bless $self, $class;
+}
+
+sub init_inbox {
+	my ($self, $parallel) = @_;
+	$self->{parallel} = $parallel;
+	$self->idx_init;
+	my $epoch_max = -1;
+	git_dir_latest($self, \$epoch_max);
+	$self->git_init($epoch_max >= 0 ? $epoch_max : 0);
+	$self->done;
+}
+
+# returns undef on duplicate or spam
+# mimics Import::add and wraps it for v2
+sub add {
+	my ($self, $mime, $check_cb) = @_;
+
+	# spam check:
+	if ($check_cb) {
+		$mime = $check_cb->($mime) or return;
+	}
+
+	# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
+	# as does SQLite 3.4.1+ (released in 2007-07-20), and
+	# Xapian 1.3.2+ (released 2015-03-15).
+	# For the most part, we can spawn git-fast-import without
+	# leaking FDs to it...
+	$self->idx_init;
+
+	my $mid0;
+	my $num = num_for($self, $mime, \$mid0);
+	defined $num or return; # duplicate
+	defined $mid0 or die "BUG: $mid0 undefined\n";
+	my $im = $self->importer;
+	my $cmt = $im->add($mime);
+	$cmt = $im->get_mark($cmt);
+	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
+
+	my ($oid, $len, $msgref) = @{$im->{last_object}};
+	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+	my $nparts = $self->{partitions};
+	my $part = $num % $nparts;
+	my $idx = $self->idx_part($part);
+	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+	my $n = $self->{transact_bytes} += $len;
+	if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+		$self->checkpoint;
+	}
+
+	$cmt;
+}
+
+sub num_for {
+	my ($self, $mime, $mid0) = @_;
+	my $mids = mids($mime->header_obj);
+	if (@$mids) {
+		my $mid = $mids->[0];
+		my $num = $self->{mm}->mid_insert($mid);
+		if (defined $num) { # common case
+			$$mid0 = $mid;
+			return $num;
+		};
+
+		# crap, Message-ID is already known, hope somebody just resent:
+		foreach my $m (@$mids) {
+			# read-only lookup now safe to do after above barrier
+			my $existing = $self->lookup_content($mime, $m);
+			# easy, don't store duplicates
+			# note: do not add more diagnostic info here since
+			# it gets noisy on public-inbox-watch restarts
+			return if $existing;
+		}
+
+		# very unlikely:
+		warn "<$mid> reused for mismatched content\n";
+
+		# try the rest of the mids
+		for(my $i = $#$mids; $i >= 1; $i--) {
+			my $m = $mids->[$i];
+			$num = $self->{mm}->mid_insert($m);
+			if (defined $num) {
+				warn "alternative <$m> for <$mid> found\n";
+				$$mid0 = $m;
+				return $num;
+			}
+		}
+	}
+	# none of the existing Message-IDs are good, generate a new one:
+	num_for_harder($self, $mime, $mid0);
+}
+
+sub num_for_harder {
+	my ($self, $mime, $mid0) = @_;
+
+	my $hdr = $mime->header_obj;
+	my $dig = content_digest($mime);
+	$$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
+	my $num = $self->{mm}->mid_insert($$mid0);
+	unless (defined $num) {
+		# it's hard to spoof the last Received: header
+		my @recvd = $hdr->header_raw('Received');
+		$dig->add("Received: $_") foreach (@recvd);
+		$$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
+		$num = $self->{mm}->mid_insert($$mid0);
+
+		# fall back to a random Message-ID and give up determinism:
+		until (defined($num)) {
+			$dig->add(rand);
+			$$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
+			warn "using random Message-ID <$$mid0> as fallback\n";
+			$num = $self->{mm}->mid_insert($$mid0);
+		}
+	}
+	PublicInbox::Import::append_mid($hdr, $$mid0);
+	$num;
+}
+
+sub idx_part {
+	my ($self, $part) = @_;
+	$self->{idx_parts}->[$part];
+}
+
+# idempotent
+sub idx_init {
+	my ($self) = @_;
+	return if $self->{idx_parts};
+	my $ibx = $self->{-inbox};
+
+	# do not leak read-only FDs to child processes, we only have these
+	# FDs for duplicate detection so they should not be
+	# frequently activated.
+	delete $ibx->{$_} foreach (qw(git mm search));
+
+       if ($self->{parallel}) {
+               pipe(my ($r, $w)) or die "pipe failed: $!";
+               $self->{bnote} = [ $r, $w ];
+               $w->autoflush(1);
+       }
+
+	my $over = $self->{over};
+	$ibx->umask_prepare;
+	$ibx->with_umask(sub {
+		$self->lock_acquire;
+		$over->create;
+
+		# -compact can change partition count while -watch is idle
+		my $nparts = count_partitions($self);
+		if ($nparts && $nparts != $self->{partitions}) {
+			$self->{partitions} = $nparts;
+		}
+
+		# need to create all parts before initializing msgmap FD
+		my $max = $self->{partitions} - 1;
+
+		# idx_parts must be visible to all forked processes
+		my $idx = $self->{idx_parts} = [];
+		for my $i (0..$max) {
+			push @$idx, PublicInbox::SearchIdxPart->new($self, $i);
+		}
+
+		# Now that all subprocesses are up, we can open the FDs
+		# for SQLite:
+		my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
+			"$self->{-inbox}->{mainrepo}/msgmap.sqlite3", 1);
+		$mm->{dbh}->begin_work;
+	});
+}
+
+sub purge_oids {
+	my ($self, $purge) = @_; # $purge = { $object_id => 1, ... }
+	$self->done;
+	my $pfx = "$self->{-inbox}->{mainrepo}/git";
+	my $purges = [];
+	foreach my $i (0..$self->{epoch_max}) {
+		my $git = PublicInbox::Git->new("$pfx/$i.git");
+		my $im = $self->import_init($git, 0, 1);
+		$purges->[$i] = $im->purge_oids($purge);
+	}
+	$purges;
+}
+
+sub content_ids ($) {
+	my ($mime) = @_;
+	my @cids = ( content_id($mime) );
+
+	# Email::MIME->as_string doesn't always round-trip, so we may
+	# use a second content_id
+	my $rt = content_id(PublicInbox::MIME->new(\($mime->as_string)));
+	push @cids, $rt if $cids[0] ne $rt;
+	\@cids;
+}
+
+sub content_matches ($$) {
+	my ($cids, $existing) = @_;
+	my $cid = content_id($existing);
+	foreach (@$cids) {
+		return 1 if $_ eq $cid
+	}
+	0
+}
+
+sub remove_internal {
+	my ($self, $mime, $cmt_msg, $purge) = @_;
+	$self->idx_init;
+	my $im = $self->importer unless $purge;
+	my $over = $self->{over};
+	my $cids = content_ids($mime);
+	my $parts = $self->{idx_parts};
+	my $mm = $self->{mm};
+	my $removed;
+	my $mids = mids($mime->header_obj);
+
+	# We avoid introducing new blobs into git since the raw content
+	# can be slightly different, so we do not need the user-supplied
+	# message now that we have the mids and content_id
+	$mime = undef;
+	my $mark;
+
+	foreach my $mid (@$mids) {
+		my %gone;
+		my ($id, $prev);
+		while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+			my $msg = get_blob($self, $smsg);
+			if (!defined($msg)) {
+				warn "broken smsg for $mid\n";
+				next; # continue
+			}
+			my $orig = $$msg;
+			my $cur = PublicInbox::MIME->new($msg);
+			if (content_matches($cids, $cur)) {
+				$smsg->{mime} = $cur;
+				$gone{$smsg->{num}} = [ $smsg, \$orig ];
+			}
+		}
+		my $n = scalar keys %gone;
+		next unless $n;
+		if ($n > 1) {
+			warn "BUG: multiple articles linked to <$mid>\n",
+				join(',', sort keys %gone), "\n";
+		}
+		foreach my $num (keys %gone) {
+			my ($smsg, $orig) = @{$gone{$num}};
+			$mm->num_delete($num);
+			# $removed should only be set once assuming
+			# no bugs in our deduplication code:
+			$removed = $smsg;
+			my $oid = $smsg->{blob};
+			if ($purge) {
+				$purge->{$oid} = 1;
+			} else {
+				($mark, undef) = $im->remove($orig, $cmt_msg);
+			}
+			$orig = undef;
+			$self->unindex_oid_remote($oid, $mid);
+		}
+	}
+
+	if (defined $mark) {
+		my $cmt = $im->get_mark($mark);
+		$self->{last_commit}->[$self->{epoch_max}] = $cmt;
+	}
+	if ($purge && scalar keys %$purge) {
+		return purge_oids($self, $purge);
+	}
+	$removed;
+}
+
+sub remove {
+	my ($self, $mime, $cmt_msg) = @_;
+	remove_internal($self, $mime, $cmt_msg);
+}
+
+sub purge {
+	my ($self, $mime) = @_;
+	my $purges = remove_internal($self, $mime, undef, {});
+	$self->idx_init if @$purges; # ->done is called on purges
+	for my $i (0..$#$purges) {
+		defined(my $cmt = $purges->[$i]) or next;
+		$self->{last_commit}->[$i] = $cmt;
+	}
+	$purges;
+}
+
+sub last_commit_part ($$;$) {
+	my ($self, $i, $cmt) = @_;
+	my $v = PublicInbox::Search::SCHEMA_VERSION();
+	$self->{mm}->last_commit_xap($v, $i, $cmt);
+}
+
+sub set_last_commits ($) {
+	my ($self) = @_;
+	defined(my $epoch_max = $self->{epoch_max}) or return;
+	my $last_commit = $self->{last_commit};
+	foreach my $i (0..$epoch_max) {
+		defined(my $cmt = $last_commit->[$i]) or next;
+		$last_commit->[$i] = undef;
+		last_commit_part($self, $i, $cmt);
+	}
+}
+
+sub barrier_init {
+	my ($self, $n) = @_;
+	$self->{bnote} or return;
+	--$n;
+	my $barrier = { map { $_ => 1 } (0..$n) };
+}
+
+sub barrier_wait {
+	my ($self, $barrier) = @_;
+	my $bnote = $self->{bnote} or return;
+	my $r = $bnote->[0];
+	while (scalar keys %$barrier) {
+		defined(my $l = $r->getline) or die "EOF on barrier_wait: $!";
+		$l =~ /\Abarrier (\d+)/ or die "bad line on barrier_wait: $l";
+		delete $barrier->{$1} or die "bad part[$1] on barrier wait";
+	}
+}
+
+sub checkpoint ($;$) {
+	my ($self, $wait) = @_;
+
+	if (my $im = $self->{im}) {
+		if ($wait) {
+			$im->barrier;
+		} else {
+			$im->checkpoint;
+		}
+	}
+	my $parts = $self->{idx_parts};
+	if ($parts) {
+		my $dbh = $self->{mm}->{dbh};
+
+		# SQLite msgmap data is second in importance
+		$dbh->commit;
+
+		# SQLite overview is third
+		$self->{over}->commit_lazy;
+
+		# Now deal with Xapian
+		if ($wait) {
+			my $barrier = $self->barrier_init(scalar @$parts);
+
+			# each partition needs to issue a barrier command
+			$_->remote_barrier for @$parts;
+
+			# wait for each Xapian partition
+			$self->barrier_wait($barrier);
+		} else {
+			$_->remote_commit for @$parts;
+		}
+
+		# last_commit is special, don't commit these until
+		# remote partitions are done:
+		$dbh->begin_work;
+		set_last_commits($self);
+		$dbh->commit;
+
+		$dbh->begin_work;
+	}
+	$self->{transact_bytes} = 0;
+}
+
+# issue a write barrier to ensure all data is visible to other processes
+# and read-only ops.  Order of data importance is: git > SQLite > Xapian
+sub barrier { checkpoint($_[0], 1) };
+
+sub done {
+	my ($self) = @_;
+	my $im = delete $self->{im};
+	$im->done if $im; # PublicInbox::Import::done
+	checkpoint($self);
+	my $mm = delete $self->{mm};
+	$mm->{dbh}->commit if $mm;
+	my $parts = delete $self->{idx_parts};
+	if ($parts) {
+		$_->remote_close for @$parts;
+	}
+	$self->{over}->disconnect;
+	delete $self->{bnote};
+	$self->{transact_bytes} = 0;
+	$self->lock_release if $parts;
+}
+
+sub git_init {
+	my ($self, $epoch) = @_;
+	my $pfx = "$self->{-inbox}->{mainrepo}/git";
+	my $git_dir = "$pfx/$epoch.git";
+	my @cmd = (qw(git init --bare -q), $git_dir);
+	PublicInbox::Import::run_die(\@cmd);
+
+	my $all = "$self->{-inbox}->{mainrepo}/all.git";
+	unless (-d $all) {
+		@cmd = (qw(git init --bare -q), $all);
+		PublicInbox::Import::run_die(\@cmd);
+		@cmd = (qw/git config/, "--file=$all/config",
+				'repack.writeBitmaps', 'true');
+		PublicInbox::Import::run_die(\@cmd);
+	}
+
+	@cmd = (qw/git config/, "--file=$git_dir/config",
+			'include.path', '../../all.git/config');
+	PublicInbox::Import::run_die(\@cmd);
+
+	my $alt = "$all/objects/info/alternates";
+	my $new_obj_dir = "../../git/$epoch.git/objects";
+	my %alts;
+	if (-e $alt) {
+		open(my $fh, '<', $alt) or die "open < $alt: $!\n";
+		%alts = map { chomp; $_ => 1 } (<$fh>);
+	}
+	return $git_dir if $alts{$new_obj_dir};
+	open my $fh, '>>', $alt or die "open >> $alt: $!\n";
+	print $fh "$new_obj_dir\n" or die "print >> $alt: $!\n";
+	close $fh or die "close $alt: $!\n";
+	$git_dir
+}
+
+sub git_dir_latest {
+	my ($self, $max) = @_;
+	$$max = -1;
+	my $pfx = "$self->{-inbox}->{mainrepo}/git";
+	return unless -d $pfx;
+	my $latest;
+	opendir my $dh, $pfx or die "opendir $pfx: $!\n";
+	while (defined(my $git_dir = readdir($dh))) {
+		$git_dir =~ m!\A(\d+)\.git\z! or next;
+		if ($1 > $$max) {
+			$$max = $1;
+			$latest = "$pfx/$git_dir";
+		}
+	}
+	$latest;
+}
+
+sub importer {
+	my ($self) = @_;
+	my $im = $self->{im};
+	if ($im) {
+		if ($im->{bytes_added} < $self->{rotate_bytes}) {
+			return $im;
+		} else {
+			$self->{im} = undef;
+			$im->done;
+			$im = undef;
+			$self->checkpoint;
+			my $git_dir = $self->git_init(++$self->{epoch_max});
+			my $git = PublicInbox::Git->new($git_dir);
+			return $self->import_init($git, 0);
+		}
+	}
+	my $epoch = 0;
+	my $max;
+	my $latest = git_dir_latest($self, \$max);
+	if (defined $latest) {
+		my $git = PublicInbox::Git->new($latest);
+		my $packed_bytes = $git->packed_bytes;
+		if ($packed_bytes >= $self->{rotate_bytes}) {
+			$epoch = $max + 1;
+		} else {
+			$self->{epoch_max} = $max;
+			return $self->import_init($git, $packed_bytes);
+		}
+	}
+	$self->{epoch_max} = $epoch;
+	$latest = $self->git_init($epoch);
+	$self->import_init(PublicInbox::Git->new($latest), 0);
+}
+
+sub import_init {
+	my ($self, $git, $packed_bytes, $tmp) = @_;
+	my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
+	$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
+	$im->{want_object_info} = 1;
+	$im->{lock_path} = undef;
+	$im->{path_type} = 'v2';
+	$self->{im} = $im unless $tmp;
+	$im;
+}
+
+# XXX experimental
+sub diff ($$$) {
+	my ($mid, $cur, $new) = @_;
+	use File::Temp qw(tempfile);
+
+	my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
+	print $ah $cur->as_string or die "print: $!";
+	close $ah or die "close: $!";
+	my ($bh, $bn) = tempfile('email-new-XXXXXXXX', TMPDIR => 1);
+	PublicInbox::Import::drop_unwanted_headers($new);
+	print $bh $new->as_string or die "print: $!";
+	close $bh or die "close: $!";
+	my $cmd = [ qw(diff -u), $an, $bn ];
+	print STDERR "# MID conflict <$mid>\n";
+	my $pid = spawn($cmd, undef, { 1 => 2 });
+	defined $pid or die "diff failed to spawn $!";
+	waitpid($pid, 0) == $pid or die "diff did not finish";
+	unlink($an, $bn);
+}
+
+sub get_blob ($$) {
+	my ($self, $smsg) = @_;
+	if (my $im = $self->{im}) {
+		my $msg = $im->cat_blob($smsg->{blob});
+		return $msg if $msg;
+	}
+	# older message, should be in alternates
+	my $ibx = $self->{-inbox};
+	$ibx->msg_by_smsg($smsg);
+}
+
+sub lookup_content {
+	my ($self, $mime, $mid) = @_;
+	my $over = $self->{over};
+	my $cids = content_ids($mime);
+	my ($id, $prev);
+	while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+		my $msg = get_blob($self, $smsg);
+		if (!defined($msg)) {
+			warn "broken smsg for $mid\n";
+			next;
+		}
+		my $cur = PublicInbox::MIME->new($msg);
+		if (content_matches($cids, $cur)) {
+			$smsg->{mime} = $cur;
+			return $smsg;
+		}
+
+
+		# XXX DEBUG_DIFF is experimental and may be removed
+		diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
+	}
+	undef;
+}
+
+sub atfork_child {
+	my ($self) = @_;
+	my $fh = delete $self->{reindex_pipe};
+	close $fh if $fh;
+	if (my $parts = $self->{idx_parts}) {
+		$_->atfork_child foreach @$parts;
+	}
+	if (my $im = $self->{im}) {
+		$im->atfork_child;
+	}
+	die "unexpected mm" if $self->{mm};
+	close $self->{bnote}->[0] or die "close bnote[0]: $!\n";
+	$self->{bnote}->[1];
+}
+
+sub mark_deleted {
+	my ($self, $D, $git, $oid) = @_;
+	my $msgref = $git->cat_file($oid);
+	my $mime = PublicInbox::MIME->new($$msgref);
+	my $mids = mids($mime->header_obj);
+	my $cid = content_id($mime);
+	foreach my $mid (@$mids) {
+		$D->{"$mid\0$cid"} = 1;
+	}
+}
+
+sub reindex_oid {
+	my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_;
+	my $len;
+	my $msgref = $git->cat_file($oid, \$len);
+	my $mime = PublicInbox::MIME->new($$msgref);
+	my $mids = mids($mime->header_obj);
+	my $cid = content_id($mime);
+
+	# get the NNTP article number we used before, highest number wins
+	# and gets deleted from mm_tmp;
+	my $mid0;
+	my $num = -1;
+	my $del = 0;
+	foreach my $mid (@$mids) {
+		$del += (delete $D->{"$mid\0$cid"} || 0);
+		my $n = $mm_tmp->num_for($mid);
+		if (defined $n && $n > $num) {
+			$mid0 = $mid;
+			$num = $n;
+		}
+	}
+	if (!defined($mid0) && $regen && !$del) {
+		$num = $$regen--;
+		die "BUG: ran out of article numbers\n" if $num <= 0;
+		my $mm = $self->{mm};
+		foreach my $mid (reverse @$mids) {
+			if ($mm->mid_set($num, $mid) == 1) {
+				$mid0 = $mid;
+				last;
+			}
+		}
+		if (!defined($mid0)) {
+			my $id = '<' . join('> <', @$mids) . '>';
+			warn "Message-ID $id unusable for $num\n";
+			foreach my $mid (@$mids) {
+				defined(my $n = $mm->num_for($mid)) or next;
+				warn "#$n previously mapped for <$mid>\n";
+			}
+		}
+	}
+
+	if (!defined($mid0) || $del) {
+		if (!defined($mid0) && $del) { # expected for deletes
+			$$regen--;
+			return
+		}
+
+		my $id = '<' . join('> <', @$mids) . '>';
+		defined($mid0) or
+			warn "Skipping $id, no article number found\n";
+		if ($del && defined($mid0)) {
+			warn "$id was deleted $del " .
+				"time(s) but mapped to article #$num\n";
+		}
+		return;
+
+	}
+	$mm_tmp->mid_delete($mid0) or
+		die "failed to delete <$mid0> for article #$num\n";
+
+	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+	my $nparts = $self->{partitions};
+	my $part = $num % $nparts;
+	my $idx = $self->idx_part($part);
+	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+	my $n = $self->{transact_bytes} += $len;
+	if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+		$git->cleanup;
+		$mm_tmp->atfork_prepare;
+		$self->done; # release lock
+		# allow -watch or -mda to write...
+		$self->idx_init; # reacquire lock
+		$mm_tmp->atfork_parent;
+	}
+}
+
+# only update last_commit for $i on reindex iff newer than current
+sub update_last_commit {
+	my ($self, $git, $i, $cmt) = @_;
+	my $last = last_commit_part($self, $i);
+	if (defined $last && is_ancestor($git, $last, $cmt)) {
+		my @cmd = (qw(rev-list --count), "$last..$cmt");
+		chomp(my $n = $git->qx(@cmd));
+		return if $n ne '' && $n == 0;
+	}
+	last_commit_part($self, $i, $cmt);
+}
+
+sub git_dir_n ($$) { "$_[0]->{-inbox}->{mainrepo}/git/$_[1].git" }
+
+sub last_commits {
+	my ($self, $epoch_max) = @_;
+	my $heads = [];
+	for (my $i = $epoch_max; $i >= 0; $i--) {
+		$heads->[$i] = last_commit_part($self, $i);
+	}
+	$heads;
+}
+
+*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
+
+sub index_prepare {
+	my ($self, $opts, $epoch_max, $ranges) = @_;
+	my $regen_max = 0;
+	my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
+	for (my $i = $epoch_max; $i >= 0; $i--) {
+		die "already indexing!\n" if $self->{index_pipe};
+		my $git_dir = git_dir_n($self, $i);
+		-d $git_dir or next; # missing parts are fine
+		my $git = PublicInbox::Git->new($git_dir);
+		chomp(my $tip = $git->qx('rev-parse', $head));
+		my $range;
+		if (defined(my $cur = $ranges->[$i])) {
+			$range = "$cur..$tip";
+			if (is_ancestor($git, $cur, $tip)) { # common case
+				my $n = $git->qx(qw(rev-list --count), $range);
+				chomp($n);
+				if ($n == 0) {
+					$ranges->[$i] = undef;
+					next;
+				}
+			} else {
+				warn <<"";
+discontiguous range: $range
+Rewritten history? (in $git_dir)
+
+				my $base = $git->qx('merge-base', $tip, $cur);
+				chomp $base;
+				if ($base) {
+					$range = "$base..$tip";
+					warn "found merge-base: $base\n"
+				} else {
+					$range = $tip;
+					warn <<"";
+discarding history at $cur
+
+				}
+				warn <<"";
+reindexing $git_dir starting at
+$range
+
+				$self->{"unindex-range.$i"} = "$base..$cur";
+			}
+		} else {
+			$range = $tip; # all of it
+		}
+		$ranges->[$i] = $range;
+
+		# can't use 'rev-list --count' if we use --diff-filter
+		my $fh = $git->popen(qw(log --pretty=tformat:%H
+				--no-notes --no-color --no-renames
+				--diff-filter=AM), $range, '--', 'm');
+		++$regen_max while <$fh>;
+	}
+	\$regen_max;
+}
+
+sub unindex_oid_remote {
+	my ($self, $oid, $mid) = @_;
+	$_->remote_remove($oid, $mid) foreach @{$self->{idx_parts}};
+	$self->{over}->remove_oid($oid, $mid);
+}
+
+sub unindex_oid {
+	my ($self, $git, $oid) = @_;
+	my $msgref = $git->cat_file($oid);
+	my $mime = PublicInbox::MIME->new($msgref);
+	my $mids = mids($mime->header_obj);
+	$mime = $msgref = undef;
+	my $over = $self->{over};
+	foreach my $mid (@$mids) {
+		my %gone;
+		my ($id, $prev);
+		while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+			$gone{$smsg->{num}} = 1 if $oid eq $smsg->{blob};
+			1; # continue
+		}
+		my $n = scalar keys %gone;
+		next unless $n;
+		if ($n > 1) {
+			warn "BUG: multiple articles linked to $oid\n",
+				join(',',sort keys %gone), "\n";
+		}
+		$self->{unindexed}->{$_}++ foreach keys %gone;
+		$self->unindex_oid_remote($oid, $mid);
+	}
+}
+
+my $x40 = qr/[a-f0-9]{40}/;
+sub unindex {
+	my ($self, $opts, $git, $unindex_range) = @_;
+	my $un = $self->{unindexed} ||= {}; # num => removal count
+	my $before = scalar keys %$un;
+	my @cmd = qw(log --raw -r
+			--no-notes --no-color --no-abbrev --no-renames);
+	my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $unindex_range);
+	while (<$fh>) {
+		/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o or next;
+		$self->unindex_oid($git, $1);
+	}
+	delete $self->{reindex_pipe};
+	$fh = undef;
+
+	return unless $opts->{prune};
+	my $after = scalar keys %$un;
+	return if $before == $after;
+
+	# ensure any blob can not longer be accessed via dumb HTTP
+	PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}",
+		qw(-c gc.reflogExpire=now gc --prune=all)]);
+}
+
+sub index_sync {
+	my ($self, $opts) = @_;
+	$opts ||= {};
+	my $epoch_max;
+	my $latest = git_dir_latest($self, \$epoch_max);
+	return unless defined $latest;
+	$self->idx_init; # acquire lock
+	my $mm_tmp = $self->{mm}->tmp_clone;
+	my $ranges = $opts->{reindex} ? [] : $self->last_commits($epoch_max);
+
+	my ($min, $max) = $mm_tmp->minmax;
+	my $regen = $self->index_prepare($opts, $epoch_max, $ranges);
+	$$regen += $max if $max;
+	my $D = {};
+	my @cmd = qw(log --raw -r --pretty=tformat:%H
+			--no-notes --no-color --no-abbrev --no-renames);
+
+	# work backwards through history
+	my $last_commit = [];
+	for (my $i = $epoch_max; $i >= 0; $i--) {
+		my $git_dir = git_dir_n($self, $i);
+		die "already reindexing!\n" if delete $self->{reindex_pipe};
+		-d $git_dir or next; # missing parts are fine
+		my $git = PublicInbox::Git->new($git_dir);
+		my $unindex = delete $self->{"unindex-range.$i"};
+		$self->unindex($opts, $git, $unindex) if $unindex;
+		defined(my $range = $ranges->[$i]) or next;
+		my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
+		my $cmt;
+		while (<$fh>) {
+			if (/\A$x40$/o && !defined($cmt)) {
+				chomp($cmt = $_);
+			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+				$self->reindex_oid($mm_tmp, $D, $git, $1,
+						$regen);
+			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
+				$self->mark_deleted($D, $git, $1);
+			}
+		}
+		$fh = undef;
+		delete $self->{reindex_pipe};
+		$self->update_last_commit($git, $i, $cmt) if defined $cmt;
+	}
+	my @d = sort keys %$D;
+	if (@d) {
+		warn "BUG: ", scalar(@d)," unseen deleted messages marked\n";
+		foreach (@d) {
+			my ($mid, undef) = split(/\0/, $_, 2);
+			warn "<$mid>\n";
+		}
+	}
+	$self->done;
+}
+
+1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 590a76a0..af287b96 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -6,33 +6,41 @@
 package PublicInbox::View;
 use strict;
 use warnings;
-use Date::Parse qw/str2time/;
+use PublicInbox::MsgTime qw(msg_datestamp);
 use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
 use PublicInbox::Linkify;
-use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
+use PublicInbox::MID qw/id_compress mid_escape mids references/;
 use PublicInbox::MsgIter;
 use PublicInbox::Address;
 use PublicInbox::WwwStream;
 use PublicInbox::Reply;
 require POSIX;
+use Time::Local qw(timegm);
 
 use constant INDENT => '  ';
 use constant TCHILD => '` ';
 sub th_pfx ($) { $_[0] == 0 ? '' : TCHILD };
 
 # public functions: (unstable)
+
 sub msg_html {
-	my ($ctx, $mime) = @_;
+	my ($ctx, $mime, $more, $smsg) = @_;
 	my $hdr = $mime->header_obj;
 	my $ibx = $ctx->{-inbox};
-	my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
-	my $tip = _msg_html_prepare($hdr, $ctx, $obfs_ibx);
+	my $obfs_ibx = $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
+	my $tip = _msg_html_prepare($hdr, $ctx, $more, 0);
+	my $end = 2;
 	PublicInbox::WwwStream->response($ctx, 200, sub {
 		my ($nr, undef) = @_;
 		if ($nr == 1) {
-			$tip . multipart_text_as_html($mime, '', $obfs_ibx) .
+			# $more cannot be true w/o $smsg being defined:
+			my $upfx = $more ? '../'.mid_escape($smsg->mid).'/' : '';
+			$tip . multipart_text_as_html($mime, $upfx, $obfs_ibx) .
 				'

' - } elsif ($nr == 2) { + } elsif ($more && @$more) { + ++$end; + msg_html_more($ctx, $more, $nr); + } elsif ($nr == $end) { # fake an EOF if generating the footer fails; # we want to at least show the message if something # here crashes: @@ -46,6 +54,53 @@ sub msg_html { }); } +sub msg_page { + my ($ctx) = @_; + my $mid = $ctx->{mid}; + my $ibx = $ctx->{-inbox}; + my ($first, $more); + my $smsg; + if (my $srch = $ibx->search) { + my ($id, $prev); + $smsg = $srch->next_by_mid($mid, \$id, \$prev); + $first = $ibx->msg_by_smsg($smsg) if $smsg; + if ($first) { + my $next = $srch->next_by_mid($mid, \$id, \$prev); + $more = [ $id, $prev, $next ] if $next; + } + return unless $first; + } else { + $first = $ibx->msg_by_mid($mid) or return; + } + msg_html($ctx, PublicInbox::MIME->new($first), $more, $smsg); +} + +sub msg_html_more { + my ($ctx, $more, $nr) = @_; + my $str = eval { + my ($id, $prev, $smsg) = @$more; + my $mid = $ctx->{mid}; + $smsg = $ctx->{-inbox}->smsg_mime($smsg); + my $next = $ctx->{srch}->next_by_mid($mid, \$id, \$prev); + @$more = $next ? ($id, $prev, $next) : (); + if ($smsg) { + my $mime = $smsg->{mime}; + my $upfx = '../' . mid_escape($smsg->mid) . '/'; + _msg_html_prepare($mime->header_obj, $ctx, $more, $nr) . + multipart_text_as_html($mime, $upfx, + $ctx->{-obfs_ibx}) . + '

' + } else { + ''; + } + }; + if ($@) { + warn "Error lookup up additional messages: $@\n"; + $str = '
Error looking up additional messages
'; + } + $str; +} + # /$INBOX/$MESSAGE_ID/#R sub msg_reply { my ($ctx, $hdr) = @_; @@ -105,14 +160,8 @@ EOF sub in_reply_to { my ($hdr) = @_; - my %mid = map { $_ => 1 } $hdr->header_raw('Message-ID'); - my @refs = (($hdr->header_raw('References') || '') =~ /<([^>]+)>/g); - push(@refs, (($hdr->header_raw('In-Reply-To') || '') =~ /<([^>]+)>/g)); - while (defined(my $irt = pop @refs)) { - next if $mid{"<$irt>"}; - return $irt; - } - undef; + my $refs = references($hdr); + $refs->[-1]; } sub _hdr_names_html ($$) { @@ -129,12 +178,10 @@ sub nr_to_s ($$$) { # this is already inside a
 sub index_entry {
-	my ($mime, $ctx, $more) = @_;
+	my ($smsg, $ctx, $more) = @_;
 	my $srch = $ctx->{srch};
-	my $hdr = $mime->header_obj;
-	my $subj = $hdr->header('Subject');
-
-	my $mid_raw = mid_clean(mid_mime($mime));
+	my $subj = $smsg->subject;
+	my $mid_raw = $smsg->mid;
 	my $id = id_compress($mid_raw, 1);
 	my $id_m = 'm'.$id;
 
@@ -149,6 +196,8 @@ sub index_entry {
 	$rv .= $subj . "\n";
 	$rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx);
 	my @tocc;
+	my $mime = $smsg->{mime};
+	my $hdr = $mime->header_obj;
 	foreach my $f (qw(To Cc)) {
 		my $dst = _hdr_names_html($hdr, $f);
 		if ($dst ne '') {
@@ -158,7 +207,7 @@ sub index_entry {
 	}
 	my $from = _hdr_names_html($hdr, 'From');
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
-	$rv .= "From: $from @ "._msg_date($hdr)." UTC";
+	$rv .= "From: $from @ ".fmt_ts($smsg->ds)." UTC";
 	my $upfx = $ctx->{-upfx};
 	my $mhref = $upfx . mid_escape($mid_raw) . '/';
 	$rv .= qq{ (permalink / };
@@ -301,30 +350,30 @@ sub pre_thread  {
 }
 
 sub thread_index_entry {
-	my ($ctx, $level, $mime) = @_;
+	my ($ctx, $level, $smsg) = @_;
 	my ($beg, $end) = thread_adj_level($ctx, $level);
-	$beg . '
' . index_entry($mime, $ctx, 0) . '
' . $end; + $beg . '
' . index_entry($smsg, $ctx, 0) . '
' . $end; } sub stream_thread ($$) { my ($rootset, $ctx) = @_; my $inbox = $ctx->{-inbox}; - my $mime; my @q = map { (0, $_) } @$rootset; my $level; + my $smsg; while (@q) { $level = shift @q; my $node = shift @q or next; my $cl = $level + 1; unshift @q, map { ($cl, $_) } @{$node->{children}}; - $mime = $inbox->msg_by_smsg($node->{smsg}) and last; + $smsg = $inbox->smsg_mime($node->{smsg}) and last; } - return missing_thread($ctx) unless $mime; + return missing_thread($ctx) unless $smsg; $ctx->{-obfs_ibx} = $inbox->{obfuscate} ? $inbox : undef; - $mime = PublicInbox::MIME->new($mime); - $ctx->{-title_html} = ascii_html($mime->header('Subject')); - $ctx->{-html_tip} = thread_index_entry($ctx, $level, $mime); + $ctx->{-title_html} = ascii_html($smsg->subject); + $ctx->{-html_tip} = thread_index_entry($ctx, $level, $smsg); + $smsg = undef; PublicInbox::WwwStream->response($ctx, 200, sub { return unless $ctx; while (@q) { @@ -332,10 +381,8 @@ sub stream_thread ($$) { my $node = shift @q or next; my $cl = $level + 1; unshift @q, map { ($cl, $_) } @{$node->{children}}; - my $mid = $node->{id}; - if ($mime = $inbox->msg_by_smsg($node->{smsg})) { - $mime = PublicInbox::MIME->new($mime); - return thread_index_entry($ctx, $level, $mime); + if ($smsg = $inbox->smsg_mime($node->{smsg})) { + return thread_index_entry($ctx, $level, $smsg); } else { return ghost_index_entry($ctx, $level, $node); } @@ -351,9 +398,7 @@ sub thread_html { my ($ctx) = @_; my $mid = $ctx->{mid}; my $srch = $ctx->{srch}; - my $sres = $srch->get_thread($mid); - my $msgs = load_results($srch, $sres); - my $nr = $sres->{total}; + my ($nr, $msgs) = $srch->get_thread($mid); return missing_thread($ctx) if $nr == 0; my $skel = '
';
 	$skel .= $nr == 1 ? 'only message in thread' : 'end of thread';
@@ -372,7 +417,7 @@ sub thread_html {
 	$ctx->{mapping} = {};
 	$ctx->{s_nr} = "$nr+ messages in thread";
 
-	my $rootset = thread_results($msgs, $srch);
+	my $rootset = thread_results($ctx, $msgs);
 
 	# reduce hash lookups in pre_thread->skel_dump
 	my $inbox = $ctx->{-inbox};
@@ -383,24 +428,21 @@ sub thread_html {
 	return stream_thread($rootset, $ctx) unless $ctx->{flat};
 
 	# flat display: lazy load the full message from smsg
-	my $mime;
-	while ($mime = shift @$msgs) {
-		$mime = $inbox->msg_by_smsg($mime) and last;
+	my $smsg;
+	while (my $m = shift @$msgs) {
+		$smsg = $inbox->smsg_mime($m) and last;
 	}
-	return missing_thread($ctx) unless $mime;
-	$mime = PublicInbox::MIME->new($mime);
-	$ctx->{-title_html} = ascii_html($mime->header('Subject'));
-	$ctx->{-html_tip} = '
'.index_entry($mime, $ctx, scalar @$msgs);
-	$mime = undef;
+	return missing_thread($ctx) unless $smsg;
+	$ctx->{-title_html} = ascii_html($smsg->subject);
+	$ctx->{-html_tip} = '
'.index_entry($smsg, $ctx, scalar @$msgs);
+	$smsg = undef;
 	PublicInbox::WwwStream->response($ctx, 200, sub {
 		return unless $msgs;
-		while ($mime = shift @$msgs) {
-			$mime = $inbox->msg_by_smsg($mime) and last;
-		}
-		if ($mime) {
-			$mime = PublicInbox::MIME->new($mime);
-			return index_entry($mime, $ctx, scalar @$msgs);
+		$smsg = undef;
+		while (my $m = shift @$msgs) {
+			$smsg = $inbox->smsg_mime($m) and last;
 		}
+		return index_entry($smsg, $ctx, scalar @$msgs) if $smsg;
 		$msgs = undef;
 		$skel;
 	});
@@ -529,17 +571,26 @@ sub add_text_body {
 }
 
 sub _msg_html_prepare {
-	my ($hdr, $ctx, $obfs_ibx) = @_;
+	my ($hdr, $ctx, $more, $nr) = @_;
 	my $srch = $ctx->{srch} if $ctx;
 	my $atom = '';
-	my $rv = ""; # anchor for body start
-
+	my $obfs_ibx = $ctx->{-obfs_ibx};
+	my $rv = '';
+	my $mids = mids($hdr);
+	my $multiple = scalar(@$mids) > 1; # zero, one, infinity
+	if ($nr == 0) {
+		if ($more) {
+			$rv .=
+"
WARNING: multiple messages refer to this Message-ID\n
"; + } + $rv .= ""; # anchor for body start + } else { + $rv .= '
';
+	}
 	if ($srch) {
 		$ctx->{-upfx} = '../';
 	}
 	my @title;
-	my $mid = mid_clean($hdr->header_raw('Message-ID'));
-	$mid = PublicInbox::Hval->new_msgid($mid);
 	foreach my $h (qw(From To Cc Subject Date)) {
 		my $v = $hdr->header($h);
 		defined($v) && ($v ne '') or next;
@@ -564,8 +615,20 @@ sub _msg_html_prepare {
 	}
 	$title[0] ||= '(no subject)';
 	$ctx->{-title_html} = join(' - ', @title);
-	$rv .= 'Message-ID: <' . $mid->as_html . '> ';
-	$rv .= "(raw)\n";
+	foreach (@$mids) {
+		my $mid = PublicInbox::Hval->new_msgid($_) ;
+		my $mhtml = $mid->as_html;
+		if ($multiple) {
+			my $href = $mid->{href};
+			$rv .= "Message-ID: ";
+			$rv .= "";
+			$rv .= "<$mhtml> ";
+			$rv .= "(raw)\n";
+		} else {
+			$rv .= "Message-ID: <$mhtml> ";
+			$rv .= "(raw)\n";
+		}
+	}
 	$rv .= _parent_headers($hdr, $srch);
 	$rv .= "\n";
 }
@@ -573,9 +636,8 @@ sub _msg_html_prepare {
 sub thread_skel {
 	my ($dst, $ctx, $hdr, $tpfx) = @_;
 	my $srch = $ctx->{srch};
-	my $mid = mid_clean($hdr->header_raw('Message-ID'));
-	my $sres = $srch->get_thread($mid);
-	my $nr = $sres->{total};
+	my $mid = mids($hdr)->[0];
+	my ($nr, $msgs) = $srch->get_thread($mid);
 	my $expand = qq(expand[flat) .
 	                qq(|nested]  ) .
 			qq(mbox.gz  ) .
@@ -605,12 +667,11 @@ sub thread_skel {
 	$ctx->{prev_attr} = '';
 	$ctx->{prev_level} = 0;
 	$ctx->{dst} = $dst;
-	$sres = load_results($srch, $sres);
 
 	# reduce hash lookups in skel_dump
 	my $ibx = $ctx->{-inbox};
 	$ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
-	walk_thread(thread_results($sres, $srch), $ctx, *skel_dump);
+	walk_thread(thread_results($ctx, $msgs), $ctx, *skel_dump);
 
 	$ctx->{parent_msg} = $parent;
 }
@@ -726,22 +787,10 @@ sub indent_for {
 	$level ? INDENT x ($level - 1) : '';
 }
 
-sub load_results {
-	my ($srch, $sres) = @_;
-	my $msgs = delete $sres->{msgs};
-	$srch->retry_reopen(sub { [ map { $_->mid; $_ } @$msgs ] });
-}
-
-sub msg_timestamp {
-	my ($hdr) = @_;
-	my $ts = eval { str2time($hdr->header('Date')) };
-	defined($ts) ? $ts : 0;
-}
-
 sub thread_results {
-	my ($msgs, $srch) = @_;
+	my ($ctx, $msgs) = @_;
 	require PublicInbox::SearchThread;
-	PublicInbox::SearchThread::thread($msgs, *sort_ts, $srch);
+	PublicInbox::SearchThread::thread($msgs, *sort_ds, $ctx->{-inbox});
 }
 
 sub missing_thread {
@@ -752,8 +801,7 @@ sub missing_thread {
 
 sub _msg_date {
 	my ($hdr) = @_;
-	my $ts = $hdr->header('X-PI-TS') || msg_timestamp($hdr);
-	fmt_ts($ts);
+	fmt_ts(msg_datestamp($hdr));
 }
 
 sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
@@ -789,7 +837,7 @@ sub skel_dump {
 	my $obfs_ibx = $ctx->{-obfs_ibx};
 	obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx;
 
-	my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level);
+	my $d = fmt_ts($smsg->{ds}) . ' ' . indent_for($level) . th_pfx($level);
 	my $attr = $f;
 	$ctx->{first_level} ||= $level;
 
@@ -870,10 +918,10 @@ sub _skel_ghost {
 	$$dst .= $d;
 }
 
-sub sort_ts {
+sub sort_ds {
 	[ sort {
-		(eval { $a->topmost->{smsg}->ts } || 0) <=>
-		(eval { $b->topmost->{smsg}->ts } || 0)
+		(eval { $a->topmost->{smsg}->ds } || 0) <=>
+		(eval { $b->topmost->{smsg}->ds } || 0)
 	} @{$_[0]} ];
 }
 
@@ -883,22 +931,22 @@ sub acc_topic {
 	my ($ctx, $level, $node) = @_;
 	my $srch = $ctx->{srch};
 	my $mid = $node->{id};
-	my $x = $node->{smsg} || $srch->lookup_mail($mid);
-	my ($subj, $ts);
+	my $x = $node->{smsg} || $ctx->{-inbox}->smsg_by_mid($mid);
+	my ($subj, $ds);
 	my $topic;
 	if ($x) {
 		$subj = $x->subject;
 		$subj = $srch->subject_normalized($subj);
-		$ts = $x->ts;
+		$ds = $x->ds;
 		if ($level == 0) {
-			$topic = [ $ts, 1, { $subj => $mid }, $subj ];
+			$topic = [ $ds, 1, { $subj => $mid }, $subj ];
 			$ctx->{-cur_topic} = $topic;
 			push @{$ctx->{order}}, $topic;
 			return;
 		}
 
 		$topic = $ctx->{-cur_topic}; # should never be undef
-		$topic->[0] = $ts if $ts > $topic->[0];
+		$topic->[0] = $ds if $ds > $topic->[0];
 		$topic->[1]++;
 		my $seen = $topic->[2];
 		if (scalar(@$topic) == 3) { # parent was a ghost
@@ -917,7 +965,7 @@ sub acc_topic {
 
 sub dump_topics {
 	my ($ctx) = @_;
-	my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ]
+	my $order = delete $ctx->{order}; # [ ds, subj1, subj2, subj3, ... ]
 	if (!@$order) {
 		$ctx->{-html_tip} = '
[No topics in range]
'; return 404; @@ -930,14 +978,14 @@ sub dump_topics { # sort by recency, this allows new posts to "bump" old topics... foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { - my ($ts, $n, $seen, $top, @ex) = @$topic; + my ($ds, $n, $seen, $top, @ex) = @$topic; @$topic = (); next unless defined $top; # ghost topic my $mid = delete $seen->{$top}; my $href = mid_escape($mid); my $prev_subj = [ split(/ /, $top) ]; $top = PublicInbox::Hval->new($top)->as_html; - $ts = fmt_ts($ts); + $ds = fmt_ts($ds); # $n isn't the total number of posts on the topic, # just the number of posts in the current results window @@ -953,7 +1001,7 @@ sub dump_topics { my $mbox = qq(mbox.gz); my $atom = qq(Atom); my $s = "$top\n" . - " $ts UTC $n - $mbox / $atom\n"; + " $ds UTC $n - $mbox / $atom\n"; for (my $i = 0; $i < scalar(@ex); $i += 2) { my $level = $ex[$i]; my $subj = $ex[$i + 1]; @@ -974,45 +1022,81 @@ sub dump_topics { 200; } +sub ts2str ($) { + my ($ts) = @_; + POSIX::strftime('%Y%m%d%H%M%S', gmtime($ts)); +} + +sub str2ts ($) { + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $_[0]); + timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); +} + +sub pagination_footer ($$) { + my ($ctx, $latest) = @_; + delete $ctx->{qp} or return; + my $next = $ctx->{next_page} || ''; + my $prev = $ctx->{prev_page} || ''; + if ($prev) { + $next = $next ? "$next " : ' '; + $prev .= qq! latest!; + } + "
page: $next$prev
"; +} + sub index_nav { # callback for WwwStream my (undef, $ctx) = @_; - delete $ctx->{qp} or return; - my ($next, $prev); - $next = $prev = ' '; - my $latest = ''; + pagination_footer($ctx, '.') +} + +sub paginate_recent ($$) { + my ($ctx, $lim) = @_; + my $t = $ctx->{qp}->{t} || ''; + my $opts = { limit => $lim }; + my ($after, $before); + + # Xapian uses '..' but '-' is perhaps friendier to URL linkifiers + # if only $after exists "YYYYMMDD.." because "." could be skipped + # if interpreted as an end-of-sentence + $t =~ s/\A(\d{8,14})-// and $after = str2ts($1); + $t =~ /\A(\d{8,14})\z/ and $before = str2ts($1); - my $next_o = $ctx->{-next_o}; - if ($next_o) { - $next = qq!next!; + my $ibx = $ctx->{-inbox}; + my $msgs = $ibx->recent($opts, $after, $before); + my $nr = scalar @$msgs; + if ($nr < $lim && defined($after)) { + $after = $before = undef; + $msgs = $ibx->recent($opts); + $nr = scalar @$msgs; } - if (my $cur_o = $ctx->{-cur_o}) { - $latest = qq! latest!; - - my $o = $cur_o - ($next_o - $cur_o); - if ($o > 0) { - $prev = qq!prev!; - } elsif ($o == 0) { - $prev = qq!prev!; + my $more = $nr == $lim; + my ($newest, $oldest); + if ($nr) { + $newest = $msgs->[0]->{ts}; + $oldest = $msgs->[-1]->{ts}; + # if we only had $after, our SQL query in ->recent ordered + if ($newest < $oldest) { + ($oldest, $newest) = ($newest, $oldest); + $more = 0 if defined($after) && $after < $oldest; } } - "
page: $next $prev$latest
"; + if (defined($oldest) && $more) { + my $s = ts2str($oldest); + $ctx->{next_page} = qq!next!; + } + if (defined($newest) && (defined($before) || defined($after))) { + my $s = ts2str($newest); + $ctx->{prev_page} = qq!prev!; + } + $msgs; } sub index_topics { my ($ctx) = @_; - my ($off) = (($ctx->{qp}->{o} || '0') =~ /(\d+)/); - my $opts = { offset => $off, limit => 200 }; - - $ctx->{order} = []; - my $srch = $ctx->{srch}; - my $sres = $srch->query('', $opts); - my $nr = scalar @{$sres->{msgs}}; - if ($nr) { - $sres = load_results($srch, $sres); - walk_thread(thread_results($sres, $srch), $ctx, *acc_topic); + my $msgs = paginate_recent($ctx, 200); # 200 is our window + if (@$msgs) { + walk_thread(thread_results($ctx, $msgs), $ctx, *acc_topic); } - $ctx->{-next_o} = $off+ $nr; - $ctx->{-cur_o} = $off; PublicInbox::WwwStream->response($ctx, dump_topics($ctx), *index_nav); } diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 4ddc187b..24e24f1e 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -54,10 +54,10 @@ sub call { my $method = $env->{REQUEST_METHOD}; if ($method eq 'POST') { - if ($path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) { - my $path = $2; + if ($path_info =~ m!$INBOX_RE/(?:(\d+)/)?(git-upload-pack)\z!) { + my ($part, $path) = ($2, $3); return invalid_inbox($ctx, $1) || - serve_git($ctx, $path); + serve_git($ctx, $part, $path); } elsif ($path_info =~ m!$INBOX_RE/!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } @@ -77,10 +77,10 @@ sub call { invalid_inbox($ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { invalid_inbox($ctx, $1) || get_new($ctx); - } elsif ($path_info =~ m!$INBOX_RE/ + } elsif ($path_info =~ m!$INBOX_RE/(?:(\d+)/)? ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { - my $path = $2; - invalid_inbox($ctx, $1) || serve_git($ctx, $path); + my ($part, $path) = ($2, $3); + invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); } elsif ($path_info =~ m!$INBOX_RE/([\w-]+).mbox\.gz\z!o) { serve_mbox_range($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$END_RE\z!o) { @@ -150,10 +150,8 @@ sub invalid_inbox ($$) { my $www = $ctx->{www}; my $obj = $www->{pi_config}->lookup_name($inbox); if (defined $obj) { - $ctx->{git_dir} = $obj->{mainrepo}; $ctx->{git} = $obj->git; $ctx->{-inbox} = $obj; - $ctx->{inbox} = $inbox; return; } @@ -171,14 +169,15 @@ sub invalid_inbox_mid { return $ret if $ret; $ctx->{mid} = $mid; - if ($mid =~ /\A[a-f0-9]{40}\z/) { - # this is horiffically wasteful for legacy URLs: - if ($mid = mid2blob($ctx)) { - require Email::Simple; - use PublicInbox::MID qw/mid_clean/; - my $s = Email::Simple->new($mid); - $ctx->{mid} = mid_clean($s->header('Message-ID')); - } + my $ibx = $ctx->{-inbox}; + if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) { + my ($x2, $x38) = ($1, $2); + # this is horrifically wasteful for legacy URLs: + my $str = $ctx->{-inbox}->msg_by_path("$x2/$x38") or return; + require Email::Simple; + my $s = Email::Simple->new($str); + $mid = PublicInbox::MID::mid_clean($s->header('Message-ID')); + return r301($ctx, $inbox, $mid); } undef; } @@ -210,30 +209,19 @@ sub get_index { } } -# just returns a string ref for the blob in the current ctx -sub mid2blob { - my ($ctx) = @_; - $ctx->{-inbox}->msg_by_mid($ctx->{mid}); -} - # /$INBOX/$MESSAGE_ID/raw -> raw mbox sub get_mid_txt { my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); require PublicInbox::Mbox; - PublicInbox::Mbox::emit1($ctx, $x); + PublicInbox::Mbox::emit_raw($ctx) || r404($ctx); } # /$INBOX/$MESSAGE_ID/ -> HTML content (short quotes) sub get_mid_html { my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); - require PublicInbox::View; - require PublicInbox::MIME; - my $mime = PublicInbox::MIME->new($x); searcher($ctx); - PublicInbox::View::msg_html($ctx, $mime); + PublicInbox::View::msg_page($ctx) || r404($ctx); } # /$INBOX/$MESSAGE_ID/t/ @@ -400,8 +388,11 @@ sub msg_page { } sub serve_git { - my ($ctx, $path) = @_; - PublicInbox::GitHTTPBackend::serve($ctx->{env}, $ctx->{git}, $path); + my ($ctx, $part, $path) = @_; + my $env = $ctx->{env}; + my $ibx = $ctx->{-inbox}; + my $git = defined $part ? $ibx->git_part($part) : $ibx->git; + $git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404(); } sub mbox_results { diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index a3fab428..7ee29da5 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -7,13 +7,14 @@ package PublicInbox::WatchMaildir; use strict; use warnings; use PublicInbox::MIME; -use Email::MIME::ContentType; -$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect use PublicInbox::Git; use PublicInbox::Import; use PublicInbox::MDA; use PublicInbox::Spawn qw(spawn); +use PublicInbox::InboxWritable; use File::Temp qw//; +use PublicInbox::Filter::Base; +*REJECT = *PublicInbox::Filter::Base::REJECT; sub new { my ($class, $config) = @_; @@ -52,6 +53,10 @@ sub new { $spamcheck = undef; } } + + # need to make all inboxes writable for spam removal: + $config->each_inbox(sub { PublicInbox::InboxWritable->new($_[0]) }); + foreach $k (keys %$config) { $k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next; my $name = $1; @@ -93,18 +98,6 @@ sub _done_for_now { my ($self) = @_; my $importers = $self->{importers}; foreach my $im (values %$importers) { - $im->done if $im->{nchg}; - } - - my $opendirs = $self->{opendirs}; - - # spamdir scanning means every importer remains open - my $spamdir = $self->{spamdir}; - return if defined($spamdir) && $opendirs->{$spamdir}; - - foreach my $im (values %$importers) { - # not done if we're scanning - next if $opendirs->{$im->{git}->{git_dir}}; $im->done; } } @@ -127,15 +120,14 @@ sub _remove_spam { # path must be marked as (S)een $path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return; my $mime = _path_to_mime($path) or return; - _force_mid($mime); $self->{config}->each_inbox(sub { my ($ibx) = @_; eval { my $im = _importer_for($self, $ibx); $im->remove($mime, 'spam'); - if (my $scrub = _scrubber_for($ibx)) { + if (my $scrub = $ibx->filter) { my $scrubbed = $scrub->scrub($mime) or return; - $scrubbed == 100 and return; + $scrubbed == REJECT() and return; $im->remove($scrubbed, 'spam'); } }; @@ -146,36 +138,9 @@ sub _remove_spam { }) } -# used to hash the relevant portions of a message when there are conflicts -sub _hash_mime2 { - my ($mime) = @_; - require Digest::SHA; - my $dig = Digest::SHA->new('SHA-1'); - $dig->add($mime->header_obj->header_raw('Subject')); - $dig->add($mime->body_raw); - $dig->hexdigest; -} - -sub _force_mid { - my ($mime) = @_; - # probably a bad idea, but we inject a Message-Id if - # one is missing, here.. - my $mid = $mime->header_obj->header_raw('Message-Id'); - if (!defined $mid || $mid =~ /\A\s*\z/) { - $mid = '<' . _hash_mime2($mime) . '@generated>'; - $mime->header_set('Message-Id', $mid); - } -} - sub _try_path { my ($self, $path) = @_; - my @p = split(m!/+!, $path); - return if $p[-1] !~ /\A[a-zA-Z0-9][\w:,=\.]+\z/; - if ($p[-1] =~ /:2,([A-Z]+)\z/i) { - my $flags = $1; - return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail - } - return unless -f $path; + return unless PublicInbox::InboxWritable::is_maildir_path($path); if ($path !~ $self->{mdre}) { warn "unrecognized path: $path\n"; return; @@ -190,19 +155,17 @@ sub _try_path { } my $im = _importer_for($self, $inbox); my $mime = _path_to_mime($path) or return; - $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS; my $wm = $inbox->{-watchheader}; if ($wm) { my $v = $mime->header_obj->header_raw($wm->[0]); return unless ($v && $v =~ $wm->[1]); } - if (my $scrub = _scrubber_for($inbox)) { + if (my $scrub = $inbox->filter) { my $ret = $scrub->scrub($mime) or return; - $ret == 100 and return; + $ret == REJECT() and return; $mime = $ret; } - _force_mid($mime); $im->add($mime, $self->{spamcheck}); } @@ -290,41 +253,15 @@ sub _path_to_mime { } sub _importer_for { - my ($self, $inbox) = @_; - my $im = $inbox->{-import} ||= eval { - my $git = $inbox->git; - my $name = $inbox->{name}; - my $addr = $inbox->{-primary_address}; - PublicInbox::Import->new($git, $name, $addr, $inbox); - }; - + my ($self, $ibx) = @_; my $importers = $self->{importers}; + my $im = $importers->{"$ibx"} ||= $ibx->importer(0); if (scalar(keys(%$importers)) > 2) { - delete $importers->{"$im"}; + delete $importers->{"$ibx"}; _done_for_now($self); } - $importers->{"$im"} = $im; -} - -sub _scrubber_for { - my ($inbox) = @_; - my $f = $inbox->{filter}; - if ($f && $f =~ /::/) { - my @args = (-inbox => $inbox); - # basic line splitting, only - # Perhaps we can have proper quote splitting one day... - ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; - - eval "require $f"; - if ($@) { - warn $@; - } else { - # e.g: PublicInbox::Filter::Vger->new(@args) - return $f->new(@args); - } - } - undef; + $importers->{"$ibx"} = $im; } sub _spamcheck_cb { diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm index b69de856..38eba2a0 100644 --- a/lib/PublicInbox/WwwAtomStream.pm +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -7,11 +7,11 @@ use strict; use warnings; use POSIX qw(strftime); -use Date::Parse qw(str2time); use Digest::SHA qw(sha1_hex); use PublicInbox::Address; use PublicInbox::Hval qw(ascii_html); use PublicInbox::MID qw/mid_clean mid_escape/; +use PublicInbox::MsgTime qw(msg_timestamp); # called by PSGI server after getline: sub close {} @@ -33,8 +33,8 @@ sub response { sub getline { my ($self) = @_; if (my $middle = $self->{cb}) { - my $mime = $middle->(); - return feed_entry($self, $mime) if $mime; + my $smsg = $middle->(); + return feed_entry($self, $smsg) if $smsg; } delete $self->{cb} ? '' : undef; } @@ -92,10 +92,11 @@ sub mid2uuid ($) { # returns undef or string sub feed_entry { - my ($self, $mime) = @_; + my ($self, $smsg) = @_; my $ctx = $self->{ctx}; + my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; - my $mid = mid_clean($hdr->header_raw('Message-ID')); + my $mid = $smsg->mid; my $irt = PublicInbox::View::in_reply_to($hdr); my $uuid = mid2uuid($mid); my $base = $ctx->{feed_base_url}; @@ -108,8 +109,7 @@ sub feed_entry { $irt = ''; } my $href = $base . mid_escape($mid) . '/'; - my $date = $hdr->header('Date'); - my $t = eval { str2time($date) } if defined $date; + my $t = msg_timestamp($hdr); my @t = gmtime(defined $t ? $t : time); my $updated = feed_updated(@t); diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm index 98cf9f70..b1504f52 100644 --- a/lib/PublicInbox/WwwAttach.pm +++ b/lib/PublicInbox/WwwAttach.pm @@ -5,9 +5,8 @@ package PublicInbox::WwwAttach; # internal package use strict; use warnings; -use PublicInbox::MIME; use Email::MIME::ContentType qw(parse_content_type); -$Email::MIME::ContentType::STRICT_PARAMS = 0; +use PublicInbox::MIME; use PublicInbox::MsgIter; # /$LISTNAME/$MESSAGE_ID/$IDX-$FILENAME diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 05519984..ec75f16c 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -72,22 +72,52 @@ sub _html_end { my $obj = $ctx->{-inbox}; my $desc = ascii_html($obj->description); + my (%seen, @urls); my $http = $obj->base_url($ctx->{env}); - chop $http; - my %seen = ( $http => 1 ); - my @urls = ($http); + chop $http; # no trailing slash for clone + my $part = $obj->max_git_part; + my $dir = (split(m!/!, $http))[-1]; + if (defined($part)) { # v2 + $seen{$http} = 1; + for my $i (0..$part) { + # old parts my be deleted: + -d "$obj->{mainrepo}/git/$i.git" or next; + my $url = "$http/$i"; + $seen{$url} = 1; + push @urls, "$url $dir/git/$i.git"; + } + } else { # v1 + $seen{$http} = 1; + push @urls, $http; + } + + # FIXME: partitioning in can be different in other repositories, + # use the "cloneurl" file as-is for now: foreach my $u (@{$obj->cloneurl}) { next if $seen{$u}; $seen{$u} = 1; push @urls, $u =~ /\Ahttps?:/ ? qq($u) : $u; } + if (scalar(@urls) == 1) { - $urls .= " git clone --mirror $http"; + $urls .= " git clone --mirror $urls[0]"; } else { $urls .= "\n" . join("\n", map { "\tgit clone --mirror $_" } @urls); } + if (defined $part) { + my $addrs = $obj->{address}; + $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY'; + $urls .= <{name} $dir/ $http \\ + $addrs + public-inbox-index $dir +EOF + } my @nntp = map { qq($_) } @{$obj->nntp_url}; if (@nntp) { $urls .= "\n\n"; diff --git a/script/public-inbox-compact b/script/public-inbox-compact new file mode 100755 index 00000000..5f18497e --- /dev/null +++ b/script/public-inbox-compact @@ -0,0 +1,97 @@ +#!/usr/bin/perl -w +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +use PublicInbox::Search; +use PublicInbox::Config; +use PublicInbox::InboxWritable; +use Cwd 'abs_path'; +use File::Temp qw(tempdir); +use File::Path qw(remove_tree); +use PublicInbox::Spawn qw(spawn); +my $usage = "Usage: public-inbox-compact REPO_DIR\n"; +my $dir = shift or die $usage; +my $config = PublicInbox::Config->new; +my $ibx; +$dir = abs_path($dir); +$config->each_inbox(sub { + $ibx = $_[0] if abs_path($_[0]->{mainrepo}) eq $dir +}); +unless ($ibx) { + warn "W: $dir not configured in ". + PublicInbox::Config::default_file() . "\n"; + $ibx = { + mainrepo => $dir, + name => 'ignored', + address => [ 'old@example.com' ], + }; + $ibx = PublicInbox::Inbox->new($ibx); +} +my $v = ($ibx->{version} || 1); +$ibx = PublicInbox::InboxWritable->new($ibx); +$ibx->umask_prepare; + +sub commit_changes ($$$) { + my ($im, $old, $new) = @_; + my @st = stat($old) or die "failed to stat($old): $!\n"; + + my $over = "$old/over.sqlite3"; + if (-f $over) { + require PublicInbox::Over; + $over = PublicInbox::Over->new($over); + $over->connect->sqlite_backup_to_file("$new/over.sqlite3"); + } + rename($old, "$new/old") or die "rename $old => $new/old: $!\n"; + chmod($st[2] & 07777, $new) or die "chmod $old: $!\n"; + rename($new, $old) or die "rename $new => $old: $!\n"; + $im->lock_release; + remove_tree("$old/old") or die "failed to remove $old/old: $!\n"; +} +my @compact = qw(xapian-compact --no-renumber); +if ($v == 2) { + require PublicInbox::V2Writable; + my $v2w = PublicInbox::V2Writable->new($ibx); + my $xap_v = 'xap'.PublicInbox::Search::SCHEMA_VERSION; + my $old = "$dir/$xap_v"; + opendir my $dh, $old or die "Failed to opendir $old: $!\n"; + my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $dir); + $ibx->with_umask(sub { + $v2w->lock_acquire; + my %pids; + while (defined(my $dn = readdir($dh))) { + if ($dn =~ /\A\d+\z/) { + my $cmd = [ @compact, "$old/$dn", "$new/$dn" ]; + $pids{spawn($cmd)} = join(' ', @$cmd); + } elsif ($dn eq '.' || $dn eq '..') { + } elsif ($dn =~ /\Aover\.sqlite3/) { + } else { + warn "W: skipping unknown Xapian DB: $old/$dn\n" + } + } + close $dh; + die "No Xapian parts found in $old\n" unless keys %pids; + while (scalar keys %pids) { + my $pid = waitpid(-1, 0); + my $desc = delete $pids{$pid}; + die "$desc failed: $?\n" if $?; + } + commit_changes($v2w, $old, $new); + }); +} elsif ($v == 1) { + require PublicInbox::Import; + my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx); + my $xap_v = 'xapian'.PublicInbox::Search::SCHEMA_VERSION; + my $v1_root = "$dir/public-inbox"; + my $old = "$v1_root/$xap_v"; + -d $old or die "$old does not exist\n"; + my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $v1_root); + $ibx->with_umask(sub { + $im->lock_acquire; + PublicInbox::Import::run_die([@compact, $old, $new]); + commit_changes($im, $old, $new); + }); +} else { + die "Unsupported inbox version: $v\n"; +} diff --git a/script/public-inbox-convert b/script/public-inbox-convert new file mode 100755 index 00000000..2742be79 --- /dev/null +++ b/script/public-inbox-convert @@ -0,0 +1,139 @@ +#!/usr/bin/perl -w +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +use PublicInbox::MIME; +use PublicInbox::InboxWritable; +use PublicInbox::Config; +use PublicInbox::V2Writable; +use PublicInbox::Import; +use PublicInbox::Spawn qw(spawn); +use Cwd 'abs_path'; +my $usage = "Usage: public-inbox-convert OLD NEW\n"; +my $jobs; +my $index = 1; +my %opts = ( + '--jobs|j=i' => \$jobs, + '--index!' => \$index, +); +GetOptions(%opts) or die "bad command-line args\n$usage"; +GetOptions(%opts) or die "bad command-line args\n$usage"; +my $old_dir = shift or die $usage; +my $new_dir = shift or die $usage; +die "$new_dir exists\n" if -d $new_dir; +die "$old_dir not a directory\n" unless -d $old_dir; +my $config = PublicInbox::Config->new; +$old_dir = abs_path($old_dir); +my $old; +$config->each_inbox(sub { + $old = $_[0] if abs_path($_[0]->{mainrepo}) eq $old_dir; +}); +unless ($old) { + warn "W: $old_dir not configured in " . + PublicInbox::Config::default_file() . "\n"; + $old = { + mainrepo => $old_dir, + name => 'ignored', + address => [ 'old@example.com' ], + }; + $old = PublicInbox::Inbox->new($old); +} +$old = PublicInbox::InboxWritable->new($old); +if (($old->{version} || 1) >= 2) { + die "Only conversion from v1 inboxes is supported\n"; +} +my $new = { %$old }; +$new->{mainrepo} = abs_path($new_dir); +$new->{version} = 2; +$new = PublicInbox::InboxWritable->new($new); +my $v2w; +$old->umask_prepare; +$old->with_umask(sub { + local $ENV{GIT_CONFIG} = "$old->{mainrepo}/config"; + $v2w = PublicInbox::V2Writable->new($new, 1); + $v2w->init_inbox($jobs); + chomp(my $sr = $old->git->qx('config', 'core.sharedRepository')); + if ($sr ne '') { + PublicInbox::Import::run_die(['git', 'config', + "--file=$new->{mainrepo}/all.git/config", + 'core.sharedRepository', $sr]); + } + if (my $alt = $new->{altid}) { + require PublicInbox::AltId; + foreach my $i (0..$#$alt) { + my $src = PublicInbox::AltId->new($old, $alt->[$i], 0); + $src->mm_alt or next; + my $dst = PublicInbox::AltId->new($new, $alt->[$i], 1); + $dst = $dst->{filename}; + $src->mm_alt->{dbh}->sqlite_backup_to_file($dst); + } + } +}); +my $state = ''; +my ($prev, $from); +my $head = $old->{ref_head} || 'HEAD'; +my ($rd, $pid) = $old->git->popen(qw(fast-export --use-done-feature), $head); +$v2w->idx_init; +my $im = $v2w->importer; +my ($r, $w) = $im->gfi_start; +my $h = '[0-9a-f]'; +my %D; +my $last; +while (<$rd>) { + if ($_ eq "blob\n") { + $state = 'blob'; + } elsif (/^commit /) { + $state = 'commit'; + } elsif (/^data (\d+)/) { + my $len = $1; + $w->print($_) or $im->wfail; + while ($len) { + my $n = read($rd, my $tmp, $len) or die "read: $!"; + warn "$n != $len\n" if $n != $len; + $len -= $n; + $w->print($tmp) or $im->wfail; + } + next; + } elsif ($state eq 'commit') { + if (m{^M 100644 :(\d+) (${h}{2}/${h}{38})}o) { + my ($mark, $path) = ($1, $2); + $D{$path} = $mark; + if ($last && $last ne 'm') { + $w->print("D $last\n") or $im->wfail; + } + $w->print("M 100644 :$mark m\n") or $im->wfail; + $last = 'm'; + next; + } + if (m{^D (${h}{2}/${h}{38})}o) { + my $mark = delete $D{$1}; + defined $mark or die "undeleted path: $1\n"; + if ($last && $last ne 'd') { + $w->print("D $last\n") or $im->wfail; + } + $w->print("M 100644 :$mark d\n") or $im->wfail; + $last = 'd'; + next; + } + if (m{^from (:\d+)}) { + $prev = $from; + $from = $1; + # no next + } + } + last if $_ eq "done\n"; + $w->print($_) or $im->wfail; +} +$w = $r = undef; +close $rd or die "close fast-export: $!\n"; +waitpid($pid, 0) or die "waitpid failed: $!\n"; +$? == 0 or die "fast-export failed: $?\n"; +my $mm = $old->mm; +$mm->{dbh}->sqlite_backup_to_file("$new_dir/msgmap.sqlite3") if $mm; +$v2w->done; +if ($index) { + $v2w->index_sync; + $v2w->done; +} diff --git a/script/public-inbox-index b/script/public-inbox-index index 594a3d9c..db7ebbab 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -4,13 +4,13 @@ # Basic tool to create a Xapian search index for a git repository # configured for public-inbox. # Usage with libeatmydata -# highly recommended: eatmydata public-inbox-index GIT_DIR +# highly recommended: eatmydata public-inbox-index REPO_DIR use strict; use warnings; use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); use Cwd 'abs_path'; -my $usage = "public-inbox-index GIT_DIR"; +my $usage = "public-inbox-index REPO_DIR"; use PublicInbox::Config; my $config = eval { PublicInbox::Config->new } || eval { warn "public-inbox unconfigured for serving, indexing anyways...\n"; @@ -23,13 +23,25 @@ if ($@) { } my $reindex; -my %opts = ( '--reindex' => \$reindex ); +my $prune; +my $jobs = undef; +my %opts = ( + '--reindex' => \$reindex, + '--jobs|j=i' => \$jobs, + '--prune' => \$prune, +); GetOptions(%opts) or die "bad command-line args\n$usage"; +die "--jobs must be positive\n" if defined $jobs && $jobs < 0; my @dirs; -sub resolve_git_dir { +sub resolve_repo_dir { my ($cd) = @_; + my $prefix = defined $cd ? $cd : './'; + if (-d $prefix && -f "$prefix/inbox.lock") { # v2 + return abs_path($prefix); + } + my @cmd = qw(git rev-parse --git-dir); my $cmd = join(' ', @cmd); my $pid = open my $fh, '-|'; @@ -53,9 +65,9 @@ sub resolve_git_dir { } if (@ARGV) { - @dirs = map { resolve_git_dir($_) } @ARGV; + @dirs = map { resolve_repo_dir($_) } @ARGV; } else { - @dirs = (resolve_git_dir()); + @dirs = (resolve_repo_dir()); } sub usage { print STDERR "Usage: $usage\n"; exit 1 } @@ -73,14 +85,39 @@ foreach my $k (keys %$config) { } foreach my $dir (@dirs) { + if (!ref($dir) && -f "$dir/inbox.lock") { # v2 + my $ibx = { mainrepo => $dir, name => 'unnamed' }; + $dir = PublicInbox::Inbox->new($ibx); + } index_dir($dir); } sub index_dir { - my ($git_dir) = @_; - if (!ref $git_dir && ! -d $git_dir) { - die "$git_dir does not appear to be a git repository\n"; + my ($repo) = @_; + if (!ref $repo && ! -d $repo) { + die "$repo does not appear to be an inbox repository\n"; + } + if (ref($repo) && ($repo->{version} || 1) == 2) { + eval { require PublicInbox::V2Writable }; + die "v2 requirements not met: $@\n" if $@; + my $v2w = eval { + local $ENV{NPROC} = $jobs if $jobs; + PublicInbox::V2Writable->new($repo); + }; + if (defined $jobs) { + if ($jobs == 0) { + $v2w->{parallel} = 0; + } else { + my $n = $v2w->{partitions}; + if ($jobs != $n) { + warn +"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n"; + } + } + } + $v2w->index_sync({ reindex => $reindex, prune => $prune }); + } else { + my $s = PublicInbox::SearchIdx->new($repo, 1); + $s->index_sync({ reindex => $reindex }); } - my $s = PublicInbox::SearchIdx->new($git_dir, 1); - $s->index_sync({ reindex => $reindex }); } diff --git a/script/public-inbox-init b/script/public-inbox-init index 2f33c9ef..3ef6c3bd 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -5,7 +5,8 @@ # Initializes a public-inbox, basically a wrapper for git-init(1) use strict; use warnings; -my $usage = "public-inbox-init NAME GIT_DIR HTTP_URL ADDRESS [ADDRESS..]"; +my $usage = "public-inbox-init NAME REPO_DIR HTTP_URL ADDRESS [ADDRESS..]"; +use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use PublicInbox::Config; use File::Temp qw/tempfile/; use File::Basename qw/dirname/; @@ -14,9 +15,11 @@ use Cwd qw/abs_path/; sub x { system(@_) and die join(' ', @_). " failed: $?\n" } sub usage { print STDERR "Usage: $usage\n"; exit 1 } - +my $version = undef; +my %opts = ( 'V|version=i' => \$version ); +GetOptions(%opts) or usage(); my $name = shift @ARGV or usage(); -my $git_dir = shift @ARGV or usage(); +my $mainrepo = shift @ARGV or usage(); my $http_url = shift @ARGV or usage(); my (@address) = @ARGV; @address or usage(); @@ -25,7 +28,7 @@ my %seen; my $pi_config = PublicInbox::Config->default_file; my $dir = dirname($pi_config); mkpath($dir); # will croak on fatal errors -my ($fh, $filename) = tempfile('pi-init-XXXXXXXX', DIR => $dir); +my ($fh, $pi_config_tmp) = tempfile('pi-init-XXXXXXXX', DIR => $dir); if (-e $pi_config) { open(my $oh, '<', $pi_config) or die "unable to read $pi_config: $!\n"; my @st = stat($oh); @@ -62,22 +65,55 @@ if (-e $pi_config) { exit(1) if $conflict; } -close $fh or die "failed to close $filename: $!\n"; +close $fh or die "failed to close $pi_config_tmp: $!\n"; my $pfx = "publicinbox.$name"; -my @x = (qw/git config/, "--file=$filename"); -$git_dir = abs_path($git_dir); -x(qw(git init -q --bare), $git_dir); +my @x = (qw/git config/, "--file=$pi_config_tmp"); + +$mainrepo = abs_path($mainrepo); +if (-f "$mainrepo/inbox.lock") { + if (!defined $version) { + $version = 2; + } elsif ($version != 2) { + die "$mainrepo is a -V2 repo, -V$version specified\n" + } +} elsif (-d "$mainrepo/objects") { + if (!defined $version) { + $version = 1; + } elsif ($version != 1) { + die "$mainrepo is a -V1 repo, -V$version specified\n" + } +} + +$version = 1 unless defined $version; -# set a reasonable default: -x(qw/git config/, "--file=$git_dir/config", 'repack.writeBitmaps', 'true'); +if ($version >= 2) { + require PublicInbox::V2Writable; + require PublicInbox::Inbox; + my $ibx = { + mainrepo => $mainrepo, + name => $name, + version => $version, + -primary_address => $address[0], + }; + $ibx = PublicInbox::Inbox->new($ibx); + PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0); +} elsif ($version == 1) { + x(qw(git init -q --bare), $mainrepo); + + # set a reasonable default: + x(qw/git config/, "--file=$mainrepo/config", + 'repack.writeBitmaps', 'true'); +} else { + die "Unsupported -V/--version: $version\n"; +} foreach my $addr (@address) { next if $seen{lc($addr)}; x(@x, "--add", "$pfx.address", $addr); } x(@x, "$pfx.url", $http_url); -x(@x, "$pfx.mainrepo", $git_dir); +x(@x, "$pfx.mainrepo", $mainrepo); -rename $filename, $pi_config or - die "failed to rename `$filename' to `$pi_config': $!\n"; +rename $pi_config_tmp, $pi_config or + die "failed to rename `$pi_config_tmp' to `$pi_config': $!\n"; diff --git a/script/public-inbox-learn b/script/public-inbox-learn index bdc72e08..c51f9585 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -11,8 +11,6 @@ use PublicInbox::Config; use PublicInbox::Git; use PublicInbox::Import; use PublicInbox::MIME; -use Email::MIME::ContentType; -$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect use PublicInbox::Address; use PublicInbox::Spamcheck::Spamc; my $train = shift or die "usage: $usage\n"; diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 8cf44195..766d58a6 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -15,9 +15,7 @@ sub do_exit { } use Email::Simple; -use Email::MIME; -use Email::MIME::ContentType; -$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::MIME; use PublicInbox::MDA; use PublicInbox::Config; use PublicInbox::Import; @@ -80,8 +78,18 @@ if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message } # else { accept PublicInbox::MDA->set_list_headers($mime, $dst); -my $git = PublicInbox::Git->new($main_repo); -my $im = PublicInbox::Import->new($git, $dst->{name}, $recipient); +my $v = $dst->{version} || 1; +my $im; +if ($v == 2) { + require PublicInbox::V2Writable; + $im = PublicInbox::V2Writable->new($dst); + $im->{parallel} = 0; # pointless to be parallel for a single message +} elsif ($v == 1) { + my $git = $dst->git; + $im = PublicInbox::Import->new($git, $dst->{name}, $recipient, $dst); +} else { + die "Unsupported inbox version: $v\n"; +} if (defined $im->add($mime)) { $emm = $emm->abort; } else { diff --git a/scripts/dupe-finder b/scripts/dupe-finder new file mode 100644 index 00000000..14022379 --- /dev/null +++ b/scripts/dupe-finder @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +# +# ad-hoc tool for finding duplicates, unstable! +use strict; +use warnings; +use PublicInbox::Inbox; +use PublicInbox::Over; +use PublicInbox::Search; +use PublicInbox::Config; +my $repo = shift; +my $ibx; +if (index($repo, '@') > 0) { + $ibx = PublicInbox::Config->new->lookup($repo); +} elsif (-d $repo) { + $ibx = { mainrepo => $repo, address => 'unnamed@example.com' }; + $ibx = PublicInbox::Inbox->new($ibx); +} else { + $ibx = PublicInbox::Config->new->lookup_name($repo); +} +$ibx or die "No inbox"; +$ibx->search or die "search not available for inbox"; +my $dbh = $ibx->search->{over_ro}->connect; +my $over = PublicInbox::Over->new($dbh->sqlite_db_filename); + +sub emit ($) { + my ($nums) = @_; + foreach my $n (@$nums) { + my $smsg = $over->get_art($n) or next; + print STDERR "$n $smsg->{blob} $smsg->{mid}\n"; + my $msg = $ibx->msg_by_smsg($smsg) or next; + print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n"; + $$msg =~ s/^(>*From )/>$1/gm; + print $$msg, "\n"; + } +} + +my $sth = $dbh->prepare(<<''); +SELECT id,num FROM id2num WHERE num > 0 ORDER BY id + +$sth->execute; +my $prev_id = -1; +my ($id, $num, @nums); +while (1) { + ($id, $num) = $sth->fetchrow_array; + defined $id or last; + if ($prev_id != $id) { + emit(\@nums) if scalar(@nums) > 1; + @nums = (); + } + $prev_id = $id; + push @nums, $num; +} diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool index 5158460b..7b6c9ab0 100755 --- a/scripts/import_slrnspool +++ b/scripts/import_slrnspool @@ -11,7 +11,7 @@ use strict; use warnings; use PublicInbox::Config; -use Email::MIME; +use PublicInbox::MIME; use PublicInbox::Import; use PublicInbox::Git; sub usage { "Usage:\n".join('',grep(/\t/, `head -n 10 $0`)) } @@ -58,7 +58,7 @@ for (; $exit == 0 && $n < $max; $n++) { open(my $fh, '<', $fn) or next; $max = $n + $max_gap; - my $mime = Email::MIME->new(eval { local $/; <$fh> }); + my $mime = PublicInbox::MIME->new(eval { local $/; <$fh> }); my $hdr = $mime->header_obj; # gmane rewrites Received headers, which increases spamminess diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox index 44055ffd..ca5a408d 100644 --- a/scripts/import_vger_from_mbox +++ b/scripts/import_vger_from_mbox @@ -3,45 +3,43 @@ # License: AGPL-3.0+ use strict; use warnings; -use Email::MIME; -$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect -use PublicInbox::Git; +use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; +use PublicInbox::MIME; +use PublicInbox::InboxWritable; use PublicInbox::Import; -my $usage = "usage: $0 NAME EMAIL new($git_dir); +use PublicInbox::MDA; +my $usage = "usage: $0 NAME EMAIL DIR \$dry_run, + 'V|version=i' => \$version, + 'F|format=s' => \$variant, +); +GetOptions(%opts) or die $usage; +if ($variant ne 'mboxrd' && $variant ne 'mboxo') { + die "Unsupported mbox variant: $variant\n"; +} my $name = shift or die $usage; # git my $email = shift or die $usage; # git@vger.kernel.org -my $im = PublicInbox::Import->new($git, $name, $email); -binmode STDIN; -my $msg = ''; -use PublicInbox::Filter::Vger; -my $vger = PublicInbox::Filter::Vger->new; -sub do_add ($$) { - my ($im, $msg) = @_; - $$msg =~ s/(\r?\n)+\z/$1/s; - $msg = Email::MIME->new($$msg); - $msg = $vger->scrub($msg); - $im->add($msg) or - warn "duplicate: ", - $msg->header_obj->header_raw('Message-ID'), "\n"; -} - -# asctime: From example@example.com Fri Jun 23 02:56:55 2000 -my $from_strict = qr/^From \S+ \S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; -my $prev = undef; -while (defined(my $l = )) { - if ($l =~ /$from_strict/o) { - if (!defined($prev) || $prev =~ /^\r?$/) { - do_add($im, \$msg) if $msg; - $msg = ''; - $prev = $l; - next; - } - warn "W[$.] $l\n"; +my $mainrepo = shift or die $usage; # /path/to/v2/repo +my $ibx = { + mainrepo => $mainrepo, + name => $name, + version => $version, + address => [ $email ], + filter => 'PublicInbox::Filter::Vger', +}; +$ibx = PublicInbox::Inbox->new($ibx); +unless ($dry_run) { + if ($version >= 2) { + require PublicInbox::V2Writable; + PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0); + } else { + system(qw(git init --bare -q), $mainrepo) == 0 or die; } - $prev = $l; - $msg .= $l; } -do_add($im, \$msg) if $msg; -$im->done; +$ibx = PublicInbox::InboxWritable->new($ibx); +binmode STDIN; +$ibx->import_mbox(\*STDIN, $variant); diff --git a/t/address.t b/t/address.t index e35e4f8b..eced5c46 100644 --- a/t/address.t +++ b/t/address.t @@ -9,8 +9,9 @@ is_deeply([qw(e@example.com e@example.org)], [PublicInbox::Address::emails('User , e@example.org')], 'address extraction works as expected'); -is_deeply([PublicInbox::Address::emails('"ex@example.com" ')], - [qw(ex@example.com)]); +is_deeply(['user@example.com'], + [PublicInbox::Address::emails('')], + 'comment after domain accepted before >'); my @names = PublicInbox::Address::names( 'User , e@e, "John A. Doe" , '); diff --git a/t/altid.t b/t/altid.t index 7759bd6b..d4f6152e 100644 --- a/t/altid.t +++ b/t/altid.t @@ -20,7 +20,9 @@ my $altid = [ "serial:gmane:file=$alt_file" ]; { my $mm = PublicInbox::Msgmap->new_file($alt_file, 1); - $mm->mid_set(1234, 'a@example.com'); + is($mm->mid_set(1234, 'a@example.com'), 1, 'mid_set once OK'); + ok(0 == $mm->mid_set(1234, 'a@example.com'), 'mid_set not idempotent'); + ok(0 == $mm->mid_set(1, 'a@example.com'), 'mid_set fails with dup MID'); } { @@ -48,14 +50,20 @@ my $altid = [ "serial:gmane:file=$alt_file" ]; { my $ro = PublicInbox::Search->new($git_dir, $altid); - my $res = $ro->query("gmane:1234"); - is($res->{total}, 1, 'got one match'); - is($res->{msgs}->[0]->mid, 'a@example.com'); + my $msgs = $ro->query("gmane:1234"); + is_deeply([map { $_->mid } @$msgs], ['a@example.com'], 'got one match'); - $res = $ro->query("gmane:666"); - is($res->{total}, 0, 'body did NOT match'); + $msgs = $ro->query("gmane:666"); + is_deeply([], $msgs, 'body did NOT match'); }; +{ + my $mm = PublicInbox::Msgmap->new_file($alt_file, 1); + my ($min, $max) = $mm->minmax; + my $num = $mm->mid_insert('b@example.com'); + ok($num > $max, 'auto-increment goes beyond mid_set'); +} + done_testing(); 1; diff --git a/t/altid_v2.t b/t/altid_v2.t new file mode 100644 index 00000000..87f1452b --- /dev/null +++ b/t/altid_v2.t @@ -0,0 +1,55 @@ +# Copyright (C) 2016-2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for altid_v2.t" if $@; +} + +use_ok 'PublicInbox::V2Writable'; +use_ok 'PublicInbox::Inbox'; +my $tmpdir = tempdir('pi-altidv2-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $mainrepo = "$tmpdir/inbox"; +my $full = "$tmpdir/inbox/another-nntp.sqlite3"; +my $altid = [ 'serial:gmane:file=another-nntp.sqlite3' ]; + +{ + ok(mkdir($mainrepo), 'created repo for msgmap'); + my $mm = PublicInbox::Msgmap->new_file($full, 1); + is($mm->mid_set(1234, 'a@example.com'), 1, 'mid_set once OK'); + ok(0 == $mm->mid_set(1234, 'a@example.com'), 'mid_set not idempotent'); + ok(0 == $mm->mid_set(1, 'a@example.com'), 'mid_set fails with dup MID'); +} + +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', + altid => $altid, +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $v2w = PublicInbox::V2Writable->new($ibx, 1); +$v2w->add(Email::MIME->create( + header => [ + From => 'a@example.com', + To => 'b@example.com', + 'Content-Type' => 'text/plain', + Subject => 'boo!', + 'Message-ID' => '', + ], + body => "hello world gmane:666\n", + )); +$v2w->done; + +my $msgs = $ibx->search->reopen->query("gmane:1234"); +is_deeply([map { $_->mid } @$msgs], ['a@example.com'], 'got one match'); +$msgs = $ibx->search->query("gmane:666"); +is_deeply([], $msgs, 'body did NOT match'); + +done_testing(); + +1; diff --git a/t/content_id.t b/t/content_id.t new file mode 100644 index 00000000..01ce65e5 --- /dev/null +++ b/t/content_id.t @@ -0,0 +1,35 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::ContentId qw(content_id); +use Email::MIME; + +my $mime = Email::MIME->create( + header => [ + From => 'a@example.com', + To => 'b@example.com', + 'Content-Type' => 'text/plain', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); + +my $orig = content_id($mime); +my $reload = content_id(Email::MIME->new($mime->as_string)); +is($orig, $reload, 'content_id matches after serialization'); + +foreach my $h (qw(From To Cc)) { + my $n = '"Quoted N\'Ame" '; + $mime->header_str_set($h, "$n"); + my $q = content_id($mime); + is($n, $mime->header($h), "content_id does not mutate $h:"); + $mime->header_str_set($h, 'Quoted N\'Ame '); + my $nq = content_id($mime); + is($nq, $q, "quotes ignored in $h:"); +} + +done_testing(); diff --git a/t/convert-compact.t b/t/convert-compact.t new file mode 100644 index 00000000..ced45415 --- /dev/null +++ b/t/convert-compact.t @@ -0,0 +1,104 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use PublicInbox::MIME; +my @mods = qw(DBD::SQLite Search::Xapian); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for convert-compact.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +use PublicInbox::Import; +my $tmpdir = tempdir('convert-compact-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => "$tmpdir/v1", + name => 'test-v1', + -primary_address => 'test@example.com', +}; + +ok(PublicInbox::Import::run_die([qw(git init --bare -q), $ibx->{mainrepo}]), + 'initialized v1 repo'); +ok(umask(077), 'set restrictive umask'); +ok(PublicInbox::Import::run_die([qw(git) , "--git-dir=$ibx->{mainrepo}", + qw(config core.sharedRepository 0644)]), 'set sharedRepository'); +$ibx = PublicInbox::Inbox->new($ibx); +my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); +ok($im->add($mime), 'added one message'); +ok($im->remove($mime), 'remove message'); +ok($im->add($mime), 'added message again'); +$im->done; +PublicInbox::SearchIdx->new($ibx, 1)->index_sync; + +is(((stat("$ibx->{mainrepo}/public-inbox"))[2]) & 07777, 0755, + 'sharedRepository respected for v1'); +is(((stat("$ibx->{mainrepo}/public-inbox/msgmap.sqlite3"))[2]) & 07777, 0644, + 'sharedRepository respected for v1 msgmap'); +my @xdir = glob("$ibx->{mainrepo}/public-inbox/xap*/*"); +foreach (@xdir) { + my @st = stat($_); + is($st[2] & 07777, -f _ ? 0644 : 0755, + 'sharedRepository respected on file after convert'); +} + +local $ENV{PATH} = "blib/script:$ENV{PATH}"; +open my $err, '>>', "$tmpdir/err.log" or die "open: err.log $!\n"; +open my $out, '>>', "$tmpdir/out.log" or die "open: out.log $!\n"; +my $rdr = { 1 => fileno($out), 2 => fileno($err) }; + +my $cmd = [ 'public-inbox-compact', $ibx->{mainrepo} ]; +ok(PublicInbox::Import::run_die($cmd, undef, $rdr), 'v1 compact works'); + +@xdir = glob("$ibx->{mainrepo}/public-inbox/xap*"); +is(scalar(@xdir), 1, 'got one xapian directory after compact'); +is(((stat($xdir[0]))[2]) & 07777, 0755, + 'sharedRepository respected on v1 compact'); + +$cmd = [ 'public-inbox-convert', $ibx->{mainrepo}, "$tmpdir/v2" ]; +ok(PublicInbox::Import::run_die($cmd, undef, $rdr), 'convert works'); +@xdir = glob("$tmpdir/v2/xap*/*"); +foreach (@xdir) { + my @st = stat($_); + is($st[2] & 07777, -f _ ? 0644 : 0755, + 'sharedRepository respected after convert'); +} + +$cmd = [ 'public-inbox-compact', "$tmpdir/v2" ]; +my $env = { NPROC => 2 }; +ok(PublicInbox::Import::run_die($cmd, $env, $rdr), 'v2 compact works'); +$ibx->{mainrepo} = "$tmpdir/v2"; +$ibx->{version} = 2; + +@xdir = glob("$tmpdir/v2/xap*/*"); +foreach (@xdir) { + my @st = stat($_); + is($st[2] & 07777, -f _ ? 0644 : 0755, + 'sharedRepository respected after v2 compact'); +} +is(((stat("$tmpdir/v2/msgmap.sqlite3"))[2]) & 07777, 0644, + 'sharedRepository respected for v2 msgmap'); + +@xdir = (glob("$tmpdir/v2/git/*.git/objects/*/*"), + glob("$tmpdir/v2/git/*.git/objects/pack/*")); +foreach (@xdir) { + my @st = stat($_); + is($st[2] & 07777, -f _ ? 0444 : 0755, + 'sharedRepository respected after v2 compact'); +} +my $msgs = $ibx->recent({limit => 1000}); +is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @$msgs, 1, 'only one message in history'); + +done_testing(); diff --git a/t/git.t b/t/git.t index 5efc18ab..7f96293f 100644 --- a/t/git.t +++ b/t/git.t @@ -6,6 +6,7 @@ use Test::More; use File::Temp qw/tempdir/; my $dir = tempdir('pi-git-XXXXXX', TMPDIR => 1, CLEANUP => 1); use Cwd qw/getcwd/; +use PublicInbox::Spawn qw(popen_rd); use_ok 'PublicInbox::Git'; { @@ -137,6 +138,29 @@ if (1) { is($all, join('', @ref), 'qx returned array when wanted'); my $nl = scalar @ref; ok($nl > 1, "qx returned array length of $nl"); + + $gcf->qx(qw(repack -adbq)); + ok($gcf->packed_bytes > 0, 'packed size is positive'); +} + +if ('alternates reloaded') { + my $alt = tempdir('pi-git-XXXXXX', TMPDIR => 1, CLEANUP => 1); + my @cmd = ('git', "--git-dir=$alt", qw(hash-object -w --stdin)); + is(system(qw(git init -q --bare), $alt), 0, 'create alt directory'); + open my $fh, '<', "$alt/config" or die "open failed: $!\n"; + my $rd = popen_rd(\@cmd, {}, { 0 => fileno($fh) } ); + close $fh or die "close failed: $!"; + chomp(my $remote = <$rd>); + my $gcf = PublicInbox::Git->new($dir); + is($gcf->cat_file($remote), undef, "remote file not found"); + open $fh, '>>', "$dir/objects/info/alternates" or + die "open failed: $!\n"; + print $fh "$alt/objects" or die "print failed: $!\n"; + close $fh or die "close failed: $!"; + my $found = $gcf->cat_file($remote); + open $fh, '<', "$alt/config" or die "open failed: $!\n"; + my $config = eval { local $/; <$fh> }; + is($$found, $config, 'alternates reloaded'); } done_testing(); diff --git a/t/import.t b/t/import.t index fb6238e7..eee47447 100644 --- a/t/import.t +++ b/t/import.t @@ -6,7 +6,10 @@ use Test::More; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; -use File::Temp qw/tempdir/; +use PublicInbox::Spawn qw(spawn); +use IO::File; +use Fcntl qw(:DEFAULT); +use File::Temp qw/tempdir tempfile/; my $dir = tempdir('pi-import-XXXXXX', TMPDIR => 1, CLEANUP => 1); is(system(qw(git init -q --bare), $dir), 0, 'git init successful'); @@ -20,10 +23,33 @@ my $mime = PublicInbox::MIME->create( 'Content-Type' => 'text/plain', Subject => 'this is a subject', 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', ], body => "hello world\n", ); + +$im->{want_object_info} = 1 if 'v2'; like($im->add($mime), qr/\A:\d+\z/, 'added one message'); + +if ('v2') { + my $info = $im->{last_object}; + like($info->[0], qr/\A[a-f0-9]{40}\z/, 'got last object_id'); + is($mime->as_string, ${$info->[2]}, 'string matches'); + is($info->[1], length(${$info->[2]}), 'length matches'); + my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); + my $in = tempfile(); + print $in $mime->as_string or die "write failed: $!"; + $in->flush or die "flush failed: $!"; + $in->seek(0, SEEK_SET); + my $out = tempfile(); + my $pid = spawn(\@cmd, {}, { 0 => fileno($in), 1 => fileno($out)}); + is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object'); + is($?, 0, 'hash-object'); + $out->seek(0, SEEK_SET); + chomp(my $hashed_obj = <$out>); + is($hashed_obj, $info->[0], "last object_id matches exp"); +} + $im->done; my @revs = $git->qx(qw(rev-list HEAD)); is(scalar @revs, 1, 'one revision created'); @@ -64,6 +90,8 @@ isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch'); $mime->header_set('Message-Id', ''); is($im->add($mime, sub { undef }), undef, 'check callback fails'); is($im->remove($mime), undef, 'message not added, so not removed'); - +is(undef, $im->checkpoint, 'checkpoint works before ->done'); $im->done; +is(undef, $im->checkpoint, 'checkpoint works after ->done'); +$im->checkpoint; done_testing(); diff --git a/t/init.t b/t/init.t index 864f1ab5..59f54813 100644 --- a/t/init.t +++ b/t/init.t @@ -7,6 +7,23 @@ use PublicInbox::Config; use File::Temp qw/tempdir/; my $tmpdir = tempdir('pi-init-XXXXXX', TMPDIR => 1, CLEANUP => 1); use constant pi_init => 'blib/script/public-inbox-init'; +use PublicInbox::Import; +use File::Basename; +open my $null, '>>', '/dev/null'; +my $rdr = { 2 => fileno($null) }; +sub quiet_fail { + my ($cmd, $msg) = @_; + # run_die doesn't take absolute paths: + my $path = $ENV{PATH}; + if (index($cmd->[0], '/') >= 0) { + my ($dir, $base) = ($cmd->[0] =~ m!\A(.+)/([^/]+)\z!); + $path = "$dir:$path"; + $cmd->[0] = $base; + } + local $ENV{PATH} = $path; + eval { PublicInbox::Import::run_die($cmd, undef, $rdr) }; + isnt($@, '', $msg); +} { local $ENV{PI_DIR} = "$tmpdir/.public-inbox/"; @@ -23,6 +40,34 @@ use constant pi_init => 'blib/script/public-inbox-init'; qw(http://example.com/clist clist@example.com)); is(system(@cmd), 0, 'public-inbox-init clist OK'); is((stat($cfgfile))[2] & 07777, 0666, "permissions preserved"); + + @cmd = (pi_init, 'clist', '-V2', "$tmpdir/clist", + qw(http://example.com/clist clist@example.com)); + quiet_fail(\@cmd, 'attempting to init V2 from V1 fails'); +} + +SKIP: { + foreach my $mod (qw(DBD::SQLite Search::Xapian::WritableDatabase)) { + eval "require $mod"; + skip "$mod missing for v2", 2 if $@; + } + local $ENV{PI_DIR} = "$tmpdir/.public-inbox/"; + my $cfgfile = "$ENV{PI_DIR}/config"; + my @cmd = (pi_init, '-V2', 'v2list', "$tmpdir/v2list", + qw(http://example.com/v2list v2list@example.com)); + is(system(@cmd), 0, 'public-inbox-init -V2 OK'); + ok(-d "$tmpdir/v2list", 'v2list directory exists'); + ok(-f "$tmpdir/v2list/msgmap.sqlite3", 'msgmap exists'); + ok(-d "$tmpdir/v2list/all.git", 'catch-all.git directory exists'); + @cmd = (pi_init, 'v2list', "$tmpdir/v2list", + qw(http://example.com/v2list v2list@example.com)); + is(system(@cmd), 0, 'public-inbox-init is idempotent'); + ok(! -d "$tmpdir/public-inbox" && !-d "$tmpdir/objects", + 'idempotent invocation w/o -V2 does not make inbox v1'); + + @cmd = (pi_init, 'v2list', "-V1", "$tmpdir/v2list", + qw(http://example.com/v2list v2list@example.com)); + quiet_fail(\@cmd, 'initializing V2 as V1 fails'); } done_testing(); diff --git a/t/mid.t b/t/mid.t index 0bf33318..223be798 100644 --- a/t/mid.t +++ b/t/mid.t @@ -1,11 +1,31 @@ # Copyright (C) 2016-2018 all contributors # License: AGPL-3.0+ use Test::More; -use PublicInbox::MID qw(mid_escape); +use PublicInbox::MID qw(mid_escape mids references); is(mid_escape('foo!@(bar)'), 'foo!@(bar)'); is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)'); is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)'); +{ + use Email::MIME; + my $mime = Email::MIME->create; + $mime->header_set('Message-Id', ''); + is_deeply(['mid-1@a'], mids($mime->header_obj), 'mids in common case'); + $mime->header_set('Message-Id', '', ''); + is_deeply(['mid-1@a', 'mid-2@b'], mids($mime->header_obj), '2 mids'); + $mime->header_set('Message-Id', '', ''); + is_deeply(['mid-1@a'], mids($mime->header_obj), 'dup mids'); + $mime->header_set('Message-Id', ' comment'); + is_deeply(['mid-1@a'], mids($mime->header_obj), 'comment ignored'); + $mime->header_set('Message-Id', 'bare-mid'); + is_deeply(['bare-mid'], mids($mime->header_obj), 'bare mid OK'); + + $mime->header_set('References', ' '); + $mime->header_set('In-Reply-To', ''); + is_deeply(['hello', 'world', 'weld'], references($mime->header_obj), + 'references combines with In-Reply-To'); +} + done_testing(); 1; diff --git a/t/msgmap.t b/t/msgmap.t index bc22137d..dce98f46 100644 --- a/t/msgmap.t +++ b/t/msgmap.t @@ -65,4 +65,8 @@ my $orig = $d->mid_insert('spam@1'); $d->mid_delete('spam@1'); is($d->mid_insert('spam@2'), 1 + $orig, "last number not recycled"); +my $tmp = $d->tmp_clone; +is_deeply([$d->minmax], [$tmp->minmax], 'Cloned temporary DB matches'); +ok($tmp->mid_delete('spam@2'), 'temporary DB is writable'); + done_testing(); diff --git a/t/nntp.t b/t/nntp.t index 03c7f083..57fef48b 100644 --- a/t/nntp.t +++ b/t/nntp.t @@ -109,7 +109,9 @@ use_ok 'PublicInbox::Inbox'; is($ng->base_url, $u, 'URL expanded'); my $mid = 'a@b'; my $mime = Email::MIME->new("Message-ID: <$mid>\r\n\r\n"); - PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 1, $mid); + my $hdr = $mime->header_obj; + my $mock_self = { nntpd => { grouplist => [] } }; + PublicInbox::NNTP::set_nntp_headers($mock_self, $hdr, $ng, 1, $mid); is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ], 'Message-ID unchanged'); is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ], @@ -124,7 +126,7 @@ use_ok 'PublicInbox::Inbox'; 'Xref: set'); $ng->{-base_url} = 'http://mirror.example.com/m/'; - PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 2, $mid); + PublicInbox::NNTP::set_nntp_headers($mock_self, $hdr, $ng, 2, $mid); is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ], 'Message-ID unchanged'); is_deeply([ $mime->header('Archived-At') ], diff --git a/t/nntpd.t b/t/nntpd.t index 20191cb6..3698f98b 100644 --- a/t/nntpd.t +++ b/t/nntpd.t @@ -21,14 +21,18 @@ my $tmpdir = tempdir('pi-nntpd-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $home = "$tmpdir/pi-home"; my $err = "$tmpdir/stderr.log"; my $out = "$tmpdir/stdout.log"; -my $maindir = "$tmpdir/main.git"; +my $mainrepo = "$tmpdir/main.git"; my $group = 'test-nntpd'; my $addr = $group . '@example.com'; my $nntpd = 'blib/script/public-inbox-nntpd'; my $init = 'blib/script/public-inbox-init'; use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Inbox'; use_ok 'PublicInbox::Git'; +use_ok 'PublicInbox::V2Writable'; +# XXX FIXME: make it easier to test both versions +my $version = int($ENV{PI_VERSION} || 1); my %opts = ( LocalAddr => '127.0.0.1', ReuseAddr => 1, @@ -40,14 +44,34 @@ my $sock = IO::Socket::INET->new(%opts); my $pid; my $len; END { kill 'TERM', $pid if defined $pid }; + +my $ibx = { + mainrepo => $mainrepo, + name => $group, + version => $version, + -primary_address => $addr, +}; +$ibx = PublicInbox::Inbox->new($ibx); { local $ENV{HOME} = $home; - system($init, $group, $maindir, 'http://example.com/', $addr); + my @cmd = ($init, $group, $mainrepo, 'http://example.com/', $addr); + push @cmd, "-V$version"; + is(system(@cmd), 0, 'init OK'); is(system(qw(git config), "--file=$home/.public-inbox/config", "publicinbox.$group.newsgroup", $group), 0, 'enabled newsgroup'); my $len; + my $im; + if ($version == 2) { + $im = PublicInbox::V2Writable->new($ibx); + } elsif ($version == 1) { + my $git = PublicInbox::Git->new($mainrepo); + $im = PublicInbox::Import->new($git, 'test', $addr); + } else { + die "unsupported version: $version"; + } + # ensure successful message delivery { my $mime = Email::MIME->new(<header_set('List-Id', "<$list_id>"); $len = length($mime->as_string); - my $git = PublicInbox::Git->new($maindir); - my $im = PublicInbox::Import->new($git, 'test', $addr); $im->add($mime); $im->done; - my $s = PublicInbox::SearchIdx->new($maindir, 1); - $s->index_sync; + if ($version == 1) { + my $s = PublicInbox::SearchIdx->new($mainrepo, 1); + $s->index_sync; + } } ok($sock, 'sock created'); @@ -99,6 +123,7 @@ EOF my $list = $n->list; is_deeply($list, { $group => [ qw(1 1 n) ] }, 'LIST works'); is_deeply([$n->group($group)], [ qw(0 1 1), $group ], 'GROUP works'); + is_deeply($n->listgroup($group), [1], 'listgroup OK'); %opts = ( PeerAddr => $host_port, diff --git a/t/over.t b/t/over.t new file mode 100644 index 00000000..c0d9d5e5 --- /dev/null +++ b/t/over.t @@ -0,0 +1,63 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use Compress::Zlib qw(compress); +foreach my $mod (qw(DBD::SQLite)) { + eval "require $mod"; + plan skip_all => "$mod missing for over.t" if $@; +} + +use_ok 'PublicInbox::OverIdx'; +my $tmpdir = tempdir('pi-over-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $over = PublicInbox::OverIdx->new("$tmpdir/over.sqlite3"); +$over->connect; +my $x = $over->next_tid; +is(int($x), $x, 'integer tid'); +my $y = $over->next_tid; +is($y, $x+1, 'tid increases'); + +$x = $over->sid('hello-world'); +is(int($x), $x, 'integer sid'); +$y = $over->sid('hello-WORLD'); +is($y, $x+1, 'sid ncreases'); +is($over->sid('hello-world'), $x, 'idempotent'); +$over->disconnect; + +$over = PublicInbox::OverIdx->new("$tmpdir/over.sqlite3"); +$over->connect; +is($over->sid('hello-world'), $x, 'idempotent across reopen'); +$over->each_by_mid('never', sub { fail('should not be called') }); + +$x = $over->create_ghost('never'); +is(int($x), $x, 'integer tid for ghost'); +$y = $over->create_ghost('NEVAR'); +is($y, $x + 1, 'integer tid for ghost increases'); + +my $ddd = compress(''); +foreach my $s ('', undef) { + $over->add_over([0, 0, 98, [ 'a' ], [], $s, $ddd]); + $over->add_over([0, 0, 99, [ 'b' ], [], $s, $ddd]); + my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; + is_deeply([98], $msgs, + 'messages not linked by empty subject'); +} + +$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); +$over->add_over([0, 0, 99, [ 'b' ], [], 's', $ddd]); +foreach my $mid (qw(a b)) { + my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; + is_deeply([98, 99], $msgs, 'linked messages by subject'); +} +$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); +$over->add_over([0, 0, 99, [ 'b' ], ['a'], 'diff', $ddd]); +foreach my $mid (qw(a b)) { + my $msgs = [ map { $_->{num} } @{$over->get_thread($mid)} ]; + is_deeply([98, 99], $msgs, "linked messages by Message-ID: <$mid>"); +} + +$over->rollback_lazy; + +done_testing(); diff --git a/t/perf-nntpd.t b/t/perf-nntpd.t new file mode 100644 index 00000000..e5021532 --- /dev/null +++ b/t/perf-nntpd.t @@ -0,0 +1,135 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use Benchmark qw(:all :hireswallclock); +use PublicInbox::Inbox; +use File::Temp qw/tempdir/; +use POSIX qw(dup2); +use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); +use Net::NNTP; +my $pi_dir = $ENV{GIANT_PI_DIR}; +plan skip_all => "GIANT_PI_DIR not defined for $0" unless $pi_dir; +eval { require PublicInbox::Search }; +my ($host_port, $group, %opts, $s, $pid); +END { + if ($s) { + $s->print("QUIT\r\n"); + $s->getline; + $s = undef; + } + kill 'TERM', $pid if defined $pid; +}; + +if (($ENV{NNTP_TEST_URL} || '') =~ m!\Anntp://([^/]+)/([^/]+)\z!) { + ($host_port, $group) = ($1, $2); + $host_port .= ":119" unless index($host_port, ':') > 0; +} else { + $group = 'inbox.test.perf.nntpd'; + my $ibx = { mainrepo => $pi_dir, newsgroup => $group }; + $ibx = PublicInbox::Inbox->new($ibx); + my $nntpd = 'blib/script/public-inbox-nntpd'; + my $tmpdir = tempdir('perf-nntpd-XXXXXX', TMPDIR => 1, CLEANUP => 1); + + my $pi_config = "$tmpdir/config"; + { + open my $fh, '>', $pi_config or die "open($pi_config): $!"; + print $fh <<"" or die "print $pi_config: $!"; +[publicinbox "test"] + newsgroup = $group + mainrepo = $pi_dir + address = test\@example.com + + close $fh or die "close($pi_config): $!"; + } + + %opts = ( + LocalAddr => '127.0.0.1', + ReuseAddr => 1, + Proto => 'tcp', + Listen => 1024, + ); + my $sock = IO::Socket::INET->new(%opts); + + ok($sock, 'sock created'); + $! = 0; + $pid = fork; + if ($pid == 0) { + # pretend to be systemd + my $fl = fcntl($sock, F_GETFD, 0); + dup2(fileno($sock), 3) or die "dup2 failed: $!\n"; + dup2(1, 2) or die "dup2 failed: $!\n"; + fcntl($sock, F_SETFD, $fl &= ~FD_CLOEXEC); + $ENV{LISTEN_PID} = $$; + $ENV{LISTEN_FDS} = 1; + $ENV{PI_CONFIG} = $pi_config; + exec $nntpd, '-W0'; + die "FAIL: $!\n"; + } + ok(defined $pid, 'forked nntpd process successfully'); + $host_port = $sock->sockhost . ':' . $sock->sockport; +} +%opts = ( + PeerAddr => $host_port, + Proto => 'tcp', + Timeout => 1, +); +$s = IO::Socket::INET->new(%opts); +$s->autoflush(1); +my $buf = $s->getline; +is($buf, "201 server ready - post via email\r\n", 'got greeting'); + +my $t = timeit(10, sub { + ok($s->print("GROUP $group\r\n"), 'changed group'); + $buf = $s->getline; +}); +diag 'GROUP took: ' . timestr($t); + +my ($tot, $min, $max) = ($buf =~ /\A211 (\d+) (\d+) (\d+) /); +ok($tot && $min && $max, 'got GROUP response'); +my $nr = $max - $min; +my $nmax = 50000; +my $nmin = $max - $nmax; +$nmin = $min if $nmin < $min; +my $res; +my $spec = "$nmin-$max"; +my $n; + +sub read_until_dot ($) { + my $n = 0; + do { + $buf = $s->getline; + ++$n + } until $buf eq ".\r\n"; + $n; +} + +$t = timeit(1, sub { + $s->print("XOVER $spec\r\n"); + $n = read_until_dot($s); +}); +diag 'xover took: ' . timestr($t) . " for $n"; + +$t = timeit(1, sub { + $s->print("HDR From $spec\r\n"); + $n = read_until_dot($s); + +}); +diag "XHDR From ". timestr($t) . " for $n"; + +my $date = $ENV{NEWNEWS_DATE}; +unless ($date) { + my (undef, undef, undef, $d, $m, $y) = gmtime(time - 30 * 86400); + $date = sprintf('%04u%02u%02u', $y + 1900, $m, $d); + diag "NEWNEWS_DATE undefined, using $date"; +} +$t = timeit(1, sub { + $s->print("NEWNEWS * $date 000000 GMT\r\n"); + $n = read_until_dot($s); +}); +diag 'newnews took: ' . timestr($t) . " for $n"; + +done_testing(); + +1; diff --git a/t/perf-threading.t b/t/perf-threading.t new file mode 100644 index 00000000..15779c93 --- /dev/null +++ b/t/perf-threading.t @@ -0,0 +1,32 @@ +# Copyright (C) 2016-2018 all contributors +# License: AGPL-3.0+ +# +# real-world testing of search threading +use strict; +use warnings; +use Test::More; +use Benchmark qw(:all); +use PublicInbox::Inbox; +my $pi_dir = $ENV{GIANT_PI_DIR}; +plan skip_all => "GIANT_PI_DIR not defined for $0" unless $pi_dir; +my $ibx = PublicInbox::Inbox->new({ mainrepo => $pi_dir }); +eval { require PublicInbox::Search }; +my $srch = $ibx->search; +plan skip_all => "$pi_dir not configured for search $0 $@" unless $srch; + +require PublicInbox::View; + +my $msgs; +my $elapsed = timeit(1, sub { + $msgs = $srch->{over_ro}->recent({limit => 200000}); +}); +my $n = scalar(@$msgs); +ok($n, 'got some messages'); +diag "enquire: ".timestr($elapsed)." for $n"; + +$elapsed = timeit(1, sub { + PublicInbox::View::thread_results({-inbox => $ibx}, $msgs); +}); +diag "thread_results ".timestr($elapsed); + +done_testing(); diff --git a/t/plack.t b/t/plack.t index 26b03660..7eb7d7f2 100644 --- a/t/plack.t +++ b/t/plack.t @@ -18,6 +18,7 @@ foreach my $mod (@mods) { } use_ok 'PublicInbox::Import'; use_ok 'PublicInbox::Git'; +my @ls; foreach my $mod (@mods) { use_ok $mod; } { @@ -55,6 +56,8 @@ EOF $im->done; my $rev = `git --git-dir="$maindir" rev-list HEAD`; like($rev, qr/\A[a-f0-9]{40}/, "good revision committed"); + @ls = `git --git-dir="$maindir" ls-tree -r --name-only HEAD`; + chomp @ls; } my $app = eval { local $ENV{PI_CONFIG} = $pi_config; @@ -198,6 +201,21 @@ EOF "$sfx redirected to /mbox.gz"); }); } + test_psgi($app, sub { + my ($cb) = @_; + # for a while, we used to support /$INBOX/$X40/ + # when we "compressed" long Message-IDs to SHA-1 + # Now we're stuck supporting them forever :< + foreach my $path (@ls) { + $path =~ tr!/!!d; + my $from = "http://example.com/test/$path/"; + my $res = $cb->(GET($from)); + is(301, $res->code, 'is permanent redirect'); + like($res->header('Location'), + qr!/test/blah\@example\.com/!, + 'redirect from x40 MIDs works'); + } + }); } done_testing(); diff --git a/t/psgi_search.t b/t/psgi_search.t index cf5a7e91..2f033016 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -30,8 +30,7 @@ EOF my $num = 0; # nb. using internal API, fragile! -my $xdb = $rw->_xdb_acquire; -$xdb->begin_transaction; +$rw->begin_txn_lazy; foreach (reverse split(/\n\n/, $data)) { $_ .= "\n"; @@ -42,8 +41,7 @@ foreach (reverse split(/\n\n/, $data)) { ok($doc_id, 'message added: '. $mid); } -$xdb->commit_transaction; -$rw = undef; +$rw->commit_txn_lazy; my $cfgpfx = "publicinbox.test"; my $config = PublicInbox::Config->new({ diff --git a/t/psgi_v2.t b/t/psgi_v2.t new file mode 100644 index 00000000..65448dc4 --- /dev/null +++ b/t/psgi_v2.t @@ -0,0 +1,245 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use PublicInbox::MIME; +use PublicInbox::Config; +use PublicInbox::WWW; +use PublicInbox::MID qw(mids); +my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test + URI::Escape Plack::Builder); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for psgi_v2_dupes.t" if $@; +} +use_ok($_) for @mods; +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-v2_dupes-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $new_mid; + +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; + +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); +ok($im->add($mime), 'added one message'); +$mime->body_set("hello world!\n"); + +my @warn; +local $SIG{__WARN__} = sub { push @warn, @_ }; +$mime->header_set(Date => 'Fri, 02 Oct 1993 00:01:00 +0000'); +ok($im->add($mime), 'added duplicate-but-different message'); +is(scalar(@warn), 1, 'got one warning'); +my $mids = mids($mime->header_obj); +$new_mid = $mids->[1]; +$im->done; + +my $cfgpfx = "publicinbox.v2test"; +my $cfg = { + "$cfgpfx.address" => $ibx->{-primary_address}, + "$cfgpfx.mainrepo" => $mainrepo, +}; +my $config = PublicInbox::Config->new($cfg); +my $www = PublicInbox::WWW->new($config); +my ($res, $raw, @from_); +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + $res = $cb->(GET('/v2test/a-mid@b/raw')); + $raw = $res->content; + like($raw, qr/^hello world$/m, 'got first message'); + like($raw, qr/^hello world!$/m, 'got second message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 2, 'two From_ lines'); + + $res = $cb->(GET("/v2test/$new_mid/raw")); + $raw = $res->content; + like($raw, qr/^hello world!$/m, 'second message with new Message-Id'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 1, 'only one From_ line'); + + # Atom feed should sort by Date: (if Received is missing) + $res = $cb->(GET('/v2test/new.atom')); + my @bodies = ($res->content =~ />(hello [^<]+)(GET('/v2test/new.html')); + @bodies = ($res->content =~ /^(hello [^<]+)$/mg); + is_deeply(\@bodies, [ "hello world!\n", "hello world\n" ], + 'new.html ordering is chronological'); +}); + +$mime->header_set('Message-Id', 'a-mid@b'); +$mime->body_set("hello ghosts\n"); +ok($im->add($mime), 'added 3rd duplicate-but-different message'); +is(scalar(@warn), 2, 'got another warning'); +like($warn[0], qr/mismatched/, 'warned about mismatched messages'); +is($warn[0], $warn[1], 'both warnings are the same'); + +$mids = mids($mime->header_obj); +my $third = $mids->[-1]; +$im->done; + +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + $res = $cb->(GET("/v2test/$third/raw")); + $raw = $res->content; + like($raw, qr/^hello ghosts$/m, 'got third message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 1, 'one From_ line'); + + $res = $cb->(GET('/v2test/a-mid@b/raw')); + $raw = $res->content; + like($raw, qr/^hello world$/m, 'got first message'); + like($raw, qr/^hello world!$/m, 'got second message'); + like($raw, qr/^hello ghosts$/m, 'got third message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 3, 'three From_ lines'); + $config->each_inbox(sub { $_[0]->search->reopen }); + + SKIP: { + eval { require IO::Uncompress::Gunzip }; + skip 'IO::Uncompress::Gunzip missing', 4 if $@; + + $res = $cb->(GET('/v2test/a-mid@b/t.mbox.gz')); + my $out; + my $in = $res->content; + my $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + like($out, qr/^hello world$/m, 'got first in t.mbox.gz'); + like($out, qr/^hello world!$/m, 'got second in t.mbox.gz'); + like($out, qr/^hello ghosts$/m, 'got third in t.mbox.gz'); + @from_ = ($out =~ m/^From /mg); + is(scalar(@from_), 3, 'three From_ lines in t.mbox.gz'); + + # search interface + $res = $cb->(POST('/v2test/?q=m:a-mid@b&x=m')); + $in = $res->content; + $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + like($out, qr/^hello world$/m, 'got first in mbox POST'); + like($out, qr/^hello world!$/m, 'got second in mbox POST'); + like($out, qr/^hello ghosts$/m, 'got third in mbox POST'); + @from_ = ($out =~ m/^From /mg); + is(scalar(@from_), 3, 'three From_ lines in mbox POST'); + + # all.mbox.gz interface + $res = $cb->(GET('/v2test/all.mbox.gz')); + $in = $res->content; + $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + like($out, qr/^hello world$/m, 'got first in all.mbox'); + like($out, qr/^hello world!$/m, 'got second in all.mbox'); + like($out, qr/^hello ghosts$/m, 'got third in all.mbox'); + @from_ = ($out =~ m/^From /mg); + is(scalar(@from_), 3, 'three From_ lines in all.mbox'); + }; + + $res = $cb->(GET('/v2test/?q=m:a-mid@b&x=t')); + is($res->code, 200, 'success with threaded search'); + my $raw = $res->content; + ok($raw =~ s/\A.*>Results 1-3 of 3(GET('/v2test/a-mid@b/')); + $raw = $res->content; + like($raw, qr/^hello world$/m, 'got first message'); + like($raw, qr/^hello world!$/m, 'got second message'); + like($raw, qr/^hello ghosts$/m, 'got third message'); + @from_ = ($raw =~ m/>From: /mg); + is(scalar(@from_), 3, 'three From: lines'); + foreach my $mid ('a-mid@b', $new_mid, $third) { + like($raw, qr/<\Q$mid\E>/s, "Message-ID $mid shown"); + } + like($raw, qr/\b3\+ messages\b/, 'thread overview shown'); + + my $exp = [ qw( ) ]; + $mime->header_set('Message-Id', @$exp); + $mime->header_set('Subject', '4th dupe'); + local $SIG{__WARN__} = sub {}; + ok($im->add($mime), 'added one message'); + $im->done; + my @h = $mime->header('Message-ID'); + is_deeply($exp, \@h, 'reused existing Message-ID'); + + $config->each_inbox(sub { $_[0]->search->reopen }); + + $res = $cb->(GET('/v2test/new.atom')); + my @ids = ($res->content =~ m!urn:uuid:([^<]+)!sg); + my %ids; + $ids{$_}++ for @ids; + is_deeply([qw(1 1 1 1)], [values %ids], 'feed ids unique'); + + $res = $cb->(GET('/v2test/reuse@mid/T/')); + $raw = $res->content; + like($raw, qr/\b4\+ messages\b/, 'thread overview shown with /T/'); + @over = ($raw =~ m/^\d{4}-\d+-\d+\s+\d+:\d+ (.+)$/gm); + is_deeply(\@over, [ '(GET('/v2test/reuse@mid/t/')); + $raw = $res->content; + like($raw, qr/\b4\+ messages\b/, 'thread overview shown with /t/'); + + $res = $cb->(GET('/v2test/0/info/refs')); + is($res->code, 200, 'got info refs for dumb clones'); + $res = $cb->(GET('/v2test/info/refs')); + is($res->code, 404, 'unpartitioned git URL fails'); + + # ensure conflicted attachments can be resolved + foreach my $body (qw(old new)) { + my $parts = [ + PublicInbox::MIME->create( + attributes => { content_type => 'text/plain' }, + body => 'blah', + ), + PublicInbox::MIME->create( + attributes => { + filename => 'attach.txt', + content_type => 'text/plain', + }, + body => $body + ) + ]; + $mime = PublicInbox::MIME->create( + parts => $parts, + header_str => [ From => 'root@z', + 'Message-ID' => '', + Subject => 'hi'] + ); + ok($im->add($mime), "added attachment $body"); + } + $im->done; + $config->each_inbox(sub { $_[0]->search->reopen }); + $res = $cb->(GET('/v2test/a@dup/')); + my @links = ($res->content =~ m!"\.\./([^/]+/2-attach\.txt)\"!g); + is(scalar(@links), 2, 'both attachment links exist'); + isnt($links[0], $links[1], 'attachment links are different'); + { + my $old = $cb->(GET('/v2test/' . $links[0])); + my $new = $cb->(GET('/v2test/' . $links[1])); + is($old->content, 'old', 'got expected old content'); + is($new->content, 'new', 'got expected new content'); + } +}); + +done_testing(); + +1; diff --git a/t/search-thr-index.t b/t/search-thr-index.t index c3534f6b..2aa97bff 100644 --- a/t/search-thr-index.t +++ b/t/search-thr-index.t @@ -4,6 +4,7 @@ use strict; use warnings; use Test::More; use File::Temp qw/tempdir/; +use PublicInbox::MID qw(mids); use Email::MIME; eval { require PublicInbox::SearchIdx; }; plan skip_all => "Xapian missing for search" if $@; @@ -31,8 +32,7 @@ EOF my $num = 0; # nb. using internal API, fragile! -my $xdb = $rw->_xdb_acquire; -$xdb->begin_transaction; +my $xdb = $rw->begin_txn_lazy; my @mids; foreach (reverse split(/\n\n/, $data)) { @@ -41,18 +41,20 @@ foreach (reverse split(/\n\n/, $data)) { $mime->header_set('From' => 'bw@g'); $mime->header_set('To' => 'git@vger.kernel.org'); my $bytes = bytes::length($mime->as_string); - my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored'); - my $mid = $mime->header('Message-Id'); + my $mid = mids($mime->header_obj)->[0]; + my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid); push @mids, $mid; ok($doc_id, 'message added: '. $mid); } my $prev; foreach my $mid (@mids) { - my $res = $rw->get_thread($mid); - is(3, $res->{total}, "got all messages from $mid"); + my $msgs = $rw->{over}->get_thread($mid); + is(3, scalar(@$msgs), "got all messages from $mid"); } +$rw->commit_txn_lazy; + done_testing(); 1; diff --git a/t/search.t b/t/search.t index 6b1aa2a3..48c2511c 100644 --- a/t/search.t +++ b/t/search.t @@ -18,36 +18,36 @@ ok($@, "exception raised on non-existent DB"); my $rw = PublicInbox::SearchIdx->new($git_dir, 1); $rw->_xdb_acquire; $rw->_xdb_release; +my $ibx = $rw->{-inbox}; $rw = undef; my $ro = PublicInbox::Search->new($git_dir); my $rw_commit = sub { - $rw->{xdb}->commit_transaction if $rw && $rw->{xdb}; + $rw->commit_txn_lazy if $rw; $rw = PublicInbox::SearchIdx->new($git_dir, 1); - $rw->_xdb_acquire->begin_transaction; + $rw->begin_txn_lazy; }; { # git repository perms - is(PublicInbox::SearchIdx->_git_config_perm(undef), - &PublicInbox::SearchIdx::PERM_GROUP, + is($ibx->_git_config_perm(), &PublicInbox::InboxWritable::PERM_GROUP, "undefined permission is group"); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('0644')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('0644')), 0022, "644 => umask(0022)"); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('0600')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('0600')), 0077, "600 => umask(0077)"); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('0640')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('0640')), 0027, "640 => umask(0027)"); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('group')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('group')), 0007, 'group => umask(0007)'); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('everybody')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('everybody')), 0002, 'everybody => umask(0002)'); - is(PublicInbox::SearchIdx::_umask_for( - PublicInbox::SearchIdx->_git_config_perm('umask')), + is(PublicInbox::InboxWritable::_umask_for( + PublicInbox::InboxWritable->_git_config_perm('umask')), umask, 'umask => existing umask'); } @@ -82,18 +82,16 @@ my $rw_commit = sub { } sub filter_mids { - my ($res) = @_; - sort(map { $_->mid } @{$res->{msgs}}); + my ($msgs) = @_; + sort(map { $_->mid } @$msgs); } { $rw_commit->(); $ro->reopen; - my $found = $ro->lookup_message(''); - ok($found, "message found"); - is($root_id, $found->{doc_id}, 'doc_id set correctly'); - is($found->mid, 'root@s', 'mid set correctly'); - ok(int($found->thread_id) > 0, 'thread_id is an integer'); + my $found = $ro->query('m:root@s'); + is(scalar(@$found), 1, "message found"); + is($found->[0]->mid, 'root@s', 'mid set correctly'); my ($res, @res); my @exp = sort qw(root@s last@s); @@ -107,12 +105,12 @@ sub filter_mids { is_deeply(\@res, \@exp, 'got expected results for s:"" match'); $res = $ro->query('s:"Hello world"', {limit => 1}); - is(scalar @{$res->{msgs}}, 1, "limit works"); - my $first = $res->{msgs}->[0]; + is(scalar @$res, 1, "limit works"); + my $first = $res->[0]; $res = $ro->query('s:"Hello world"', {offset => 1}); - is(scalar @{$res->{msgs}}, 1, "offset works"); - my $second = $res->{msgs}->[0]; + is(scalar @$res, 1, "offset works"); + my $second = $res->[0]; isnt($first, $second, "offset returned different result from limit"); } @@ -148,7 +146,13 @@ sub filter_mids { my $ghost_id = $rw->add_message($was_ghost); is($ghost_id, int($ghost_id), "ghost_id is an integer: $ghost_id"); - ok($ghost_id < $reply_id, "ghost vivified from earlier message"); + my $msgs = $rw->{over}->get_thread('ghost-message@s'); + is(scalar(@$msgs), 2, 'got both messages in ghost thread'); + foreach (qw(sid tid)) { + is($msgs->[0]->{$_}, $msgs->[1]->{$_}, "{$_} match"); + } + isnt($msgs->[0]->{num}, $msgs->[1]->{num}, "num do not match"); + ok($_->{num} > 0, 'positive art num') foreach @$msgs } # search thread on ghost @@ -164,7 +168,14 @@ sub filter_mids { # body $res = $ro->query('goodbye'); - is($res->{msgs}->[0]->mid, 'last@s', 'got goodbye message body'); + is($res->[0]->mid, 'last@s', 'got goodbye message body'); + + # datestamp + $res = $ro->query('dt:20101002000001..20101002000001'); + @res = filter_mids($res); + is_deeply(\@res, ['ghost-message@s'], 'exact Date: match works'); + $res = $ro->query('dt:20101002000002..20101002000002'); + is_deeply($res, [], 'exact Date: match down to the second'); } # long message-id @@ -210,7 +221,7 @@ sub filter_mids { $rw_commit->(); $ro->reopen; my $t = $ro->get_thread('root@s'); - is($t->{total}, 4, "got all 4 mesages in thread"); + is(scalar(@$t), 4, "got all 4 mesages in thread"); my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_mid); @res = filter_mids($t); is_deeply(\@res, \@exp, "get_thread works"); @@ -239,13 +250,13 @@ sub filter_mids { ], body => "theatre\nfade\n")); my $res = $rw->query("theatre"); - is($res->{total}, 2, "got both matches"); - is($res->{msgs}->[0]->mid, 'nquote@a', "non-quoted scores higher"); - is($res->{msgs}->[1]->mid, 'quote@a', "quoted result still returned"); + is(scalar(@$res), 2, "got both matches"); + is($res->[0]->mid, 'nquote@a', "non-quoted scores higher"); + is($res->[1]->mid, 'quote@a', "quoted result still returned"); $res = $rw->query("illusions"); - is($res->{total}, 1, "got a match for quoted text"); - is($res->{msgs}->[0]->mid, 'quote@a', + is(scalar(@$res), 1, "got a match for quoted text"); + is($res->[0]->mid, 'quote@a', "quoted result returned if nothing else"); } @@ -264,10 +275,9 @@ sub filter_mids { ], body => "LOOP!\n")); ok($doc_id > 0, "doc_id defined with circular reference"); - my $smsg = $rw->lookup_message('circle@a'); + my $smsg = $rw->query('m:circle@a', {limit=>1})->[0]; is($smsg->references, '', "no references created"); - my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); - is($s, $msg->subject, 'long subject not rewritten'); + is($s, $smsg->subject, 'long subject not rewritten'); } { @@ -281,51 +291,51 @@ sub filter_mids { my $mime = Email::MIME->new($str); my $doc_id = $rw->add_message($mime); ok($doc_id > 0, 'message indexed doc_id with UTF-8'); - my $smsg = $rw->lookup_message('testmessage@example.com'); - my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); - + my $msg = $rw->query('m:testmessage@example.com', {limit => 1})->[0]; is($mime->header('Subject'), $msg->subject, 'UTF-8 subject preserved'); } { - my $res = $ro->query('d:19931002..20101002'); - ok(scalar @{$res->{msgs}} > 0, 'got results within range'); - $res = $ro->query('d:20101003..'); - is(scalar @{$res->{msgs}}, 0, 'nothing after 20101003'); - $res = $ro->query('d:..19931001'); - is(scalar @{$res->{msgs}}, 0, 'nothing before 19931001'); + my $msgs = $ro->query('d:19931002..20101002'); + ok(scalar(@$msgs) > 0, 'got results within range'); + $msgs = $ro->query('d:20101003..'); + is(scalar(@$msgs), 0, 'nothing after 20101003'); + $msgs = $ro->query('d:..19931001'); + is(scalar(@$msgs), 0, 'nothing before 19931001'); } # names and addresses { - my $res = $ro->query('t:list@example.com'); - is(scalar @{$res->{msgs}}, 6, 'searched To: successfully'); - foreach my $smsg (@{$res->{msgs}}) { + my $mset = $ro->query('t:list@example.com', {mset => 1}); + is($mset->size, 6, 'searched To: successfully'); + foreach my $m ($mset->items) { + my $smsg = $ro->lookup_article($m->get_docid); like($smsg->to, qr/\blist\@example\.com\b/, 'to appears'); } - $res = $ro->query('tc:list@example.com'); - is(scalar @{$res->{msgs}}, 6, 'searched To+Cc: successfully'); - foreach my $smsg (@{$res->{msgs}}) { + $mset = $ro->query('tc:list@example.com', {mset => 1}); + is($mset->size, 6, 'searched To+Cc: successfully'); + foreach my $m ($mset->items) { + my $smsg = $ro->lookup_article($m->get_docid); my $tocc = join("\n", $smsg->to, $smsg->cc); like($tocc, qr/\blist\@example\.com\b/, 'tocc appears'); } foreach my $pfx ('tcf:', 'c:') { - $res = $ro->query($pfx . 'foo@example.com'); - is(scalar @{$res->{msgs}}, 1, - "searched $pfx successfully for Cc:"); - foreach my $smsg (@{$res->{msgs}}) { + my $mset = $ro->query($pfx . 'foo@example.com', { mset => 1 }); + is($mset->items, 1, "searched $pfx successfully for Cc:"); + foreach my $m ($mset->items) { + my $smsg = $ro->lookup_article($m->get_docid); like($smsg->cc, qr/\bfoo\@example\.com\b/, 'cc appears'); } } foreach my $pfx ('', 'tcf:', 'f:') { - $res = $ro->query($pfx . 'Laggy'); - is(scalar @{$res->{msgs}}, 1, + my $res = $ro->query($pfx . 'Laggy'); + is(scalar(@$res), 1, "searched $pfx successfully for From:"); - foreach my $smsg (@{$res->{msgs}}) { + foreach my $smsg (@$res) { like($smsg->from, qr/Laggy Sender/, "From appears with $pfx"); } @@ -336,23 +346,23 @@ sub filter_mids { $rw_commit->(); $ro->reopen; my $res = $ro->query('b:hello'); - is(scalar @{$res->{msgs}}, 0, 'no match on body search only'); + is(scalar(@$res), 0, 'no match on body search only'); $res = $ro->query('bs:smith'); - is(scalar @{$res->{msgs}}, 0, + is(scalar(@$res), 0, 'no match on body+subject search for From'); $res = $ro->query('q:theatre'); - is(scalar @{$res->{msgs}}, 1, 'only one quoted body'); - like($res->{msgs}->[0]->from, qr/\AQuoter/, 'got quoted body'); + is(scalar(@$res), 1, 'only one quoted body'); + like($res->[0]->from, qr/\AQuoter/, 'got quoted body'); $res = $ro->query('nq:theatre'); - is(scalar @{$res->{msgs}}, 1, 'only one non-quoted body'); - like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body'); + is(scalar @$res, 1, 'only one non-quoted body'); + like($res->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body'); foreach my $pfx (qw(b: bs:)) { $res = $ro->query($pfx . 'theatre'); - is(scalar @{$res->{msgs}}, 2, "searched both bodies for $pfx"); - like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, + is(scalar @$res, 2, "searched both bodies for $pfx"); + like($res->[0]->from, qr/\ANon-Quoter/, "non-quoter first for $pfx"); } } @@ -391,14 +401,22 @@ sub filter_mids { $rw_commit->(); $ro->reopen; my $n = $ro->query('n:attached_fart.txt'); - is(scalar @{$n->{msgs}}, 1, 'got result for n:'); + is(scalar @$n, 1, 'got result for n:'); my $res = $ro->query('part_deux.txt'); - is(scalar @{$res->{msgs}}, 1, 'got result without n:'); - is($n->{msgs}->[0]->mid, $res->{msgs}->[0]->mid, + is(scalar @$res, 1, 'got result without n:'); + is($n->[0]->mid, $res->[0]->mid, 'same result with and without'); my $txt = $ro->query('"inside another"'); - is($txt->{msgs}->[0]->mid, $res->{msgs}->[0]->mid, + is($txt->[0]->mid, $res->[0]->mid, 'search inside text attachments works'); + + my $mid = $n->[0]->mid; + my ($id, $prev); + my $art = $ro->next_by_mid($mid, \$id, \$prev); + ok($art, 'article exists in OVER DB'); + $rw->unindex_blob($amsg); + $rw->commit_txn_lazy; + is($ro->lookup_article($art->{num}), undef, 'gone from OVER DB'); } done_testing(); diff --git a/t/thread-all.t b/t/thread-all.t deleted file mode 100644 index d4e8c1fc..00000000 --- a/t/thread-all.t +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (C) 2016-2018 all contributors -# License: AGPL-3.0+ -# -# real-world testing of search threading -use strict; -use warnings; -use Test::More; -use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); -my $pi_dir = $ENV{GIANT_PI_DIR}; -plan skip_all => "GIANT_PI_DIR not defined for $0" unless $pi_dir; -eval { require PublicInbox::Search; }; -plan skip_all => "Xapian missing for $0" if $@; -my $srch = eval { PublicInbox::Search->new($pi_dir) }; -plan skip_all => "$pi_dir not initialized for $0" if $@; - -require PublicInbox::View; -require PublicInbox::SearchThread; - -my $pfx = PublicInbox::Search::xpfx('thread'); -my $opts = { limit => 1000000, asc => 1 }; -my $t0 = clock_gettime(CLOCK_MONOTONIC); -my $elapsed; - -my $sres = $srch->_do_enquire(undef, $opts); -$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; -diag "enquire: $elapsed"; - -$t0 = clock_gettime(CLOCK_MONOTONIC); -my $msgs = PublicInbox::View::load_results($srch, $sres); -$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; -diag "load_results $elapsed"; - -$t0 = clock_gettime(CLOCK_MONOTONIC); -PublicInbox::View::thread_results($msgs); -$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; -diag "thread_results $elapsed"; - -done_testing(); diff --git a/t/time.t b/t/time.t new file mode 100644 index 00000000..370a0bd3 --- /dev/null +++ b/t/time.t @@ -0,0 +1,28 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use_ok 'PublicInbox::MIME'; +use PublicInbox::MsgTime qw(msg_datestamp); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 93 00:00:00 +0000', + ], + body => "hello world\n", +); + +my $ts = msg_datestamp($mime->header_obj); +use POSIX qw(strftime); +is(strftime('%Y-%m-%d %H:%M:%S', gmtime($ts)), '1993-10-02 00:00:00', + 'got expected date with 2 digit year'); +$mime->header_set(Date => 'Fri, 02 Oct 101 01:02:03 +0000'); +$ts = msg_datestamp($mime->header_obj); +is(strftime('%Y-%m-%d %H:%M:%S', gmtime($ts)), '2001-10-02 01:02:03', + 'got expected date with 3 digit year'); + +done_testing(); diff --git a/t/v1-add-remove-add.t b/t/v1-add-remove-add.t new file mode 100644 index 00000000..cd6e2811 --- /dev/null +++ b/t/v1-add-remove-add.t @@ -0,0 +1,45 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::Import; +use PublicInbox::SearchIdx; +use File::Temp qw/tempdir/; + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v1-add-remove-add.t" if $@; +} +my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1); +is(system(qw(git init --bare), $mainrepo), 0); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-add-remove-add', + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + 'Message-ID' => '', + ], + body => "hello world\n", +); +my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx); +ok($im->add($mime), 'message added'); +ok($im->remove($mime), 'message added'); +ok($im->add($mime), 'message added again'); +$im->done; +my $rw = PublicInbox::SearchIdx->new($ibx, 1); +$rw->index_sync; +my $msgs = $ibx->recent({limit => 10}); +is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @$msgs, 1, 'only one message in history'); +is($ibx->mm->num_for('a-mid@b'), 2, 'exists with second article number'); + +done_testing(); diff --git a/t/v2-add-remove-add.t b/t/v2-add-remove-add.t new file mode 100644 index 00000000..c8d12d34 --- /dev/null +++ b/t/v2-add-remove-add.t @@ -0,0 +1,42 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use File::Temp qw/tempdir/; + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2-add-remove-add.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => "$mainrepo/v2", + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + 'Message-ID' => '', + ], + body => "hello world\n", +); +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; +ok($im->add($mime), 'message added'); +ok($im->remove($mime), 'message added'); +ok($im->add($mime), 'message added again'); +$im->done; +my $msgs = $ibx->recent({limit => 1000}); +is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @$msgs, 1, 'only one message in history'); + +done_testing(); diff --git a/t/v2mda.t b/t/v2mda.t new file mode 100644 index 00000000..ca1bb09c --- /dev/null +++ b/t/v2mda.t @@ -0,0 +1,59 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use File::Temp qw/tempdir/; +use Fcntl qw(SEEK_SET); +use Cwd; + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2mda.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $tmpdir = tempdir('pi-v2mda-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => "$tmpdir/inbox", + name => 'test-v2writable', + address => [ 'test@example.com' ], +}; +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + 'Message-ID' => '', + 'List-ID' => '', + ], + body => "hello world\n", +); + +my $mda = "blib/script/public-inbox-mda"; +ok(-f "blib/script/public-inbox-mda", '-mda exists'); +my $main_bin = getcwd()."/t/main-bin"; +local $ENV{PI_DIR} = "$tmpdir/foo"; +local $ENV{PATH} = "$main_bin:blib/script:$ENV{PATH}"; +my @cmd = (qw(public-inbox-init -V2), $ibx->{name}, + $ibx->{mainrepo}, 'http://localhost/test', + $ibx->{address}->[0]); +ok(PublicInbox::Import::run_die(\@cmd), 'initialized v2 inbox'); + +open my $tmp, '+>', undef or die "failed to open anonymous tempfile: $!"; +ok($tmp->print($mime->as_string), 'wrote to temporary file'); +ok($tmp->flush, 'flushed temporary file'); +ok($tmp->sysseek(0, SEEK_SET), 'seeked'); + +my $rdr = { 0 => fileno($tmp) }; +local $ENV{ORIGINAL_RECIPIENT} = 'test@example.com'; +ok(PublicInbox::Import::run_die(['public-inbox-mda'], undef, $rdr), + 'mda delivered a message'); + +$ibx = PublicInbox::Inbox->new($ibx); +my $msgs = $ibx->search->query(''); +my $saved = $ibx->smsg_mime($msgs->[0]); +is($saved->{mime}->as_string, $mime->as_string, 'injected message'); + +done_testing(); diff --git a/t/v2mirror.t b/t/v2mirror.t new file mode 100644 index 00000000..9e0c9e1d --- /dev/null +++ b/t/v2mirror.t @@ -0,0 +1,176 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; + +# Integration tests for HTTP cloning + mirroring +foreach my $mod (qw(Plack::Util Plack::Builder Danga::Socket + HTTP::Date HTTP::Status Search::Xapian DBD::SQLite)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2mirror.t" if $@; +} +use File::Temp qw/tempdir/; +use IO::Socket; +use POSIX qw(dup2); +use_ok 'PublicInbox::V2Writable'; +use PublicInbox::MIME; +use PublicInbox::Config; +use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); +# FIXME: too much setup +my $tmpdir = tempdir('pi-v2mirror-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $script = 'blib/script/public-inbox'; +my $pi_config = "$tmpdir/config"; +{ + open my $fh, '>', $pi_config or die "open($pi_config): $!"; + print $fh <<"" or die "print $pi_config: $!"; +[publicinbox "v2"] + mainrepo = $tmpdir/in + address = test\@example.com + + close $fh or die "close($pi_config): $!"; +} +local $ENV{PI_CONFIG} = $pi_config; + +my $cfg = PublicInbox::Config->new($pi_config); +my $ibx = $cfg->lookup('test@example.com'); +ok($ibx, 'inbox found'); +$ibx->{version} = 2; +my $v2w = PublicInbox::V2Writable->new($ibx, 1); +ok $v2w, 'v2w loaded'; +$v2w->{parallel} = 0; +my $mime = PublicInbox::MIME->new(<<''); +From: Me +To: You +Subject: a +Date: Thu, 01 Jan 1970 00:00:00 +0000 + +for my $i (1..9) { + $mime->header_set('Message-ID', "<$i\@example.com>"); + $mime->header_set('Subject', "subject = $i"); + ok($v2w->add($mime), "add msg $i OK"); +} +$v2w->barrier; + +my %opts = ( + LocalAddr => '127.0.0.1', + ReuseAddr => 1, + Proto => 'tcp', + Listen => 1024, +); +my ($sock, $pid); +END { kill 'TERM', $pid if defined $pid }; + +$! = 0; +$sock = IO::Socket::INET->new(%opts); +ok($sock, 'sock created'); +my $fl = fcntl($sock, F_GETFD, 0); +$pid = fork; +if ($pid == 0) { + # pretend to be systemd + fcntl($sock, F_SETFD, $fl &= ~FD_CLOEXEC); + dup2(fileno($sock), 3) or die "dup2 failed: $!\n"; + $ENV{LISTEN_PID} = $$; + $ENV{LISTEN_FDS} = 1; + exec "$script-httpd", "--stdout=$tmpdir/out", "--stderr=$tmpdir/err"; + die "FAIL: $!\n"; +} +ok(defined $pid, 'forked httpd process successfully'); +my ($host, $port) = ($sock->sockhost, $sock->sockport); +$sock = undef; + +my @cmd = (qw(git clone --mirror -q), "http://$host:$port/v2/0", + "$tmpdir/m/git/0.git"); + +is(system(@cmd), 0, 'cloned OK'); +ok(-d "$tmpdir/m/git/0.git", 'mirror OK');; + +@cmd = ("$script-init", '-V2', 'm', "$tmpdir/m", 'http://example.com/m', + 'alt@example.com'); +is(system(@cmd), 0, 'initialized public-inbox -V2'); +is(system("$script-index", "$tmpdir/m"), 0, 'indexed'); + +my $mibx = { mainrepo => "$tmpdir/m", address => 'alt@example.com' }; +$mibx = PublicInbox::Inbox->new($mibx); +is_deeply([$mibx->mm->minmax], [$ibx->mm->minmax], 'index synched minmax'); + +for my $i (10..15) { + $mime->header_set('Message-ID', "<$i\@example.com>"); + $mime->header_set('Subject', "subject = $i"); + ok($v2w->add($mime), "add msg $i OK"); +} +$v2w->barrier; +is(system('git', "--git-dir=$tmpdir/m/git/0.git", 'fetch', '-q'), 0, + 'fetch successful'); + +my $mset = $mibx->search->reopen->query('m:15@example.com', {mset => 1}); +is(scalar($mset->items), 0, 'new message not found in mirror, yet'); +is(system("$script-index", "$tmpdir/m"), 0, 'index updated'); +is_deeply([$mibx->mm->minmax], [$ibx->mm->minmax], 'index synched minmax'); +$mset = $mibx->search->reopen->query('m:15@example.com', {mset => 1}); +is(scalar($mset->items), 1, 'found message in mirror'); + +# purge: +$mime->header_set('Message-ID', '<10@example.com>'); +$mime->header_set('Subject', 'subject = 10'); +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + ok($v2w->purge($mime), 'purge a message'); + my $warn = join('', @warn); + like($warn, qr/purge rewriting/); + my @subj = ($warn =~ m/^# subject .*$/mg); + is_deeply(\@subj, ["# subject = 10"], "only rewrote one"); +} + +$v2w->barrier; + +my $msgs = $mibx->search->{over_ro}->get_thread('10@example.com'); +my $to_purge = $msgs->[0]->{blob}; +like($to_purge, qr/\A[a-f0-9]{40,}\z/, 'read blob to be purged'); +$mset = $ibx->search->reopen->query('m:10@example.com', {mset => 1}); +is(scalar($mset->items), 0, 'purged message gone from origin'); + +is(system('git', "--git-dir=$tmpdir/m/git/0.git", 'fetch', '-q'), 0, + 'fetch successful'); +{ + open my $err, '+>', "$tmpdir/index-err" or die "open: $!"; + my $ipid = fork; + if ($ipid == 0) { + dup2(fileno($err), 2) or die "dup2 failed: $!"; + exec("$script-index", '--prune', "$tmpdir/m"); + die "exec fail: $!"; + } + ok($ipid, 'running index..'); + is(waitpid($ipid, 0), $ipid, 'index --prune done'); + is($?, 0, 'no error from index'); + ok(seek($err, 0, 0), 'rewound stderr'); + $err = eval { local $/; <$err> }; + like($err, qr/discontiguous range/, 'warned about discontiguous range'); + unlike($err, qr/fatal/, 'no scary fatal error shown'); +} + +$mset = $mibx->search->reopen->query('m:10@example.com', {mset => 1}); +is(scalar($mset->items), 0, 'purged message not found in mirror'); +is_deeply([$mibx->mm->minmax], [$ibx->mm->minmax], 'minmax still synced'); +for my $i ((1..9),(11..15)) { + $mset = $mibx->search->query("m:$i\@example.com", {mset => 1}); + is(scalar($mset->items), 1, "$i\@example.com remains visible"); +} +is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror'); + +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + $v2w->index_sync; + is_deeply(\@warn, [], 'no warnings from index_sync after purge'); +} + +$v2w->done; +ok(kill('TERM', $pid), 'killed httpd'); +$pid = undef; +waitpid(-1, 0); + +done_testing(); + +1; diff --git a/t/v2reindex.t b/t/v2reindex.t new file mode 100644 index 00000000..9bc271fc --- /dev/null +++ b/t/v2reindex.t @@ -0,0 +1,97 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::ContentId qw(content_digest); +use File::Temp qw/tempdir/; +use File::Path qw(remove_tree); + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2reindex.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-v2reindex-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); +local $ENV{NPROC} = 2; +my $im = PublicInbox::V2Writable->new($ibx, 1); +foreach my $i (1..10) { + $mime->header_set('Message-Id', "<$i\@example.com>"); + ok($im->add($mime), "message $i added"); + if ($i == 4) { + $im->remove($mime); + } +} + +if ('test remove later') { + $mime->header_set('Message-Id', "<5\@example.com>"); + $im->remove($mime); +} + +$im->done; +my $minmax = [ $ibx->mm->minmax ]; +ok(defined $minmax->[0] && defined $minmax->[1], 'minmax defined'); + +eval { $im->index_sync({reindex => 1}) }; +is($@, '', 'no error from reindexing'); +$im->done; + +my $xap = "$mainrepo/xap".PublicInbox::Search::SCHEMA_VERSION(); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed'); +eval { $im->index_sync({reindex => 1}) }; +is($@, '', 'no error from reindexing'); +$im->done; +ok(-d $xap, 'Xapian directories recreated'); + +delete $ibx->{mm}; +is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); + +ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap'); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed again'); +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + eval { $im->index_sync({reindex => 1}) }; + is($@, '', 'no error from reindexing without msgmap'); + is(scalar(@warn), 0, 'no warnings from reindexing'); + $im->done; + ok(-d $xap, 'Xapian directories recreated'); + delete $ibx->{mm}; + is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); +} + +ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap'); +remove_tree($xap); +ok(!-d $xap, 'Xapian directories removed again'); +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + eval { $im->index_sync({reindex => 1}) }; + is($@, '', 'no error from reindexing without msgmap'); + is_deeply(\@warn, [], 'no warnings'); + $im->done; + ok(-d $xap, 'Xapian directories recreated'); + delete $ibx->{mm}; + is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged'); +} + +done_testing(); diff --git a/t/v2writable.t b/t/v2writable.t new file mode 100644 index 00000000..d37fb06e --- /dev/null +++ b/t/v2writable.t @@ -0,0 +1,280 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::ContentId qw(content_digest); +use File::Temp qw/tempdir/; +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for nntpd.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-v2writable-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); + +my $im = eval { + local $ENV{NPROC} = '1'; + PublicInbox::V2Writable->new($ibx, 1); +}; +is($im->{partitions}, 1, 'one partition when forced'); +ok($im->add($mime), 'ordinary message added'); +my $git0; + +if ('ensure git configs are correct') { + my @cmd = (qw(git config), "--file=$mainrepo/all.git/config", + qw(core.sharedRepository 0644)); + is(system(@cmd), 0, "set sharedRepository in all.git"); + $git0 = PublicInbox::Git->new("$mainrepo/git/0.git"); + chomp(my $v = $git0->qx(qw(config core.sharedRepository))); + is($v, '0644', 'child repo inherited core.sharedRepository'); + chomp($v = $git0->qx(qw(config --bool repack.writeBitmaps))); + is($v, 'true', 'child repo inherited repack.writeBitmaps'); +} + +{ + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + is($im->add($mime), undef, 'obvious duplicate rejected'); + is(scalar(@warn), 0, 'no warning about resent message'); + + @warn = (); + $mime->header_set('Message-Id', '', ''); + is($im->add($mime), undef, 'secondary MID ignored if first matches'); + my $sec = PublicInbox::MIME->new($mime->as_string); + $sec->header_set('Date'); + $sec->header_set('Message-Id', '', ''); + ok($im->add($sec), 'secondary MID used if data is different'); + like(join(' ', @warn), qr/mismatched/, 'warned about mismatch'); + like(join(' ', @warn), qr/alternative/, 'warned about alternative'); + is_deeply([ '', '' ], + [ $sec->header_obj->header_raw('Message-Id') ], + 'no new Message-Id added'); + + my $sane_mid = qr/\A<[\w\-\.]+\@\w+>\z/; + @warn = (); + $mime->header_set('Message-Id', ''); + $mime->body_set('different'); + ok($im->add($mime), 'reused mid ok'); + like(join(' ', @warn), qr/reused/, 'warned about reused MID'); + my @mids = $mime->header_obj->header_raw('Message-Id'); + is($mids[0], '', 'original mid not changed'); + like($mids[1], $sane_mid, 'new MID added'); + is(scalar(@mids), 2, 'only one new MID added'); + + @warn = (); + $mime->header_set('Message-Id', ''); + $mime->body_set('this one needs a random mid'); + my $hdr = $mime->header_obj; + my $gen = PublicInbox::Import::digest2mid(content_digest($mime), $hdr); + unlike($gen, qr![\+/=]!, 'no URL-unfriendly chars in Message-Id'); + my $fake = PublicInbox::MIME->new($mime->as_string); + $fake->header_set('Message-Id', "<$gen>"); + ok($im->add($fake), 'fake added easily'); + is_deeply(\@warn, [], 'no warnings from a faker'); + ok($im->add($mime), 'random MID made'); + like(join(' ', @warn), qr/using random/, 'warned about using random'); + @mids = $mime->header_obj->header_raw('Message-Id'); + is($mids[0], '', 'original mid not changed'); + like($mids[1], $sane_mid, 'new MID added'); + is(scalar(@mids), 2, 'only one new MID added'); + + @warn = (); + $mime->header_set('Message-Id'); + ok($im->add($mime), 'random MID made for MID free message'); + @mids = $mime->header_obj->header_raw('Message-Id'); + like($mids[0], $sane_mid, 'mid was generated'); + is(scalar(@mids), 1, 'new generated'); +} + +{ + $mime->header_set('Message-Id', '', ''); + $mime->header_set('References', ''); + ok($im->add($mime), 'message with multiple Message-ID'); + $im->done; + my $srch = $ibx->search; + my $mset1 = $srch->reopen->query('m:abcde@1', { mset => 1 }); + is($mset1->size, 1, 'message found by first MID'); + my $mset2 = $srch->reopen->query('m:abcde@2', { mset => 1 }); + is($mset2->size, 1, 'message found by second MID'); + is((($mset1->items)[0])->get_docid, (($mset2->items)[0])->get_docid, + 'same document'); +} + +SKIP: { + use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); + use Net::NNTP; + use IO::Socket; + use Socket qw(SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); + eval { require Danga::Socket }; + skip "Danga::Socket missing $@", 2 if $@; + my $err = "$mainrepo/stderr.log"; + my $out = "$mainrepo/stdout.log"; + my %opts = ( + LocalAddr => '127.0.0.1', + ReuseAddr => 1, + Proto => 'tcp', + Type => SOCK_STREAM, + Listen => 1024, + ); + my $group = 'inbox.comp.test.v2writable'; + my $pi_config = "$mainrepo/pi_config"; + open my $fh, '>', $pi_config or die "open: $!\n"; + print $fh <new(%opts); + ok($sock, 'sock created'); + my $pid; + my $len; + END { kill 'TERM', $pid if defined $pid }; + $! = 0; + my $fl = fcntl($sock, F_GETFD, 0); + ok(! $!, 'no error from fcntl(F_GETFD)'); + is($fl, FD_CLOEXEC, 'cloexec set by default (Perl behavior)'); + $pid = fork; + if ($pid == 0) { + use POSIX qw(dup2); + $ENV{PI_CONFIG} = $pi_config; + # pretend to be systemd + fcntl($sock, F_SETFD, $fl &= ~FD_CLOEXEC); + dup2(fileno($sock), 3) or die "dup2 failed: $!\n"; + $ENV{LISTEN_PID} = $$; + $ENV{LISTEN_FDS} = 1; + my $nntpd = 'blib/script/public-inbox-nntpd'; + exec $nntpd, "--stdout=$out", "--stderr=$err"; + die "FAIL: $!\n"; + } + ok(defined $pid, 'forked nntpd process successfully'); + $! = 0; + fcntl($sock, F_SETFD, $fl |= FD_CLOEXEC); + ok(! $!, 'no error from fcntl(F_SETFD)'); + my $host_port = $sock->sockhost . ':' . $sock->sockport; + my $n = Net::NNTP->new($host_port); + $n->group($group); + my $x = $n->xover('1-'); + my %uniq; + foreach my $num (sort { $a <=> $b } keys %$x) { + my $mid = $x->{$num}->[3]; + is($uniq{$mid}++, 0, "MID for $num is unique in XOVER"); + is_deeply($n->xhdr('Message-ID', $num), + { $num => $mid }, "XHDR lookup OK on num $num"); + is_deeply($n->xhdr('Message-ID', $mid), + { $mid => $mid }, "XHDR lookup OK on MID $num"); + } + my %nn; + foreach my $mid (@{$n->newnews(0, $group)}) { + is($nn{$mid}++, 0, "MID is unique in NEWNEWS"); + } + is_deeply([sort keys %nn], [sort keys %uniq]); + + my %lg; + foreach my $num (@{$n->listgroup($group)}) { + is($lg{$num}++, 0, "num is unique in LISTGROUP"); + } + is_deeply([sort keys %lg], [sort keys %$x], + 'XOVER and LISTGROUPS return the same article numbers'); + + my $xref = $n->xhdr('Xref', '1-'); + is_deeply([sort keys %lg], [sort keys %$xref], 'Xref range OK'); + + my $mids = $n->xhdr('Message-ID', '1-'); + is_deeply([sort keys %lg], [sort keys %$xref], 'Message-ID range OK'); + + my $rover = $n->xrover('1-'); + is_deeply([sort keys %lg], [sort keys %$rover], 'XROVER range OK'); +}; +{ + local $ENV{NPROC} = 2; + my @before = $git0->qx(qw(log --pretty=oneline)); + my $before = $git0->qx(qw(log --pretty=raw --raw -r --no-abbrev)); + $im = PublicInbox::V2Writable->new($ibx, 1); + is($im->{partitions}, 1, 'detected single partition from previous'); + my $smsg = $im->remove($mime, 'test removal'); + $im->done; + my @after = $git0->qx(qw(log --pretty=oneline)); + my $tip = shift @after; + like($tip, qr/\A[a-f0-9]+ test removal\n\z/s, + 'commit message propagated to git'); + is_deeply(\@after, \@before, 'only one commit written to git'); + is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid'); + my $num = $smsg->{num}; + like($num, qr/\A\d+\z/, 'numeric number in return message'); + is($ibx->mm->mid_for($num), undef, 'no longer in Msgmap by num'); + my $srch = $ibx->search->reopen; + my $mset = $srch->query('m:'.$smsg->mid, { mset => 1}); + is($mset->size, 0, 'no longer found in Xapian'); + my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames); + is($srch->{over_ro}->get_art($num), undef, + 'removal propagated to Over DB'); + + my $after = $git0->qx(@log1); + if ($after =~ m!( [a-f0-9]+ )A\td$!m) { + my $oid = $1; + ok(index($before, $oid) > 0, 'no new blob introduced'); + } else { + fail('failed to extract blob from log output'); + } + is($im->remove($mime, 'test removal'), undef, + 'remove is idempotent'); + $im->done; + is($git0->qx(@log1), + $after, 'no git history made with idempotent remove'); + eval { $im->done }; + ok(!$@, '->done is idempotent'); +} + +{ + ok($im->add($mime), 'add message to be purged'); + local $SIG{__WARN__} = sub {}; + ok(my $cmts = $im->purge($mime), 'purged message'); + like($cmts->[0], qr/\A[a-f0-9]{40}\z/, 'purge returned current commit'); + $im->done; +} + +{ + my @warn; + my $x = 'x'x250; + my $y = 'y'x250; + local $SIG{__WARN__} = sub { push @warn, @_ }; + $mime->header_set('Subject', 'long mid'); + $mime->header_set('Message-ID', "<$x>"); + ok($im->add($mime), 'add excessively long Message-ID'); + + $mime->header_set('Message-ID', "<$y>"); + $mime->header_set('References', "<$x>"); + ok($im->add($mime), 'add excessively long References'); + $im->barrier; + + my $msgs = $ibx->search->reopen->get_thread('x'x244); + is(2, scalar(@$msgs), 'got both messages'); + is($msgs->[0]->{mid}, 'x'x244, 'stored truncated mid'); + is($msgs->[1]->{references}, '<'.('x'x244).'>', 'stored truncated ref'); + is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)'); + $im->done; +} + +done_testing(); diff --git a/t/view.t b/t/view.t index 22f5c7e4..8ae42256 100644 --- a/t/view.t +++ b/t/view.t @@ -16,6 +16,7 @@ my $ctx = { base_url => sub { 'http://example.com/' }, cloneurl => sub {[]}, nntp_url => sub {[]}, + max_git_part => sub { undef }, description => sub { '' }), }; $ctx->{-inbox}->{-primary_address} = 'test@example.com'; diff --git a/t/watch_maildir.t b/t/watch_maildir.t index 30e94c1e..7178f29e 100644 --- a/t/watch_maildir.t +++ b/t/watch_maildir.t @@ -31,7 +31,8 @@ Date: Sat, 18 Jun 2016 00:00:00 +0000 something EOF PublicInbox::Emergency->new($maildir)->prepare(\$msg); -ok(POSIX::mkfifo("$maildir/cur/fifo", 0777)); +ok(POSIX::mkfifo("$maildir/cur/fifo", 0777), + 'create FIFO to ensure we do not get stuck on it :P'); my $sem = PublicInbox::Emergency->new($spamdir); # create dirs my $config = PublicInbox::Config->new({ diff --git a/t/watch_maildir_v2.t b/t/watch_maildir_v2.t new file mode 100644 index 00000000..a76e413f --- /dev/null +++ b/t/watch_maildir_v2.t @@ -0,0 +1,125 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use Test::More; +use File::Temp qw/tempdir/; +use PublicInbox::MIME; +use Cwd; +use PublicInbox::Config; +my @mods = qw(Filesys::Notify::Simple PublicInbox::V2Writable); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for watch_maildir_v2.t" if $@; +} + +my $tmpdir = tempdir('watch_maildir-v2-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $mainrepo = "$tmpdir/v2"; +my $maildir = "$tmpdir/md"; +my $spamdir = "$tmpdir/spam"; +use_ok 'PublicInbox::WatchMaildir'; +use_ok 'PublicInbox::Emergency'; +my $cfgpfx = "publicinbox.test"; +my $addr = 'test-public@example.com'; +my @cmd = ('blib/script/public-inbox-init', '-V2', 'test', $mainrepo, + 'http://example.com/v2list', $addr); +local $ENV{PI_CONFIG} = "$tmpdir/pi_config"; +is(system(@cmd), 0, 'public-inbox init OK'); + +my $msg = < +Date: Sat, 18 Jun 2016 00:00:00 +0000 + +something +EOF +PublicInbox::Emergency->new($maildir)->prepare(\$msg); +ok(POSIX::mkfifo("$maildir/cur/fifo", 0777), + 'create FIFO to ensure we do not get stuck on it :P'); +my $sem = PublicInbox::Emergency->new($spamdir); # create dirs + +my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => $addr, + "$cfgpfx.mainrepo" => $mainrepo, + "$cfgpfx.watch" => "maildir:$maildir", + "$cfgpfx.filter" => 'PublicInbox::Filter::Vger', + "publicinboxlearn.watchspam" => "maildir:$spamdir", +}); +my $ibx = $config->lookup_name('test'); +ok($ibx, 'found inbox by name'); +my $srch = $ibx->search; + +PublicInbox::WatchMaildir->new($config)->scan('full'); +my ($total, undef) = $srch->reopen->query(''); +is($total, 1, 'got one revision'); + +# my $git = PublicInbox::Git->new("$mainrepo/git/0.git"); +# my @list = $git->qx(qw(rev-list refs/heads/master)); +# is(scalar @list, 1, 'one revision in rev-list'); + +my $write_spam = sub { + is(scalar glob("$spamdir/new/*"), undef, 'no spam existing'); + $sem->prepare(\$msg); + $sem->commit; + my @new = glob("$spamdir/new/*"); + is(scalar @new, 1); + my @p = split(m!/+!, $new[0]); + ok(link($new[0], "$spamdir/cur/".$p[-1].":2,S")); + is(unlink($new[0]), 1); +}; +$write_spam->(); +is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam'); +PublicInbox::WatchMaildir->new($config)->scan('full'); +is(($srch->reopen->query(''))[0], 0, 'deleted file'); + +# check with scrubbing +{ + $msg .= qq(-- +To unsubscribe from this list: send the line "unsubscribe git" in +the body of a message to majordomo\@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html\n); + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + PublicInbox::WatchMaildir->new($config)->scan('full'); + my ($nr, $msgs) = $srch->reopen->query(''); + is($nr, 1, 'got one file back'); + my $mref = $ibx->msg_by_smsg($msgs->[0]); + like($$mref, qr/something\n\z/s, 'message scrubbed on import'); + + is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam'); + $write_spam->(); + PublicInbox::WatchMaildir->new($config)->scan('full'); + ($nr, $msgs) = $srch->reopen->query(''); + is($nr, 0, 'inbox is empty again'); +} + +{ + my $fail_bin = getcwd()."/t/fail-bin"; + ok(-x "$fail_bin/spamc", "mock spamc exists"); + my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $fail_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + { + local $SIG{__WARN__} = sub {}; # quiet spam check warning + PublicInbox::WatchMaildir->new($config)->scan('full'); + } + ($nr, $msgs) = $srch->reopen->query(''); + is($nr, 0, 'inbox is still empty'); + is(unlink(glob("$maildir/new/*")), 1); +} + +{ + my $main_bin = getcwd()."/t/main-bin"; + ok(-x "$main_bin/spamc", "mock spamc exists"); + my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $main_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + PublicInbox::WatchMaildir->new($config)->scan('full'); + ($nr, $msgs) = $srch->reopen->query(''); + is($nr, 1, 'inbox has one mail after spamc OK-ed a message'); + my $mref = $ibx->msg_by_smsg($msgs->[0]); + like($$mref, qr/something\n\z/s, 'message scrubbed on import'); +} + +done_testing;