From a367ec1b15a2458e532245f5308565dd84f8ca63 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 22 Aug 2020 06:06:27 +0000 Subject: [PATCH] mbox: disable "&t" on existing Xapian until full reindex Expanding threads via over.sqlite3 for mbox.gz downloads without Xapian effectively collapsing on the THREADID column leads to repeated messages getting downloaded. To avoid that situation, use a "has_threadid" Xapian metadata flag that's only set on --reindex (and brand new Xapian DBs). This allows admins to upgrade WWW or do --reindex in any order; without worrying about users eating up bandwidth and CPU cycles. --- lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/Search.pm | 10 ++++++++- lib/PublicInbox/SearchIdx.pm | 16 ++++++++++++-- lib/PublicInbox/SearchView.pm | 21 ++++++++++++------- lib/PublicInbox/V2Writable.pm | 12 +++++++++++ t/init.t | 1 + t/psgi_search.t | 39 +++++++++++++++++++++++++++++++++-- 7 files changed, 88 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c9b11c21..0223bead 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -262,7 +262,7 @@ sub mbox_all { $ctx->{ids} = $srch->mset_to_artnums($mset); require PublicInbox::MboxGz; my $fn; - if ($q->{t}) { + if ($q->{t} && $srch->has_threadid) { $fn = 'results-thread-'.$q_string; PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); } else { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index bc820b64..01bbe73d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -311,6 +311,12 @@ sub _do_enquire { retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); } +# returns true if all docs have the THREADID value +sub has_threadid ($) { + my ($self) = @_; + (xdb($self)->get_metadata('has_threadid') // '') eq '1'; +} + sub _enquire_once { # retry_reopen callback my ($self, $query, $opts) = @{$_[0]}; my $xdb = xdb($self); @@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback } # `mairix -t / --threads' or JMAP collapseThreads - $enquire->set_collapse_key(THREADID) if $opts->{thread}; + if ($opts->{thread} && has_threadid($self)) { + $enquire->set_collapse_key(THREADID); + } my $offset = $opts->{offset} || 0; my $limit = $opts->{limit} || 50; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index baa6f41a..ade55756 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -131,6 +131,7 @@ sub idx_acquire { ($is_shard && need_xapian($self)))) { File::Path::mkpath($dir); nodatacow_dir($dir); + $self->{-set_has_threadid_once} = 1; } } return unless defined $flag; @@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) { $self->{mm}->{dbh}->commit; if ($newest && need_xapian($self)) { - my $cur = $self->{xdb}->get_metadata('last_commit'); + my $xdb = $self->{xdb}; + my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { - $self->{xdb}->set_metadata('last_commit', $newest); + $xdb->set_metadata('last_commit', $newest); + } + + # let SearchView know a full --reindex was done so it can + # generate ->has_threadid-dependent links + if ($sync->{reindex} && !ref($sync->{reindex})) { + my $n = $xdb->get_metadata('has_threadid'); + $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } } @@ -816,6 +825,9 @@ sub set_metadata_once { return if $self->{shard}; # only continue if undef or 0, not >0 my $xdb = $self->{xdb}; + if (delete($self->{-set_has_threadid_once})) { + $xdb->set_metadata('has_threadid', '1'); + } if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index dd69564a..76428dfb 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -188,13 +188,20 @@ sub search_nav_top { $rv .= qq{summary|nested}; } my $A = $q->qs_html(x => 'A', r => undef); - $rv .= qq{|Atom feed]} . - qq{\n\t\t\tdownload mbox.gz: } . - # we set name=z w/o using it since it seems required for - # lynx (but works fine for w3m). - qq{|} . - qq{} . - qq{
};
+	$rv .= qq{|Atom feed]};
+	if ($ctx->{-inbox}->search->has_threadid) {
+		$rv .= qq{\n\t\t\tdownload mbox.gz: } .
+			# we set name=z w/o using it since it seems required for
+			# lynx (but works fine for w3m).
+			qq{} .
+			qq{|};
+	} else { # BOFH needs to --reindex
+		$rv .= qq{\n\t\t\t\t\t\tdownload: } .
+			qq{}
+	}
+	$rv .= qq{
};
 }
 
 sub search_nav_bot {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0a91a132..b0148dba 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1337,6 +1337,18 @@ sub index_sync {
 		xapian_only($self, $opt, $sync, $art_beg);
 	}
 
+	# --reindex on the command-line
+	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
+		$self->lock_acquire;
+		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
+		if (my $xdb = $s0->idx_acquire) {
+			my $n = $xdb->get_metadata('has_threadid');
+			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
+		}
+		$s0->idx_release;
+		$self->lock_release;
+	}
+
 	# reindex does not pick up new changes, so we rerun w/o it:
 	if ($opt->{reindex}) {
 		my %again = %$opt;
diff --git a/t/init.t b/t/init.t
index dad09435..a5a9debc 100644
--- a/t/init.t
+++ b/t/init.t
@@ -108,6 +108,7 @@ SKIP: {
 		is(PublicInbox::Admin::detect_indexlevel($ibx), 'full',
 			"detected default indexlevel -V$v");
 		ok($ibx->{-skip_docdata}, "docdata skip set -V$v");
+		ok($ibx->search->has_threadid, 'has_threadid flag set on new inbox');
 	}
 
 	# loop for idempotency
diff --git a/t/psgi_search.t b/t/psgi_search.t
index 5d537363..c1677eb3 100644
--- a/t/psgi_search.t
+++ b/t/psgi_search.t
@@ -3,6 +3,7 @@
 use strict;
 use warnings;
 use Test::More;
+use IO::Uncompress::Gunzip qw(gunzip);
 use PublicInbox::Eml;
 use PublicInbox::Config;
 use PublicInbox::Inbox;
@@ -39,6 +40,12 @@ To: git\@vger.kernel.org
 EOF
 $im->add($mime);
 
+$im->add(PublicInbox::Eml->new(<<""));
+Message-ID: 
+From: replier 
+In-Reply-To: <$mid>
+Subject: mismatch
+
 $mime = PublicInbox::Eml->new(<<'EOF');
 Subject:
 Message-ID: 
@@ -79,6 +86,9 @@ test_psgi(sub { $www->call(@_) }, sub {
 	ok(index($html, 'by Ævar Arnfjörð Bjarmason') >= 0,
 		"displayed Ævar's name properly in HTML");
 
+	like($html, qr/download mbox\.gz: .*?"full threads"/s,
+		'"full threads" download option shown');
+
 	my $warn = [];
 	local $SIG{__WARN__} = sub { push @$warn, @_ };
 	$res = $cb->(GET('/test/?q=s:test&l=5e'));
@@ -118,8 +128,33 @@ test_psgi(sub { $www->call(@_) }, sub {
 	$res = $cb->(GET('/test/no-subject-at-all@example.com/t.mbox.gz'));
 	like($res->header('Content-Disposition'),
 		qr/filename=no-subject\.mbox\.gz/);
+
+	# "full threads" mbox.gz download
+	$res = $cb->(POST('/test/?q=s:test&x=m&t'));
+	is($res->code, 200, 'successful mbox download with threads');
+	gunzip(\($res->content) => \(my $before));
+	is_deeply([ "Message-ID: <$mid>\n", "Message-ID: \n" ],
+		[ grep(/^Message-ID:/m, split(/^/m, $before)) ],
+		'got full thread');
+
+	# clobber has_threadid to emulate old versions:
+	{
+		my $sidx = PublicInbox::SearchIdx->new($ibx, 0);
+		my $xdb = $sidx->idx_acquire;
+		$xdb->set_metadata('has_threadid', '0');
+		$sidx->idx_release;
+	}
+	$config->each_inbox(sub { delete $_[0]->{search} });
+	$res = $cb->(GET('/test/?q=s:test'));
+	is($res->code, 200, 'successful search w/o has_threadid');
+	unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
+		'"full threads" download option not shown w/o has_threadid');
+
+	# in case somebody uses curl to bypass 
+ $res = $cb->(POST('/test/?q=s:test&x=m&t')); + is($res->code, 200, 'successful mbox download w/ threads'); + gunzip(\($res->content) => \(my $after)); + isnt($before, $after); }); done_testing(); - -1; -- 2.44.0