From: Eric Wong (Contractor, The Linux Foundation) <e@80x24.org>
Date: Mon, 19 Mar 2018 08:14:39 +0000 (+0000)
Subject: v2writable: implement remove correctly
X-Git-Tag: v1.1.0-pre1~163
X-Git-Url: http://www.git.stargrave.org/?a=commitdiff_plain;h=eb48e7d6675babdda9a36be1a490c29a2ccddbdc;p=public-inbox.git

v2writable: implement remove correctly

We need to hide removals from anybody hitting the search engine.
---

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index a147b9f3..8e81fba0 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -140,6 +140,14 @@ sub mid_delete {
 	$sth->execute;
 }
 
+sub num_delete {
+	my ($self, $num) = @_;
+	my $dbh = $self->{dbh};
+	my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?');
+	$sth->bind_param(1, $num);
+	$sth->execute;
+}
+
 sub create_tables {
 	my ($dbh) = @_;
 	my $e;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ccec0181..ae2544da 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -440,6 +440,31 @@ sub remove_message {
 	}
 }
 
+# MID is a hint in V2
+sub remove_by_oid {
+	my ($self, $oid, $mid) = @_;
+	my $db = $self->{xdb};
+
+	# XXX careful, we cannot use batch_do here since we conditionally
+	# delete documents based on other factors, so we cannot call
+	# find_doc_ids twice.
+	my ($head, $tail) = $self->find_doc_ids('Q' . $mid);
+	return if $head == $tail;
+
+	# there is only ONE element in @delete unless we
+	# have bugs in our v2writable deduplication check
+	my @delete;
+	for (; $head != $tail; $head->inc) {
+		my $docid = $head->get_docid;
+		my $doc = $db->get_document($docid);
+		my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+		$smsg->load_expand;
+		push(@delete, $docid) if $smsg->{blob} eq $oid;
+	}
+	$db->delete_document($_) foreach @delete;
+	scalar(@delete);
+}
+
 sub term_generator { # write-only
 	my ($self) = @_;
 
@@ -896,4 +921,11 @@ sub remote_close {
 	$? == 0 or die ref($self)." pid:$pid exited with: $?";
 }
 
+# triggers remove_by_oid in partition or skeleton
+sub remote_remove {
+	my ($self, $oid, $mid) = @_;
+	print { $self->{w} } "D $oid $mid\n" or
+			die "failed to write remove $!";
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
index dd7ace67..c1660783 100644
--- a/lib/PublicInbox/SearchIdxPart.pm
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -54,6 +54,14 @@ sub partition_worker_loop ($$$) {
 			$txn = undef;
 			print { $self->{skeleton}->{w} } "barrier $part\n" or
 					die "write failed to skeleton: $!\n";
+		} elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) {
+			my ($oid, $mid) = ($1, $2);
+			$xdb ||= $self->_xdb_acquire;
+			if (!$txn) {
+				$xdb->begin_transaction;
+				$txn = 1;
+			}
+			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
 			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4cb10f59..beb17b9f 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -73,6 +73,14 @@ sub skeleton_worker_loop {
 				print $barrier_note "barrier_done\n" or die
 					"print failed to barrier note: $!";
 			}
+		} elsif ($line =~ /\AD ([a-f0-9]{40,}) (.*)\n\z/s) {
+			my ($oid, $mid) = ($1, $2);
+			$xdb ||= $self->_xdb_acquire;
+			if (!$txn) {
+				$xdb->begin_transaction;
+				$txn = 1;
+			}
+			$self->remove_by_oid($oid, $mid);
 		} else {
 			my $len = int($line);
 			my $n = read($r, my $msg, $len) or die "read: $!\n";
@@ -110,6 +118,16 @@ sub index_skeleton {
 	die "print failed: $err\n" if $err;
 }
 
+sub remote_remove {
+	my ($self, $oid, $mid) = @_;
+	my $err;
+	$self->_lock_acquire;
+	eval { $self->SUPER::remote_remove($oid, $mid) };
+	$err = $@;
+	$self->_lock_release;
+	die $err if $err;
+}
+
 # values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ]
 sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 23478a2a..a1cd0c28 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -64,7 +64,9 @@ sub load_doc {
 # :bytes and :lines metadata in RFC 3977
 sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
 sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) }
+sub num ($) {
+	$_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM)
+}
 
 sub __hdr ($$) {
 	my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e673c252..656f0693 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -199,18 +199,47 @@ sub idx_init {
 }
 
 sub remove {
-	my ($self, $mime, $msg) = @_;
-	my $existing = $self->lookup_content($mime) or return;
-
-	# don't touch ghosts or already junked messages
-	return unless $existing->type eq 'mail';
-
-	# always write removals to the current (latest) git repo since
-	# we process chronologically
+	my ($self, $mime, $cmt_msg) = @_;
+	$self->barrier;
+	$self->idx_init;
 	my $im = $self->importer;
-	my ($cmt, undef) = $im->remove($mime, $msg);
-	$cmt = $im->get_mark($cmt);
-	$self->unindex_msg($existing, $cmt);
+	my $ibx = $self->{-inbox};
+	my $srch = $ibx->search;
+	my $cid = content_id($mime);
+	my $skel = $self->{skel};
+	my $parts = $self->{idx_parts};
+	my $mm = $skel->{mm};
+	my $removed;
+	my $mids = mids($mime->header_obj);
+	foreach my $mid (@$mids) {
+		$srch->reopen->each_smsg_by_mid($mid, sub {
+			my ($smsg) = @_;
+			$smsg->load_expand;
+			my $msg = $ibx->msg_by_smsg($smsg);
+			if (!defined($msg)) {
+				warn "broken smsg for $mid\n";
+				return 1; # continue
+			}
+			my $cur = PublicInbox::MIME->new($msg);
+			if (content_id($cur) eq $cid) {
+				$mm->num_delete($smsg->num);
+				# $removed should only be set once assuming
+				# no bugs in our deduplication code:
+				$removed = $smsg;
+				$removed->{mime} = $cur;
+				$im->remove($cur, $cmt_msg);
+				$removed->num; # memoize this for callers
+
+				my $oid = $smsg->{blob};
+				foreach my $idx (@$parts, $skel) {
+					$idx->remote_remove($oid, $mid);
+				}
+			}
+			1; # continue
+		});
+		$self->barrier;
+	}
+	$removed;
 }
 
 sub done {
diff --git a/t/v2writable.t b/t/v2writable.t
index 7d276da7..6e37b722 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -36,12 +36,13 @@ my $im = eval {
 };
 is($im->{partitions}, 1, 'one partition when forced');
 ok($im->add($mime), 'ordinary message added');
+my $git0;
 
 if ('ensure git configs are correct') {
 	my @cmd = (qw(git config), "--file=$mainrepo/all.git/config",
 		qw(core.sharedRepository 0644));
 	is(system(@cmd), 0, "set sharedRepository in all.git");
-	my $git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
+	$git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
 	my $fh = $git0->popen(qw(config core.sharedRepository));
 	my $v = eval { local $/; <$fh> };
 	chomp $v;
@@ -189,8 +190,23 @@ EOF
 };
 {
 	local $ENV{NPROC} = 2;
+	my @before = $git0->qx(qw(log --pretty=oneline));
 	$im = PublicInbox::V2Writable->new($ibx, 1);
 	is($im->{partitions}, 1, 'detected single partition from previous');
+	my $smsg = $im->remove($mime, 'test removal');
+	my @after = $git0->qx(qw(log --pretty=oneline));
+	$im->done;
+	my $tip = shift @after;
+	like($tip, qr/\A[a-f0-9]+ test removal\n\z/s,
+		'commit message propaged to git');
+	is_deeply(\@after, \@before, 'only one commit written to git');
+	is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
+	like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
+	is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+	my $srch = $ibx->search->reopen;
+	my @found = ();
+	$srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 });
+	is(scalar(@found), 0, 'no longer found in Xapian skeleton');
 }
 
 done_testing();