From eb48e7d6675babdda9a36be1a490c29a2ccddbdc Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Mon, 19 Mar 2018 08:14:39 +0000 Subject: [PATCH 1/1] v2writable: implement remove correctly We need to hide removals from anybody hitting the search engine. --- lib/PublicInbox/Msgmap.pm | 8 +++++ lib/PublicInbox/SearchIdx.pm | 32 +++++++++++++++++ lib/PublicInbox/SearchIdxPart.pm | 8 +++++ lib/PublicInbox/SearchIdxSkeleton.pm | 18 ++++++++++ lib/PublicInbox/SearchMsg.pm | 4 ++- lib/PublicInbox/V2Writable.pm | 51 ++++++++++++++++++++++------ t/v2writable.t | 18 +++++++++- 7 files changed, 126 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index a147b9f3..8e81fba0 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -140,6 +140,14 @@ sub mid_delete { $sth->execute; } +sub num_delete { + my ($self, $num) = @_; + my $dbh = $self->{dbh}; + my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?'); + $sth->bind_param(1, $num); + $sth->execute; +} + sub create_tables { my ($dbh) = @_; my $e; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ccec0181..ae2544da 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -440,6 +440,31 @@ sub remove_message { } } +# MID is a hint in V2 +sub remove_by_oid { + my ($self, $oid, $mid) = @_; + my $db = $self->{xdb}; + + # XXX careful, we cannot use batch_do here since we conditionally + # delete documents based on other factors, so we cannot call + # find_doc_ids twice. + my ($head, $tail) = $self->find_doc_ids('Q' . $mid); + return if $head == $tail; + + # there is only ONE element in @delete unless we + # have bugs in our v2writable deduplication check + my @delete; + for (; $head != $tail; $head->inc) { + my $docid = $head->get_docid; + my $doc = $db->get_document($docid); + my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $smsg->load_expand; + push(@delete, $docid) if $smsg->{blob} eq $oid; + } + $db->delete_document($_) foreach @delete; + scalar(@delete); +} + sub term_generator { # write-only my ($self) = @_; @@ -896,4 +921,11 @@ sub remote_close { $? == 0 or die ref($self)." pid:$pid exited with: $?"; } +# triggers remove_by_oid in partition or skeleton +sub remote_remove { + my ($self, $oid, $mid) = @_; + print { $self->{w} } "D $oid $mid\n" or + die "failed to write remove $!"; +} + 1; diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm index dd7ace67..c1660783 100644 --- a/lib/PublicInbox/SearchIdxPart.pm +++ b/lib/PublicInbox/SearchIdxPart.pm @@ -54,6 +54,14 @@ sub partition_worker_loop ($$$) { $txn = undef; print { $self->{skeleton}->{w} } "barrier $part\n" or die "write failed to skeleton: $!\n"; + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $xdb ||= $self->_xdb_acquire; + if (!$txn) { + $xdb->begin_transaction; + $txn = 1; + } + $self->remove_by_oid($oid, $mid); } else { chomp $line; my ($len, $artnum, $oid, $mid0) = split(/ /, $line); diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 4cb10f59..beb17b9f 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -73,6 +73,14 @@ sub skeleton_worker_loop { print $barrier_note "barrier_done\n" or die "print failed to barrier note: $!"; } + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.*)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $xdb ||= $self->_xdb_acquire; + if (!$txn) { + $xdb->begin_transaction; + $txn = 1; + } + $self->remove_by_oid($oid, $mid); } else { my $len = int($line); my $n = read($r, my $msg, $len) or die "read: $!\n"; @@ -110,6 +118,16 @@ sub index_skeleton { die "print failed: $err\n" if $err; } +sub remote_remove { + my ($self, $oid, $mid) = @_; + my $err; + $self->_lock_acquire; + eval { $self->SUPER::remote_remove($oid, $mid) }; + $err = $@; + $self->_lock_release; + die $err if $err; +} + # values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ] sub index_skeleton_real ($$) { my ($self, $values) = @_; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 23478a2a..a1cd0c28 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -64,7 +64,9 @@ sub load_doc { # :bytes and :lines metadata in RFC 3977 sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) } sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) } -sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) } +sub num ($) { + $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM) +} sub __hdr ($$) { my ($self, $field) = @_; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index e673c252..656f0693 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -199,18 +199,47 @@ sub idx_init { } sub remove { - my ($self, $mime, $msg) = @_; - my $existing = $self->lookup_content($mime) or return; - - # don't touch ghosts or already junked messages - return unless $existing->type eq 'mail'; - - # always write removals to the current (latest) git repo since - # we process chronologically + my ($self, $mime, $cmt_msg) = @_; + $self->barrier; + $self->idx_init; my $im = $self->importer; - my ($cmt, undef) = $im->remove($mime, $msg); - $cmt = $im->get_mark($cmt); - $self->unindex_msg($existing, $cmt); + my $ibx = $self->{-inbox}; + my $srch = $ibx->search; + my $cid = content_id($mime); + my $skel = $self->{skel}; + my $parts = $self->{idx_parts}; + my $mm = $skel->{mm}; + my $removed; + my $mids = mids($mime->header_obj); + foreach my $mid (@$mids) { + $srch->reopen->each_smsg_by_mid($mid, sub { + my ($smsg) = @_; + $smsg->load_expand; + my $msg = $ibx->msg_by_smsg($smsg); + if (!defined($msg)) { + warn "broken smsg for $mid\n"; + return 1; # continue + } + my $cur = PublicInbox::MIME->new($msg); + if (content_id($cur) eq $cid) { + $mm->num_delete($smsg->num); + # $removed should only be set once assuming + # no bugs in our deduplication code: + $removed = $smsg; + $removed->{mime} = $cur; + $im->remove($cur, $cmt_msg); + $removed->num; # memoize this for callers + + my $oid = $smsg->{blob}; + foreach my $idx (@$parts, $skel) { + $idx->remote_remove($oid, $mid); + } + } + 1; # continue + }); + $self->barrier; + } + $removed; } sub done { diff --git a/t/v2writable.t b/t/v2writable.t index 7d276da7..6e37b722 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -36,12 +36,13 @@ my $im = eval { }; is($im->{partitions}, 1, 'one partition when forced'); ok($im->add($mime), 'ordinary message added'); +my $git0; if ('ensure git configs are correct') { my @cmd = (qw(git config), "--file=$mainrepo/all.git/config", qw(core.sharedRepository 0644)); is(system(@cmd), 0, "set sharedRepository in all.git"); - my $git0 = PublicInbox::Git->new("$mainrepo/git/0.git"); + $git0 = PublicInbox::Git->new("$mainrepo/git/0.git"); my $fh = $git0->popen(qw(config core.sharedRepository)); my $v = eval { local $/; <$fh> }; chomp $v; @@ -189,8 +190,23 @@ EOF }; { local $ENV{NPROC} = 2; + my @before = $git0->qx(qw(log --pretty=oneline)); $im = PublicInbox::V2Writable->new($ibx, 1); is($im->{partitions}, 1, 'detected single partition from previous'); + my $smsg = $im->remove($mime, 'test removal'); + my @after = $git0->qx(qw(log --pretty=oneline)); + $im->done; + my $tip = shift @after; + like($tip, qr/\A[a-f0-9]+ test removal\n\z/s, + 'commit message propaged to git'); + is_deeply(\@after, \@before, 'only one commit written to git'); + is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid'); + like($smsg->num, qr/\A\d+\z/, 'numeric number in return message'); + is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num'); + my $srch = $ibx->search->reopen; + my @found = (); + $srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 }); + is(scalar(@found), 0, 'no longer found in Xapian skeleton'); } done_testing(); -- 2.44.0