From: Eric Wong (Contractor, The Linux Foundation) Date: Mon, 19 Mar 2018 08:14:41 +0000 (+0000) Subject: import: (v2) delete writes the blob into history in subdir X-Git-Tag: v1.1.0-pre1~161 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=7a3a4b9d310876f68f4ba788afaef77ad15fc62b import: (v2) delete writes the blob into history in subdir This makes it easier to audit deletes with "git log -p" and prevents an unstable specification of "content_id" from being stored in history. This should be cost-free if done in the same partition (and even cheaper than before as it introduces no new blobs). It does have a higher cost across partitions, but is probably irrelevant given the typical ham:spam ratio. --- diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index e20c6e03..94a49fe6 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -11,7 +11,6 @@ use Fcntl qw(:flock :DEFAULT); use PublicInbox::Spawn qw(spawn); use PublicInbox::MID qw(mid_mime mid2path); use PublicInbox::Address; -use PublicInbox::ContentId qw(content_id); use PublicInbox::MsgTime qw(msg_timestamp); sub new { @@ -163,7 +162,6 @@ sub get_mark { # ('MISMATCH', Email::MIME) on mismatch # (:MARK, Email::MIME) on success # -# For v2 inboxes, the content_id is returned instead of the msg # v2 callers should check with Xapian before calling this as # it is not idempotent. sub remove { @@ -179,10 +177,17 @@ sub remove { ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); return ($err, $cur) if $err; } else { - $cur = content_id($mime); - my $len = length($cur); + my $sref; + if (ref($mime) eq 'SCALAR') { # optimization used by V2Writable + $sref = $mime; + } else { # XXX should not be necessary: + my $str = $mime->as_string; + $sref = \$str; + } + my $len = length($$sref); $blob = $self->{mark}++; - print $w "blob\nmark :$blob\ndata $len\n$cur\n" or wfail; + print $w "blob\nmark :$blob\ndata $len\n", + $$sref, "\n" or wfail; } my $ref = $self->{ref}; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 656f0693..fd9bf615 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -220,6 +220,7 @@ sub remove { warn "broken smsg for $mid\n"; return 1; # continue } + my $orig = $$msg; my $cur = PublicInbox::MIME->new($msg); if (content_id($cur) eq $cid) { $mm->num_delete($smsg->num); @@ -227,7 +228,8 @@ sub remove { # no bugs in our deduplication code: $removed = $smsg; $removed->{mime} = $cur; - $im->remove($cur, $cmt_msg); + $im->remove(\$orig, $cmt_msg); + $orig = undef; $removed->num; # memoize this for callers my $oid = $smsg->{blob}; diff --git a/t/v2writable.t b/t/v2writable.t index 6e37b722..a5c982e9 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -191,6 +191,7 @@ EOF { local $ENV{NPROC} = 2; my @before = $git0->qx(qw(log --pretty=oneline)); + my $before = $git0->qx(qw(log --pretty=raw --raw -r --no-abbrev)); $im = PublicInbox::V2Writable->new($ibx, 1); is($im->{partitions}, 1, 'detected single partition from previous'); my $smsg = $im->remove($mime, 'test removal'); @@ -207,6 +208,14 @@ EOF my @found = (); $srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 }); is(scalar(@found), 0, 'no longer found in Xapian skeleton'); + + my $after = $git0->qx(qw(log -1 --pretty=raw --raw -r --no-abbrev)); + if ($after =~ m!( [a-f0-9]+ )A\td$!) { + my $oid = $1; + ok(index($before, $oid) > 0, 'no new blob introduced'); + } else { + fail('failed to extract blob from log output'); + } } done_testing();