X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FV2Writable.pm;h=2cc87305668256709f773415f750e5f52653503c;hb=017fed7bc4d33ac474a19356994be5bd0bfe68ba;hp=1cc4b005f86525352bcd1cb81f7ee19139ec9ee5;hpb=3348ad4b3b1a0865ee58a902953165ea0f4aa4bd;p=public-inbox.git diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 1cc4b005..2cc87305 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -15,15 +15,19 @@ use PublicInbox::ContentId qw(content_id content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; use PublicInbox::Msgmap; -use PublicInbox::Spawn; +use PublicInbox::Spawn qw(spawn); +use PublicInbox::SearchIdx; use IO::Handle; # an estimate of the post-packed size to the raw uncompressed size my $PACKING_FACTOR = 0.4; # assume 2 cores if GNU nproc(1) is not available -sub nproc () { - int($ENV{NPROC} || `nproc 2>/dev/null` || 2); +sub nproc_parts () { + my $n = int($ENV{NPROC} || `nproc 2>/dev/null` || 2); + # subtract for the main process and git-fast-import + $n -= 1; + $n < 1 ? 1 : $n; } sub count_partitions ($) { @@ -73,7 +77,7 @@ sub new { rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR), last_commit => [], # git repo -> commit }; - $self->{partitions} = count_partitions($self) || nproc(); + $self->{partitions} = count_partitions($self) || nproc_parts(); bless $self, $class; } @@ -171,19 +175,19 @@ sub num_for_harder { my $hdr = $mime->header_obj; my $dig = content_digest($mime); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); my $num = $self->{mm}->mid_insert($$mid0); unless (defined $num) { # it's hard to spoof the last Received: header my @recvd = $hdr->header_raw('Received'); $dig->add("Received: $_") foreach (@recvd); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); $num = $self->{mm}->mid_insert($$mid0); # fall back to a random Message-ID and give up determinism: until (defined($num)) { $dig->add(rand); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); warn "using random Message-ID <$$mid0> as fallback\n"; $num = $self->{mm}->mid_insert($$mid0); } @@ -256,12 +260,32 @@ sub purge_oids { $purges; } +sub content_ids ($) { + my ($mime) = @_; + my @cids = ( content_id($mime) ); + + # Email::MIME->as_string doesn't always round-trip, so we may + # use a second content_id + my $rt = content_id(PublicInbox::MIME->new(\($mime->as_string))); + push @cids, $rt if $cids[0] ne $rt; + \@cids; +} + +sub content_matches ($$) { + my ($cids, $existing) = @_; + my $cid = content_id($existing); + foreach (@$cids) { + return 1 if $_ eq $cid + } + 0 +} + sub remove_internal { my ($self, $mime, $cmt_msg, $purge) = @_; $self->idx_init; my $im = $self->importer unless $purge; my $over = $self->{over}; - my $cid = content_id($mime); + my $cids = content_ids($mime); my $parts = $self->{idx_parts}; my $mm = $self->{mm}; my $removed; @@ -284,7 +308,7 @@ sub remove_internal { } my $orig = $$msg; my $cur = PublicInbox::MIME->new($msg); - if (content_id($cur) eq $cid) { + if (content_matches($cids, $cur)) { $smsg->{mime} = $cur; $gone{$smsg->{num}} = [ $smsg, \$orig ]; } @@ -538,7 +562,6 @@ sub import_init { sub diff ($$$) { my ($mid, $cur, $new) = @_; use File::Temp qw(tempfile); - use PublicInbox::Spawn qw(spawn); my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1); print $ah $cur->as_string or die "print: $!"; @@ -569,8 +592,7 @@ sub get_blob ($$) { sub lookup_content { my ($self, $mime, $mid) = @_; my $over = $self->{over}; - my $cid = content_id($mime); - my $found; + my $cids = content_ids($mime); my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -579,16 +601,16 @@ sub lookup_content { next; } my $cur = PublicInbox::MIME->new($msg); - if (content_id($cur) eq $cid) { + if (content_matches($cids, $cur)) { $smsg->{mime} = $cur; - $found = $smsg; - last; + return $smsg; } + # XXX DEBUG_DIFF is experimental and may be removed diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; } - $found; + undef; } sub atfork_child { @@ -716,16 +738,7 @@ sub last_commits { $heads; } -sub is_ancestor ($$$) { - my ($git, $cur, $tip) = @_; - return 0 unless $git->check($cur); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(merge-base --is-ancestor), $cur, $tip ]; - my $pid = spawn($cmd); - defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0; -} +*is_ancestor = *PublicInbox::SearchIdx::is_ancestor; sub index_prepare { my ($self, $opts, $epoch_max, $ranges) = @_;