package PublicInbox::Import;
use strict;
use warnings;
-use Fcntl qw(:flock :DEFAULT);
+use base qw(PublicInbox::Lock);
use PublicInbox::Spawn qw(spawn);
-use PublicInbox::MID qw(mid_mime mid2path);
+use PublicInbox::MID qw(mids mid_mime mid2path);
use PublicInbox::Address;
-use PublicInbox::ContentId qw(content_id);
use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::MDA;
sub new {
my ($class, $git, $name, $email, $ibx) = @_;
ref => $ref,
inbox => $ibx,
path_type => '2/38', # or 'v2'
- ssoma_lock => 1, # disable for v2
+ lock_path => "$git->{git_dir}/ssoma.lock", # v2 changes this
bytes_added => 0,
}, $class
}
pipe($in_r, $in_w) or die "pipe failed: $!";
pipe($out_r, $out_w) or die "pipe failed: $!";
my $git = $self->{git};
- my $git_dir = $git->{git_dir};
- my $lockfh;
- if ($self->{ssoma_lock}) {
- my $lockpath = "$git_dir/ssoma.lock";
- sysopen($lockfh, $lockpath, O_WRONLY|O_CREAT) or
- die "failed to open lock $lockpath: $!";
- # wait for other processes to be done
- flock($lockfh, LOCK_EX) or die "lock failed: $!\n";
- }
+ $self->lock_acquire;
local $/ = "\n";
chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref}));
+ my $git_dir = $git->{git_dir};
my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import
--quiet --done --date-format=raw));
my $rdr = { 0 => fileno($out_r), 1 => fileno($in_w) };
$out_w->autoflush(1);
$self->{in} = $in_r;
$self->{out} = $out_w;
- $self->{lockfh} = $lockfh;
$self->{pid} = $pid;
$self->{nchg} = 0;
binmode $out_w, ':raw' or die "binmode :raw failed: $!";
$n = read($r, my $lf, 1);
defined($n) or die "read final byte of cat-blob failed: $!";
die "bad read on final byte: <$lf>" if $lf ne "\n";
- my $cur = PublicInbox::MIME->new($buf);
+ my $cur = PublicInbox::MIME->new(\$buf);
my $cur_s = $cur->header('Subject');
$cur_s = '' unless defined $cur_s;
my $cur_m = $mime->header('Subject');
(undef, $cur);
}
-# used for v2 (maybe)
sub checkpoint {
my ($self) = @_;
return unless $self->{pid};
undef;
}
+sub progress {
+ my ($self, $msg) = @_;
+ return unless $self->{pid};
+ print { $self->{out} } "progress $msg\n" or wfail;
+ $self->{in}->getline eq "progress $msg\n" or die
+ "progress $msg not received\n";
+ undef;
+}
+
+sub _update_git_info ($$) {
+ my ($self, $do_gc) = @_;
+ # for compatibility with existing ssoma installations
+ # we can probably remove this entirely by 2020
+ my $git_dir = $self->{git}->{git_dir};
+ my @cmd = ('git', "--git-dir=$git_dir");
+ my $index = "$git_dir/ssoma.index";
+ if (-e $index && !$ENV{FAST}) {
+ my $env = { GIT_INDEX_FILE => $index };
+ run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env);
+ }
+ run_die([@cmd, 'update-server-info'], undef);
+ ($self->{path_type} eq '2/38') and eval {
+ require PublicInbox::SearchIdx;
+ my $inbox = $self->{inbox} || $git_dir;
+ my $s = PublicInbox::SearchIdx->new($inbox);
+ $s->index_sync({ ref => $self->{ref} });
+ };
+ eval { run_die([@cmd, qw(gc --auto)], undef) } if $do_gc;
+}
+
+sub barrier {
+ my ($self) = @_;
+
+ # For safety, we ensure git checkpoint is complete before because
+ # the data in git is still more important than what is in Xapian
+ # in v2. Performance may be gained by delaying the ->progress
+ # call but we lose safety
+ if ($self->{nchg}) {
+ $self->checkpoint;
+ $self->progress('checkpoint');
+ _update_git_info($self, 0);
+ $self->{nchg} = 0;
+ }
+}
+
# used for v2
sub get_mark {
my ($self, $mark) = @_;
# ('MISMATCH', Email::MIME) on mismatch
# (:MARK, Email::MIME) on success
#
-# For v2 inboxes, the content_id is returned instead of the msg
# v2 callers should check with Xapian before calling this as
# it is not idempotent.
sub remove {
($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime);
return ($err, $cur) if $err;
} else {
- $cur = content_id($mime);
- my $len = length($cur);
+ my $sref;
+ if (ref($mime) eq 'SCALAR') { # optimization used by V2Writable
+ $sref = $mime;
+ } else { # XXX should not be necessary:
+ my $str = $mime->as_string;
+ $sref = \$str;
+ }
+ my $len = length($$sref);
$blob = $self->{mark}++;
- print $w "blob\nmark :$blob\ndata $len\n$cur\n" or wfail;
+ print $w "blob\nmark :$blob\ndata $len\n",
+ $$sref, "\n" or wfail;
}
my $ref = $self->{ref};
if (defined $path) {
print $w "D $path\n\n" or wfail;
} else {
- print $w "M 100644 :$blob d\n\n" or wfail;
+ print $w "M 100644 :$blob _/D\n\n" or wfail;
}
$self->{nchg}++;
(($self->{tip} = ":$commit"), $cur);
my $path;
if ($path_type eq '2/38') {
- $path = mid2path(mid_mime($mime));
+ my $mids = mids($mime->header_obj);
+ if (!scalar(@$mids)) {
+ my $dig = content_digest($mime);
+ @$mids = (digest2mid($dig));
+ }
+ $path = mid2path($mids->[0]);
} else { # v2 layout, one file:
$path = 'm';
}
# kill potentially confusing/misleading headers
$mime->header_set($_) for qw(bytes lines content-length status);
+ $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS;
# spam check:
if ($check_cb) {
my $pid = delete $self->{pid} or die 'BUG: missing {pid} when done';
waitpid($pid, 0) == $pid or die 'fast-import did not finish';
$? == 0 or die "fast-import failed: $?";
- my $nchg = delete $self->{nchg};
- # for compatibility with existing ssoma installations
- # we can probably remove this entirely by 2020
- my $git_dir = $self->{git}->{git_dir};
- my @cmd = ('git', "--git-dir=$git_dir");
- my $index = "$git_dir/ssoma.index";
- if ($nchg && -e $index && !$ENV{FAST}) {
- my $env = { GIT_INDEX_FILE => $index };
- run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env);
- }
- if ($nchg) {
- run_die([@cmd, 'update-server-info'], undef);
- ($self->{path_type} eq '2/38') and eval {
- require PublicInbox::SearchIdx;
- my $inbox = $self->{inbox} || $git_dir;
- my $s = PublicInbox::SearchIdx->new($inbox);
- $s->index_sync({ ref => $self->{ref} });
- };
-
- eval { run_die([@cmd, qw(gc --auto)], undef) };
- }
+ _update_git_info($self, 1) if delete $self->{nchg};
- $self->{ssoma_lock} or return;
- my $lockfh = delete $self->{lockfh} or die "BUG: not locked: $!";
- flock($lockfh, LOCK_UN) or die "unlock failed: $!";
- close $lockfh or die "close lock failed: $!";
+ $self->lock_release;
}
sub atfork_child {
}
}
+sub digest2mid ($) {
+ my ($dig) = @_;
+ my $b64 = $dig->clone->b64digest;
+ # Make our own URLs nicer:
+ # See "Base 64 Encoding with URL and Filename Safe Alphabet" in RFC4648
+ $b64 =~ tr!+/=!-_!d;
+
+ # We can make this more meaningful with a date prefix or other things,
+ # but this is only needed for crap that fails to generate a Message-ID
+ # or reuses one. In other words, it's usually spammers who hit this
+ # so they don't deserve nice Message-IDs :P
+ $b64 . '@localhost';
+}
+
1;
__END__
=pod