X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FLeiToMail.pm;h=3f65e9e99f5d1f9ac0a2250adb31a934d9f19532;hb=757652fd1ad6843c984610263a2a0b336c974111;hp=ead00d1a6d890be7720e61804d1495fd1c25dd0a;hpb=d2a7dcb58ffb9604b2023159431fcdc4871f368f;p=public-inbox.git diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index ead00d1a..3f65e9e9 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -1,18 +1,27 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir package PublicInbox::LeiToMail; use strict; use v5.10.1; +use parent qw(PublicInbox::IPC); use PublicInbox::Eml; use PublicInbox::Lock; use PublicInbox::ProcessPipe; use PublicInbox::Spawn qw(which spawn popen_rd); use PublicInbox::LeiDedupe; +use PublicInbox::OnDestroy; +use PublicInbox::Git; +use PublicInbox::GitAsyncCat; use Symbol qw(gensym); use IO::Handle; # ->autoflush -use Fcntl qw(SEEK_SET); +use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); +use Errno qw(EEXIST ESPIPE ENOENT EPIPE); + +# struggles with short-lived repos, Gcf2Client makes little sense with lei; +# but we may use in-process libgit2 in the future. +$PublicInbox::GitAsyncCat::GCF2C = 0; my %kw2char = ( # Maildir characters draft => 'D', @@ -29,10 +38,14 @@ my %kw2status = ( ); sub _mbox_hdr_buf ($$$) { - my ($eml, $type, $kw) = @_; + my ($eml, $type, $smsg) = @_; $eml->header_set($_) for (qw(Lines Bytes Content-Length)); - my %hdr; # set Status, X-Status - for my $k (@$kw) { + + # Messages are always 'O' (non-\Recent in IMAP), it saves + # MUAs the trouble of rewriting the mbox if no other + # changes are made + my %hdr = (Status => [ 'O' ]); # set Status, X-Status + for my $k (@{$smsg->{kw} // []}) { if (my $ent = $kw2status{$k}) { push @{$hdr{$ent->[0]}}, $ent->[1]; } else { # X-Label? @@ -46,25 +59,30 @@ sub _mbox_hdr_buf ($$$) { # fixup old bug from import (pre-a0c07cba0e5d8b6a) $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $ident = $smsg->{blob} // 'lei'; + if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } substr($$buf, 0, 0, # prepend From line - "From lei\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); + "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); $buf; } -sub write_in_full ($$$) { - my ($fh, $buf, $atomic) = @_; - if ($atomic) { - defined(my $w = syswrite($fh, $$buf)) or die "write: $!"; - $w == length($$buf) or die "short write: $w != ".length($$buf); +sub atomic_append { # for on-disk destinations (O_APPEND, or O_EXCL) + my ($lei, $buf) = @_; + if (defined(my $w = syswrite($lei->{1} // return, $$buf))) { + return if $w == length($$buf); + $buf = "short atomic write: $w != ".length($$buf); + } elsif ($! == EPIPE) { + return $lei->note_sigpipe(1); } else { - print $fh $$buf or die "print: $!"; + $buf = "atomic write: $!"; } + $lei->fail($buf); } sub eml2mboxrd ($;$) { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^(>*From )/>$1/gm; $$buf .= $eml->{crlf}; @@ -76,8 +94,8 @@ sub eml2mboxrd ($;$) { } sub eml2mboxo { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; $$buf .= $eml->{crlf}; @@ -88,16 +106,24 @@ sub eml2mboxo { $buf; } +sub _mboxcl_common ($$$) { + my ($buf, $bdy, $crlf) = @_; + # add Lines: so mutt won't have to add it on MUA close + my $lines = $$bdy =~ tr!\n!\n!; + $$buf .= 'Content-Length: '.length($$bdy).$crlf. + 'Lines: '.$lines.$crlf.$crlf; + substr($$bdy, 0, 0, $$buf); # prepend header + $_[0] = $bdy; +} + # mboxcl still escapes "From " lines sub eml2mboxcl { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; - $$buf .= 'Content-Length: '.length($$bdy).$crlf.$crlf; - substr($$bdy, 0, 0, $$buf); # prepend header - $buf = $bdy; + _mboxcl_common($buf, $bdy, $crlf); } $$buf .= $crlf; $buf; @@ -105,30 +131,16 @@ sub eml2mboxcl { # mboxcl2 has no "From " escaping sub eml2mboxcl2 { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { - $$buf .= 'Content-Length: '.length($$bdy).$crlf.$crlf; - substr($$bdy, 0, 0, $$buf); # prepend header - $buf = $bdy; + _mboxcl_common($buf, $bdy, $crlf); } $$buf .= $crlf; $buf; } -sub mkmaildir ($) { - my ($maildir) = @_; - for (qw(new tmp cur)) { - my $d = "$maildir/$_"; - next if -d $d; - require File::Path; - if (!File::Path::mkpath($d) && !-d $d) { - die "failed to mkpath($d): $!\n"; - } - } -} - sub git_to_mail { # git->cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; if ($type ne 'blob') { @@ -138,10 +150,11 @@ sub git_to_mail { # git->cat_async callback warn "unexpected type=$type for $oid\n"; } } - if ($size > 0) { - my ($write_cb, $kw) = @$arg; - $write_cb->($bref, $oid, $kw); + my ($write_cb, $smsg) = @$arg; + if ($smsg->{blob} ne $oid) { + die "BUG: expected=$smsg->{blob} got=$oid"; } + $write_cb->($bref, $smsg) if $size > 0; } sub reap_compress { # dwaitpid callback @@ -156,8 +169,7 @@ sub reap_compress { # dwaitpid callback # { foo => '' } means "--foo" is passed to the command-line, # otherwise { foo => '--bar' } passes "--bar" our %zsfx2cmd = ( - gz => [ qw(GZIP pigz gzip), { - rsyncable => '', threads => '-p' } ], + gz => [ qw(GZIP pigz gzip), { rsyncable => '', threads => '-p' } ], bz2 => [ 'bzip2', {} ], xz => [ 'xz', { threads => '-T' } ], # XXX does anybody care for these? I prefer zstd on entire FSes, @@ -198,25 +210,26 @@ sub zsfx2cmd ($$$) { \@cmd; } -sub compress_dst { - my ($out, $zsfx, $lei) = @_; +sub _post_augment_mbox { # open a compressor process + my ($self, $lei) = @_; + my $zsfx = $self->{zsfx} or return; my $cmd = zsfx2cmd($zsfx, undef, $lei); - pipe(my ($r, $w)) or die "pipe: $!"; - my $rdr = { 0 => $r, 1 => $out, 2 => $lei->{2} }; + my ($r, $w) = @{delete $lei->{zpipe}}; + my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2} }; my $pid = spawn($cmd, $lei->{env}, $rdr); - $lei->{"pid.$pid"} = $cmd; my $pp = gensym; - tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $lei; - my $pipe_lk = ($lei->{opt}->{jobs} // 0) > 1 ? - PublicInbox::Lock->new_tmp($zsfx) : undef; - ($pp, $pipe_lk); + my $dup = bless { "pid.$pid" => $cmd }, ref($lei); + $dup->{$_} = $lei->{$_} for qw(2 sock); + tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $dup; + $lei->{1} = $pp; + die 'BUG: unexpected {ovv}->{lock_path}' if $lei->{ovv}->{lock_path}; + $lei->{ovv}->ovv_out_lk_init; } sub decompress_src ($$$) { my ($in, $zsfx, $lei) = @_; my $cmd = zsfx2cmd($zsfx, 1, $lei); - my $rdr = { 0 => $in, 2 => $lei->{2} }; - popen_rd($cmd, $lei->{env}, $rdr); + popen_rd($cmd, $lei->{env}, { 0 => $in, 2 => $lei->{2} }); } sub dup_src ($) { @@ -229,50 +242,266 @@ sub dup_src ($) { sub _augment { # MboxReader eml_cb my ($eml, $lei) = @_; # ignore return value, just populate the skv - $lei->{dedupe_cb}->is_dup($eml); + $lei->{dedupe}->is_dup($eml); } -sub _mbox_write_cb ($$$$) { - my ($cls, $mbox, $dst, $lei) = @_; - my $m = "eml2$mbox"; - my $eml2mbox = $cls->can($m) or die "$cls->$m missing"; - my ($out, $pipe_lk); - open $out, '+>>', $dst or die "open $dst: $!"; - # Perl does SEEK_END even with O_APPEND :< - seek($out, 0, SEEK_SET) or die "seek $dst: $!"; - my $jobs = $lei->{opt}->{jobs} // 0; - my $atomic = $jobs > 1; - my $dedupe = $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei); - state $zsfx_allow = join('|', keys %zsfx2cmd); - my ($zsfx) = ($dst =~ /\.($zsfx_allow)\z/); - if ($lei->{opt}->{augment}) { - if (-s $out && $dedupe->prepare_dedupe) { - my $rd = $zsfx ? decompress_src($out, $zsfx, $lei) : - dup_src($out); - PublicInbox::MboxReader->$mbox($rd, \&_augment, $lei); +sub _mbox_write_cb ($$) { + my ($self, $lei) = @_; + my $ovv = $lei->{ovv}; + my $m = 'eml2'.$ovv->{fmt}; + my $eml2mbox = $self->can($m) or die "$self->$m missing"; + $lei->{1} // die "no stdout ($m, $ovv->{dst})"; # redirected earlier + $lei->{1}->autoflush(1); + my $atomic_append = !defined($ovv->{lock_path}); + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + $eml //= PublicInbox::Eml->new($buf); + return if $dedupe->is_dup($eml, $smsg->{blob}); + $buf = $eml2mbox->($eml, $smsg); + return atomic_append($lei, $buf) if $atomic_append; + my $lk = $ovv->lock_for_scope; + $lei->out($$buf); + } +} + +sub _maildir_each_file ($$;@) { + my ($dir, $cb, @arg) = @_; + for my $d (qw(new/ cur/)) { + my $pfx = $dir.$d; + opendir my $dh, $pfx or next; + while (defined(my $fn = readdir($dh))) { + $cb->($pfx.$fn, @arg) if $fn =~ /:2,[A-Za-z]*\z/; } - $dedupe->pause_dedupe if $jobs; # are we forking? + } +} + +sub _augment_file { # _maildir_each_file cb + my ($f, $lei) = @_; + my $eml = PublicInbox::InboxWritable::eml_from_path($f) or return; + _augment($eml, $lei); +} + +# _maildir_each_file callback, \&CORE::unlink doesn't work with it +sub _unlink { unlink($_[0]) } + +sub _rand () { + state $seq = 0; + sprintf('%x,%x,%x,%x', rand(0xffffffff), time, $$, ++$seq); +} + +sub _buf2maildir { + my ($dst, $buf, $smsg) = @_; + my $kw = $smsg->{kw} // []; + my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw)); + my $rand = ''; # chosen by die roll :P + my ($tmp, $fh, $final, $ok); + my $common = $smsg->{blob} // _rand; + if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" } + do { + $tmp = $dst.'tmp/'.$rand.$common; + } while (!($ok = sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) && + $! == EEXIST && ($rand = _rand.',')); + if ($ok && print $fh $$buf and close($fh)) { + # ignore new/ and write only to cur/, otherwise MUAs + # with R/W access to the Maildir will end up doing + # a mass rename which can take a while with thousands + # of messages. + $dst .= 'cur/'; + $rand = ''; + do { + $final = $dst.$rand.$common.':2,'.$sfx; + } while (!link($tmp, $final) && $! == EEXIST && + ($rand = _rand.',')); + unlink($tmp) or warn "W: failed to unlink $tmp: $!\n"; } else { - truncate($out, 0) or die "truncate $dst: $!"; - $dedupe->prepare_dedupe if !$jobs; + my $err = "Error writing $smsg->{blob} to $dst: $!\n"; + $_[0] = undef; # clobber dst + unlink($tmp); + die $err; } - ($out, $pipe_lk) = compress_dst($out, $zsfx, $lei) if $zsfx; - sub { - my ($buf, $oid, $kw) = @_; - my $eml = PublicInbox::Eml->new($buf); - if (!$lei->{dedupe}->is_dup($eml, $oid)) { - $buf = $eml2mbox->($eml, $kw); - my $lock = $pipe_lk->lock_for_scope if $pipe_lk; - write_in_full($out, $buf, $atomic); - } +} + +sub _maildir_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + my $dst = $lei->{ovv}->{dst}; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + $dst // return $lei->fail; # dst may be undef-ed in last run + $buf //= \($eml->as_string); + return _buf2maildir($dst, $buf, $smsg) if !$dedupe; + $eml //= PublicInbox::Eml->new($$buf); # copy buf + return if $dedupe->is_dup($eml, $smsg->{blob}); + undef $eml; + _buf2maildir($dst, $buf, $smsg); } } sub write_cb { # returns a callback for git_to_mail - my ($cls, $dst, $lei) = @_; - if ($dst =~ s!\A(mbox(?:rd|cl|cl2|o))?:!!) { - _mbox_write_cb($cls, $1, $dst, $lei); + my ($self, $lei) = @_; + # _mbox_write_cb or _maildir_write_cb + my $m = "_$self->{base_type}_write_cb"; + $self->$m($lei); +} + +sub new { + my ($cls, $lei) = @_; + my $fmt = $lei->{ovv}->{fmt}; + my $dst = $lei->{ovv}->{dst}; + my $self = bless {}, $cls; + if ($fmt eq 'maildir') { + $self->{base_type} = 'maildir'; + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + } elsif (substr($fmt, 0, 4) eq 'mbox') { + (-d $dst || (-e _ && !-w _)) and die + "$dst exists and is not a writable file\n"; + $self->can("eml2$fmt") or die "bad mbox --format=$fmt\n"; + $self->{base_type} = 'mbox'; + } else { + die "bad mail --format=$fmt\n"; + } + $self->{dst} = $dst; + $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei); + $self; +} + +sub _pre_augment_maildir {} # noop + +sub _do_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + if ($lei->{opt}->{augment}) { + my $dedupe = $lei->{dedupe}; + if ($dedupe && $dedupe->prepare_dedupe) { + require PublicInbox::InboxWritable; # eml_from_path + _maildir_each_file($dst, \&_augment_file, $lei); + $dedupe->pause_dedupe; + } + } else { # clobber existing Maildir + _maildir_each_file($dst, \&_unlink); + } +} + +sub _post_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + for my $x (qw(tmp new cur)) { + my $d = $dst.$x; + next if -d $d; + require File::Path; + File::Path::mkpath($d); + -d $d or die "$d is not a directory"; + } +} + +sub _pre_augment_mbox { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + if ($dst ne '/dev/stdout') { + my $mode = -p $dst ? '>' : '+>>'; + if (-f _ && !$lei->{opt}->{augment} and !unlink($dst)) { + $! == ENOENT or die "unlink($dst): $!"; + } + open my $out, $mode, $dst or die "open($dst): $!"; + $lei->{old_1} = $lei->{1}; # keep for spawning MUA + $lei->{1} = $out; + } + # Perl does SEEK_END even with O_APPEND :< + $self->{seekable} = seek($lei->{1}, 0, SEEK_SET); + if (!$self->{seekable} && $! != ESPIPE && $dst ne '/dev/stdout') { + die "seek($dst): $!\n"; + } + state $zsfx_allow = join('|', keys %zsfx2cmd); + ($self->{zsfx}) = ($dst =~ /\.($zsfx_allow)\z/) or return; + pipe(my ($r, $w)) or die "pipe: $!"; + $lei->{zpipe} = [ $r, $w ]; +} + +sub _do_augment_mbox { + my ($self, $lei) = @_; + return if !$lei->{opt}->{augment}; + my $dedupe = $lei->{dedupe}; + my $dst = $lei->{ovv}->{dst}; + die "cannot augment $dst, not seekable\n" if !$self->{seekable}; + my $out = $lei->{1}; + if (-s $out && $dedupe && $dedupe->prepare_dedupe) { + my $zsfx = $self->{zsfx}; + my $rd = $zsfx ? decompress_src($out, $zsfx, $lei) : + dup_src($out); + my $fmt = $lei->{ovv}->{fmt}; + require PublicInbox::MboxReader; + PublicInbox::MboxReader->$fmt($rd, \&_augment, $lei); + } + # maybe some systems don't honor O_APPEND, Perl does this: + seek($out, 0, SEEK_END) or die "seek $dst: $!"; + $dedupe->pause_dedupe if $dedupe; +} + +sub pre_augment { # fast (1 disk seek), runs in main daemon + my ($self, $lei) = @_; + # _pre_augment_maildir, _pre_augment_mbox + my $m = "_pre_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub do_augment { # slow, runs in wq worker + my ($self, $lei) = @_; + # _do_augment_maildir, _do_augment_mbox + my $m = "_do_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon + my ($self, $lei, @args) = @_; + # _post_augment_maildir, _post_augment_mbox + my $m = "_post_augment_$self->{base_type}"; + $self->$m($lei, @args); +} + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = delete $self->{lei}; + $lei->lei_atfork_child; + if (my $zpipe = delete $lei->{zpipe}) { + $lei->{1} = $zpipe->[1]; + close $zpipe->[0]; + } + $self->{wcb} = $self->write_cb($lei); + $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + $self->SUPER::ipc_atfork_child; +} + +sub lock_free { + $_[0]->{base_type} =~ /\A(?:maildir|mh|imap|jmap)\z/ ? 1 : 0; +} + +sub poke_dst { + my ($self) = @_; + if ($self->{base_type} eq 'maildir') { + my $t = time + 1; + utime($t, $t, "$self->{dst}/cur"); + } +} + +sub write_mail { # via ->wq_io_do + my ($self, $git_dir, $smsg) = @_; + my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir); + git_async_cat($git, $smsg->{blob}, \&git_to_mail, + [$self->{wcb}, $smsg]); +} + +sub wq_atexit_child { + my ($self) = @_; + delete $self->{wcb}; + for my $git (delete @$self{grep(/\A$$\0/, keys %$self)}) { + $git->async_wait_all; } + $SIG{__WARN__} = 'DEFAULT'; } 1;