X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FLeiToMail.pm;h=3f65e9e99f5d1f9ac0a2250adb31a934d9f19532;hb=757652fd1ad6843c984610263a2a0b336c974111;hp=be3380065fbda299c6048338572225d9b719d17e;hpb=3dab16e671b344dbfa925ecc640518532a88b16a;p=public-inbox.git diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index be338006..3f65e9e9 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -1,19 +1,27 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir package PublicInbox::LeiToMail; use strict; use v5.10.1; +use parent qw(PublicInbox::IPC); use PublicInbox::Eml; use PublicInbox::Lock; use PublicInbox::ProcessPipe; use PublicInbox::Spawn qw(which spawn popen_rd); use PublicInbox::LeiDedupe; +use PublicInbox::OnDestroy; +use PublicInbox::Git; +use PublicInbox::GitAsyncCat; use Symbol qw(gensym); use IO::Handle; # ->autoflush use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); -use Errno qw(EEXIST ESPIPE ENOENT); +use Errno qw(EEXIST ESPIPE ENOENT EPIPE); + +# struggles with short-lived repos, Gcf2Client makes little sense with lei; +# but we may use in-process libgit2 in the future. +$PublicInbox::GitAsyncCat::GCF2C = 0; my %kw2char = ( # Maildir characters draft => 'D', @@ -30,10 +38,14 @@ my %kw2status = ( ); sub _mbox_hdr_buf ($$$) { - my ($eml, $type, $kw) = @_; + my ($eml, $type, $smsg) = @_; $eml->header_set($_) for (qw(Lines Bytes Content-Length)); - my %hdr; # set Status, X-Status - for my $k (@$kw) { + + # Messages are always 'O' (non-\Recent in IMAP), it saves + # MUAs the trouble of rewriting the mbox if no other + # changes are made + my %hdr = (Status => [ 'O' ]); # set Status, X-Status + for my $k (@{$smsg->{kw} // []}) { if (my $ent = $kw2status{$k}) { push @{$hdr{$ent->[0]}}, $ent->[1]; } else { # X-Label? @@ -47,26 +59,30 @@ sub _mbox_hdr_buf ($$$) { # fixup old bug from import (pre-a0c07cba0e5d8b6a) $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $ident = $smsg->{blob} // 'lei'; + if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } substr($$buf, 0, 0, # prepend From line - "From lei\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); + "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); $buf; } sub atomic_append { # for on-disk destinations (O_APPEND, or O_EXCL) - my ($fh, $buf) = @_; - defined(my $w = syswrite($fh, $$buf)) or die "write: $!"; - $w == length($$buf) or die "short write: $w != ".length($$buf); -} - -sub _print_full { - my ($fh, $buf) = @_; - print $fh $$buf or die "print: $!"; + my ($lei, $buf) = @_; + if (defined(my $w = syswrite($lei->{1} // return, $$buf))) { + return if $w == length($$buf); + $buf = "short atomic write: $w != ".length($$buf); + } elsif ($! == EPIPE) { + return $lei->note_sigpipe(1); + } else { + $buf = "atomic write: $!"; + } + $lei->fail($buf); } sub eml2mboxrd ($;$) { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^(>*From )/>$1/gm; $$buf .= $eml->{crlf}; @@ -78,8 +94,8 @@ sub eml2mboxrd ($;$) { } sub eml2mboxo { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; $$buf .= $eml->{crlf}; @@ -90,16 +106,24 @@ sub eml2mboxo { $buf; } +sub _mboxcl_common ($$$) { + my ($buf, $bdy, $crlf) = @_; + # add Lines: so mutt won't have to add it on MUA close + my $lines = $$bdy =~ tr!\n!\n!; + $$buf .= 'Content-Length: '.length($$bdy).$crlf. + 'Lines: '.$lines.$crlf.$crlf; + substr($$bdy, 0, 0, $$buf); # prepend header + $_[0] = $bdy; +} + # mboxcl still escapes "From " lines sub eml2mboxcl { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; - $$buf .= 'Content-Length: '.length($$bdy).$crlf.$crlf; - substr($$bdy, 0, 0, $$buf); # prepend header - $buf = $bdy; + _mboxcl_common($buf, $bdy, $crlf); } $$buf .= $crlf; $buf; @@ -107,13 +131,11 @@ sub eml2mboxcl { # mboxcl2 has no "From " escaping sub eml2mboxcl2 { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { - $$buf .= 'Content-Length: '.length($$bdy).$crlf.$crlf; - substr($$bdy, 0, 0, $$buf); # prepend header - $buf = $bdy; + _mboxcl_common($buf, $bdy, $crlf); } $$buf .= $crlf; $buf; @@ -128,10 +150,11 @@ sub git_to_mail { # git->cat_async callback warn "unexpected type=$type for $oid\n"; } } - if ($size > 0) { - my ($write_cb, $kw) = @$arg; - $write_cb->($bref, $oid, $kw); + my ($write_cb, $smsg) = @$arg; + if ($smsg->{blob} ne $oid) { + die "BUG: expected=$smsg->{blob} got=$oid"; } + $write_cb->($bref, $smsg) if $size > 0; } sub reap_compress { # dwaitpid callback @@ -146,8 +169,7 @@ sub reap_compress { # dwaitpid callback # { foo => '' } means "--foo" is passed to the command-line, # otherwise { foo => '--bar' } passes "--bar" our %zsfx2cmd = ( - gz => [ qw(GZIP pigz gzip), { - rsyncable => '', threads => '-p' } ], + gz => [ qw(GZIP pigz gzip), { rsyncable => '', threads => '-p' } ], bz2 => [ 'bzip2', {} ], xz => [ 'xz', { threads => '-T' } ], # XXX does anybody care for these? I prefer zstd on entire FSes, @@ -188,25 +210,26 @@ sub zsfx2cmd ($$$) { \@cmd; } -sub compress_dst { - my ($out, $zsfx, $lei) = @_; +sub _post_augment_mbox { # open a compressor process + my ($self, $lei) = @_; + my $zsfx = $self->{zsfx} or return; my $cmd = zsfx2cmd($zsfx, undef, $lei); - pipe(my ($r, $w)) or die "pipe: $!"; - my $rdr = { 0 => $r, 1 => $out, 2 => $lei->{2} }; + my ($r, $w) = @{delete $lei->{zpipe}}; + my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2} }; my $pid = spawn($cmd, $lei->{env}, $rdr); - $lei->{"pid.$pid"} = $cmd; my $pp = gensym; - tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $lei; - my $pipe_lk = ($lei->{opt}->{jobs} // 0) > 1 ? - PublicInbox::Lock->new_tmp($zsfx) : undef; - ($pp, $pipe_lk); + my $dup = bless { "pid.$pid" => $cmd }, ref($lei); + $dup->{$_} = $lei->{$_} for qw(2 sock); + tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $dup; + $lei->{1} = $pp; + die 'BUG: unexpected {ovv}->{lock_path}' if $lei->{ovv}->{lock_path}; + $lei->{ovv}->ovv_out_lk_init; } sub decompress_src ($$$) { my ($in, $zsfx, $lei) = @_; my $cmd = zsfx2cmd($zsfx, 1, $lei); - my $rdr = { 0 => $in, 2 => $lei->{2} }; - popen_rd($cmd, $lei->{env}, $rdr); + popen_rd($cmd, $lei->{env}, { 0 => $in, 2 => $lei->{2} }); } sub dup_src ($) { @@ -222,50 +245,24 @@ sub _augment { # MboxReader eml_cb $lei->{dedupe}->is_dup($eml); } -sub _mbox_write_cb ($$$$) { - my ($cls, $mbox, $dst, $lei) = @_; - my $m = "eml2$mbox"; - my $eml2mbox = $cls->can($m) or die "$cls->$m missing"; - my ($out, $pipe_lk, $seekable); - # XXX should we support /dev/stdout.gz ? - if ($dst eq '/dev/stdout') { - $out = $lei->{1}; - } else { # TODO: mbox locking (but mairix doesn't...) - my $mode = -p $dst ? '>' : '+>>'; - if (-f _ && !$lei->{opt}->{augment} and !unlink($dst)) { - die "unlink $dst: $!" if $! != ENOENT; - } - open $out, $mode, $dst or die "open $dst: $!"; - # Perl does SEEK_END even with O_APPEND :< - $seekable = seek($out, 0, SEEK_SET); - die "seek $dst: $!\n" if !$seekable && $! != ESPIPE; - } - my $jobs = $lei->{opt}->{jobs} // 0; - state $zsfx_allow = join('|', keys %zsfx2cmd); - my ($zsfx) = ($dst =~ /\.($zsfx_allow)\z/); - my $write = $jobs > 1 && !$zsfx ? \&atomic_append : \&_print_full; - my $dedupe = $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei); - if ($lei->{opt}->{augment}) { - die "cannot augment $dst, not seekable\n" if !$seekable; - if (-s $out && $dedupe->prepare_dedupe) { - my $rd = $zsfx ? decompress_src($out, $zsfx, $lei) : - dup_src($out); - PublicInbox::MboxReader->$mbox($rd, \&_augment, $lei); - } - # maybe some systems don't honor O_APPEND, Perl does this: - seek($out, 0, SEEK_END) or die "seek $dst: $!"; - $dedupe->pause_dedupe if $jobs; # are we forking? - } - $dedupe->prepare_dedupe if !$jobs; - ($out, $pipe_lk) = compress_dst($out, $zsfx, $lei) if $zsfx; +sub _mbox_write_cb ($$) { + my ($self, $lei) = @_; + my $ovv = $lei->{ovv}; + my $m = 'eml2'.$ovv->{fmt}; + my $eml2mbox = $self->can($m) or die "$self->$m missing"; + $lei->{1} // die "no stdout ($m, $ovv->{dst})"; # redirected earlier + $lei->{1}->autoflush(1); + my $atomic_append = !defined($ovv->{lock_path}); + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe; sub { # for git_to_mail - my ($buf, $oid, $kw) = @_; - my $eml = PublicInbox::Eml->new($buf); - if (!$dedupe->is_dup($eml, $oid)) { - $buf = $eml2mbox->($eml, $kw); - my $lock = $pipe_lk->lock_for_scope if $pipe_lk; - $write->($out, $buf); - } + my ($buf, $smsg, $eml) = @_; + $eml //= PublicInbox::Eml->new($buf); + return if $dedupe->is_dup($eml, $smsg->{blob}); + $buf = $eml2mbox->($eml, $smsg); + return atomic_append($lei, $buf) if $atomic_append; + my $lk = $ovv->lock_for_scope; + $lei->out($$buf); } } @@ -289,73 +286,222 @@ sub _augment_file { # _maildir_each_file cb # _maildir_each_file callback, \&CORE::unlink doesn't work with it sub _unlink { unlink($_[0]) } +sub _rand () { + state $seq = 0; + sprintf('%x,%x,%x,%x', rand(0xffffffff), time, $$, ++$seq); +} + sub _buf2maildir { - my ($dst, $buf, $oid, $kw) = @_; + my ($dst, $buf, $smsg) = @_; + my $kw = $smsg->{kw} // []; my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw)); my $rand = ''; # chosen by die roll :P - my ($tmp, $fh, $final); + my ($tmp, $fh, $final, $ok); + my $common = $smsg->{blob} // _rand; + if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" } do { - $tmp = $dst.'tmp/'.$rand."oid=$oid"; - } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) && - $! == EEXIST && ($rand = int(rand 0x7fffffff).',')); - if (print $fh $$buf and close($fh)) { - $dst .= $sfx eq '' ? 'new/' : 'cur/'; + $tmp = $dst.'tmp/'.$rand.$common; + } while (!($ok = sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) && + $! == EEXIST && ($rand = _rand.',')); + if ($ok && print $fh $$buf and close($fh)) { + # ignore new/ and write only to cur/, otherwise MUAs + # with R/W access to the Maildir will end up doing + # a mass rename which can take a while with thousands + # of messages. + $dst .= 'cur/'; $rand = ''; do { - $final = $dst.$rand."oid=$oid:2,$sfx"; + $final = $dst.$rand.$common.':2,'.$sfx; } while (!link($tmp, $final) && $! == EEXIST && - ($rand = int(rand 0x7fffffff).',')); + ($rand = _rand.',')); unlink($tmp) or warn "W: failed to unlink $tmp: $!\n"; } else { - my $err = $!; + my $err = "Error writing $smsg->{blob} to $dst: $!\n"; + $_[0] = undef; # clobber dst unlink($tmp); - die "Error writing $oid to $dst: $err"; + die $err; } } - sub _maildir_write_cb ($$) { - my ($dst, $lei) = @_; - $dst .= '/' unless substr($dst, -1) eq '/'; - my $dedupe = $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei, $dst); - my $jobs = $lei->{opt}->{jobs} // 0; + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + my $dst = $lei->{ovv}->{dst}; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + $dst // return $lei->fail; # dst may be undef-ed in last run + $buf //= \($eml->as_string); + return _buf2maildir($dst, $buf, $smsg) if !$dedupe; + $eml //= PublicInbox::Eml->new($$buf); # copy buf + return if $dedupe->is_dup($eml, $smsg->{blob}); + undef $eml; + _buf2maildir($dst, $buf, $smsg); + } +} + +sub write_cb { # returns a callback for git_to_mail + my ($self, $lei) = @_; + # _mbox_write_cb or _maildir_write_cb + my $m = "_$self->{base_type}_write_cb"; + $self->$m($lei); +} + +sub new { + my ($cls, $lei) = @_; + my $fmt = $lei->{ovv}->{fmt}; + my $dst = $lei->{ovv}->{dst}; + my $self = bless {}, $cls; + if ($fmt eq 'maildir') { + $self->{base_type} = 'maildir'; + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + } elsif (substr($fmt, 0, 4) eq 'mbox') { + (-d $dst || (-e _ && !-w _)) and die + "$dst exists and is not a writable file\n"; + $self->can("eml2$fmt") or die "bad mbox --format=$fmt\n"; + $self->{base_type} = 'mbox'; + } else { + die "bad mail --format=$fmt\n"; + } + $self->{dst} = $dst; + $lei->{dedupe} = PublicInbox::LeiDedupe->new($lei); + $self; +} + +sub _pre_augment_maildir {} # noop + +sub _do_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; if ($lei->{opt}->{augment}) { + my $dedupe = $lei->{dedupe}; if ($dedupe && $dedupe->prepare_dedupe) { require PublicInbox::InboxWritable; # eml_from_path _maildir_each_file($dst, \&_augment_file, $lei); - $dedupe->pause_dedupe if $jobs; # are we forking? + $dedupe->pause_dedupe; } } else { # clobber existing Maildir _maildir_each_file($dst, \&_unlink); } +} + +sub _post_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; for my $x (qw(tmp new cur)) { my $d = $dst.$x; next if -d $d; require File::Path; - if (!File::Path::mkpath($d) && !-d $d) { - die "failed to mkpath($d): $!\n"; + File::Path::mkpath($d); + -d $d or die "$d is not a directory"; + } +} + +sub _pre_augment_mbox { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + if ($dst ne '/dev/stdout') { + my $mode = -p $dst ? '>' : '+>>'; + if (-f _ && !$lei->{opt}->{augment} and !unlink($dst)) { + $! == ENOENT or die "unlink($dst): $!"; } + open my $out, $mode, $dst or die "open($dst): $!"; + $lei->{old_1} = $lei->{1}; # keep for spawning MUA + $lei->{1} = $out; } - $dedupe->prepare_dedupe if $dedupe && !$jobs; - sub { # for git_to_mail - my ($buf, $oid, $kw) = @_; - return _buf2maildir($dst, $buf, $oid, $kw) if !$dedupe; - my $eml = PublicInbox::Eml->new($$buf); # copy buf - return if $dedupe->is_dup($eml, $oid); - undef $eml; - _buf2maildir($dst, $buf, $oid, $kw); + # Perl does SEEK_END even with O_APPEND :< + $self->{seekable} = seek($lei->{1}, 0, SEEK_SET); + if (!$self->{seekable} && $! != ESPIPE && $dst ne '/dev/stdout') { + die "seek($dst): $!\n"; } + state $zsfx_allow = join('|', keys %zsfx2cmd); + ($self->{zsfx}) = ($dst =~ /\.($zsfx_allow)\z/) or return; + pipe(my ($r, $w)) or die "pipe: $!"; + $lei->{zpipe} = [ $r, $w ]; } -sub write_cb { # returns a callback for git_to_mail - my ($cls, $dst, $lei) = @_; - require PublicInbox::LeiDedupe; - if ($dst =~ s!\A(mbox(?:rd|cl|cl2|o))?:!!) { - _mbox_write_cb($cls, $1, $dst, $lei); - } elsif ($dst =~ s!\A[Mm]aildir:!!) { # typically capitalized - _maildir_write_cb($dst, $lei); +sub _do_augment_mbox { + my ($self, $lei) = @_; + return if !$lei->{opt}->{augment}; + my $dedupe = $lei->{dedupe}; + my $dst = $lei->{ovv}->{dst}; + die "cannot augment $dst, not seekable\n" if !$self->{seekable}; + my $out = $lei->{1}; + if (-s $out && $dedupe && $dedupe->prepare_dedupe) { + my $zsfx = $self->{zsfx}; + my $rd = $zsfx ? decompress_src($out, $zsfx, $lei) : + dup_src($out); + my $fmt = $lei->{ovv}->{fmt}; + require PublicInbox::MboxReader; + PublicInbox::MboxReader->$fmt($rd, \&_augment, $lei); + } + # maybe some systems don't honor O_APPEND, Perl does this: + seek($out, 0, SEEK_END) or die "seek $dst: $!"; + $dedupe->pause_dedupe if $dedupe; +} + +sub pre_augment { # fast (1 disk seek), runs in main daemon + my ($self, $lei) = @_; + # _pre_augment_maildir, _pre_augment_mbox + my $m = "_pre_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub do_augment { # slow, runs in wq worker + my ($self, $lei) = @_; + # _do_augment_maildir, _do_augment_mbox + my $m = "_do_augment_$self->{base_type}"; + $self->$m($lei); +} + +sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon + my ($self, $lei, @args) = @_; + # _post_augment_maildir, _post_augment_mbox + my $m = "_post_augment_$self->{base_type}"; + $self->$m($lei, @args); +} + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = delete $self->{lei}; + $lei->lei_atfork_child; + if (my $zpipe = delete $lei->{zpipe}) { + $lei->{1} = $zpipe->[1]; + close $zpipe->[0]; + } + $self->{wcb} = $self->write_cb($lei); + $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + $self->SUPER::ipc_atfork_child; +} + +sub lock_free { + $_[0]->{base_type} =~ /\A(?:maildir|mh|imap|jmap)\z/ ? 1 : 0; +} + +sub poke_dst { + my ($self) = @_; + if ($self->{base_type} eq 'maildir') { + my $t = time + 1; + utime($t, $t, "$self->{dst}/cur"); + } +} + +sub write_mail { # via ->wq_io_do + my ($self, $git_dir, $smsg) = @_; + my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir); + git_async_cat($git, $smsg->{blob}, \&git_to_mail, + [$self->{wcb}, $smsg]); +} + +sub wq_atexit_child { + my ($self) = @_; + delete $self->{wcb}; + for my $git (delete @$self{grep(/\A$$\0/, keys %$self)}) { + $git->async_wait_all; } - # TODO: Maildir, MH, IMAP, JMAP ... + $SIG{__WARN__} = 'DEFAULT'; } 1;