our %STR_TYPE = (text => 1);
our %STR_SUBTYPE = (plain => 1, html => 1);
+# message/* subtypes we descend into
+our %MESSAGE_DESCEND = (
+ news => 1, # RFC 1849 (obsolete, but archives are forever)
+ rfc822 => 1, # RFC 2046
+ rfc2822 => 1, # gmime handles this (but not rfc5322)
+ global => 1, # RFC 6532
+);
+
my %re_memo;
sub re_memo ($) {
my ($k) = @_;
}
# returns a queue of sub-parts iff it's worth descending into
-# TODO: descend into message/rfc822 parts (Email::MIME didn't)
sub mp_descend ($$) {
my ($self, $nr) = @_; # or $once for top-level
- my $bnd = ct($self)->{attributes}->{boundary} // return; # single-part
+ my $ct = ct($self);
+ my $type = lc($ct->{type});
+ if ($type eq 'message' && $MESSAGE_DESCEND{lc($ct->{subtype})}) {
+ my $nxt = new(undef, body_raw($self));
+ $self->{-call_cb} = $nxt->{is_submsg} = 1;
+ return [ $nxt ];
+ }
+ return if $type ne 'multipart';
+ my $bnd = $ct->{attributes}->{boundary} // return; # single-part
return if $bnd eq '' || length($bnd) >= $mime_boundary_length_limit;
$bnd = quotemeta($bnd);
+ # this is a multipart message that didn't get descended into in
+ # public-inbox <= 1.5.0, so ensure we call the user callback for
+ # this part to not break PSGI downloads.
+ $self->{-call_cb} = $self->{is_submsg};
+
# "multipart" messages can exist w/o a body
my $bdy = ($nr ? delete($self->{bdy}) : \(body_raw($self))) or return;
# compatibility with Email::MIME
$parts[-1] =~ s/\n\r?\n\z/\n/s if $epilogue_missing;
- @parts = grep /[^ \t\r\n]/s, @parts; # ignore empty parts
+ # ignore empty parts
+ @parts = map { new_sub(undef, \$_) } grep /[^ \t\r\n]/s, @parts;
# Keep "From: someone..." from preamble in old,
# buggy versions of git-send-email, otherwise drop it
# There's also a case where quoted text showed up in the
# preamble
# <20060515162817.65F0F1BBAE@citi.umich.edu>
- unshift(@parts, $pre) if $pre =~ /:/s;
+ unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s;
return \@parts;
}
# "multipart", but no boundary found, treat as single part
my ($self, $cb, $arg, $once) = @_;
my $p = mp_descend($self, $once // 0) or
return $cb->([$self, 0, 0], $arg);
+
+ $cb->([$self, 0, 0], $arg) if $self->{-call_cb}; # rare
+
$p = [ $p, 0 ];
my @s; # our virtual stack
my $nr = 0;
my (undef, @idx) = @$p;
@idx = (join('.', @idx));
my $depth = ($idx[0] =~ tr/././) + 1;
- my $sub = new_sub(undef, \(shift @{$p->[0]}));
+ my $sub = shift @{$p->[0]};
if ($depth < $mime_nesting_limit &&
(my $nxt = mp_descend($sub, $nr))) {
push(@s, $p) if scalar @{$p->[0]};
$p = [ $nxt, @idx, 0 ];
+ $cb->([$sub, $depth, @idx], $arg) if $sub->{-call_cb};
} else { # a leaf node
$cb->([$sub, $depth, @idx], $arg);
}
if ($$bdy =~ /^--\Q$bnd\E--[ \t]*\r?\n(.+)\z/sm) {
$self->{epilogue} = $1;
}
- map { new_sub(undef, \$_) } @$parts;
+ @$parts;
}
sub parts_set {
# times when it should not have been:
# <87llgalspt.fsf@free.fr>
# <200308111450.h7BEoOu20077@mail.osdl.org>
+ # But also do not try this with ->{is_submsg} (message/rfc822),
+ # since a broken multipart/mixed inside a message/rfc822 part
+ # has not been seen in the wild, yet...
if ($err && ($ct =~ m!\btext/\b!i ||
- $ct =~ m!\bmultipart/mixed\b!i)) {
+ (!$part->{is_submsg} &&
+ $ct =~ m!\bmultipart/mixed\b!i) ) ) {
my $cte = $part->header_raw('Content-Transfer-Encoding');
if (defined($cte) && $cte =~ /\b7bit\b/i) {
$s = $part->body;
if (defined $fn && $fn ne '') {
index_text($self, $fn, 1, 'XFN');
}
+ if ($part->{is_submsg}) {
+ my $mids = mids_for_index($part);
+ index_ids($self, $doc, $part, $mids);
+ my $smsg = PublicInbox::Smsg->new($part);
+ index_users($self, $smsg);
+ index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+ }
my ($s, undef) = msg_part_text($part, $ct);
defined $s or return;
}
}
+sub index_ids ($$$$) {
+ my ($self, $doc, $hdr, $mids) = @_;
+ for my $mid (@$mids) {
+ index_text($self, $mid, 1, 'XM');
+
+ # because too many Message-IDs are prefixed with
+ # "Pine.LNX."...
+ if ($mid =~ /\w{12,}/) {
+ my @long = ($mid =~ /(\w{3,}+)/g);
+ index_text($self, join(' ', @long), 1, 'XM');
+ }
+ }
+ $doc->add_boolean_term('Q' . $_) for @$mids;
+ for my $l ($hdr->header_raw('List-Id')) {
+ $l =~ /<([^>]+)>/ or next;
+ my $lid = $1;
+ $doc->add_boolean_term('G' . $lid);
+ index_text($self, $lid, 1, 'XL'); # probabilistic
+ }
+}
+
sub add_xapian ($$$$) {
my ($self, $mime, $smsg, $mids) = @_;
$smsg->{mime} = $mime; # XXX dangerous
add_val($doc, PublicInbox::Search::DT(), $dt);
my $tg = term_generator($self);
-
$tg->set_document($doc);
index_text($self, $subj, 1, 'S') if $subj;
index_users($self, $smsg);
msg_iter($mime, \&index_xapian, [ $self, $doc ]);
- foreach my $mid (@$mids) {
- index_text($self, $mid, 1, 'XM');
-
- # because too many Message-IDs are prefixed with
- # "Pine.LNX."...
- if ($mid =~ /\w{12,}/) {
- my @long = ($mid =~ /(\w{3,}+)/g);
- index_text($self, join(' ', @long), 1, 'XM');
- }
- }
+ index_ids($self, $doc, $hdr, $mids);
$smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
my $data = $smsg->to_doc_data;
}
}
}
- $doc->add_boolean_term('Q' . $_) foreach @$mids;
- for my $l ($hdr->header_raw('List-Id')) {
- $l =~ /<([^>]+)>/ or next;
- my $lid = $1;
- $doc->add_boolean_term('G' . $lid);
- index_text($self, $lid, 1, 'XL'); # probabilistic
- }
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
use PublicInbox::WwwStream;
use PublicInbox::Reply;
use PublicInbox::ViewDiff qw(flush_diff);
+use PublicInbox::Eml;
use POSIX qw(strftime);
use Time::Local qw(timegm);
use PublicInbox::Smsg qw(subject_normalized);
$_[0]->each_part(\&add_text_body, $_[1], 1);
}
+sub submsg_hdr ($$) {
+ my ($ctx, $eml) = @_;
+ my $obfs_ibx = $ctx->{-obfs_ibx};
+ my $rv = $ctx->{obuf};
+ $$rv .= "\n";
+ for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) {
+ my @v = $eml->header($h);
+ for my $v (@v) {
+ obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
+ $v = ascii_html($v);
+ $$rv .= "$h: $v\n";
+ }
+ }
+}
+
sub attach_link ($$$$;$) {
my ($ctx, $ct, $p, $fn, $err) = @_;
my ($part, $depth, $idx) = @$p;
$desc = ascii_html($desc);
$$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]";
$$rv .= "</a>\n";
+
+ submsg_hdr($ctx, $part) if $part->{is_submsg};
+
undef;
}
my ($p, $ctx) = @_;
my $upfx = $ctx->{mhref};
my $ibx = $ctx->{-inbox};
+ my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
# $p - from each_part: [ Email::MIME-like, depth, $idx ]
my ($part, $depth, $idx) = @$p;
my $ct = $part->content_type || 'text/plain';
my ($s, $err) = msg_part_text($part, $ct);
return attach_link($ctx, $ct, $p, $fn) unless defined $s;
+ my $rv = $ctx->{obuf};
+ if ($part->{is_submsg}) {
+ submsg_hdr($ctx, $part);
+ $$rv .= "\n";
+ }
+
# makes no difference to browsers, and don't screw up filename
# link generation in diffs with the extra '%0D'
$s =~ s/\r\n/\n/sg;
# split off quoted and unquoted blocks:
my @sections = PublicInbox::MsgIter::split_quotes($s);
undef $s; # free memory
- my $rv = $ctx->{obuf};
- if (defined($fn) || $depth > 0 || $err) {
+ if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) {
# badly-encoded message with $err? tell the world about it!
attach_link($ctx, $ct, $p, $fn, $err);
$$rv .= "\n";
}
- my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
foreach my $cur (@sections) {
if ($cur =~ /\A>/) {
# we use a <span> here to allow users to specify
'', 'each_part can clobber body');
}
+if ('descend into message/rfc822') {
+ my $eml = eml_load 't/data/message_embed.eml';
+ my @parts;
+ $eml->each_part(sub {
+ my ($part, $level, @ex) = @{$_[0]};
+ push @parts, [ $part, $level, @ex ];
+ });
+ is(scalar(@parts), 6, 'got all parts');
+ like($parts[0]->[0]->body, qr/^testing embedded message harder\n/sm,
+ 'first part found');
+ is_deeply([ @{$parts[0]}[1..2] ], [ 1, '1' ],
+ 'got expected depth and level for part #0');
+ is($parts[1]->[0]->filename, 'embed2x.eml',
+ 'attachment filename found');
+ is_deeply([ @{$parts[1]}[1..2] ], [ 1, '2' ],
+ 'got expected depth and level for part #1');
+ is_deeply([ @{$parts[2]}[1..2] ], [ 2, '2.1' ],
+ 'got expected depth and level for part #2');
+ is_deeply([ @{$parts[3]}[1..2] ], [ 3, '2.1.1' ],
+ 'got expected depth and level for part #3');
+ is_deeply([ @{$parts[4]}[1..2] ], [ 3, '2.1.2' ],
+ 'got expected depth and level for part #4');
+ is($parts[4]->[0]->filename, 'test.eml',
+ 'another attachment filename found');
+ is_deeply([ @{$parts[5]}[1..2] ], [ 4, '2.1.2.1' ],
+ 'got expected depth and level for part #5');
+}
+
# body-less, boundary-less
for my $cls (@classes) {
my $call = 0;
$res = $cb->(GET("/test/$mid/"));
like($res->content, qr/\bhref="2-embed2x\.eml"/s,
'href to message/rfc822 attachment visible');
+ like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s,
+ 'href to nested message/rfc822 attachment visible');
+
$res = $cb->(GET("/test/$mid/2-embed2x.eml"));
my $eml = PublicInbox::Eml->new(\($res->content));
is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ],
'1st attachment is as expected');
is($subs[1]->header('Content-Type'), 'message/rfc822',
'2nd attachment is as expected');
+
+ $res = $cb->(GET("/test/$mid/2.1.2-test.eml"));
+ $eml = PublicInbox::Eml->new(\($res->content));
+ is_deeply([ $eml->header_raw('Message-ID') ],
+ [ '<20200418214114.7575-1-e@yhbt.net>' ],
+ 'nested eml retrieved');
});
}
done_testing();
is_deeply($found, [], 'matched on phrase with l:');
}
+$ibx->with_umask(sub {
+ $rw_commit->();
+ my $doc_id = $rw->add_message(eml_load('t/data/message_embed.eml'));
+ ok($doc_id > 0, 'messages within messages');
+ $rw->commit_txn_lazy;
+ $ro->reopen;
+ my $n_test_eml = $ro->query('n:test.eml');
+ is(scalar(@$n_test_eml), 1, 'got a result');
+ my $n_embed2x_eml = $ro->query('n:embed2x.eml');
+ is_deeply($n_test_eml, $n_embed2x_eml, '.eml filenames searchable');
+ for my $m (qw(20200418222508.GA13918@dcvr 20200418222020.GA2745@dcvr
+ 20200418214114.7575-1-e@yhbt.net)) {
+ is($ro->query("m:$m")->[0]->{mid},
+ '20200418222508.GA13918@dcvr', 'probabilistic m:'.$m);
+ is($ro->query("mid:$m")->[0]->{mid},
+ '20200418222508.GA13918@dcvr', 'boolean mid:'.$m);
+ }
+ is($ro->query('dfpost:4dc62c50')->[0]->{mid},
+ '20200418222508.GA13918@dcvr',
+ 'diff search reaches inside message/rfc822');
+ is($ro->query('s:"mail header experiments"')->[0]->{mid},
+ '20200418222508.GA13918@dcvr',
+ 'Subject search reaches inside message/rfc822');
+});
+
done_testing();
1;