From 41654a8cd9372c0640c4ca5339e5881927965e41 Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Fri, 23 Mar 2018 01:54:16 +0000 Subject: [PATCH] www: $MESSAGE_ID/raw endpoint supports "duplicates" Since v2 supports duplicate messages, we need to support looking up different messages with the same Message-Id. Fortunately, our "raw" endpoint has always been mboxrd, so users won't need to change their parsing tools. --- MANIFEST | 1 + lib/PublicInbox/Mbox.pm | 71 ++++++++++++++++++++---- lib/PublicInbox/Search.pm | 1 + lib/PublicInbox/WWW.pm | 3 +- t/psgi_v2.t | 110 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 175 insertions(+), 11 deletions(-) create mode 100644 t/psgi_v2.t diff --git a/MANIFEST b/MANIFEST index 0f889959..8b2b10bd 100644 --- a/MANIFEST +++ b/MANIFEST @@ -171,6 +171,7 @@ t/psgi_attach.t t/psgi_mount.t t/psgi_search.t t/psgi_text.t +t/psgi_v2.t t/qspawn.t t/reply.t t/search-thr-index.t diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 84cc3845..79e09a70 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -26,12 +26,68 @@ sub subject_fn ($) { $fn eq '' ? 'no-subject' : $fn; } -sub emit1 { - my ($ctx, $msg) = @_; - $msg = Email::Simple->new($msg); - my $fn = subject_fn($msg); +sub smsg_for ($$$) { + my ($head, $db, $mid) = @_; + my $doc_id = $head->get_docid; + my $doc = $db->get_document($doc_id); + PublicInbox::SearchMsg->wrap($doc, $mid)->load_expand; +} + +sub mb_stream { + my ($more) = @_; + bless $more, 'PublicInbox::Mbox'; +} + +# called by PSGI server as body response +sub getline { + my ($more) = @_; # self + my ($ctx, $head, $tail, $db, $cur) = @$more; + if ($cur) { + pop @$more; + return msg_str($ctx, $cur); + } + for (; !defined($cur) && $head != $tail; $head++) { + my $smsg = smsg_for($head, $db, $ctx->{mid}); + next if $smsg->type ne 'mail'; + my $mref = $ctx->{-inbox}->msg_by_smsg($smsg) or next; + $cur = Email::Simple->new($mref); + $cur = msg_str($ctx, $cur); + } + $more->[1] = $head; + $cur; +} + +sub close {} # noop + +sub emit_raw { + my ($ctx) = @_; + my $mid = $ctx->{mid}; + my $ibx = $ctx->{-inbox}; + my $first; + my $more; + my ($head, $tail, $db); + my %seen; + if (my $srch = $ibx->search) { + $srch->retry_reopen(sub { + ($head, $tail, $db) = $srch->each_smsg_by_mid($mid); + for (; !defined($first) && $head != $tail; $head++) { + my $smsg = smsg_for($head, $db, $mid); + next if $smsg->type ne 'mail'; + my $mref = $ibx->msg_by_smsg($smsg) or next; + $first = Email::Simple->new($mref); + } + if ($head != $tail) { + $more = [ $ctx, $head, $tail, $db, $first ]; + } + }); + } else { + my $mref = $ibx->msg_by_mid($mid) or return; + $first = Email::Simple->new($mref); + } + return unless defined $first; + my $fn = subject_fn($first); my @hdr = ('Content-Type'); - if ($ctx->{-inbox}->{obfuscate}) { + if ($ibx->{obfuscate}) { # obfuscation is stupid, but maybe scrapers are, too... push @hdr, 'application/mbox'; $fn .= '.mbox'; @@ -40,10 +96,7 @@ sub emit1 { $fn .= '.txt'; } push @hdr, 'Content-Disposition', "inline; filename=$fn"; - - # single message should be easily renderable in browsers, - # unless obfuscation is enabled :< - [ 200, \@hdr, [ msg_str($ctx, $msg) ] ] + [ 200, \@hdr, $more ? mb_stream($more) : [ msg_str($ctx, $first) ] ]; } sub msg_str { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index f08b9870..24600ee7 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -417,6 +417,7 @@ sub each_smsg_by_mid { my $term = 'Q' . $mid; my $head = $db->postlist_begin($term); my $tail = $db->postlist_end($term); + return ($head, $tail, $db) if wantarray; for (; $head->nequal($tail); $head->inc) { my $doc_id = $head->get_docid; my $doc = $db->get_document($doc_id); diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index e95fba08..f86363c6 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -218,9 +218,8 @@ sub mid2blob { # /$INBOX/$MESSAGE_ID/raw -> raw mbox sub get_mid_txt { my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); require PublicInbox::Mbox; - PublicInbox::Mbox::emit1($ctx, $x); + PublicInbox::Mbox::emit_raw($ctx) || r404($ctx); } # /$INBOX/$MESSAGE_ID/ -> HTML content (short quotes) diff --git a/t/psgi_v2.t b/t/psgi_v2.t new file mode 100644 index 00000000..5d089dbe --- /dev/null +++ b/t/psgi_v2.t @@ -0,0 +1,110 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use PublicInbox::MIME; +use PublicInbox::Config; +use PublicInbox::WWW; +my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test + URI::Escape Plack::Builder); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for psgi_v2_dupes.t" if $@; +} +use_ok($_) for @mods; +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-v2_dupes-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => $mainrepo, + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $new_mid; + +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; + +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + 'Message-ID' => '', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); +ok($im->add($mime), 'added one message'); +$mime->body_set("hello world!\n"); + +my @warn; +local $SIG{__WARN__} = sub { push @warn, @_ }; +ok($im->add($mime), 'added duplicate-but-different message'); +is(scalar(@warn), 1, 'got one warning'); +my @mids = $mime->header_obj->header_raw('Message-Id'); +$new_mid = PublicInbox::MID::mid_clean($mids[0]); +$im->done; + +my $cfgpfx = "publicinbox.v2test"; +my %cfg = ( + "$cfgpfx.address" => $ibx->{-primary_address}, + "$cfgpfx.mainrepo" => $mainrepo, +); + +my $config = PublicInbox::Config->new({ %cfg }); +my $www = PublicInbox::WWW->new($config); +my ($res, $raw, @from_); +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + $res = $cb->(GET('/v2test/a-mid@b/raw')); + $raw = $res->content; + like($raw, qr/^hello world$/m, 'got first message'); + like($raw, qr/^hello world!$/m, 'got second message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 2, 'two From_ lines'); + + $res = $cb->(GET("/v2test/$new_mid/raw")); + $raw = $res->content; + like($raw, qr/^hello world!$/m, 'second message with new Message-Id'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 1, 'only one From_ line'); +}); + +$mime->header_set('Message-Id', 'a-mid@b'); +$mime->body_set("hello ghosts\n"); +ok($im->add($mime), 'added 3rd duplicate-but-different message'); +is(scalar(@warn), 2, 'got another warning'); +like($warn[0], qr/mismatched/, 'warned about mismatched messages'); +is($warn[0], $warn[1], 'both warnings are the same'); + +@mids = $mime->header_obj->header_raw('Message-Id'); +my $third = PublicInbox::MID::mid_clean($mids[0]); +$im->done; + +# need to reload... +$config = PublicInbox::Config->new({ %cfg }); +$www = PublicInbox::WWW->new($config); +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + $res = $cb->(GET("/v2test/$third/raw")); + $raw = $res->content; + like($raw, qr/^hello ghosts$/m, 'got third message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 1, 'one From_ line'); + + $res = $cb->(GET('/v2test/a-mid@b/raw')); + $raw = $res->content; + like($raw, qr/^hello world$/m, 'got first message'); + like($raw, qr/^hello world!$/m, 'got second message'); + like($raw, qr/^hello ghosts$/m, 'got third message'); + @from_ = ($raw =~ m/^From /mg); + is(scalar(@from_), 3, 'three From_ lines'); +}); + +done_testing(); + +1; -- 2.44.0