1 Received: from localhost (dcvr.yhbt.net [127.0.0.1])
2 by dcvr.yhbt.net (Postfix) with ESMTP id 977481F45A;
3 Sat, 18 Apr 2020 22:25:08 +0000 (UTC)
4 Date: Sat, 18 Apr 2020 22:25:08 +0000
5 From: Eric Wong <e@yhbt.net>
6 To: test@public-inbox.org
7 Subject: Re: embedded message test
8 Message-ID: <20200418222508.GA13918@dcvr>
9 References: <20200418222020.GA2745@dcvr>
11 Content-Type: multipart/mixed; boundary="TB36FDmn/VVEgNH/"
12 Content-Disposition: inline
13 In-Reply-To: <20200418222020.GA2745@dcvr>
17 Content-Type: text/plain; charset=utf-8
18 Content-Disposition: inline
20 testing embedded message harder
23 Content-Type: message/rfc822
24 Content-Disposition: attachment; filename="embed2x.eml"
26 Date: Sat, 18 Apr 2020 22:20:20 +0000
27 From: Eric Wong <e@yhbt.net>
28 To: test@public-inbox.org
29 Subject: embedded message test
30 Message-ID: <20200418222020.GA2745@dcvr>
32 Content-Type: multipart/mixed; boundary="/04w6evG8XlLl3ft"
33 Content-Disposition: inline
36 Content-Type: text/plain; charset=utf-8
37 Content-Disposition: inline
39 testing embedded message
42 Content-Type: message/rfc822
43 Content-Disposition: attachment; filename="test.eml"
45 From: Eric Wong <e@yhbt.net>
47 Subject: [PATCH] mail header experiments
48 Date: Sat, 18 Apr 2020 21:41:14 +0000
49 Message-Id: <20200418214114.7575-1-e@yhbt.net>
51 Content-Transfer-Encoding: 8bit
54 lib/PublicInbox/MailHeader.pm | 55 +++++++++++++++++++++++++++++++++++
55 t/mail_header.t | 31 ++++++++++++++++++++
56 2 files changed, 86 insertions(+)
57 create mode 100644 lib/PublicInbox/MailHeader.pm
58 create mode 100644 t/mail_header.t
60 diff --git a/lib/PublicInbox/MailHeader.pm b/lib/PublicInbox/MailHeader.pm
62 index 00000000..166baf91
64 +++ b/lib/PublicInbox/MailHeader.pm
66 +# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
67 +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
68 +package PublicInbox::MailHeader;
70 +use HTTP::Parser::XS qw(parse_http_response HEADERS_AS_ARRAYREF);
71 +use bytes (); #bytes::length
74 +sub _headerx_to_list {
75 + my (undef, $head, $crlf) = @_;
77 + # picohttpparser uses `int' as the return value, so the
78 + # actual limit is 2GB on most platforms. However, headers
79 + # exceeding (or even close to) 1MB seems unreasonable
80 + die 'headers too big' if bytes::length($$head) > 0x100000;
81 + my ($ret, undef, undef, undef, $headers) =
82 + parse_http_response('HTTP/1.0 1 X'. $crlf . $$head,
83 + HEADERS_AS_ARRAYREF);
84 + die 'failed to parse headers' if $ret <= 0;
85 + # %casemap = map {; lc($_) => $_ } ($$head =~ m/^([^:]+):/gsm);
86 + # my $nr = @$headers;
87 + for (my $i = 0; $i < @$headers; $i += 2) {
88 + my $key = $headers->[$i]; # = $casemap{$headers->[$i]};
89 + my $val = $headers->[$i + 1];
90 + (my $trimmed = $val) =~ s/\r?\n\s+/ /;
91 + $headers->[$i + 1] = [
99 +sub _header_to_list {
100 + my (undef, $head, $crlf) = @_;
101 + my @tmp = ($$head =~ m/^(([^ \t:][^:\n]*):[ \t]*
102 + ([^\n]*\n(?:[ \t]+[^\n]*\n)*))/gsmx);
104 + $#headers = scalar @tmp;
107 + my ($orig, $key, $val) = splice(@tmp, 0, 3);
108 + # my $v = $tmp[$i + 2];
109 + # $v =~ s/\r?\n[ \t]+/ /sg;
110 + # $v =~ s/\r?\n\z//s;
111 + $val =~ s/\n[ \t]+/ /sg;
112 + chomp($val, $orig);
113 + # $val =~ s/\r?\n\z//s;
114 + # $orig =~ s/\r?\n\z//s;
115 + push @headers, $key, [ $val, $orig ];
121 diff --git a/t/mail_header.t b/t/mail_header.t
123 index 00000000..4dc62c50
125 +++ b/t/mail_header.t
127 +# Copyright (C) 2020 all contributors <meta@public-inbox.org>
128 +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
131 +use PublicInbox::TestCommon;
132 +require_mods('PublicInbox::MailHeader');
135 +From d0147582e289fdd4cdd84e91d8b0f8ae9c230124 Mon Sep 17 00:00:00 2001
136 +From: Eric Wong <e@yhbt.net>
137 +Date: Fri, 17 Apr 2020 09:28:49 +0000
138 +Subject: [PATCH] searchthread: reduce indirection by removing container
143 +my $xshdr = PublicInbox::MailHeader->_header_to_list(\$head, "\n");
144 +my $simpl = Email::Simple::Header->_header_to_list(\$head, "\n");
145 +is_deeply($xshdr, $simpl);
146 +use Benchmark qw(:all);
147 +my $res = timethese(100000, {
149 + PublicInbox::MailHeader->_header_to_list(\$head, "\n");
152 + PublicInbox::MailHeader->_header_to_list(\$head, "\n");
156 +use Data::Dumper; diag Dumper($res);