From 4030525cb228eb3837f5260637bd7a5a861e81e2 Mon Sep 17 00:00:00 2001
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
Date: Sat, 3 Mar 2018 05:14:33 +0000
Subject: [PATCH] mid: be strict with References, but loose on Message-Id

Traditionally we've been more lax on parsing Message-Id
and allow it without the angle brackets.  We've always been
strict on References and can't have it be pointlessly
large when some MUA decides to use HTML-escaped angle
brackets ("&lt;", "&gt;").
---
 lib/PublicInbox/MID.pm | 45 +++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 4ccb704d..96085399 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -49,16 +49,39 @@ sub mid2path {
 
 sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') }
 
-sub uniq_mids {
-	my ($hdr, @fields) = @_;
-	my %seen;
-	my @raw;
-	foreach my $f (@fields) {
-		push @raw, $hdr->header_raw($f);
+sub mids ($) {
+	my ($hdr) = @_;
+	my @mids;
+	my @v = $hdr->header_raw('Message-Id');
+	foreach my $v (@v) {
+		my @cur = ($v =~ /<([^>]+)>/sg);
+		if (@cur) {
+			push(@mids, @cur);
+		} else {
+			push(@mids, $v);
+		}
 	}
-	my @mids = (join(' ', @raw) =~ /<([^>]+)>/g);
-	my $mids = scalar(@mids) == 0 ? \@raw: \@mids;
+	uniq_mids(\@mids);
+}
+
+# last References should be IRT, but some mail clients do things
+# out of order, so trust IRT over References iff IRT exists
+sub references ($) {
+	my ($hdr) = @_;
+	my @mids;
+	foreach my $f (qw(References In-Reply-To)) {
+		my @v = $hdr->header_raw($f);
+		foreach my $v (@v) {
+			push(@mids, ($v =~ /<([^>]+)>/sg));
+		}
+	}
+	uniq_mids(\@mids);
+}
+
+sub uniq_mids ($) {
+	my ($mids) = @_;
 	my @ret;
+	my %seen;
 	foreach (@$mids) {
 		next if $seen{$_};
 		push @ret, $_;
@@ -67,12 +90,6 @@ sub uniq_mids {
 	\@ret;
 }
 
-sub mids { uniq_mids($_[0], 'Message-Id') }
-
-# last References should be IRT, but some mail clients do things
-# out of order, so trust IRT over References iff IRT exists
-sub references { uniq_mids($_[0], 'References', 'In-Reply-To') }
-
 # RFC3986, section 3.3:
 sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' }
 sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) }
-- 
2.51.0