lib/PublicInbox/ContentId.pm | 16 +++++++++++++++- diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index 65d5a76c09b43f4dec2edb08998c22ffe6d6e95e..7ec638ca5ef2ece2eb95912d126338e22f851d3b 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -11,7 +11,7 @@ # not sure if less-widely supported hash families are worth bothering with use Digest::SHA; # Content-* headers are often no-ops, so maybe we don't need them -my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To); +my @ID_HEADERS = qw(Subject From Date To Cc); sub content_id ($;$) { my ($mime, $alg) = @_; @@ -19,6 +19,20 @@ $alg ||= 256; my $dig = Digest::SHA->new($alg); my $hdr = $mime->header_obj; + # References: and In-Reply-To: get used interchangeably + # in some "duplicates" in LKML. We treat them the same + # in SearchIdx, so treat them the same for this: + my @mid = $hdr->header_raw('Message-ID'); + @mid = (join(' ', @mid) =~ /<([^>]+)>/g); + my $refs = join(' ', $hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + my @refs = ($refs =~ /<([^>]+)>/g); + my %seen; + foreach my $mid (@mid, @refs) { + next if $seen{$mid}; + $dig->add($mid); + $seen{$mid} = 1; + } foreach my $h (@ID_HEADERS) { my @v = $hdr->header_raw($h); $dig->add($_) foreach @v;