# Copyright (C) 2018 all contributors # License: AGPL-3.0+ package PublicInbox::ContentId; use strict; use warnings; use base qw/Exporter/; our @EXPORT_OK = qw/content_id/; use PublicInbox::MID qw(mids references); # not sure if less-widely supported hash families are worth bothering with use Digest::SHA; # Content-* headers are often no-ops, so maybe we don't need them my @ID_HEADERS = qw(Subject From Date To Cc); sub content_id ($;$) { my ($mime, $alg) = @_; $alg ||= 256; my $dig = Digest::SHA->new($alg); my $hdr = $mime->header_obj; # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: my %seen; foreach my $mid (@{mids($hdr)}) { $dig->add('mid: '.$mid); $seen{$mid} = 1; } foreach my $mid (@{references($hdr)}) { next if $seen{$mid}; $dig->add('ref: '.$mid); } foreach my $h (@ID_HEADERS) { my @v = $hdr->header_raw($h); $dig->add("$h: $_") foreach @v; } $dig->add($mime->body_raw); 'SHA-' . $dig->algorithm . ':' . $dig->hexdigest; } 1;