X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FContentHash.pm;h=bacc9cdda12498abbb0ada5d2a2e2faec10190f2;hb=23af251dd607c4e75ab1e68063f2c885c48cc035;hp=420dc5e7c92d28c05cfa183cf4059d5258afd786;hpb=098fecd1fe516a00fbfd49622b82be382ebcdab6;p=public-inbox.git diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index 420dc5e7..bacc9cdd 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # Unstable internal API. @@ -8,9 +8,9 @@ # See L manpage for more details. package PublicInbox::ContentHash; use strict; -use warnings; -use base qw/Exporter/; -our @EXPORT_OK = qw/content_hash content_digest/; +use v5.10.1; +use parent qw(Exporter); +our @EXPORT_OK = qw(content_hash content_digest git_sha); use PublicInbox::MID qw(mids references); use PublicInbox::MsgIter; @@ -20,6 +20,7 @@ use Digest::SHA; sub digest_addr ($$$) { my ($dig, $h, $v) = @_; $v =~ tr/"//d; + $v =~ tr/\r\n\t / /s; $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; utf8::encode($v); $dig->add("$h\0$v\0"); @@ -52,30 +53,28 @@ sub content_dig_i { $dig->add($s); } -sub content_digest ($) { - my ($mime) = @_; - my $dig = Digest::SHA->new(256); - my $hdr = $mime->header_obj; +sub content_digest ($;$) { + my ($eml, $dig) = @_; + $dig //= Digest::SHA->new(256); # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: # do NOT consider the Message-ID as part of the content_hash # if we got here, we've already got Message-ID reuse - my %seen = map { $_ => 1 } @{mids($hdr)}; - foreach my $mid (@{references($hdr)}) { + my %seen = map { $_ => 1 } @{mids($eml)}; + foreach my $mid (@{references($eml)}) { $dig->add("ref\0$mid\0") unless $seen{$mid}++; } # Only use Sender: if From is not present foreach my $h (qw(From Sender)) { - my @v = $hdr->header($h); - if (@v) { - digest_addr($dig, $h, $_) foreach @v; - } + my @v = $eml->header($h) or next; + digest_addr($dig, $h, $_) foreach @v; + last; } foreach my $h (qw(Subject Date)) { - my @v = $hdr->header($h); + my @v = $eml->header($h); foreach my $v (@v) { utf8::encode($v); $dig->add("$h\0$v\0"); @@ -85,10 +84,10 @@ sub content_digest ($) { # not in the original message. For the purposes of deduplication, # do not take it into account: foreach my $h (qw(To Cc)) { - my @v = $hdr->header($h); + my @v = $eml->header($h); digest_addr($dig, $h, $_) foreach @v; } - msg_iter($mime, \&content_dig_i, $dig); + msg_iter($eml, \&content_dig_i, $dig); $dig; } @@ -96,4 +95,13 @@ sub content_hash ($) { content_digest($_[0])->digest; } +sub git_sha ($$) { + my ($n, $eml) = @_; + my $dig = Digest::SHA->new($n); + my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string); + $dig->add('blob '.length($$bref)."\0"); + $dig->add($$bref); + $dig; +} + 1;