=head1 OBJECT IDENTIFIERS
-There are three distinct type of identifiers. content_id is the
+There are three distinct type of identifiers. content_hash is the
new one for v2 and should make message removal and deduplication
easier. object_id and Message-ID are already known.
This remains a searchable field in Xapian. Note: it's possible
for emails to have multiple Message-ID headers (and L<git-send-email(1)>
had that bug for a bit); so we take all of them into account.
-In case of conflicts detected by content_id below, we generate a new
-Message-ID based on content_id; if the generated Message-ID still
+In case of conflicts detected by content_hash below, we generate a new
+Message-ID based on content_hash; if the generated Message-ID still
conflicts, a random one is generated.
-=item content_id
+=item content_hash
A hash of relevant headers and raw body content for
purging of unwanted content. This is not stored anywhere,
Subject, From, Date, References, In-Reply-To, To, Cc
-Received, List-Id, and similar headers are NOT part of content_id as
+Received, List-Id, and similar headers are NOT part of content_hash as
they differ across lists and we will want removal to be able to cross
lists.
filters (e.g. PublicInbox::Filter::Vger) to clean the body for
imports.
-content_id is SHA-256 for now; but can be changed at any time
+content_hash is SHA-256 for now; but can be changed at any time
without making DB changes.
=back
lib/PublicInbox/AltId.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/Config.pm
-lib/PublicInbox/ContentId.pm
+lib/PublicInbox/ContentHash.pm
lib/PublicInbox/DS.pm
lib/PublicInbox/DSKQXS.pm
lib/PublicInbox/DSPoll.pm
t/check-www-inbox.perl
t/config.t
t/config_limiter.t
-t/content_id.t
+t/content_hash.t
t/convert-compact.t
t/data/0001.patch
t/ds-kqxs.t
# This is not stored in any database anywhere and may change
# as changes in duplicate detection are needed.
# See L<public-inbox-v2-format(5)> manpage for more details.
-package PublicInbox::ContentId;
+package PublicInbox::ContentHash;
use strict;
use warnings;
use base qw/Exporter/;
-our @EXPORT_OK = qw/content_id content_digest/;
+our @EXPORT_OK = qw/content_hash content_digest/;
use PublicInbox::MID qw(mids references);
use PublicInbox::MsgIter;
# References: and In-Reply-To: get used interchangeably
# in some "duplicates" in LKML. We treat them the same
# in SearchIdx, so treat them the same for this:
- # do NOT consider the Message-ID as part of the content_id
+ # do NOT consider the Message-ID as part of the content_hash
# if we got here, we've already got Message-ID reuse
my %seen = map { $_ => 1 } @{mids($hdr)};
foreach my $mid (@{references($hdr)}) {
$dig;
}
-sub content_id ($) {
+sub content_hash ($) {
content_digest($_[0])->digest;
}
use PublicInbox::MID qw(mids mid2path);
use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use PublicInbox::MDA;
use PublicInbox::Eml;
use POSIX qw(strftime);
use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::MID qw(mids references);
-use PublicInbox::ContentId qw(content_id content_digest);
+use PublicInbox::ContentHash qw(content_hash content_digest);
use PublicInbox::Inbox;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
$rewrites;
}
-sub content_ids ($) {
+sub content_hashes ($) {
my ($mime) = @_;
- my @cids = ( content_id($mime) );
+ my @chashes = ( content_hash($mime) );
# We still support Email::MIME, here, and
# Email::MIME->as_string doesn't always round-trip, so we may
- # use a second content_id
- my $rt = content_id(PublicInbox::Eml->new(\($mime->as_string)));
- push @cids, $rt if $cids[0] ne $rt;
- \@cids;
+ # use a second content_hash
+ my $rt = content_hash(PublicInbox::Eml->new(\($mime->as_string)));
+ push @chashes, $rt if $chashes[0] ne $rt;
+ \@chashes;
}
sub content_matches ($$) {
- my ($cids, $existing) = @_;
- my $cid = content_id($existing);
- foreach (@$cids) {
- return 1 if $_ eq $cid
+ my ($chashes, $existing) = @_;
+ my $chash = content_hash($existing);
+ foreach (@$chashes) {
+ return 1 if $_ eq $chash
}
0
}
$im = $self->importer;
}
my $over = $self->{over};
- my $cids = content_ids($old_mime);
+ my $chashes = content_hashes($old_mime);
my @removed;
my $mids = mids($old_mime->header_obj);
# We avoid introducing new blobs into git since the raw content
# can be slightly different, so we do not need the user-supplied
- # message now that we have the mids and content_id
+ # message now that we have the mids and content_hash
$old_mime = undef;
my $mark;
}
my $orig = $$msg;
my $cur = PublicInbox::Eml->new($msg);
- if (content_matches($cids, $cur)) {
+ if (content_matches($chashes, $cur)) {
$gone{$smsg->{num}} = [ $smsg, $cur, \$orig ];
}
}
sub content_exists ($$$) {
my ($self, $mime, $mid) = @_;
my $over = $self->{over};
- my $cids = content_ids($mime);
+ my $chashes = content_hashes($mime);
my ($id, $prev);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $msg = get_blob($self, $smsg);
next;
}
my $cur = PublicInbox::Eml->new($msg);
- return 1 if content_matches($cids, $cur);
+ return 1 if content_matches($chashes, $cur);
# XXX DEBUG_DIFF is experimental and may be removed
diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
my $msgref = $git->cat_file($oid);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
foreach my $mid (@$mids) {
- $sync->{D}->{"$mid\0$cid"} = $oid;
+ $sync->{D}->{"$mid\0$chash"} = $oid;
}
}
my $msgref = $git->cat_file($oid, \$len);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
die "BUG: reindex_oid_m called for <=1 mids" if scalar(@$mids) <= 1;
for my $mid (reverse @$mids) {
- delete($sync->{D}->{"$mid\0$cid"}) and
+ delete($sync->{D}->{"$mid\0$chash"}) and
die "BUG: reindex_oid should handle <$mid> delete";
}
my $over = $self->{over};
return if $len == 0; # purged
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
my $mid = $mids->[0];
# was the file previously marked as deleted?, skip if so
- if (delete($sync->{D}->{"$mid\0$cid"})) {
+ if (delete($sync->{D}->{"$mid\0$chash"})) {
if (!$sync->{reindex}) {
$num = $sync->{regen}--;
$self->{mm}->num_highwater($num);
} else { # multiple MIDs are a weird case:
my $del = 0;
for (@$mids) {
- $del += delete($sync->{D}->{"$_\0$cid"}) // 0;
+ $del += delete($sync->{D}->{"$_\0$chash"}) // 0;
}
if ($del) {
unindex_oid_remote($self, $oid, $_) for @$mids;
return unless defined $latest;
$self->idx_init($opt); # acquire lock
my $sync = {
- D => {}, # "$mid\0$cid" => $oid
+ D => {}, # "$mid\0$chash" => $oid
unindex_range => {}, # EPOCH => oid_old..oid_new
reindex => $opt->{reindex},
-opt => $opt
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
use PublicInbox::AdminEdit;
use File::Temp 0.19 (); # 0.19 for TMPDIR
-use PublicInbox::ContentId qw(content_id);
+use PublicInbox::ContentHash qw(content_hash);
use PublicInbox::MID qw(mid_clean mids);
PublicInbox::Admin::check_require('-index');
use PublicInbox::Eml;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::AdminEdit::check_editable(\@ibxs);
-my $found = {}; # cid => [ [ibx, smsg] [, [ibx, smsg] ] ]
+my $found = {}; # chash => [ [ibx, smsg] [, [ibx, smsg] ] ]
sub find_mid ($$$) {
my ($found, $mid, $ibxs) = @_;
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $ref = $ibx->msg_by_smsg($smsg);
my $mime = PublicInbox::Eml->new($ref);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
my $tuple = [ $ibx, $smsg ];
- push @{$found->{$cid} ||= []}, $tuple
+ push @{$found->{$chash} ||= []}, $tuple
}
PublicInbox::InboxWritable::cleanup($ibx);
}
die "open($file) failed: $!";
my $mids = mids($mime->header_obj);
find_mid($found, $_, \@ibxs) for (@$mids); # populates $found
- my $cid = content_id($mime);
- my $to_edit = $found->{$cid};
+ my $chash = content_hash($mime);
+ my $to_edit = $found->{$chash};
unless ($to_edit) {
my $nr = scalar(keys %$found);
if ($nr > 0) {
}
exit 1;
}
- $found = { $cid => $to_edit };
+ $found = { $chash => $to_edit };
}
my %tmpopt = (
my $nhdr = $new_mime->header_obj;
my $ohdr = $old_mime->header_obj;
if (($nhdr->as_string eq $ohdr->as_string) &&
- (content_id($new_mime) eq content_id($old_mime))) {
+ (content_hash($new_mime) eq content_hash($old_mime))) {
warn "No change detected to:\n", show_cmd($ibx, $smsg);
next unless $opt->{verbose};
use strict;
use warnings;
use Test::More;
-use PublicInbox::ContentId qw(content_id);
+use PublicInbox::ContentHash qw(content_hash);
use PublicInbox::Eml;
my $mime = PublicInbox::Eml->new(<<'EOF');
hello world
EOF
-my $orig = content_id($mime);
-my $reload = content_id(PublicInbox::Eml->new($mime->as_string));
-is($orig, $reload, 'content_id matches after serialization');
+my $orig = content_hash($mime);
+my $reload = content_hash(PublicInbox::Eml->new($mime->as_string));
+is($orig, $reload, 'content_hash matches after serialization');
foreach my $h (qw(From To Cc)) {
my $n = q("Quoted N'Ame" <foo@EXAMPLE.com>);
$mime->header_set($h, "$n");
- my $q = content_id($mime);
- is($mime->header($h), $n, "content_id does not mutate $h:");
+ my $q = content_hash($mime);
+ is($mime->header($h), $n, "content_hash does not mutate $h:");
$mime->header_set($h, 'Quoted N\'Ame <foo@example.com>');
- my $nq = content_id($mime);
+ my $nq = content_hash($mime);
is($nq, $q, "quotes ignored in $h:");
}
use strict;
use warnings;
use Test::More;
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
use PublicInbox::Eml;
use warnings;
use Test::More;
use PublicInbox::Eml;
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
require_git(2.6);
use warnings;
use Test::More;
use PublicInbox::Eml;
-use PublicInbox::ContentId qw(content_digest content_id);
+use PublicInbox::ContentHash qw(content_digest content_hash);
use PublicInbox::TestCommon;
use Cwd qw(abs_path);
require_git(2.6);
$im = PublicInbox::V2Writable->new($ibx, {nproc => 2});
is($im->{shards}, 1, 'detected single shard from previous');
my ($mark, $rm_mime, $smsg) = $im->remove($mime, 'test removal');
- is(content_id($rm_mime), content_id($mime),
+ is(content_hash($rm_mime), content_hash($mime),
'removed object returned matches');
ok(defined($mark), 'mark set');
$im->done;