The old name may be confused with "Content-ID" as described in
RFC 2392, so use an alternate name to avoid confusing future
readers.
=head1 OBJECT IDENTIFIERS
=head1 OBJECT IDENTIFIERS
-There are three distinct type of identifiers. content_id is the
+There are three distinct type of identifiers. content_hash is the
new one for v2 and should make message removal and deduplication
easier. object_id and Message-ID are already known.
new one for v2 and should make message removal and deduplication
easier. object_id and Message-ID are already known.
This remains a searchable field in Xapian. Note: it's possible
for emails to have multiple Message-ID headers (and L<git-send-email(1)>
had that bug for a bit); so we take all of them into account.
This remains a searchable field in Xapian. Note: it's possible
for emails to have multiple Message-ID headers (and L<git-send-email(1)>
had that bug for a bit); so we take all of them into account.
-In case of conflicts detected by content_id below, we generate a new
-Message-ID based on content_id; if the generated Message-ID still
+In case of conflicts detected by content_hash below, we generate a new
+Message-ID based on content_hash; if the generated Message-ID still
conflicts, a random one is generated.
conflicts, a random one is generated.
A hash of relevant headers and raw body content for
purging of unwanted content. This is not stored anywhere,
A hash of relevant headers and raw body content for
purging of unwanted content. This is not stored anywhere,
Subject, From, Date, References, In-Reply-To, To, Cc
Subject, From, Date, References, In-Reply-To, To, Cc
-Received, List-Id, and similar headers are NOT part of content_id as
+Received, List-Id, and similar headers are NOT part of content_hash as
they differ across lists and we will want removal to be able to cross
lists.
they differ across lists and we will want removal to be able to cross
lists.
filters (e.g. PublicInbox::Filter::Vger) to clean the body for
imports.
filters (e.g. PublicInbox::Filter::Vger) to clean the body for
imports.
-content_id is SHA-256 for now; but can be changed at any time
+content_hash is SHA-256 for now; but can be changed at any time
without making DB changes.
=back
without making DB changes.
=back
lib/PublicInbox/AltId.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/Config.pm
lib/PublicInbox/AltId.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/Config.pm
-lib/PublicInbox/ContentId.pm
+lib/PublicInbox/ContentHash.pm
lib/PublicInbox/DS.pm
lib/PublicInbox/DSKQXS.pm
lib/PublicInbox/DSPoll.pm
lib/PublicInbox/DS.pm
lib/PublicInbox/DSKQXS.pm
lib/PublicInbox/DSPoll.pm
t/check-www-inbox.perl
t/config.t
t/config_limiter.t
t/check-www-inbox.perl
t/config.t
t/config_limiter.t
t/convert-compact.t
t/data/0001.patch
t/ds-kqxs.t
t/convert-compact.t
t/data/0001.patch
t/ds-kqxs.t
# This is not stored in any database anywhere and may change
# as changes in duplicate detection are needed.
# See L<public-inbox-v2-format(5)> manpage for more details.
# This is not stored in any database anywhere and may change
# as changes in duplicate detection are needed.
# See L<public-inbox-v2-format(5)> manpage for more details.
-package PublicInbox::ContentId;
+package PublicInbox::ContentHash;
use strict;
use warnings;
use base qw/Exporter/;
use strict;
use warnings;
use base qw/Exporter/;
-our @EXPORT_OK = qw/content_id content_digest/;
+our @EXPORT_OK = qw/content_hash content_digest/;
use PublicInbox::MID qw(mids references);
use PublicInbox::MsgIter;
use PublicInbox::MID qw(mids references);
use PublicInbox::MsgIter;
# References: and In-Reply-To: get used interchangeably
# in some "duplicates" in LKML. We treat them the same
# in SearchIdx, so treat them the same for this:
# References: and In-Reply-To: get used interchangeably
# in some "duplicates" in LKML. We treat them the same
# in SearchIdx, so treat them the same for this:
- # do NOT consider the Message-ID as part of the content_id
+ # do NOT consider the Message-ID as part of the content_hash
# if we got here, we've already got Message-ID reuse
my %seen = map { $_ => 1 } @{mids($hdr)};
foreach my $mid (@{references($hdr)}) {
# if we got here, we've already got Message-ID reuse
my %seen = map { $_ => 1 } @{mids($hdr)};
foreach my $mid (@{references($hdr)}) {
content_digest($_[0])->digest;
}
content_digest($_[0])->digest;
}
use PublicInbox::MID qw(mids mid2path);
use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
use PublicInbox::MID qw(mids mid2path);
use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use PublicInbox::MDA;
use PublicInbox::Eml;
use POSIX qw(strftime);
use PublicInbox::MDA;
use PublicInbox::Eml;
use POSIX qw(strftime);
use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::MID qw(mids references);
use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::MID qw(mids references);
-use PublicInbox::ContentId qw(content_id content_digest);
+use PublicInbox::ContentHash qw(content_hash content_digest);
use PublicInbox::Inbox;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Inbox;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
+sub content_hashes ($) {
- my @cids = ( content_id($mime) );
+ my @chashes = ( content_hash($mime) );
# We still support Email::MIME, here, and
# Email::MIME->as_string doesn't always round-trip, so we may
# We still support Email::MIME, here, and
# Email::MIME->as_string doesn't always round-trip, so we may
- # use a second content_id
- my $rt = content_id(PublicInbox::Eml->new(\($mime->as_string)));
- push @cids, $rt if $cids[0] ne $rt;
- \@cids;
+ # use a second content_hash
+ my $rt = content_hash(PublicInbox::Eml->new(\($mime->as_string)));
+ push @chashes, $rt if $chashes[0] ne $rt;
+ \@chashes;
}
sub content_matches ($$) {
}
sub content_matches ($$) {
- my ($cids, $existing) = @_;
- my $cid = content_id($existing);
- foreach (@$cids) {
- return 1 if $_ eq $cid
+ my ($chashes, $existing) = @_;
+ my $chash = content_hash($existing);
+ foreach (@$chashes) {
+ return 1 if $_ eq $chash
$im = $self->importer;
}
my $over = $self->{over};
$im = $self->importer;
}
my $over = $self->{over};
- my $cids = content_ids($old_mime);
+ my $chashes = content_hashes($old_mime);
my @removed;
my $mids = mids($old_mime->header_obj);
# We avoid introducing new blobs into git since the raw content
# can be slightly different, so we do not need the user-supplied
my @removed;
my $mids = mids($old_mime->header_obj);
# We avoid introducing new blobs into git since the raw content
# can be slightly different, so we do not need the user-supplied
- # message now that we have the mids and content_id
+ # message now that we have the mids and content_hash
$old_mime = undef;
my $mark;
$old_mime = undef;
my $mark;
}
my $orig = $$msg;
my $cur = PublicInbox::Eml->new($msg);
}
my $orig = $$msg;
my $cur = PublicInbox::Eml->new($msg);
- if (content_matches($cids, $cur)) {
+ if (content_matches($chashes, $cur)) {
$gone{$smsg->{num}} = [ $smsg, $cur, \$orig ];
}
}
$gone{$smsg->{num}} = [ $smsg, $cur, \$orig ];
}
}
sub content_exists ($$$) {
my ($self, $mime, $mid) = @_;
my $over = $self->{over};
sub content_exists ($$$) {
my ($self, $mime, $mid) = @_;
my $over = $self->{over};
- my $cids = content_ids($mime);
+ my $chashes = content_hashes($mime);
my ($id, $prev);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $msg = get_blob($self, $smsg);
my ($id, $prev);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $msg = get_blob($self, $smsg);
next;
}
my $cur = PublicInbox::Eml->new($msg);
next;
}
my $cur = PublicInbox::Eml->new($msg);
- return 1 if content_matches($cids, $cur);
+ return 1 if content_matches($chashes, $cur);
# XXX DEBUG_DIFF is experimental and may be removed
diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
# XXX DEBUG_DIFF is experimental and may be removed
diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
my $msgref = $git->cat_file($oid);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
my $msgref = $git->cat_file($oid);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
foreach my $mid (@$mids) {
foreach my $mid (@$mids) {
- $sync->{D}->{"$mid\0$cid"} = $oid;
+ $sync->{D}->{"$mid\0$chash"} = $oid;
my $msgref = $git->cat_file($oid, \$len);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
my $msgref = $git->cat_file($oid, \$len);
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
die "BUG: reindex_oid_m called for <=1 mids" if scalar(@$mids) <= 1;
for my $mid (reverse @$mids) {
die "BUG: reindex_oid_m called for <=1 mids" if scalar(@$mids) <= 1;
for my $mid (reverse @$mids) {
- delete($sync->{D}->{"$mid\0$cid"}) and
+ delete($sync->{D}->{"$mid\0$chash"}) and
die "BUG: reindex_oid should handle <$mid> delete";
}
my $over = $self->{over};
die "BUG: reindex_oid should handle <$mid> delete";
}
my $over = $self->{over};
return if $len == 0; # purged
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
return if $len == 0; # purged
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
my $mid = $mids->[0];
# was the file previously marked as deleted?, skip if so
my $mid = $mids->[0];
# was the file previously marked as deleted?, skip if so
- if (delete($sync->{D}->{"$mid\0$cid"})) {
+ if (delete($sync->{D}->{"$mid\0$chash"})) {
if (!$sync->{reindex}) {
$num = $sync->{regen}--;
$self->{mm}->num_highwater($num);
if (!$sync->{reindex}) {
$num = $sync->{regen}--;
$self->{mm}->num_highwater($num);
} else { # multiple MIDs are a weird case:
my $del = 0;
for (@$mids) {
} else { # multiple MIDs are a weird case:
my $del = 0;
for (@$mids) {
- $del += delete($sync->{D}->{"$_\0$cid"}) // 0;
+ $del += delete($sync->{D}->{"$_\0$chash"}) // 0;
}
if ($del) {
unindex_oid_remote($self, $oid, $_) for @$mids;
}
if ($del) {
unindex_oid_remote($self, $oid, $_) for @$mids;
return unless defined $latest;
$self->idx_init($opt); # acquire lock
my $sync = {
return unless defined $latest;
$self->idx_init($opt); # acquire lock
my $sync = {
- D => {}, # "$mid\0$cid" => $oid
+ D => {}, # "$mid\0$chash" => $oid
unindex_range => {}, # EPOCH => oid_old..oid_new
reindex => $opt->{reindex},
-opt => $opt
unindex_range => {}, # EPOCH => oid_old..oid_new
reindex => $opt->{reindex},
-opt => $opt
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
use PublicInbox::AdminEdit;
use File::Temp 0.19 (); # 0.19 for TMPDIR
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
use PublicInbox::AdminEdit;
use File::Temp 0.19 (); # 0.19 for TMPDIR
-use PublicInbox::ContentId qw(content_id);
+use PublicInbox::ContentHash qw(content_hash);
use PublicInbox::MID qw(mid_clean mids);
PublicInbox::Admin::check_require('-index');
use PublicInbox::Eml;
use PublicInbox::MID qw(mid_clean mids);
PublicInbox::Admin::check_require('-index');
use PublicInbox::Eml;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::AdminEdit::check_editable(\@ibxs);
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::AdminEdit::check_editable(\@ibxs);
-my $found = {}; # cid => [ [ibx, smsg] [, [ibx, smsg] ] ]
+my $found = {}; # chash => [ [ibx, smsg] [, [ibx, smsg] ] ]
sub find_mid ($$$) {
my ($found, $mid, $ibxs) = @_;
sub find_mid ($$$) {
my ($found, $mid, $ibxs) = @_;
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $ref = $ibx->msg_by_smsg($smsg);
my $mime = PublicInbox::Eml->new($ref);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $ref = $ibx->msg_by_smsg($smsg);
my $mime = PublicInbox::Eml->new($ref);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
my $tuple = [ $ibx, $smsg ];
my $tuple = [ $ibx, $smsg ];
- push @{$found->{$cid} ||= []}, $tuple
+ push @{$found->{$chash} ||= []}, $tuple
}
PublicInbox::InboxWritable::cleanup($ibx);
}
}
PublicInbox::InboxWritable::cleanup($ibx);
}
die "open($file) failed: $!";
my $mids = mids($mime->header_obj);
find_mid($found, $_, \@ibxs) for (@$mids); # populates $found
die "open($file) failed: $!";
my $mids = mids($mime->header_obj);
find_mid($found, $_, \@ibxs) for (@$mids); # populates $found
- my $cid = content_id($mime);
- my $to_edit = $found->{$cid};
+ my $chash = content_hash($mime);
+ my $to_edit = $found->{$chash};
unless ($to_edit) {
my $nr = scalar(keys %$found);
if ($nr > 0) {
unless ($to_edit) {
my $nr = scalar(keys %$found);
if ($nr > 0) {
- $found = { $cid => $to_edit };
+ $found = { $chash => $to_edit };
my $nhdr = $new_mime->header_obj;
my $ohdr = $old_mime->header_obj;
if (($nhdr->as_string eq $ohdr->as_string) &&
my $nhdr = $new_mime->header_obj;
my $ohdr = $old_mime->header_obj;
if (($nhdr->as_string eq $ohdr->as_string) &&
- (content_id($new_mime) eq content_id($old_mime))) {
+ (content_hash($new_mime) eq content_hash($old_mime))) {
warn "No change detected to:\n", show_cmd($ibx, $smsg);
next unless $opt->{verbose};
warn "No change detected to:\n", show_cmd($ibx, $smsg);
next unless $opt->{verbose};
use strict;
use warnings;
use Test::More;
use strict;
use warnings;
use Test::More;
-use PublicInbox::ContentId qw(content_id);
+use PublicInbox::ContentHash qw(content_hash);
use PublicInbox::Eml;
my $mime = PublicInbox::Eml->new(<<'EOF');
use PublicInbox::Eml;
my $mime = PublicInbox::Eml->new(<<'EOF');
-my $orig = content_id($mime);
-my $reload = content_id(PublicInbox::Eml->new($mime->as_string));
-is($orig, $reload, 'content_id matches after serialization');
+my $orig = content_hash($mime);
+my $reload = content_hash(PublicInbox::Eml->new($mime->as_string));
+is($orig, $reload, 'content_hash matches after serialization');
foreach my $h (qw(From To Cc)) {
my $n = q("Quoted N'Ame" <foo@EXAMPLE.com>);
$mime->header_set($h, "$n");
foreach my $h (qw(From To Cc)) {
my $n = q("Quoted N'Ame" <foo@EXAMPLE.com>);
$mime->header_set($h, "$n");
- my $q = content_id($mime);
- is($mime->header($h), $n, "content_id does not mutate $h:");
+ my $q = content_hash($mime);
+ is($mime->header($h), $n, "content_hash does not mutate $h:");
$mime->header_set($h, 'Quoted N\'Ame <foo@example.com>');
$mime->header_set($h, 'Quoted N\'Ame <foo@example.com>');
- my $nq = content_id($mime);
+ my $nq = content_hash($mime);
is($nq, $q, "quotes ignored in $h:");
}
is($nq, $q, "quotes ignored in $h:");
}
use strict;
use warnings;
use Test::More;
use strict;
use warnings;
use Test::More;
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
use PublicInbox::Eml;
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
use PublicInbox::Eml;
use warnings;
use Test::More;
use PublicInbox::Eml;
use warnings;
use Test::More;
use PublicInbox::Eml;
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentHash qw(content_digest);
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
require_git(2.6);
use File::Path qw(remove_tree);
use PublicInbox::TestCommon;
require_git(2.6);
use warnings;
use Test::More;
use PublicInbox::Eml;
use warnings;
use Test::More;
use PublicInbox::Eml;
-use PublicInbox::ContentId qw(content_digest content_id);
+use PublicInbox::ContentHash qw(content_digest content_hash);
use PublicInbox::TestCommon;
use Cwd qw(abs_path);
require_git(2.6);
use PublicInbox::TestCommon;
use Cwd qw(abs_path);
require_git(2.6);
$im = PublicInbox::V2Writable->new($ibx, {nproc => 2});
is($im->{shards}, 1, 'detected single shard from previous');
my ($mark, $rm_mime, $smsg) = $im->remove($mime, 'test removal');
$im = PublicInbox::V2Writable->new($ibx, {nproc => 2});
is($im->{shards}, 1, 'detected single shard from previous');
my ($mark, $rm_mime, $smsg) = $im->remove($mime, 'test removal');
- is(content_id($rm_mime), content_id($mime),
+ is(content_hash($rm_mime), content_hash($mime),
'removed object returned matches');
ok(defined($mark), 'mark set');
$im->done;
'removed object returned matches');
ok(defined($mark), 'mark set');
$im->done;