use base qw(PublicInbox::Lock);
use 5.010_001;
use PublicInbox::SearchIdxShard;
-use PublicInbox::MIME;
+use PublicInbox::Eml;
use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::MID qw(mids references);
-use PublicInbox::ContentId qw(content_id content_digest);
+use PublicInbox::ContentHash qw(content_hash content_digest);
use PublicInbox::Inbox;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
# Also, shard count may change while -watch is running
# due to "xcpdb --reshard"
if (-d $xpfx) {
- require PublicInbox::Search;
- PublicInbox::Search::load_xapian();
- my $XapianDatabase = $PublicInbox::Search::X{Database};
+ my $XapianDatabase;
foreach my $shard (<$xpfx/*>) {
-d $shard && $shard =~ m!/[0-9]+\z! or next;
+ $XapianDatabase //= do {
+ require PublicInbox::Search;
+ PublicInbox::Search::load_xapian();
+ $PublicInbox::Search::X{Database};
+ };
eval {
$XapianDatabase->new($shard)->close;
$n++;
last_commit => [], # git repo -> commit
};
$self->{shards} = count_shards($self) || nproc_shards($creat);
+ $self->{index_max_size} = $v2ibx->{index_max_size};
bless $self, $class;
}
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
my $n = $self->{transact_bytes} += $smsg->{bytes};
- $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
+ $n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
}
sub _add {
my ($num, $mid0) = v2_num_for($self, $mime);
defined $num or return; # duplicate
- defined $mid0 or die "BUG: $mid0 undefined\n";
+ defined $mid0 or die "BUG: \$mid0 undefined\n";
my $im = $self->importer;
my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg';
my $cmt = $im->add($mime, undef, $smsg); # sets $smsg->{ds|ts|blob}
$rewrites;
}
-sub content_ids ($) {
+sub content_hashes ($) {
my ($mime) = @_;
- my @cids = ( content_id($mime) );
+ my @chashes = ( content_hash($mime) );
+ # We still support Email::MIME, here, and
# Email::MIME->as_string doesn't always round-trip, so we may
- # use a second content_id
- my $rt = content_id(PublicInbox::MIME->new(\($mime->as_string)));
- push @cids, $rt if $cids[0] ne $rt;
- \@cids;
+ # use a second content_hash
+ my $rt = content_hash(PublicInbox::Eml->new(\($mime->as_string)));
+ push @chashes, $rt if $chashes[0] ne $rt;
+ \@chashes;
}
sub content_matches ($$) {
- my ($cids, $existing) = @_;
- my $cid = content_id($existing);
- foreach (@$cids) {
- return 1 if $_ eq $cid
+ my ($chashes, $existing) = @_;
+ my $chash = content_hash($existing);
+ foreach (@$chashes) {
+ return 1 if $_ eq $chash
}
0
}
$im = $self->importer;
}
my $over = $self->{over};
- my $cids = content_ids($old_mime);
+ my $chashes = content_hashes($old_mime);
my @removed;
my $mids = mids($old_mime->header_obj);
# We avoid introducing new blobs into git since the raw content
# can be slightly different, so we do not need the user-supplied
- # message now that we have the mids and content_id
+ # message now that we have the mids and content_hash
$old_mime = undef;
my $mark;
next; # continue
}
my $orig = $$msg;
- my $cur = PublicInbox::MIME->new($msg);
- if (content_matches($cids, $cur)) {
+ my $cur = PublicInbox::Eml->new($msg);
+ if (content_matches($chashes, $cur)) {
$gone{$smsg->{num}} = [ $smsg, $cur, \$orig ];
}
}
my $bnote = $self->{bnote} or return;
my $r = $bnote->[0];
while (scalar keys %$barrier) {
- defined(my $l = $r->getline) or die "EOF on barrier_wait: $!";
+ defined(my $l = readline($r)) or die "EOF on barrier_wait: $!";
$l =~ /\Abarrier (\d+)/ or die "bad line on barrier_wait: $l";
delete $barrier->{$1} or die "bad shard[$1] on barrier wait";
}
sub content_exists ($$$) {
my ($self, $mime, $mid) = @_;
my $over = $self->{over};
- my $cids = content_ids($mime);
+ my $chashes = content_hashes($mime);
my ($id, $prev);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
my $msg = get_blob($self, $smsg);
warn "broken smsg for $mid\n";
next;
}
- my $cur = PublicInbox::MIME->new($msg);
- return 1 if content_matches($cids, $cur);
+ my $cur = PublicInbox::Eml->new($msg);
+ return 1 if content_matches($chashes, $cur);
# XXX DEBUG_DIFF is experimental and may be removed
diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
sub mark_deleted ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my $msgref = $git->cat_file($oid);
- my $mime = PublicInbox::MIME->new($$msgref);
+ my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
foreach my $mid (@$mids) {
- $sync->{D}->{"$mid\0$cid"} = $oid;
+ $sync->{D}->{"$mid\0$chash"} = $oid;
}
}
$self->{current_info} = "multi_mid $oid";
my ($num, $mid0, $len);
my $msgref = $git->cat_file($oid, \$len);
- my $mime = PublicInbox::MIME->new($$msgref);
+ my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
die "BUG: reindex_oid_m called for <=1 mids" if scalar(@$mids) <= 1;
for my $mid (reverse @$mids) {
- delete($sync->{D}->{"$mid\0$cid"}) and
+ delete($sync->{D}->{"$mid\0$chash"}) and
die "BUG: reindex_oid should handle <$mid> delete";
}
my $over = $self->{over};
sub reindex_oid ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my ($num, $mid0, $len);
my $msgref = $git->cat_file($oid, \$len);
return if $len == 0; # purged
- my $mime = PublicInbox::MIME->new($$msgref);
+ my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
- my $cid = content_id($mime);
+ my $chash = content_hash($mime);
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
my $mid = $mids->[0];
# was the file previously marked as deleted?, skip if so
- if (delete($sync->{D}->{"$mid\0$cid"})) {
+ if (delete($sync->{D}->{"$mid\0$chash"})) {
if (!$sync->{reindex}) {
$num = $sync->{regen}--;
$self->{mm}->num_highwater($num);
} else { # multiple MIDs are a weird case:
my $del = 0;
for (@$mids) {
- $del += delete($sync->{D}->{"$_\0$cid"}) // 0;
+ $del += delete($sync->{D}->{"$_\0$chash"}) // 0;
}
if ($del) {
unindex_oid_remote($self, $oid, $_) for @$mids;
my ($self, $git, $oid, $unindexed) = @_;
my $mm = $self->{mm};
my $msgref = $git->cat_file($oid);
- my $mime = PublicInbox::MIME->new($msgref);
+ my $mime = PublicInbox::Eml->new($msgref);
my $mids = mids($mime->header_obj);
$mime = $msgref = undef;
my $over = $self->{over};
return unless defined $latest;
$self->idx_init($opt); # acquire lock
my $sync = {
- D => {}, # "$mid\0$cid" => $oid
+ D => {}, # "$mid\0$chash" => $oid
unindex_range => {}, # EPOCH => oid_old..oid_new
reindex => $opt->{reindex},
-opt => $opt