use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd);
-use PublicInbox::SearchIdx;
+use PublicInbox::SearchIdx qw(too_big log2stack crlf_adjust is_ancestor);
use IO::Handle; # ->autoflush
use File::Temp qw(tempfile);
total_bytes => 0,
current_info => '',
xpfx => $xpfx,
- over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3", 1),
+ over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3"),
lock_path => "$dir/inbox.lock",
# limit each git repo (epoch) to 1GB or so
rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
- last_commit => [], # git repo -> commit
+ last_commit => [], # git epoch -> commit
};
+ $self->{over}->{-no_sync} = 1 if $v2ibx->{-no_sync};
$self->{shards} = count_shards($self) || nproc_shards($creat);
$self->{index_max_size} = $v2ibx->{index_max_size};
bless $self, $class;
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
- $smsg->{bytes} = $smsg->{raw_bytes} +
- PublicInbox::SearchIdx::crlf_adjust($$msgref);
+ $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
$self->{over}->add_overview($mime, $smsg);
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
# Now that all subprocesses are up, we can open the FDs
# for SQLite:
my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
- "$self->{ibx}->{inboxdir}/msgmap.sqlite3", 1);
+ "$self->{ibx}->{inboxdir}/msgmap.sqlite3",
+ $self->{ibx}->{-no_sync} ? 2 : 1);
$mm->{dbh}->begin_work;
}
sub atfork_child {
my ($self) = @_;
- my $fh = delete $self->{reindex_pipe};
- close $fh if $fh;
if (my $shards = $self->{idx_shards}) {
$_->atfork_child foreach @$shards;
}
$self->{bnote}->[1];
}
-sub reindex_checkpoint ($$$) {
- my ($self, $sync, $git) = @_;
+sub reindex_checkpoint ($$) {
+ my ($self, $sync) = @_;
- $git->cleanup;
$sync->{mm_tmp}->atfork_prepare;
$self->done; # release lock
if (my $pr = $sync->{-opt}->{-progress}) {
- my ($bn) = (split('/', $git->{git_dir}))[-1];
- $pr->("$bn ".sprintf($sync->{-regen_fmt}, $sync->{nr}));
+ $pr->(sprintf($sync->{-regen_fmt}, $sync->{nr}));
}
# allow -watch or -mda to write...
$sync->{mm_tmp}->atfork_parent;
}
-sub reindex_oid ($$$$) {
- my ($self, $sync, $git, $oid) = @_;
- return if PublicInbox::SearchIdx::too_big($self, $oid);
+sub reindex_oid ($$$) {
+ my ($self, $sync, $oid) = @_;
+ return if too_big($self, $oid);
my ($num, $mid0, $len);
- my $msgref = $git->cat_file($oid, \$len);
+ my $msgref = $self->{ibx}->git->cat_file($oid, \$len);
return if $len == 0; # purged
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
}, 'PublicInbox::Smsg';
$smsg->populate($mime, $sync);
if (do_idx($self, $msgref, $mime, $smsg)) {
- reindex_checkpoint($self, $sync, $git);
+ reindex_checkpoint($self, $sync);
}
}
$heads;
}
-*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
-
# returns a revision range for git-log(1)
sub log_range ($$$$$) {
my ($self, $sync, $git, $i, $tip) = @_;
$range;
}
-sub prepare_range_stack {
- my ($git, $sync, $range) = @_;
- # Don't bump num_highwater on --reindex by using {D}.
- # We intentionally do NOT use {D} in the non-reindex case because
- # we want NNTP article number gaps from unindexed messages to
- # show up in mirrors, too.
- my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
-
- my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
- --no-notes --no-color --no-renames --no-abbrev),
- $range);
- my ($at, $ct, $stk);
- while (<$fh>) {
- if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
- ($at, $ct) = ($1 + 0, $2 + 0);
- $stk //= PublicInbox::IdxStack->new($3);
- } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\td$/o) {
- my $oid = $1;
- if ($D) { # reindex case
- $D->{pack('H*', $oid)}++;
- } else { # non-reindex case:
- $stk->push_rec('d', $at, $ct, $oid);
- }
- } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o) {
- my $oid = $1;
- if ($D) {
- my $oid_bin = pack('H*', $oid);
- my $nr = --$D->{$oid_bin};
- delete($D->{$oid_bin}) if $nr <= 0;
-
- # nr < 0 (-1) means it never existed
- $stk->push_rec('m', $at, $ct, $oid) if $nr < 0;
- } else {
- $stk->push_rec('m', $at, $ct, $oid);
- }
- }
- }
- close $fh or die "git log failed: \$?=$?";
- $stk ? $stk->read_prepare : undef;
-}
-
sub sync_prepare ($$$) {
my ($self, $sync, $epoch_max) = @_;
my $pr = $sync->{-opt}->{-progress};
my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex};
for (my $i = $epoch_max; $i >= 0; $i--) {
- die 'BUG: already indexing!' if $self->{reindex_pipe};
my $git_dir = git_dir_n($self, $i);
-d $git_dir or next; # missing epochs are fine
my $git = PublicInbox::Git->new($git_dir);
my $range = log_range($self, $sync, $git, $i, $tip) or next;
# can't use 'rev-list --count' if we use --diff-filter
$pr->("$i.git counting $range ... ") if $pr;
- my $stk = prepare_range_stack($git, $sync, $range);
+ # Don't bump num_highwater on --reindex by using {D}.
+ # We intentionally do NOT use {D} in the non-reindex case
+ # because we want NNTP article number gaps from unindexed
+ # messages to show up in mirrors, too.
+ $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
+ my $stk = log2stack($sync, $git, $range, $self->{ibx});
my $nr = $stk ? $stk->num_records : 0;
$pr->("$nr\n") if $pr;
$sync->{stacks}->[$i] = $stk if $stk;
# our code and blindly injects "d" file history into git repos
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
- my $git = $self->{ibx}->git;
for my $oid (@leftovers) {
$oid = unpack('H*', $oid);
$self->{current_info} = "leftover $oid";
- unindex_oid($self, $git, $oid);
+ unindex_oid($self, $oid);
}
- $git->cleanup;
}
return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
}
}
-sub unindex_oid ($$$;$) {
- my ($self, $git, $oid, $unindexed) = @_;
+sub unindex_oid ($$;$) {
+ my ($self, $oid, $unindexed) = @_;
my $mm = $self->{mm};
- my $msgref = $git->cat_file($oid);
+ my $msgref = $self->{ibx}->git->cat_file($oid);
my $mime = PublicInbox::Eml->new($msgref);
my $mids = mids($mime->header_obj);
$mime = $msgref = undef;
}
}
+# this is rare, it only happens when we get discontiguous history in
+# a mirror because the source used -purge or -edit
sub unindex ($$$$) {
my ($self, $sync, $git, $unindex_range) = @_;
my $unindexed = $self->{unindexed} ||= {}; # $mid0 => $num
# order does not matter, here:
my @cmd = qw(log --raw -r
--no-notes --no-color --no-abbrev --no-renames);
- my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $unindex_range);
+ my $fh = $git->popen(@cmd, $unindex_range);
while (<$fh>) {
/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next;
- unindex_oid($self, $git, $1, $unindexed);
+ unindex_oid($self, $1, $unindexed);
}
- delete $self->{reindex_pipe};
close $fh or die "git log failed: \$?=$?";
return unless $sync->{-opt}->{prune};
my ($self, $sync, $i) = @_;
my $git_dir = git_dir_n($self, $i);
- die 'BUG: already reindexing!' if $self->{reindex_pipe};
-d $git_dir or return; # missing epochs are fine
my $git = PublicInbox::Git->new($git_dir);
- if (my $unindex_range = delete $sync->{unindex_range}->{$i}) {
+ if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare
unindex($self, $sync, $git, $unindex_range);
}
defined(my $stk = $sync->{stacks}->[$i]) or return;
if ($f eq 'm') {
$sync->{autime} = $at;
$sync->{cotime} = $ct;
- reindex_oid($self, $sync, $git, $oid);
+ reindex_oid($self, $sync, $oid);
} elsif ($f eq 'd') {
- unindex_oid($self, $git, $oid);
+ unindex_oid($self, $oid);
}
}
delete @$sync{qw(autime cotime)};