my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION;
my $self = {
- -inbox => $v2ibx,
+ ibx => $v2ibx,
im => undef, # PublicInbox::Import
parallel => 1,
transact_bytes => 0,
total_bytes => 0,
current_info => '',
xpfx => $xpfx,
- over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3", 1),
+ over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3"),
lock_path => "$dir/inbox.lock",
# limit each git repo (epoch) to 1GB or so
rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
- last_commit => [], # git repo -> commit
+ last_commit => [], # git epoch -> commit
};
+ $self->{over}->{-no_sync} = 1 if $v2ibx->{-no_sync};
$self->{shards} = count_shards($self) || nproc_shards($creat);
$self->{index_max_size} = $v2ibx->{index_max_size};
bless $self, $class;
# mimics Import::add and wraps it for v2
sub add {
my ($self, $eml, $check_cb) = @_;
- $self->{-inbox}->with_umask(\&_add, $self, $eml, $check_cb);
+ $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb);
}
# indexes a message, returns true if checkpointing is needed
# spam check:
if ($check_cb) {
- $mime = $check_cb->($mime, $self->{-inbox}) or return;
+ $mime = $check_cb->($mime, $self->{ibx}) or return;
}
# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
# AltId may pre-populate article numbers (e.g. X-Mail-Count
# or NNTP article number), use that article number if it's
# not in Over.
- my $altid = $self->{-inbox}->{altid};
+ my $altid = $self->{ibx}->{altid};
if ($altid && grep(/:file=msgmap\.sqlite3\z/, @$altid)) {
my $num = $self->{mm}->num_for($mid);
# Now that all subprocesses are up, we can open the FDs
# for SQLite:
my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
- "$self->{-inbox}->{inboxdir}/msgmap.sqlite3", 1);
+ "$self->{ibx}->{inboxdir}/msgmap.sqlite3",
+ $self->{ibx}->{-no_sync} ? 2 : 1);
$mm->{dbh}->begin_work;
}
sub idx_init {
my ($self, $opt) = @_;
return if $self->{idx_shards};
- my $ibx = $self->{-inbox};
+ my $ibx = $self->{ibx};
# do not leak read-only FDs to child processes, we only have these
# FDs for duplicate detection so they should not be
sub _replace_oids ($$$) {
my ($self, $mime, $replace_map) = @_;
$self->done;
- my $pfx = "$self->{-inbox}->{inboxdir}/git";
+ my $pfx = "$self->{ibx}->{inboxdir}/git";
my $rewrites = []; # epoch => commit
my $max = $self->{epoch_max};
# (retval[2]) is not part of the stable API shared with Import->remove
sub remove {
my ($self, $eml, $cmt_msg) = @_;
- my $r = $self->{-inbox}->with_umask(\&rewrite_internal,
+ my $r = $self->{ibx}->with_umask(\&rewrite_internal,
$self, $eml, $cmt_msg);
defined($r) && defined($r->[0]) ? @$r: undef;
}
sub _replace ($$;$$) {
my ($self, $old_eml, $new_eml, $sref) = @_;
my $arg = [ $self, $old_eml, undef, $new_eml, $sref ];
- my $rewritten = $self->{-inbox}->with_umask(\&rewrite_internal,
+ my $rewritten = $self->{ibx}->with_umask(\&rewrite_internal,
$self, $old_eml, undef, $new_eml, $sref) or return;
my $rewrites = $rewritten->{rewrites};
my ($self, $raw) = @_;
# grab the expected OID we have to reindex:
pipe(my($in, $w)) or die "pipe: $!";
- my $git_dir = $self->{-inbox}->git->{git_dir};
+ my $git_dir = $self->{ibx}->git->{git_dir};
my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)];
my $r = popen_rd($cmd, undef, { 0 => $in });
print $w $$raw or die "print \$w: $!";
}
# make sure we really got the OID:
- my ($blob, $type, $bytes) = $self->{-inbox}->git->check($expect_oid);
+ my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid);
$blob eq $expect_oid or die "BUG: $expect_oid not found after replace";
# don't leak FDs to Xapian:
- $self->{-inbox}->git->cleanup;
+ $self->{ibx}->git->cleanup;
# reindex modified messages:
for my $smsg (@$need_reindex) {
my $nbytes = $self->{total_bytes};
$self->{total_bytes} = 0;
$self->lock_release(!!$nbytes) if $shards;
- $self->{-inbox}->git->cleanup;
+ $self->{ibx}->git->cleanup;
}
sub fill_alternates ($$) {
my ($self, $epoch) = @_;
- my $pfx = "$self->{-inbox}->{inboxdir}/git";
- my $all = "$self->{-inbox}->{inboxdir}/all.git";
+ my $pfx = "$self->{ibx}->{inboxdir}/git";
+ my $all = "$self->{ibx}->{inboxdir}/all.git";
PublicInbox::Import::init_bare($all) unless -d $all;
my $info_dir = "$all/objects/info";
my $alt = "$info_dir/alternates";
sub git_init {
my ($self, $epoch) = @_;
- my $git_dir = "$self->{-inbox}->{inboxdir}/git/$epoch.git";
+ my $git_dir = "$self->{ibx}->{inboxdir}/git/$epoch.git";
PublicInbox::Import::init_bare($git_dir);
my @cmd = (qw/git config/, "--file=$git_dir/config",
'include.path', '../../all.git/config');
sub git_dir_latest {
my ($self, $max) = @_;
$$max = -1;
- my $pfx = "$self->{-inbox}->{inboxdir}/git";
+ my $pfx = "$self->{ibx}->{inboxdir}/git";
return unless -d $pfx;
my $latest;
opendir my $dh, $pfx or die "opendir $pfx: $!\n";
sub import_init {
my ($self, $git, $packed_bytes, $tmp) = @_;
- my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
+ my $im = PublicInbox::Import->new($git, undef, undef, $self->{ibx});
$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
$im->{lock_path} = undef;
$im->{path_type} = 'v2';
return $msg if $msg;
}
# older message, should be in alternates
- my $ibx = $self->{-inbox};
- $ibx->msg_by_smsg($smsg);
+ $self->{ibx}->msg_by_smsg($smsg);
}
sub content_exists ($$$) {
sub atfork_child {
my ($self) = @_;
- my $fh = delete $self->{reindex_pipe};
- close $fh if $fh;
if (my $shards = $self->{idx_shards}) {
$_->atfork_child foreach @$shards;
}
$self->{bnote}->[1];
}
-sub reindex_checkpoint ($$$) {
- my ($self, $sync, $git) = @_;
+sub reindex_checkpoint ($$) {
+ my ($self, $sync) = @_;
- $git->cleanup;
$sync->{mm_tmp}->atfork_prepare;
$self->done; # release lock
if (my $pr = $sync->{-opt}->{-progress}) {
- my ($bn) = (split('/', $git->{git_dir}))[-1];
- $pr->("$bn ".sprintf($sync->{-regen_fmt}, $sync->{nr}));
+ $pr->(sprintf($sync->{-regen_fmt}, $sync->{nr}));
}
# allow -watch or -mda to write...
$sync->{mm_tmp}->atfork_parent;
}
-sub reindex_oid ($$$$) {
- my ($self, $sync, $git, $oid) = @_;
- return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
+sub reindex_oid ($$$) {
+ my ($self, $sync, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $oid);
my ($num, $mid0, $len);
- my $msgref = $git->cat_file($oid, \$len);
+ my $msgref = $self->{ibx}->git->cat_file($oid, \$len);
return if $len == 0; # purged
my $mime = PublicInbox::Eml->new($$msgref);
my $mids = mids($mime->header_obj);
}, 'PublicInbox::Smsg';
$smsg->populate($mime, $sync);
if (do_idx($self, $msgref, $mime, $smsg)) {
- reindex_checkpoint($self, $sync, $git);
+ reindex_checkpoint($self, $sync);
}
}
last_epoch_commit($self, $i, $cmt);
}
-sub git_dir_n ($$) { "$_[0]->{-inbox}->{inboxdir}/git/$_[1].git" }
+sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" }
sub last_commits ($$) {
my ($self, $epoch_max) = @_;
my ($self, $sync, $epoch_max) = @_;
my $pr = $sync->{-opt}->{-progress};
my $regen_max = 0;
- my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
+ my $head = $self->{ibx}->{ref_head} || 'refs/heads/master';
# reindex stops at the current heads and we later rerun index_sync
# without {reindex}
my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex};
for (my $i = $epoch_max; $i >= 0; $i--) {
- die 'BUG: already indexing!' if $self->{reindex_pipe};
my $git_dir = git_dir_n($self, $i);
-d $git_dir or next; # missing epochs are fine
my $git = PublicInbox::Git->new($git_dir);
next if $?; # new repo
my $range = log_range($self, $sync, $git, $i, $tip) or next;
- $sync->{ranges}->[$i] = $range;
-
# can't use 'rev-list --count' if we use --diff-filter
$pr->("$i.git counting $range ... ") if $pr;
my $stk = prepare_range_stack($git, $sync, $range);
# our code and blindly injects "d" file history into git repos
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
- my $git = $self->{-inbox}->git;
for my $oid (@leftovers) {
$oid = unpack('H*', $oid);
$self->{current_info} = "leftover $oid";
- unindex_oid($self, $git, $oid);
+ unindex_oid($self, $oid);
}
- $git->cleanup;
}
return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
}
}
-sub unindex_oid ($$$;$) {
- my ($self, $git, $oid, $unindexed) = @_;
+sub unindex_oid ($$;$) {
+ my ($self, $oid, $unindexed) = @_;
my $mm = $self->{mm};
- my $msgref = $git->cat_file($oid);
+ my $msgref = $self->{ibx}->git->cat_file($oid);
my $mime = PublicInbox::Eml->new($msgref);
my $mids = mids($mime->header_obj);
$mime = $msgref = undef;
}
}
+# this is rare, it only happens when we get discontiguous history in
+# a mirror because the source used -purge or -edit
sub unindex ($$$$) {
my ($self, $sync, $git, $unindex_range) = @_;
my $unindexed = $self->{unindexed} ||= {}; # $mid0 => $num
# order does not matter, here:
my @cmd = qw(log --raw -r
--no-notes --no-color --no-abbrev --no-renames);
- my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $unindex_range);
+ my $fh = $git->popen(@cmd, $unindex_range);
while (<$fh>) {
/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next;
- unindex_oid($self, $git, $1, $unindexed);
+ unindex_oid($self, $1, $unindexed);
}
- delete $self->{reindex_pipe};
close $fh or die "git log failed: \$?=$?";
return unless $sync->{-opt}->{prune};
my ($self, $sync, $i) = @_;
my $git_dir = git_dir_n($self, $i);
- die 'BUG: already reindexing!' if $self->{reindex_pipe};
-d $git_dir or return; # missing epochs are fine
my $git = PublicInbox::Git->new($git_dir);
- if (my $unindex_range = delete $sync->{unindex_range}->{$i}) {
+ if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare
unindex($self, $sync, $git, $unindex_range);
}
defined(my $stk = $sync->{stacks}->[$i]) or return;
$sync->{stacks}->[$i] = undef;
- my $range = $sync->{ranges}->[$i];
- if (my $pr = $sync->{-opt}->{-progress}) {
- $pr->("$i.git indexing $range\n");
- }
while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
$self->{current_info} = "$i.git $oid";
if ($f eq 'm') {
$sync->{autime} = $at;
$sync->{cotime} = $ct;
- reindex_oid($self, $sync, $git, $oid);
+ reindex_oid($self, $sync, $oid);
} elsif ($f eq 'd') {
- unindex_oid($self, $git, $oid);
+ unindex_oid($self, $oid);
}
}
delete @$sync{qw(autime cotime)};