+ eidx_gc_scan_inboxes($self, $sync);
+ eidx_gc_scan_shards($self, $sync);
+ done($self);
+}
+
+sub _ibx_for ($$$) {
+ my ($self, $sync, $smsg) = @_;
+ my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset';
+ my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos";
+ $self->{-ibx_ary_known}->[$pos] //
+ die "BUG: ibx for $smsg->{blob} not mapped"
+}
+
+sub _fd_constrained ($) {
+ my ($self) = @_;
+ $self->{-fd_constrained} //= do {
+ my $soft;
+ if (eval { require BSD::Resource; 1 }) {
+ my $NOFILE = BSD::Resource::RLIMIT_NOFILE();
+ ($soft, undef) = BSD::Resource::getrlimit($NOFILE);
+ } else {
+ chomp($soft = `sh -c 'ulimit -n'`);
+ }
+ if (defined($soft)) {
+ # $want is an estimate
+ my $want = scalar(@{$self->{ibx_active}}) + 64;
+ my $ret = $want > $soft;
+ if ($ret) {
+ warn <<EOF;
+RLIMIT_NOFILE=$soft insufficient (want: $want), will close DB handles early
+EOF
+ }
+ $ret;
+ } else {
+ warn "Unable to determine RLIMIT_NOFILE: $@\n";
+ 1;
+ }
+ };
+}
+
+sub _reindex_finalize ($$$) {
+ my ($req, $smsg, $eml) = @_;
+ my $sync = $req->{sync};
+ my $self = $sync->{self};
+ my $by_chash = delete $req->{by_chash} or die 'BUG: no {by_chash}';
+ my $nr = scalar(keys(%$by_chash)) or die 'BUG: no content hashes';
+ my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}';
+ my $docid = $smsg->{num} = $orig_smsg->{num};
+ $self->{oidx}->add_overview($eml, $smsg); # may rethread
+ check_batch_limit({ %$sync, new_smsg => $smsg });
+ my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}";
+ my $stable = delete($by_chash->{$chash0}) //
+ die "BUG: $smsg->{blob} chash missing";
+ my $idx = $self->idx_shard($docid);
+ my $top_smsg = pop @$stable;
+ $top_smsg == $smsg or die 'BUG: top_smsg != smsg';
+ my $ibx = _ibx_for($self, $sync, $smsg);
+ $idx->index_eml($eml, $smsg, $ibx->eidx_key);
+ for my $x (reverse @$stable) {
+ $ibx = _ibx_for($self, $sync, $x);
+ my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}';
+ $idx->ipc_do('add_eidx_info', $docid, $ibx->eidx_key, $hdr);
+ }
+ return if $nr == 1; # likely, all good
+
+ $self->git->async_wait_all;
+ warn "W: #$docid split into $nr due to deduplication change\n";
+ my @todo;
+ for my $ary (values %$by_chash) {
+ for my $x (reverse @$ary) {
+ warn "removing #$docid xref3 $x->{blob}\n";
+ my $bin = $x->oidbin;
+ my $n = _unref_doc($sync, $docid, undef, undef, $bin);
+ die "BUG: $x->{blob} invalidated #$docid" if $n == 0;
+ }
+ my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty";
+ $x->{num} = delete($x->{xnum}) // die '{xnum} unset';
+ $ibx = _ibx_for($self, $sync, $x);
+ if (my $over = $ibx->over) {
+ my $e = $over->get_art($x->{num});
+ $e->{blob} eq $x->{blob} or die <<EOF;
+$x->{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num});
+EOF
+ push @todo, $ibx, $e;
+ $over->dbh_close if _fd_constrained($self);
+ } else {
+ die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n";
+ }
+ }
+ undef $by_chash;
+ while (my ($ibx, $e) = splice(@todo, 0, 2)) {
+ reindex_unseen($self, $sync, $ibx, $e);
+ }
+}
+
+sub _reindex_oid { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $req) = @_;
+ my $sync = $req->{sync};
+ my $self = $sync->{self};
+ my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}';
+ my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2];
+ my $docid = $orig_smsg->{num};
+ if (is_bad_blob($oid, $type, $size, $expect_oid)) {
+ my $oidbin = pack('H*', $expect_oid);
+ my $remain = _unref_doc($sync, $docid, undef, undef, $oidbin);
+ if ($remain == 0) {
+ warn "W: #$docid ($oid) gone or corrupt\n";
+ } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
+ $self->git->cat_async($next_oid, \&_reindex_oid, $req);
+ } else {
+ warn "BUG: #$docid ($oid) gone (UNEXPECTED)\n";
+ }
+ return;
+ }
+ my $ci = $self->{current_info};
+ local $self->{current_info} = "$ci #$docid $oid";
+ my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
+ $re_smsg->set_bytes($$bref, $size);
+ my $eml = PublicInbox::Eml->new($bref);
+ $re_smsg->populate($eml, { autime => $orig_smsg->{ds},
+ cotime => $orig_smsg->{ts} });
+ my $chash = content_hash($eml);
+ $re_smsg->{chash} = $chash;
+ $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1];
+ $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0];
+ $re_smsg->{hdr} = $eml->header_obj;
+ push @{$req->{by_chash}->{$chash}}, $re_smsg;
+ if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
+ $self->git->cat_async($next_oid, \&_reindex_oid, $req);
+ } else { # last $re_smsg is the highest priority xref3
+ local $self->{current_info} = "$ci #$docid";
+ _reindex_finalize($req, $re_smsg, $eml);
+ }
+}
+
+sub _reindex_smsg ($$$) {
+ my ($self, $sync, $smsg) = @_;
+ my $docid = $smsg->{num};
+ my $xr3 = $self->{oidx}->get_xref3($docid, 1);
+ if (scalar(@$xr3) == 0) { # _reindex_check_stale should've covered this
+ warn <<"";
+BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
+
+ remove_doc($self, $docid);
+ return;
+ }
+
+ # we sort {xr3r} in the reverse order of ibx_sorted so we can
+ # hit the common case in _reindex_finalize without rereading
+ # from git (or holding multiple messages in memory).
+ my $id2pos = $sync->{id2pos}; # index in ibx_sorted
+ @$xr3 = sort {
+ $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
+ ||
+ $b->[1] <=> $a->[1] # break ties with {xnum}
+ } @$xr3;
+ @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3;
+ my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 };
+ $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req);
+}
+
+sub checkpoint_due ($) {
+ my ($sync) = @_;
+ ${$sync->{need_checkpoint}} || (now() > $sync->{next_check});
+}
+
+sub host_ident () {
+ # I've copied FS images and only changed the hostname before,
+ # so prepend hostname. Use `state' since these a BOFH can change
+ # these while this process is running and we always want to be
+ # able to release locks taken by this process.
+ state $retval = hostname . '-' . do {
+ my $m; # machine-id(5) is systemd
+ if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> }
+ # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs
+ chomp($m ||= `{ sysctl -n kern.hostid ||
+ hostid || ghostid; } 2>/dev/null`
+ || "no-machine-id-or-hostid-on-$^O");
+ $m;
+ };
+}
+
+sub eidxq_release {
+ my ($self) = @_;
+ my $expect = delete($self->{-eidxq_locked}) or return;
+ my ($owner_pid, undef) = split(/-/, $expect);
+ return if $owner_pid != $$; # shards may fork
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') // '';
+ if ($cur eq $expect) {
+ $oidx->eidx_meta('eidxq_lock', '');
+ return 1;
+ } elsif ($cur ne '') {
+ warn "E: eidxq_lock($expect) stolen by $cur\n";
+ } else {
+ warn "E: eidxq_lock($expect) released by another process\n";
+ }
+ undef;
+}
+
+sub DESTROY {
+ my ($self) = @_;
+ eidxq_release($self) and $self->{oidx}->commit_lazy;
+}
+
+sub _eidxq_take ($) {
+ my ($self) = @_;
+ my $val = "$$-${\time}-$>-".host_ident;
+ $self->{oidx}->eidx_meta('eidxq_lock', $val);
+ $self->{-eidxq_locked} = $val;
+}
+
+sub eidxq_lock_acquire ($) {
+ my ($self) = @_;
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self);
+ if (my $locked = $self->{-eidxq_locked}) { # be lazy
+ return $locked if $locked eq $cur;
+ }
+ my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4);
+ my $t = strftime('%Y-%m-%d %k:%M %z', localtime($time));
+ local $self->{current_info} = 'eidxq';
+ if ($euid == $> && $ident eq host_ident) {
+ kill(0, $pid) and warn <<EOM and return;
+# PID:$pid (re)indexing since $t, it will continue our work
+EOM
+ if ($!{ESRCH}) {
+ warn "# eidxq_lock is stale ($cur), clobbering\n";
+ return _eidxq_take($self);
+ }
+ warn "E: kill(0, $pid) failed: $!\n"; # fall-through:
+ }
+ my $fn = $oidx->dbh->sqlite_db_filename;
+ warn <<EOF;
+W: PID:$pid, UID:$euid on $ident is indexing Xapian since $t
+W: If this is unexpected, delete `eidxq_lock' from the `eidx_meta' table:
+W: sqlite3 $fn 'DELETE FROM eidx_meta WHERE key = "eidxq_lock"'
+EOF
+ undef;
+}
+
+sub ibx_sorted ($$) {
+ my ($self, $type) = @_;
+ $self->{"-ibx_ary_$type"} //= do {
+ # highest boost first, stable for config-ordering tiebreaker
+ use sort 'stable';
+ [ sort {
+ ($b->{boost} // 0) <=> ($a->{boost} // 0)
+ } @{$self->{'ibx_'.$type} // die "BUG: $type unknown"} ];
+ }
+}
+
+sub prep_id2pos ($) {
+ my ($self) = @_;
+ my %id2pos;
+ my $pos = 0;
+ $id2pos{$_->{-ibx_id}} = $pos++ for (@{ibx_sorted($self, 'known')});
+ \%id2pos;
+}