+
+ # we sort {xr3r} in the reverse order of ibx_sorted so we can
+ # hit the common case in _reindex_finalize without rereading
+ # from git (or holding multiple messages in memory).
+ my $id2pos = $sync->{id2pos}; # index in ibx_sorted
+ @$xr3 = sort {
+ $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
+ ||
+ $b->[1] <=> $a->[1] # break ties with {xnum}
+ } @$xr3;
+ @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3;
+ my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 };
+ $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req);
+}
+
+sub checkpoint_due ($) {
+ my ($sync) = @_;
+ ${$sync->{need_checkpoint}} || (now() > $sync->{next_check});
+}
+
+sub host_ident () {
+ # I've copied FS images and only changed the hostname before,
+ # so prepend hostname. Use `state' since these a BOFH can change
+ # these while this process is running and we always want to be
+ # able to release locks taken by this process.
+ state $retval = hostname . '-' . do {
+ my $m; # machine-id(5) is systemd
+ if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> }
+ # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs
+ chomp($m ||= `{ sysctl -n kern.hostid ||
+ hostid || ghostid; } 2>/dev/null`
+ || "no-machine-id-or-hostid-on-$^O");
+ $m;
+ };
+}
+
+sub eidxq_release {
+ my ($self) = @_;
+ my $expect = delete($self->{-eidxq_locked}) or return;
+ my ($owner_pid, undef) = split(/-/, $expect);
+ return if $owner_pid != $$; # shards may fork
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') // '';
+ if ($cur eq $expect) {
+ $oidx->eidx_meta('eidxq_lock', '');
+ return 1;
+ } elsif ($cur ne '') {
+ warn "E: eidxq_lock($expect) stolen by $cur\n";
+ } else {
+ warn "E: eidxq_lock($expect) released by another process\n";
+ }
+ undef;
+}
+
+sub DESTROY {
+ my ($self) = @_;
+ eidxq_release($self) and $self->{oidx}->commit_lazy;
+}
+
+sub _eidxq_take ($) {
+ my ($self) = @_;
+ my $val = "$$-${\time}-$>-".host_ident;
+ $self->{oidx}->eidx_meta('eidxq_lock', $val);
+ $self->{-eidxq_locked} = $val;
+}
+
+sub eidxq_lock_acquire ($) {
+ my ($self) = @_;
+ my $oidx = $self->{oidx};
+ $oidx->begin_lazy;
+ my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self);
+ if (my $locked = $self->{-eidxq_locked}) { # be lazy
+ return $locked if $locked eq $cur;
+ }
+ my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4);
+ my $t = strftime('%Y-%m-%d %k:%M:%S', gmtime($time));
+ if ($euid == $> && $ident eq host_ident) {
+ if (kill(0, $pid)) {
+ warn <<EOM; return;
+I: PID:$pid (re)indexing Xapian since $t, it will continue our work
+EOM
+ }
+ if ($!{ESRCH}) {
+ warn "I: eidxq_lock is stale ($cur), clobbering\n";
+ return _eidxq_take($self);
+ }
+ warn "E: kill(0, $pid) failed: $!\n"; # fall-through:
+ }
+ my $fn = $oidx->dbh->sqlite_db_filename;
+ warn <<EOF;
+W: PID:$pid, UID:$euid on $ident is indexing Xapian since $t
+W: If this is unexpected, delete `eidxq_lock' from the `eidx_meta' table:
+W: sqlite3 $fn 'DELETE FROM eidx_meta WHERE key = "eidxq_lock"'
+EOF
+ undef;
+}
+
+sub ibx_sorted ($$) {
+ my ($self, $type) = @_;
+ $self->{"-ibx_ary_$type"} //= do {
+ # highest boost first, stable for config-ordering tiebreaker
+ use sort 'stable';
+ [ sort {
+ ($b->{boost} // 0) <=> ($a->{boost} // 0)
+ } @{$self->{'ibx_'.$type} // die "BUG: $type unknown"} ];
+ }
+}
+
+sub prep_id2pos ($) {
+ my ($self) = @_;
+ my %id2pos;
+ my $pos = 0;
+ $id2pos{$_->{-ibx_id}} = $pos++ for (@{ibx_sorted($self, 'known')});
+ \%id2pos;
+}
+
+sub eidxq_process ($$) { # for reindexing
+ my ($self, $sync) = @_;
+ return unless $self->{cfg};
+
+ return unless eidxq_lock_acquire($self);
+ my $dbh = $self->{oidx}->dbh;
+ my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return;
+ ${$sync->{nr}} = 0;
+ local $sync->{-regen_fmt} = "%u/$tot\n";
+ my $pr = $sync->{-opt}->{-progress};
+ if ($pr) {
+ my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq');
+ my $max = $dbh->selectrow_array('SELECT MAX(docid) FROM eidxq');
+ $pr->("Xapian indexing $min..$max (total=$tot)\n");
+ }
+ $sync->{id2pos} //= prep_id2pos($self);
+ my ($del, $iter);
+restart:
+ $del = $dbh->prepare('DELETE FROM eidxq WHERE docid = ?');
+ $iter = $dbh->prepare('SELECT docid FROM eidxq ORDER BY docid ASC');
+ $iter->execute;
+ while (defined(my $docid = $iter->fetchrow_array)) {
+ last if $sync->{quit};
+ if (my $smsg = $self->{oidx}->get_art($docid)) {
+ _reindex_smsg($self, $sync, $smsg);
+ } else {
+ warn "E: #$docid does not exist in over\n";
+ }
+ $del->execute($docid);
+ ++${$sync->{nr}};
+
+ if (checkpoint_due($sync)) {
+ $dbh = $del = $iter = undef;
+ reindex_checkpoint($self, $sync); # release lock
+ $dbh = $self->{oidx}->dbh;
+ goto restart;
+ }
+ }
+ $self->git->async_wait_all;
+ $pr->("reindexed ${$sync->{nr}}/$tot\n") if $pr;
+}
+
+sub _reindex_unseen { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $req) = @_;
+ return if is_bad_blob($oid, $type, $size, $req->{oid});
+ my $self = $req->{self} // die 'BUG: {self} unset';
+ local $self->{current_info} = "$self->{current_info} $oid";
+ my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg';
+ $new_smsg->set_bytes($$bref, $size);
+ my $eml = $req->{eml} = PublicInbox::Eml->new($bref);
+ $req->{new_smsg} = $new_smsg;
+ $req->{chash} = content_hash($eml);
+ $req->{mids} = mids($eml); # do_step iterates through this
+ do_step($req); # enter the normal indexing flow
+}
+
+# --reindex may catch totally unseen messages, this handles them
+sub reindex_unseen ($$$$) {
+ my ($self, $sync, $ibx, $xsmsg) = @_;
+ my $req = {
+ %$sync, # has {self}
+ autime => $xsmsg->{ds},
+ cotime => $xsmsg->{ts},
+ oid => $xsmsg->{blob},
+ ibx => $ibx,
+ xnum => $xsmsg->{num},
+ # {mids} and {chash} will be filled in at _reindex_unseen
+ };
+ warn "I: reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n";
+ $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req);
+}
+
+sub _unref_stale_range ($$$) {
+ my ($sync, $ibx, $lt_or_gt) = @_;
+ my $r;
+ my $lim = 10000;
+ do {
+ $r = $sync->{self}->{oidx}->dbh->selectall_arrayref(
+ <<EOS, undef, $ibx->{-ibx_id});
+SELECT docid,xnum,oidbin FROM xref3
+WHERE ibx_id = ? AND xnum $lt_or_gt LIMIT $lim
+EOS
+ return if $sync->{quit};
+ for (@$r) { # hopefully rare, not worth optimizing:
+ my ($docid, $xnum, $oidbin) = @$_;
+ my $hex = unpack('H*', $oidbin);
+ warn("# $xnum:$hex (#$docid): stale\n");
+ _unref_doc($sync, $docid, $ibx, $xnum, $oidbin);
+ }
+ } while (scalar(@$r) == $lim);
+ 1;
+}
+
+sub _reindex_check_ibx ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $ibx_id = $ibx->{-ibx_id};
+ my $slice = 10000;
+ my $opt = { limit => $slice };
+ my ($beg, $end) = (1, $slice);
+ my $err = sync_inbox($self, $sync, $ibx) and return;
+ my $max = $ibx->over->max;
+ $end = $max if $end > $max;
+
+ # first, check if we missed any messages in target $ibx
+ my $msgs;
+ my $pr = $sync->{-opt}->{-progress};
+ my $ekey = $ibx->eidx_key;
+ local $sync->{-regen_fmt} = "$ekey checking %u/$max\n";
+ ${$sync->{nr}} = 0;
+ my $fast = $sync->{-opt}->{fast};
+ my $dsu; # _unref_stale_range (< $lo) called
+ my ($lo, $hi);
+ while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end, $opt)})) {
+ ${$sync->{nr}} = $beg;
+ $beg = $msgs->[-1]->{num} + 1;
+ $end = $beg + $slice;
+ $end = $max if $end > $max;
+ if (checkpoint_due($sync)) {
+ reindex_checkpoint($self, $sync); # release lock
+ }
+ ($lo, $hi) = ($msgs->[0]->{num}, $msgs->[-1]->{num});
+ $dsu //= _unref_stale_range($sync, $ibx, "< $lo");
+ my $x3a = $self->{oidx}->dbh->selectall_arrayref(
+ <<"", undef, $ibx_id, $lo, $hi);
+SELECT xnum,oidbin,docid FROM xref3 WHERE
+ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+ my %x3m;
+ for (@$x3a) {
+ my $k = pack('J', $_->[0]) . $_->[1];
+ push @{$x3m{$k}}, $_->[2];
+ }
+ undef $x3a;
+ for my $xsmsg (@$msgs) {
+ my $k = pack('JH*', $xsmsg->{num}, $xsmsg->{blob});
+ my $docids = delete($x3m{$k});
+ if (!defined($docids)) {
+ reindex_unseen($self, $sync, $ibx, $xsmsg);
+ } elsif (!$fast) {
+ for my $num (@$docids) {
+ $self->{oidx}->eidxq_add($num);
+ }
+ return if $sync->{quit};
+ }
+ }
+ return if $sync->{quit};
+ next unless scalar keys %x3m;
+
+ # eliminate stale/mismatched entries
+ my %mismatch = map { $_->{num} => $_->{blob} } @$msgs;
+ while (my ($k, $docids) = each %x3m) {
+ my ($xnum, $hex) = unpack('JH*', $k);
+ my $bin = pack('H*', $hex);
+ my $exp = $mismatch{$xnum};
+ my $m = defined($exp) ? "mismatch (!= $exp)" : 'stale';
+ warn("# $xnum:$hex (#@$docids): $m\n");
+ for my $i (@$docids) {
+ _unref_doc($sync, $i, $ibx, $xnum, $bin);
+ }
+ }
+ }
+ _unref_stale_range($sync, $ibx, "> $hi") if defined($hi);
+}
+
+sub _reindex_inbox ($$$) {
+ my ($self, $sync, $ibx) = @_;
+ my $ekey = $ibx->eidx_key;
+ local $self->{current_info} = $ekey;
+ if (defined(my $err = _ibx_index_reject($ibx))) {
+ warn "W: cannot reindex $ekey ($err)\n";
+ } else {
+ _reindex_check_ibx($self, $sync, $ibx);
+ }
+ delete @$ibx{qw(over mm search git)}; # won't need these for a bit
+}
+
+sub eidx_reindex {
+ my ($self, $sync) = @_;
+ return unless $self->{cfg};
+
+ # acquire eidxq_lock early because full reindex takes forever
+ # and incremental -extindex processes can run during our checkpoints
+ if (!eidxq_lock_acquire($self)) {
+ warn "E: aborting --reindex\n";
+ return;
+ }
+ for my $ibx (@{ibx_sorted($self, 'active')}) {
+ _reindex_inbox($self, $sync, $ibx);
+ last if $sync->{quit};
+ }
+ $self->git->async_wait_all; # ensure eidxq gets filled completely
+ eidxq_process($self, $sync) unless $sync->{quit};
+}
+
+sub sync_inbox {
+ my ($self, $sync, $ibx) = @_;
+ my $err = _sync_inbox($self, $sync, $ibx);
+ delete @$ibx{qw(mm over)};
+ warn $err, "\n" if defined($err);
+ $err;
+}
+
+sub dd_smsg { # git->cat_async callback
+ my ($bref, $oid, $type, $size, $dd) = @_;
+ my $smsg = $dd->{smsg} // die 'BUG: dd->{smsg} missing';
+ my $self = $dd->{self} // die 'BUG: {self} missing';
+ my $per_mid = $dd->{per_mid} // die 'BUG: {per_mid} missing';
+ if ($type eq 'missing') {
+ _blob_missing($dd, $smsg);
+ } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) {
+ local $self->{current_info} = "$self->{current_info} $oid";
+ my $chash = content_hash(PublicInbox::Eml->new($bref));
+ push(@{$per_mid->{dd_chash}->{$chash}}, $smsg);
+ }
+ return if $per_mid->{last_smsg} != $smsg;
+ while (my ($chash, $ary) = each %{$per_mid->{dd_chash}}) {
+ my $keep = shift @$ary;
+ next if !scalar(@$ary);
+ $per_mid->{sync}->{dedupe_cull} += scalar(@$ary);
+ print STDERR
+ "# <$keep->{mid}> keeping #$keep->{num}, dropping ",
+ join(', ', map { "#$_->{num}" } @$ary),"\n";
+ next if $per_mid->{sync}->{-opt}->{'dry-run'};
+ my $oidx = $self->{oidx};
+ for my $smsg (@$ary) {
+ my $gone = $smsg->{num};
+ $oidx->merge_xref3($keep->{num}, $gone, $smsg->{blob});
+ remove_doc($self, $gone);
+ }
+ }
+}
+
+sub eidx_dedupe ($$$) {
+ my ($self, $sync, $msgids) = @_;
+ $sync->{dedupe_cull} = 0;
+ my $candidates = 0;
+ my $nr_mid = 0;
+ return unless eidxq_lock_acquire($self);
+ my ($iter, $cur_mid);
+ my $min_id = 0;
+ my $idx = 0;
+ my ($max_id) = $self->{oidx}->dbh->selectrow_array(<<EOS);
+SELECT MAX(id) FROM msgid
+EOS
+ local $sync->{-regen_fmt} = "dedupe %u/$max_id\n";
+
+ # note: we could write this query more intelligently,
+ # but that causes lock contention with read-only processes
+dedupe_restart:
+ $cur_mid = $msgids->[$idx];
+ if ($cur_mid eq '') { # all Message-IDs
+ $iter = $self->{oidx}->dbh->prepare(<<EOS);
+SELECT mid,id FROM msgid WHERE id > ? ORDER BY id ASC
+EOS
+ $iter->execute($min_id);
+ } else {
+ $iter = $self->{oidx}->dbh->prepare(<<EOS);
+SELECT mid,id FROM msgid WHERE mid = ? AND id > ? ORDER BY id ASC
+EOS
+ $iter->execute($cur_mid, $min_id);
+ }
+ while (my ($mid, $id) = $iter->fetchrow_array) {
+ last if $sync->{quit};
+ $self->{current_info} = "dedupe $mid";
+ ${$sync->{nr}} = $min_id = $id;
+ my ($prv, @smsg);
+ while (my $x = $self->{oidx}->next_by_mid($mid, \$id, \$prv)) {
+ push @smsg, $x;
+ }
+ next if scalar(@smsg) < 2;
+ my $per_mid = {
+ dd_chash => {}, # chash => [ary of smsgs]
+ last_smsg => $smsg[-1],
+ sync => $sync
+ };
+ $nr_mid++;
+ $candidates += scalar(@smsg) - 1;
+ for my $smsg (@smsg) {
+ my $dd = {
+ per_mid => $per_mid,
+ smsg => $smsg,
+ self => $self,
+ };
+ $self->git->cat_async($smsg->{blob}, \&dd_smsg, $dd);
+ }
+ # need to wait on every single one @smsg contents can get
+ # invalidated inside dd_smsg for messages with multiple
+ # Message-IDs.
+ $self->git->async_wait_all;
+
+ if (checkpoint_due($sync)) {
+ undef $iter;
+ reindex_checkpoint($self, $sync);
+ goto dedupe_restart;
+ }
+ }
+ goto dedupe_restart if defined($msgids->[++$idx]);
+
+ my $n = delete $sync->{dedupe_cull};
+ if (my $pr = $sync->{-opt}->{-progress}) {
+ $pr->("culled $n/$candidates candidates ($nr_mid msgids)\n");
+ }
+ ${$sync->{nr}} = 0;