+ my ($self, $mime, $mid) = @_;
+ my $over = $self->{over};
+ my $cids = content_ids($mime);
+ my ($id, $prev);
+ while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+ my $msg = get_blob($self, $smsg);
+ if (!defined($msg)) {
+ warn "broken smsg for $mid\n";
+ next;
+ }
+ my $cur = PublicInbox::MIME->new($msg);
+ if (content_matches($cids, $cur)) {
+ $smsg->{mime} = $cur;
+ return $smsg;
+ }
+
+
+ # XXX DEBUG_DIFF is experimental and may be removed
+ diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
+ }
+ undef;
+}
+
+sub atfork_child {
+ my ($self) = @_;
+ my $fh = delete $self->{reindex_pipe};
+ close $fh if $fh;
+ if (my $parts = $self->{idx_parts}) {
+ $_->atfork_child foreach @$parts;
+ }
+ if (my $im = $self->{im}) {
+ $im->atfork_child;
+ }
+ die "unexpected mm" if $self->{mm};
+ close $self->{bnote}->[0] or die "close bnote[0]: $!\n";
+ $self->{bnote}->[1];
+}
+
+sub mark_deleted {
+ my ($self, $D, $git, $oid) = @_;
+ my $msgref = $git->cat_file($oid);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+ foreach my $mid (@$mids) {
+ $D->{"$mid\0$cid"} = 1;
+ }
+}
+
+sub reindex_oid {
+ my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_;
+ my $len;
+ my $msgref = $git->cat_file($oid, \$len);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+
+ # get the NNTP article number we used before, highest number wins
+ # and gets deleted from mm_tmp;
+ my $mid0;
+ my $num = -1;
+ my $del = 0;
+ foreach my $mid (@$mids) {
+ $del += (delete $D->{"$mid\0$cid"} || 0);
+ my $n = $mm_tmp->num_for($mid);
+ if (defined $n && $n > $num) {
+ $mid0 = $mid;
+ $num = $n;
+ }
+ }
+ if (!defined($mid0) && $regen && !$del) {
+ $num = $$regen--;
+ die "BUG: ran out of article numbers\n" if $num <= 0;
+ my $mm = $self->{mm};
+ foreach my $mid (reverse @$mids) {
+ if ($mm->mid_set($num, $mid) == 1) {
+ $mid0 = $mid;
+ last;
+ }
+ }
+ if (!defined($mid0)) {
+ my $id = '<' . join('> <', @$mids) . '>';
+ warn "Message-ID $id unusable for $num\n";
+ foreach my $mid (@$mids) {
+ defined(my $n = $mm->num_for($mid)) or next;
+ warn "#$n previously mapped for <$mid>\n";
+ }
+ }
+ }
+
+ if (!defined($mid0) || $del) {
+ if (!defined($mid0) && $del) { # expected for deletes
+ $$regen--;
+ return
+ }
+
+ my $id = '<' . join('> <', @$mids) . '>';
+ defined($mid0) or
+ warn "Skipping $id, no article number found\n";
+ if ($del && defined($mid0)) {
+ warn "$id was deleted $del " .
+ "time(s) but mapped to article #$num\n";
+ }
+ return;
+
+ }
+ $mm_tmp->mid_delete($mid0) or
+ die "failed to delete <$mid0> for article #$num\n";
+
+ $self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+ my $nparts = $self->{partitions};
+ my $part = $num % $nparts;
+ my $idx = $self->idx_part($part);
+ $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+ my $n = $self->{transact_bytes} += $len;
+ if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+ $git->cleanup;
+ $mm_tmp->atfork_prepare;
+ $self->done; # release lock
+ # allow -watch or -mda to write...
+ $self->idx_init; # reacquire lock
+ $mm_tmp->atfork_parent;
+ }
+}
+
+# only update last_commit for $i on reindex iff newer than current
+sub update_last_commit {
+ my ($self, $git, $i, $cmt) = @_;
+ my $last = last_commit_part($self, $i);
+ if (defined $last && is_ancestor($git, $last, $cmt)) {
+ my @cmd = (qw(rev-list --count), "$last..$cmt");
+ chomp(my $n = $git->qx(@cmd));
+ return if $n ne '' && $n == 0;
+ }
+ last_commit_part($self, $i, $cmt);
+}
+
+sub git_dir_n ($$) { "$_[0]->{-inbox}->{mainrepo}/git/$_[1].git" }
+
+sub last_commits {
+ my ($self, $epoch_max) = @_;
+ my $heads = [];
+ for (my $i = $epoch_max; $i >= 0; $i--) {
+ $heads->[$i] = last_commit_part($self, $i);
+ }
+ $heads;
+}
+
+*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
+
+sub index_prepare {
+ my ($self, $opts, $epoch_max, $ranges) = @_;
+ my $regen_max = 0;
+ my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
+ for (my $i = $epoch_max; $i >= 0; $i--) {
+ die "already indexing!\n" if $self->{index_pipe};
+ my $git_dir = git_dir_n($self, $i);
+ -d $git_dir or next; # missing parts are fine
+ my $git = PublicInbox::Git->new($git_dir);
+ chomp(my $tip = $git->qx('rev-parse', $head));
+ my $range;
+ if (defined(my $cur = $ranges->[$i])) {
+ $range = "$cur..$tip";
+ if (is_ancestor($git, $cur, $tip)) { # common case
+ my $n = $git->qx(qw(rev-list --count), $range);
+ chomp($n);
+ if ($n == 0) {
+ $ranges->[$i] = undef;
+ next;
+ }
+ } else {
+ warn <<"";
+discontiguous range: $range
+Rewritten history? (in $git_dir)
+
+ my $base = $git->qx('merge-base', $tip, $cur);
+ chomp $base;
+ if ($base) {
+ $range = "$base..$tip";
+ warn "found merge-base: $base\n"
+ } else {
+ $range = $tip;
+ warn <<"";
+discarding history at $cur
+
+ }
+ warn <<"";
+reindexing $git_dir starting at
+$range
+
+ $self->{"unindex-range.$i"} = "$base..$cur";
+ }
+ } else {
+ $range = $tip; # all of it
+ }
+ $ranges->[$i] = $range;
+
+ # can't use 'rev-list --count' if we use --diff-filter
+ my $fh = $git->popen(qw(log --pretty=tformat:%h
+ --no-notes --no-color --no-renames
+ --diff-filter=AM), $range, '--', 'm');
+ ++$regen_max while <$fh>;
+ }
+ \$regen_max;
+}
+
+sub unindex_oid_remote {
+ my ($self, $oid, $mid) = @_;
+ $_->remote_remove($oid, $mid) foreach @{$self->{idx_parts}};
+ $self->{over}->remove_oid($oid, $mid);
+}
+
+sub unindex_oid {
+ my ($self, $git, $oid) = @_;
+ my $msgref = $git->cat_file($oid);
+ my $mime = PublicInbox::MIME->new($msgref);
+ my $mids = mids($mime->header_obj);
+ $mime = $msgref = undef;
+ my $over = $self->{over};
+ foreach my $mid (@$mids) {
+ my %gone;
+ my ($id, $prev);
+ while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+ $gone{$smsg->{num}} = 1 if $oid eq $smsg->{blob};
+ 1; # continue
+ }
+ my $n = scalar keys %gone;
+ next unless $n;
+ if ($n > 1) {
+ warn "BUG: multiple articles linked to $oid\n",
+ join(',',sort keys %gone), "\n";
+ }
+ $self->{unindexed}->{$_}++ foreach keys %gone;
+ $self->unindex_oid_remote($oid, $mid);
+ }
+}
+
+my $x40 = qr/[a-f0-9]{40}/;
+sub unindex {
+ my ($self, $opts, $git, $unindex_range) = @_;
+ my $un = $self->{unindexed} ||= {}; # num => removal count
+ my $before = scalar keys %$un;
+ my @cmd = qw(log --raw -r
+ --no-notes --no-color --no-abbrev --no-renames);
+ my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $unindex_range);
+ while (<$fh>) {
+ /\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o or next;
+ $self->unindex_oid($git, $1);
+ }
+ delete $self->{reindex_pipe};
+ $fh = undef;
+
+ return unless $opts->{prune};
+ my $after = scalar keys %$un;
+ return if $before == $after;
+
+ # ensure any blob can not longer be accessed via dumb HTTP
+ PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}",
+ qw(-c gc.reflogExpire=now gc --prune=all)]);
+}
+
+sub index_sync {
+ my ($self, $opts) = @_;
+ $opts ||= {};
+ my $epoch_max;
+ my $latest = git_dir_latest($self, \$epoch_max);
+ return unless defined $latest;
+ $self->idx_init; # acquire lock
+ my $mm_tmp = $self->{mm}->tmp_clone;
+ my $ranges = $opts->{reindex} ? [] : $self->last_commits($epoch_max);
+
+ my ($min, $max) = $mm_tmp->minmax;
+ my $regen = $self->index_prepare($opts, $epoch_max, $ranges);
+ $$regen += $max if $max;
+ my $D = {};
+ my @cmd = qw(log --raw -r --pretty=tformat:%h
+ --no-notes --no-color --no-abbrev --no-renames);
+
+ # work backwards through history
+ my $last_commit = [];
+ for (my $i = $epoch_max; $i >= 0; $i--) {
+ my $git_dir = git_dir_n($self, $i);
+ die "already reindexing!\n" if delete $self->{reindex_pipe};
+ -d $git_dir or next; # missing parts are fine
+ my $git = PublicInbox::Git->new($git_dir);
+ my $unindex = delete $self->{"unindex-range.$i"};
+ $self->unindex($opts, $git, $unindex) if $unindex;
+ defined(my $range = $ranges->[$i]) or next;
+ my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
+ my $cmt;
+ while (<$fh>) {
+ if (/\A$x40$/o && !defined($cmt)) {
+ chomp($cmt = $_);
+ } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+ $self->reindex_oid($mm_tmp, $D, $git, $1,
+ $regen);
+ } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
+ $self->mark_deleted($D, $git, $1);
+ }
+ }
+ $fh = undef;
+ delete $self->{reindex_pipe};
+ $self->update_last_commit($git, $i, $cmt) if defined $cmt;
+ }
+ my @d = sort keys %$D;
+ if (@d) {
+ warn "BUG: ", scalar(@d)," unseen deleted messages marked\n";
+ foreach (@d) {
+ my ($mid, undef) = split(/\0/, $_, 2);
+ warn "<$mid>\n";
+ }
+ }
+ $self->done;