package PublicInbox::SearchIdx;
use strict;
use v5.10.1;
-use parent qw(PublicInbox::Search PublicInbox::Lock);
+use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
use PublicInbox::Eml;
use PublicInbox::InboxWritable;
use PublicInbox::MID qw(mid_mime mids_for_index mids);
use PublicInbox::Spawn qw(spawn);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+our @EXPORT_OK = qw(too_big crlf_adjust log2stack is_ancestor);
my $X = \%PublicInbox::Search::X;
my ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
-my $addmsg = qr!^:000000 100644 \S+ ($OID) A\t${hex}{2}/${hex}{38}$!;
-my $delmsg = qr!^:100644 000000 ($OID) \S+ D\t${hex}{2}/${hex}{38}$!;
sub new {
my ($class, $ibx, $creat, $shard) = @_;
$batch_cb->($nr, $stk);
}
-sub prepare_stack ($$$) {
- my ($self, $sync, $range) = @_;
- my $git = $self->{ibx}->git;
-
- if (index($range, '..') < 0) {
- # don't show annoying git errors to users who run -index
- # on empty inboxes
- $git->qx(qw(rev-parse -q --verify), "$range^0");
- return PublicInbox::IdxStack->new->read_prepare if $?;
+sub log2stack ($$$$) {
+ my ($sync, $git, $range, $ibx) = @_;
+ my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise)
+ my ($add, $del);
+ if ($ibx->version == 1) {
+ my $path = $hex.'{2}/'.$hex.'{38}';
+ $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!;
+ $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!;
+ } else {
+ $del = qr!\A:\d{6} 100644 $OID ($OID) [AM]\td$!;
+ $add = qr!\A:\d{6} 100644 $OID ($OID) [AM]\tm$!;
}
- my $D = $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
# Count the new files so they can be added newest to oldest
# and still have numbers increasing from oldest to newest
if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
($at, $ct) = ($1 + 0, $2 + 0);
$stk //= PublicInbox::IdxStack->new($3);
- } elsif (/$delmsg/) {
+ } elsif (/$del/) {
my $oid = $1;
if ($D) { # reindex case
$D->{pack('H*', $oid)}++;
} else { # non-reindex case:
$stk->push_rec('d', $at, $ct, $oid);
}
- } elsif (/$addmsg/) {
+ } elsif (/$add/) {
my $oid = $1;
if ($D) {
my $oid_bin = pack('H*', $oid);
$stk->read_prepare;
}
+sub prepare_stack ($$$) {
+ my ($self, $sync, $range) = @_;
+ my $git = $self->{ibx}->git;
+
+ if (index($range, '..') < 0) {
+ # don't show annoying git errors to users who run -index
+ # on empty inboxes
+ $git->qx(qw(rev-parse -q --verify), "$range^0");
+ return PublicInbox::IdxStack->new->read_prepare if $?;
+ }
+ $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
+ log2stack($sync, $git, $range, $self->{ibx});
+}
+
# --is-ancestor requires git 1.8.0+
sub is_ancestor ($$$) {
my ($git, $cur, $tip) = @_;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd);
-use PublicInbox::SearchIdx;
+use PublicInbox::SearchIdx qw(too_big log2stack crlf_adjust is_ancestor);
use IO::Handle; # ->autoflush
use File::Temp qw(tempfile);
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
- $smsg->{bytes} = $smsg->{raw_bytes} +
- PublicInbox::SearchIdx::crlf_adjust($$msgref);
+ $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
$self->{over}->add_overview($mime, $smsg);
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
sub reindex_oid ($$$) {
my ($self, $sync, $oid) = @_;
- return if PublicInbox::SearchIdx::too_big($self, $oid);
+ return if too_big($self, $oid);
my ($num, $mid0, $len);
my $msgref = $self->{ibx}->git->cat_file($oid, \$len);
return if $len == 0; # purged
$heads;
}
-*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
-
# returns a revision range for git-log(1)
sub log_range ($$$$$) {
my ($self, $sync, $git, $i, $tip) = @_;
$range;
}
-sub prepare_range_stack {
- my ($git, $sync, $range) = @_;
- # Don't bump num_highwater on --reindex by using {D}.
- # We intentionally do NOT use {D} in the non-reindex case because
- # we want NNTP article number gaps from unindexed messages to
- # show up in mirrors, too.
- my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
-
- my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
- --no-notes --no-color --no-renames --no-abbrev),
- $range);
- my ($at, $ct, $stk);
- while (<$fh>) {
- if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
- ($at, $ct) = ($1 + 0, $2 + 0);
- $stk //= PublicInbox::IdxStack->new($3);
- } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\td$/o) {
- my $oid = $1;
- if ($D) { # reindex case
- $D->{pack('H*', $oid)}++;
- } else { # non-reindex case:
- $stk->push_rec('d', $at, $ct, $oid);
- }
- } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o) {
- my $oid = $1;
- if ($D) {
- my $oid_bin = pack('H*', $oid);
- my $nr = --$D->{$oid_bin};
- delete($D->{$oid_bin}) if $nr <= 0;
-
- # nr < 0 (-1) means it never existed
- $stk->push_rec('m', $at, $ct, $oid) if $nr < 0;
- } else {
- $stk->push_rec('m', $at, $ct, $oid);
- }
- }
- }
- close $fh or die "git log failed: \$?=$?";
- $stk ? $stk->read_prepare : undef;
-}
-
sub sync_prepare ($$$) {
my ($self, $sync, $epoch_max) = @_;
my $pr = $sync->{-opt}->{-progress};
my $range = log_range($self, $sync, $git, $i, $tip) or next;
# can't use 'rev-list --count' if we use --diff-filter
$pr->("$i.git counting $range ... ") if $pr;
- my $stk = prepare_range_stack($git, $sync, $range);
+ # Don't bump num_highwater on --reindex by using {D}.
+ # We intentionally do NOT use {D} in the non-reindex case
+ # because we want NNTP article number gaps from unindexed
+ # messages to show up in mirrors, too.
+ $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
+ my $stk = log2stack($sync, $git, $range, $self->{ibx});
my $nr = $stk ? $stk->num_records : 0;
$pr->("$nr\n") if $pr;
$sync->{stacks}->[$i] = $stk if $stk;