package PublicInbox::SearchIdx;
use strict;
use v5.10.1;
-use parent qw(PublicInbox::Search PublicInbox::Lock);
+use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
use PublicInbox::Eml;
use PublicInbox::InboxWritable;
use PublicInbox::MID qw(mid_mime mids_for_index mids);
use PublicInbox::Spawn qw(spawn);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size);
my $X = \%PublicInbox::Search::X;
my ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
-my $addmsg = qr!^:000000 100644 \S+ ($OID) A\t${hex}{2}/${hex}{38}$!;
-my $delmsg = qr!^:100644 000000 ($OID) \S+ D\t${hex}{2}/${hex}{38}$!;
sub new {
my ($class, $ibx, $creat, $shard) = @_;
}
}
-sub too_big ($$) {
- my ($self, $oid) = @_;
- my $max_size = $self->{index_max_size} or return;
- my (undef, undef, $size) = $self->{ibx}->git->check($oid);
- die "E: bad $oid in $self->{ibx}->{inboxdir}\n" if !defined($size);
- return if $size <= $max_size;
- warn "W: skipping $oid ($size > $max_size)\n";
- 1;
+sub check_size { # check_async cb for -index --max-size=...
+ my ($oid, $type, $size, $arg, $git) = @_;
+ (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}";
+ if ($size <= $arg->{index_max_size}) {
+ $git->cat_async($oid, $arg->{index_oid}, $arg);
+ } else {
+ warn "W: skipping $oid ($size > $arg->{index_max_size})\n";
+ }
+}
+
+sub v1_checkpoint ($$;$) {
+ my ($self, $sync, $stk) = @_;
+ $self->{ibx}->git->check_async_wait;
+ $self->{ibx}->git->cat_async_wait;
+
+ # latest_cmt may be undef
+ my $newest = $stk ? $stk->{latest_cmt} : undef;
+ if ($newest) {
+ my $cur = $self->{mm}->last_commit || '';
+ if (need_update($self, $cur, $newest)) {
+ $self->{mm}->last_commit($newest);
+ }
+ } else {
+ ${$sync->{max}} = $BATCH_BYTES;
+ }
+
+ $self->{mm}->{dbh}->commit;
+ if ($newest && need_xapian($self)) {
+ my $cur = $self->{xdb}->get_metadata('last_commit');
+ if (need_update($self, $cur, $newest)) {
+ $self->{xdb}->set_metadata('last_commit', $newest);
+ }
+ }
+
+ $self->{over}->rethread_done($sync->{-opt}) if $newest; # all done
+ commit_txn_lazy($self);
+ $self->{ibx}->git->cleanup;
+ my $nr = ${$sync->{nr}};
+ idx_release($self, $nr);
+ # let another process do some work...
+ if (my $pr = $sync->{-opt}->{-progress}) {
+ $pr->("indexed $nr/$sync->{ntodo}\n") if $nr;
+ }
+ if (!$stk) { # more to come
+ begin_txn_lazy($self);
+ $self->{mm}->{dbh}->begin_work;
+ }
}
# only for v1
sub process_stack {
- my ($self, $stk, $sync, $batch_cb) = @_;
+ my ($self, $sync, $stk) = @_;
my $git = $self->{ibx}->git;
my $max = $BATCH_BYTES;
my $nr = 0;
$sync->{max} = \$max;
$sync->{sidx} = $self;
+ $self->{mm}->{dbh}->begin_work;
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
for my $oid (@leftovers) {
$git->cat_async($oid, \&unindex_both, $self);
}
}
+ if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
+ $sync->{index_oid} = \&index_both;
+ }
while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
if ($f eq 'm') {
- $sync->{autime} = $at;
- $sync->{cotime} = $ct;
- next if too_big($self, $oid);
- $git->cat_async($oid, \&index_both, { %$sync });
- if ($max <= 0) {
- $git->cat_async_wait;
- $max = $BATCH_BYTES;
- $batch_cb->($nr);
+ my $arg = { %$sync, autime => $at, cotime => $ct };
+ if ($sync->{index_max_size}) {
+ $git->check_async($oid, \&check_size, $arg);
+ } else {
+ $git->cat_async($oid, \&index_both, $arg);
}
+ v1_checkpoint($self, $sync) if $max <= 0;
} elsif ($f eq 'd') {
$git->cat_async($oid, \&unindex_both, $self);
}
}
- $git->cat_async_wait;
- $batch_cb->($nr, $stk);
+ v1_checkpoint($self, $sync, $stk);
}
-sub prepare_stack ($$$) {
- my ($self, $sync, $range) = @_;
- my $git = $self->{ibx}->git;
-
- if (index($range, '..') < 0) {
- # don't show annoying git errors to users who run -index
- # on empty inboxes
- $git->qx(qw(rev-parse -q --verify), "$range^0");
- return PublicInbox::IdxStack->new->read_prepare if $?;
+sub log2stack ($$$$) {
+ my ($sync, $git, $range, $ibx) = @_;
+ my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise)
+ my ($add, $del);
+ if ($ibx->version == 1) {
+ my $path = $hex.'{2}/'.$hex.'{38}';
+ $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!;
+ $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!;
+ } else {
+ $del = qr!\A:\d{6} 100644 $OID ($OID) [AM]\td$!;
+ $add = qr!\A:\d{6} 100644 $OID ($OID) [AM]\tm$!;
}
- my $D = $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
# Count the new files so they can be added newest to oldest
# and still have numbers increasing from oldest to newest
if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
($at, $ct) = ($1 + 0, $2 + 0);
$stk //= PublicInbox::IdxStack->new($3);
- } elsif (/$delmsg/) {
+ } elsif (/$del/) {
my $oid = $1;
if ($D) { # reindex case
$D->{pack('H*', $oid)}++;
} else { # non-reindex case:
$stk->push_rec('d', $at, $ct, $oid);
}
- } elsif (/$addmsg/) {
+ } elsif (/$add/) {
my $oid = $1;
if ($D) {
my $oid_bin = pack('H*', $oid);
$stk->read_prepare;
}
+sub prepare_stack ($$$) {
+ my ($self, $sync, $range) = @_;
+ my $git = $self->{ibx}->git;
+
+ if (index($range, '..') < 0) {
+ # don't show annoying git errors to users who run -index
+ # on empty inboxes
+ $git->qx(qw(rev-parse -q --verify), "$range^0");
+ return PublicInbox::IdxStack->new->read_prepare if $?;
+ }
+ $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
+ log2stack($sync, $git, $range, $self->{ibx});
+}
+
# --is-ancestor requires git 1.8.0+
sub is_ancestor ($$$) {
my ($git, $cur, $tip) = @_;
my $git = $self->{ibx}->git;
$git->batch_prepare;
my $pr = $opts->{-progress};
- my $sync = { reindex => $opts->{reindex} };
+ my $sync = { reindex => $opts->{reindex}, -opt => $opts };
my $xdb = $self->begin_txn_lazy;
$self->{over}->rethread_prepare($opts);
my $mm = _msgmap_init($self);
my $stk = prepare_stack($self, $sync, $range);
$sync->{ntodo} = $stk ? $stk->num_records : 0;
$pr->("$sync->{ntodo}\n") if $pr; # continue previous line
-
- my $dbh = $mm->{dbh};
- my $batch_cb = sub {
- my ($nr, $stk) = @_;
- # latest_cmt may be undef
- my $newest = $stk ? $stk->{latest_cmt} : undef;
- if ($newest) {
- my $cur = $mm->last_commit || '';
- if (need_update($self, $cur, $newest)) {
- $mm->last_commit($newest);
- }
- }
- $dbh->commit;
- if ($newest && need_xapian($self)) {
- my $cur = $xdb->get_metadata('last_commit');
- if (need_update($self, $cur, $newest)) {
- $xdb->set_metadata('last_commit', $newest);
- }
- }
-
- $self->{over}->rethread_done($opts) if $newest; # all done
- $self->commit_txn_lazy;
- $git->cleanup;
- $xdb = idx_release($self, $nr);
- # let another process do some work...
- $pr->("indexed $nr/$sync->{ntodo}\n") if $pr && $nr;
- if (!$stk) { # more to come
- $xdb = $self->begin_txn_lazy;
- $dbh->begin_work;
- }
- };
-
- $dbh->begin_work;
- process_stack($self, $stk, $sync, $batch_cb);
+ process_stack($self, $sync, $stk);
}
sub DESTROY {