use PublicInbox::MID qw(mids_for_index mids);
use PublicInbox::MsgIter;
use PublicInbox::IdxStack;
-use Carp qw(croak);
+use Carp qw(croak carp);
use POSIX qw(strftime);
use Time::Local qw(timegm);
use PublicInbox::OverIdx;
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
- index_text term_generator add_val);
+ index_text term_generator add_val is_bad_blob);
my $X = \%PublicInbox::Search::X;
our ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
index_list_id($self, $doc, $hdr);
}
-sub add_xapian ($$$$) {
+sub eml2doc ($$$;$) {
my ($self, $eml, $smsg, $mids) = @_;
+ $mids //= mids_for_index($eml);
my $doc = $X->{Document}->new;
add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
my @ds = gmtime($smsg->{ds});
}
}
}
+ $doc;
+}
+
+sub add_xapian ($$$$) {
+ my ($self, $eml, $smsg, $mids) = @_;
+ my $doc = eml2doc($self, $eml, $smsg, $mids);
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
$smsg->{num};
}
-sub _get_doc ($$$) {
- my ($self, $docid, $oid) = @_;
+sub _get_doc ($$) {
+ my ($self, $docid) = @_;
my $doc = eval { $self->{xdb}->get_document($docid) };
$doc // do {
warn "E: $@\n" if $@;
- warn "E: #$docid $oid missing in Xapian\n";
+ warn "E: #$docid missing in Xapian\n";
undef;
}
}
sub add_eidx_info {
- my ($self, $docid, $oid, $eidx_key, $eml) = @_;
+ my ($self, $docid, $eidx_key, $eml) = @_;
begin_txn_lazy($self);
- my $doc = _get_doc($self, $docid, $oid) or return;
+ my $doc = _get_doc($self, $docid) or return;
term_generator($self)->set_document($doc);
$doc->add_boolean_term('O'.$eidx_key);
index_list_id($self, $doc, $eml);
}
sub remove_eidx_info {
- my ($self, $docid, $oid, $eidx_key, $eml) = @_;
+ my ($self, $docid, $eidx_key, $eml) = @_;
begin_txn_lazy($self);
- my $doc = _get_doc($self, $docid, $oid) or return;
+ my $doc = _get_doc($self, $docid) or return;
eval { $doc->remove_term('O'.$eidx_key) };
warn "W: ->remove_term O$eidx_key: $@\n" if $@;
for my $l ($eml ? $eml->header_raw('List-Id') : ()) {
$self->{xdb}->replace_document($docid, $doc);
}
-sub get_val ($$) {
+sub int_val ($$) {
my ($doc, $col) = @_;
- sortable_unserialise($doc->get_value($col));
+ my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
+ sortable_unserialise($val) + 0; # PV => IV conversion
}
sub smsg_from_doc ($) {
my ($doc) = @_;
my $data = $doc->get_data or return;
my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->{ts} = get_val($doc, PublicInbox::Search::TS());
- my $dt = get_val($doc, PublicInbox::Search::DT());
+ $smsg->{ts} = int_val($doc, PublicInbox::Search::TS());
+ my $dt = int_val($doc, PublicInbox::Search::DT());
my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt);
$smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy);
$smsg->load_from_data($data);
}
sub xdb_remove {
- my ($self, $oid, @removed) = @_;
+ my ($self, @docids) = @_;
my $xdb = $self->{xdb} or return;
- for my $num (@removed) {
- my $doc = _get_doc($self, $num, $oid) or next;
- my $smsg = smsg_from_doc($doc);
- my $blob = $smsg->{blob}; # may be undef if --skip-docdata
- if (!defined($blob) || $blob eq $oid) {
- $xdb->delete_document($num);
- } else {
- warn "E: #$num $oid != $blob in Xapian\n";
- }
+ for my $docid (@docids) {
+ eval { $xdb->delete_document($docid) };
+ warn "E: #$docid not in in Xapian? $@\n" if $@;
}
}
-sub remove_by_oid {
- my ($self, $oid, $num) = @_;
- die "BUG: remove_by_oid is v2-only\n" if $self->{oidx};
+sub remove_by_docid {
+ my ($self, $num) = @_;
+ die "BUG: remove_by_docid is v2-only\n" if $self->{oidx};
$self->begin_txn_lazy;
- xdb_remove($self, $oid, $num) if need_xapian($self);
+ xdb_remove($self, $num) if need_xapian($self);
}
sub index_git_blob_id {
} else { # just in case msgmap and over.sqlite3 become desynched:
$self->{mm}->mid_delete($mids->[0]);
}
- xdb_remove($self, $oid, keys %tmp) if need_xapian($self);
+ xdb_remove($self, keys %tmp) if need_xapian($self);
}
sub index_mm {
}
}
+sub is_bad_blob ($$$$) {
+ my ($oid, $type, $size, $expect_oid) = @_;
+ if ($type ne 'blob') {
+ carp "W: $expect_oid is not a blob (type=$type)";
+ return 1;
+ }
+ croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid;
+ $size == 0 ? 1 : 0; # size == 0 means purged
+}
+
sub index_both { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
+ return if is_bad_blob($oid, $type, $size, $sync->{oid});
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
$size += crlf_adjust($$bref);
my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
my $self = $sync->{sidx};
+ local $self->{current_info} = "$self->{current_info}: $oid";
my $eml = PublicInbox::Eml->new($bref);
$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
die "E: could not generate NNTP article number for $oid";
sub unindex_both { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
- unindex_eml($sync->{sidx}, $oid, PublicInbox::Eml->new($bref));
+ return if is_bad_blob($oid, $type, $size, $sync->{oid});
+ my $self = $sync->{sidx};
+ local $self->{current_info} = "$self->{current_info}: $oid";
+ unindex_eml($self, $oid, PublicInbox::Eml->new($bref));
# may be undef if leftover
if (defined(my $cur_cmt = $sync->{cur_cmt})) {
${$sync->{latest_cmt}} = $cur_cmt;
$sync->{index_oid} = \&index_both;
}
while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) {
- my $arg = { %$sync, cur_cmt => $cur_cmt };
+ my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid };
last if $sync->{quit};
if ($f eq 'm') {
$arg->{autime} = $at;
my ($self, $opt) = @_;
my $tip = $opt->{ref} || 'HEAD';
my $ibx = $self->{ibx};
+ local $self->{current_info} = "$ibx->{inboxdir}";
$self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES;
$ibx->git->batch_prepare;
my $pr = $opt->{-progress};
sub _commit_txn {
my ($self) = @_;
+ if (my $eidx = $self->{eidx}) {
+ $eidx->git->async_wait_all;
+ $eidx->{transact_bytes} = 0;
+ }
if (my $xdb = $self->{xdb}) {
set_metadata_once($self);
$xdb->commit_transaction;