use v5.10.1;
use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
use PublicInbox::Eml;
+use PublicInbox::Search qw(xap_terms);
use PublicInbox::InboxWritable;
use PublicInbox::MID qw(mids_for_index mids);
use PublicInbox::MsgIter;
use PublicInbox::Spawn qw(spawn nodatacow_dir);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Address;
+use Config;
our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
index_text term_generator add_val is_bad_blob);
my $X = \%PublicInbox::Search::X;
our ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
-our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000;
+our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
+ # assume a typical 64-bit system has 8x more RAM than a
+ # typical 32-bit system:
+ (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
+
use constant DEBUG => !!$ENV{DEBUG};
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
+my @VMD_MAP = (kw => 'K', L => 'L');
our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
sub new {
sub idx_release {
my ($self, $wake) = @_;
if (need_xapian($self)) {
- my $xdb = delete $self->{xdb} or croak 'not acquired';
+ my $xdb = delete $self->{xdb} or croak '{xdb} not acquired';
$xdb->close;
}
$self->lock_release($wake) if $self->{creat};
sub load_xapian_writable () {
return 1 if $X->{WritableDatabase};
- PublicInbox::Search::load_xapian() or return;
+ PublicInbox::Search::load_xapian() or die "failed to load Xapian: $@\n";
my $xap = $PublicInbox::Search::Xap;
for (qw(Document TermGenerator WritableDatabase)) {
$X->{$_} = $xap.'::'.$_;
}
}
+sub index_phrase ($$$$) {
+ my ($self, $text, $wdf_inc, $prefix) = @_;
+
+ my $tg = term_generator($self);
+ $tg->index_text($text, $wdf_inc, $prefix);
+ $tg->increase_termpos;
+}
+
sub index_text ($$$$) {
my ($self, $text, $wdf_inc, $prefix) = @_;
- my $tg = term_generator($self); # man Search::Xapian::TermGenerator
if ($self->{indexlevel} eq 'full') {
- $tg->index_text($text, $wdf_inc, $prefix);
- $tg->increase_termpos;
+ index_phrase($self, $text, $wdf_inc, $prefix);
} else {
+ my $tg = term_generator($self);
$tg->index_text_without_positions($text, $wdf_inc, $prefix);
}
}
sub index_headers ($$) {
my ($self, $smsg) = @_;
- my @x = (from => 'A', # Author
- subject => 'S', to => 'XTO', cc => 'XCC');
+ my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $val = $smsg->{$field};
+ next if $val eq '';
+ # include "(comments)" after the address, too, so not using
+ # PublicInbox::Address::names or pairs
+ index_text($self, $val, 1, $pfx);
+
+ # we need positional info for email addresses since they
+ # can be considered phrases
+ if ($self->{indexlevel} eq 'medium') {
+ for my $addr (PublicInbox::Address::emails($val)) {
+ index_phrase($self, $addr, 1, $pfx);
+ }
+ }
+ }
+ @x = (subject => 'S');
while (my ($field, $pfx) = splice(@x, 0, 2)) {
my $val = $smsg->{$field};
index_text($self, $val, 1, $pfx) if $val ne '';
index_text($self, join("\n", @$xnq), 1, 'XNQ');
@$xnq = ();
}
- index_text($self, $text, 1, $pfx);
+ if ($pfx eq 'XDFN') {
+ index_phrase($self, $text, 1, $pfx);
+ } else {
+ index_text($self, $text, 1, $pfx);
+ }
}
sub index_old_diff_fn {
my $ct = $part->content_type || 'text/plain';
my $fn = $part->filename;
if (defined $fn && $fn ne '') {
- index_text($self, $fn, 1, 'XFN');
+ index_phrase($self, $fn, 1, 'XFN');
}
if ($part->{is_submsg}) {
my $mids = mids_for_index($part);
$l =~ /<([^>]+)>/ or next;
my $lid = lc $1;
$doc->add_boolean_term('G' . $lid);
- index_text($self, $lid, 1, 'XL'); # probabilistic
+ index_phrase($self, $lid, 1, 'XL'); # probabilistic
}
}
sub index_ids ($$$$) {
my ($self, $doc, $hdr, $mids) = @_;
for my $mid (@$mids) {
- index_text($self, $mid, 1, 'XM');
+ index_phrase($self, $mid, 1, 'XM');
# because too many Message-IDs are prefixed with
# "Pine.LNX."...
if ($mid =~ /\w{12,}/) {
my @long = ($mid =~ /(\w{3,}+)/g);
- index_text($self, join(' ', @long), 1, 'XM');
+ index_phrase($self, join(' ', @long), 1, 'XM');
}
}
$doc->add_boolean_term('Q' . $_) for @$mids;
if (!$self->{-skip_docdata}) {
# WWW doesn't need {to} or {cc}, only NNTP
$smsg->{to} = $smsg->{cc} = '';
- PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
+ $smsg->parse_references($eml, $mids);
my $data = $smsg->to_doc_data;
$doc->set_data($data);
}
sub add_xapian ($$$$) {
my ($self, $eml, $smsg, $mids) = @_;
begin_txn_lazy($self);
+ my $merge_vmd = delete $smsg->{-merge_vmd};
my $doc = eml2doc($self, $eml, $smsg, $mids);
+ if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) {
+ my @x = @VMD_MAP;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ for my $term (xap_terms($pfx, $old)) {
+ $doc->add_boolean_term($pfx.$term);
+ }
+ }
+ }
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
term_generator($self)->set_document($doc);
+
+ # '.' is special for lei_store
$doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
+
index_list_id($self, $doc, $eml);
$self->{xdb}->replace_document($docid, $doc);
}
$self->{xdb}->replace_document($docid, $doc);
}
-sub set_keywords {
- my ($self, $docid, @kw) = @_;
+sub set_vmd {
+ my ($self, $docid, $vmd) = @_;
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
- my %keep = map { $_ => 1 } @kw;
- my %add = %keep;
- my @rm;
- my $end = $doc->termlist_end;
- for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
- $cur->skip_to('K');
- last if $cur == $end;
- my $kw = $cur->get_termname;
- $kw =~ s/\AK//s or next;
- $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw);
- }
- return unless (scalar(@rm) + scalar(keys %add));
- $doc->remove_term('K'.$_) for @rm;
- $doc->add_boolean_term('K'.$_) for (keys %add);
+ my ($end, @rm, @add);
+ my @x = @VMD_MAP;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $set = $vmd->{$field} // next;
+ my %keep = map { $_ => 1 } @$set;
+ my %add = %keep;
+ $end //= $doc->termlist_end;
+ for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
+ $cur->skip_to($pfx);
+ last if $cur == $end;
+ my $v = $cur->get_termname;
+ $v =~ s/\A$pfx//s or next;
+ $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
+ }
+ push(@add, map { $pfx.$_ } keys %add);
+ }
+ return unless scalar(@rm) || scalar(@add);
+ $doc->remove_term($_) for @rm;
+ $doc->add_boolean_term($_) for @add;
$self->{xdb}->replace_document($docid, $doc);
}
-sub add_keywords {
- my ($self, $docid, @kw) = @_;
+sub apply_vmd_mod ($$) {
+ my ($doc, $vmd_mod) = @_;
+ my $updated = 0;
+ my @x = @VMD_MAP;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ # field: "L" or "kw"
+ for my $val (@{$vmd_mod->{"-$field"} // []}) {
+ eval {
+ $doc->remove_term($pfx . $val);
+ ++$updated;
+ };
+ }
+ for my $val (@{$vmd_mod->{"+$field"} // []}) {
+ $doc->add_boolean_term($pfx . $val);
+ ++$updated;
+ }
+ }
+ $updated;
+}
+
+sub add_vmd {
+ my ($self, $docid, $vmd) = @_;
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
- $doc->add_boolean_term('K'.$_) for @kw;
- $self->{xdb}->replace_document($docid, $doc);
+ my @x = @VMD_MAP;
+ my $updated = 0;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $add = $vmd->{$field} // next;
+ $doc->add_boolean_term($pfx . $_) for @$add;
+ $updated += scalar(@$add);
+ }
+ $updated += apply_vmd_mod($doc, $vmd);
+ $self->{xdb}->replace_document($docid, $doc) if $updated;
}
-sub remove_keywords {
- my ($self, $docid, @kw) = @_;
+sub remove_vmd {
+ my ($self, $docid, $vmd) = @_;
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
my $replace;
- eval {
- $doc->remove_term('K'.$_);
- $replace = 1
- } for @kw;
+ my @x = @VMD_MAP;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $rm = $vmd->{$field} // next;
+ for (@$rm) {
+ eval {
+ $doc->remove_term($pfx . $_);
+ $replace = 1;
+ };
+ }
+ }
$self->{xdb}->replace_document($docid, $doc) if $replace;
}
-sub smsg_from_doc ($) {
- my ($doc) = @_;
- my $data = $doc->get_data or return;
- my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->{ts} = int_val($doc, PublicInbox::Search::TS());
- my $dt = int_val($doc, PublicInbox::Search::DT());
- my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt);
- $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy);
- $smsg->load_from_data($data);
- $smsg;
+sub update_vmd {
+ my ($self, $docid, $vmd_mod) = @_;
+ begin_txn_lazy($self);
+ my $doc = _get_doc($self, $docid) or return;
+ my $updated = apply_vmd_mod($doc, $vmd_mod);
+ $self->{xdb}->replace_document($docid, $doc) if $updated;
+ $updated;
}
sub xdb_remove {
${$sync->{max}} = $self->{batch_bytes};
$self->{mm}->{dbh}->commit;
- my $xdb = need_xapian($self) ? $self->{xdb} : undef;
+ my $xdb = $self->{xdb};
if ($newest && $xdb) {
my $cur = $xdb->get_metadata('last_commit');
if (need_update($self, $cur, $newest)) {