X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=88349faa38b7a9325a11900155d4715c9faa90de;hb=af0b0fb7a454470a32c452119d0392e0dedb3fe1;hp=0fbe6560d793d463ac841976c25e642c607b0c4d;hpb=db0fff29f2ee736117ba03e8d1e0ad8a251013f3;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 0fbe6560..88349faa 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -1,6 +1,6 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ -# based on notmuch, but with no concept of folders, files or flags +# based on notmuch, but with no concept of folders, files # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread @@ -54,20 +54,16 @@ sub new { } } $ibx = PublicInbox::InboxWritable->new($ibx); - my $self = bless { - ibx => $ibx, - xpfx => $inboxdir, # for xpfx_init - -altid => $altid, - ibx_ver => $version, - indexlevel => $indexlevel, - }, $class; - $self->xpfx_init; + my $self = PublicInbox::Search->new($ibx); + bless $self, $class; + $self->{ibx} = $ibx; + $self->{-altid} = $altid; + $self->{indexlevel} = $indexlevel; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; if ($ibx->{-skip_docdata}) { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; @@ -106,7 +102,6 @@ sub load_xapian_writable () { } eval 'require '.$X->{WritableDatabase} or die; *sortable_serialise = $xap.'::sortable_serialise'; - *sortable_unserialise = $xap.'::sortable_unserialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); my $ver = (eval($xap.'::major_version()') << 16) | @@ -352,8 +347,9 @@ sub index_ids ($$$$) { index_list_id($self, $doc, $hdr); } -sub add_xapian ($$$$) { +sub eml2doc ($$$;$) { my ($self, $eml, $smsg, $mids) = @_; + $mids //= mids_for_index($eml); my $doc = $X->{Document}->new; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); @@ -370,7 +366,7 @@ sub add_xapian ($$$$) { index_headers($self, $smsg); if (defined(my $eidx_key = $smsg->{eidx_key})) { - $doc->add_boolean_term('O'.$eidx_key); + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; } msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); @@ -396,12 +392,18 @@ sub add_xapian ($$$$) { } } } + $doc; +} + +sub add_xapian ($$$$) { + my ($self, $eml, $smsg, $mids) = @_; + my $doc = eml2doc($self, $eml, $smsg, $mids); $self->{xdb}->replace_document($smsg->{num}, $doc); } sub _msgmap_init ($) { my ($self) = @_; - die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1; + die "BUG: _msgmap_init is only for v1\n" if $self->{ibx}->version != 1; $self->{mm} //= eval { require PublicInbox::Msgmap; my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1; @@ -460,7 +462,7 @@ sub add_eidx_info { begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; term_generator($self)->set_document($doc); - $doc->add_boolean_term('O'.$eidx_key); + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; index_list_id($self, $doc, $eml); $self->{xdb}->replace_document($docid, $doc); } @@ -494,17 +496,53 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub get_val ($$) { - my ($doc, $col) = @_; - sortable_unserialise($doc->get_value($col)); +sub set_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my %keep = map { $_ => 1 } @kw; + my %add = %keep; + my @rm; + my $end = $doc->termlist_end; + for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { + $cur->skip_to('K'); + last if $cur == $end; + my $kw = $cur->get_termname; + $kw =~ s/\AK//s or next; + $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw); + } + return unless (scalar(@rm) + scalar(keys %add)); + $doc->remove_term('K'.$_) for @rm; + $doc->add_boolean_term('K'.$_) for (keys %add); + $self->{xdb}->replace_document($docid, $doc); +} + +sub add_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + $doc->add_boolean_term('K'.$_) for @kw; + $self->{xdb}->replace_document($docid, $doc); +} + +sub remove_keywords { + my ($self, $docid, @kw) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my $replace; + eval { + $doc->remove_term('K'.$_); + $replace = 1 + } for @kw; + $self->{xdb}->replace_document($docid, $doc) if $replace; } sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); $smsg->load_from_data($data); @@ -610,10 +648,12 @@ sub index_both { # git->cat_async callback $size += crlf_adjust($$bref); my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; ${$sync->{latest_cmt}} = $cur_cmt; } @@ -621,11 +661,14 @@ sub index_both { # git->cat_async callback sub unindex_both { # git->cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; return if is_bad_blob($oid, $type, $size, $sync->{oid}); - unindex_eml($sync->{sidx}, $oid, PublicInbox::Eml->new($bref)); + my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; + unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); # may be undef if leftover if (defined(my $cur_cmt = $sync->{cur_cmt})) { ${$sync->{latest_cmt}} = $cur_cmt; } + ++$self->{nidx}; } sub with_umask { @@ -865,6 +908,7 @@ sub _index_sync { my ($self, $opt) = @_; my $tip = $opt->{ref} || 'HEAD'; my $ibx = $self->{ibx}; + local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; $ibx->git->batch_prepare; my $pr = $opt->{-progress}; @@ -941,6 +985,10 @@ sub set_metadata_once { sub _commit_txn { my ($self) = @_; + if (my $eidx = $self->{eidx}) { + $eidx->git->async_wait_all; + $eidx->{transact_bytes} = 0; + } if (my $xdb = $self->{xdb}) { set_metadata_once($self); $xdb->commit_transaction;