X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=ade5575669c1b601162280369eb795aa3b0c40e1;hb=a367ec1b15a2458e532245f5308565dd84f8ca63;hp=a1baa65bd2945dcc8885fc70735a1d60ecd5853b;hpb=77eafbd653d2efac546f2c330d8cf5e84bef2712;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index a1baa65b..ade55756 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -18,15 +18,14 @@ use PublicInbox::IdxStack; use Carp qw(croak); use POSIX qw(strftime); use PublicInbox::OverIdx; -use PublicInbox::Spawn qw(spawn); +use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size nodatacow_dir); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size); my $X = \%PublicInbox::Search::X; my ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; -our $BATCH_BYTES = defined($ENV{XAPIAN_FLUSH_THRESHOLD}) ? - 0x7fffffff : 1_000_000; +our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; @@ -62,13 +61,16 @@ sub new { }, $class; $self->xpfx_init; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; + if ($ibx->{-skip_docdata}) { + $self->{-set_skip_docdata_once} = 1; + $self->{-skip_docdata} = 1; + } $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); - $self->{over}->{-no_sync} = 1 if $ibx->{-no_sync}; - $self->{index_max_size} = $ibx->{index_max_size}; + $self->{over}->{-no_fsync} = 1 if $ibx->{-no_fsync}; } elsif ($version == 2) { defined $shard or die "shard is required for v2\n"; # shard is a number @@ -110,12 +112,6 @@ sub load_xapian_writable () { 1; } -sub nodatacow_dir ($) { - my ($dir) = @_; - opendir my $dh, $dir or die "opendir($dir): $!\n"; - PublicInbox::Spawn::set_nodatacow(fileno($dh)); -} - sub idx_acquire { my ($self) = @_; my $flag; @@ -135,10 +131,11 @@ sub idx_acquire { ($is_shard && need_xapian($self)))) { File::Path::mkpath($dir); nodatacow_dir($dir); + $self->{-set_has_threadid_once} = 1; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_sync}; + $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; if ($@) { die "Failed opening $dir: ", $@; @@ -360,6 +357,7 @@ sub add_xapian ($$$$) { add_val($doc, PublicInbox::Search::DT(), $dt); add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes}); add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); + add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); my $tg = term_generator($self); $tg->set_document($doc); @@ -367,10 +365,18 @@ sub add_xapian ($$$$) { msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); - $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); - my $data = $smsg->to_doc_data; - $doc->set_data($data); + + # by default, we maintain compatibility with v1.5.0 and earlier + # by writing to docdata.glass, users who never exect to downgrade can + # use --skip-docdata + if (!$self->{-skip_docdata}) { + # WWW doesn't need {to} or {cc}, only NNTP + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + my $data = $smsg->to_doc_data; + $doc->set_data($data); + } + if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { my $pfx = $alt->{xprefix}; @@ -389,7 +395,7 @@ sub _msgmap_init ($) { die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1; $self->{mm} //= eval { require PublicInbox::Msgmap; - my $rw = $self->{ibx}->{-no_sync} ? 2 : 1; + my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1; PublicInbox::Msgmap->new($self->{ibx}->{inboxdir}, $rw); }; } @@ -547,11 +553,11 @@ sub unindex_both { # git->cat_async callback # called by public-inbox-index sub index_sync { - my ($self, $opts) = @_; - delete $self->{lock_path} if $opts->{-skip_lock}; - $self->{ibx}->with_umask(\&_index_sync, $self, $opts); - if ($opts->{reindex}) { - my %again = %$opts; + my ($self, $opt) = @_; + delete $self->{lock_path} if $opt->{-skip_lock}; + $self->{ibx}->with_umask(\&_index_sync, $self, $opt); + if ($opt->{reindex}) { + my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); } @@ -560,10 +566,10 @@ sub index_sync { sub check_size { # check_async cb for -index --max-size=... my ($oid, $type, $size, $arg, $git) = @_; (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; - if ($size <= $arg->{index_max_size}) { + if ($size <= $arg->{max_size}) { $git->cat_async($oid, $arg->{index_oid}, $arg); } else { - warn "W: skipping $oid ($size > $arg->{index_max_size})\n"; + warn "W: skipping $oid ($size > $arg->{max_size})\n"; } } @@ -580,14 +586,22 @@ sub v1_checkpoint ($$;$) { $self->{mm}->last_commit($newest); } } else { - ${$sync->{max}} = $BATCH_BYTES; + ${$sync->{max}} = $self->{batch_bytes}; } $self->{mm}->{dbh}->commit; if ($newest && need_xapian($self)) { - my $cur = $self->{xdb}->get_metadata('last_commit'); + my $xdb = $self->{xdb}; + my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { - $self->{xdb}->set_metadata('last_commit', $newest); + $xdb->set_metadata('last_commit', $newest); + } + + # let SearchView know a full --reindex was done so it can + # generate ->has_threadid-dependent links + if ($sync->{reindex} && !ref($sync->{reindex})) { + my $n = $xdb->get_metadata('has_threadid'); + $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } } @@ -610,7 +624,7 @@ sub v1_checkpoint ($$;$) { sub process_stack { my ($self, $sync, $stk) = @_; my $git = $self->{ibx}->git; - my $max = $BATCH_BYTES; + my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; @@ -624,13 +638,13 @@ sub process_stack { $git->cat_async($oid, \&unindex_both, $self); } } - if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) { + if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { if ($f eq 'm') { my $arg = { %$sync, autime => $at, cotime => $ct }; - if ($sync->{index_max_size}) { + if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { $git->cat_async($oid, \&index_both, $arg); @@ -753,14 +767,15 @@ sub reindex_from ($$) { # indexes all unindexed messages (v1 only) sub _index_sync { - my ($self, $opts) = @_; - my $tip = $opts->{ref} || 'HEAD'; + my ($self, $opt) = @_; + my $tip = $opt->{ref} || 'HEAD'; my $git = $self->{ibx}->git; + $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; $git->batch_prepare; - my $pr = $opts->{-progress}; - my $sync = { reindex => $opts->{reindex}, -opt => $opts }; + my $pr = $opt->{-progress}; + my $sync = { reindex => $opt->{reindex}, -opt => $opt }; my $xdb = $self->begin_txn_lazy; - $self->{over}->rethread_prepare($opts); + $self->{over}->rethread_prepare($opt); my $mm = _msgmap_init($self); if ($sync->{reindex}) { my $last = $mm->last_commit; @@ -788,40 +803,6 @@ sub DESTROY { $_[0]->{lockfh} = undef; } -# remote_* subs are only used by SearchIdxPart -sub remote_commit { - my ($self) = @_; - if (my $w = $self->{w}) { - print $w "commit\n" or die "failed to write commit: $!"; - } else { - $self->commit_txn_lazy; - } -} - -sub remote_close { - my ($self) = @_; - if (my $w = delete $self->{w}) { - my $pid = delete $self->{pid} or die "no process to wait on\n"; - print $w "close\n" or die "failed to write to pid:$pid: $!\n"; - close $w or die "failed to close pipe for pid:$pid: $!\n"; - waitpid($pid, 0) == $pid or die "remote process did not finish"; - $? == 0 or die ref($self)." pid:$pid exited with: $?"; - } else { - die "transaction in progress $self\n" if $self->{txn}; - idx_release($self) if $self->{xdb}; - } -} - -sub remote_remove { - my ($self, $oid, $num) = @_; - if (my $w = $self->{w}) { - # triggers remove_by_oid in a shard - print $w "D $oid $num\n" or die "failed to write remove $!"; - } else { - $self->remove_by_oid($oid, $num); - } -} - sub _begin_txn { my ($self) = @_; my $xdb = $self->{xdb} || idx_acquire($self); @@ -838,23 +819,31 @@ sub begin_txn_lazy { # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) # This metadata is read by Admin::detect_indexlevel: -sub set_indexlevel { +sub set_metadata_once { my ($self) = @_; - if (!$self->{shard} && # undef or 0, not >0 - delete($self->{-set_indexlevel_once})) { - my $xdb = $self->{xdb}; + return if $self->{shard}; # only continue if undef or 0, not >0 + my $xdb = $self->{xdb}; + + if (delete($self->{-set_has_threadid_once})) { + $xdb->set_metadata('has_threadid', '1'); + } + if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { $xdb->set_metadata('indexlevel', 'medium'); } } + if (delete($self->{-set_skip_docdata_once})) { + $xdb->get_metadata('skip_docdata') or + $xdb->set_metadata('skip_docdata', '1'); + } } sub _commit_txn { my ($self) = @_; if (my $xdb = $self->{xdb}) { - set_indexlevel($self); + set_metadata_once($self); $xdb->commit_transaction; } $self->{over}->commit_lazy if $self->{over};