Documentation/public-inbox-config.pod | 8 ++++++++ lib/PublicInbox/Config.pm | 2 +- lib/PublicInbox/ExtSearchIdx.pm | 43 ++++++++++++++++++++++++++++--------------- t/extsearch.t | 32 ++++++++++++++++++++++++++++++++ diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 05d9ca6266cd4ca3fd1b1fe4509c59455d43ab4a..5b86ef6cbfcd0152cb8ff41a285c28e31835fea3 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -124,6 +124,14 @@ (e.g. C<"looking for a complete sentence">) Default: C +=item publicinbox..boost + +Control indexing order for L, with ties +broken by config file order. This only affects indexing and does +not affect messages which are already indexed. + +Default: C<0> + =item publicinbox..indexSequentialShard See L diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 8e46328dbd70288291b423236c8a4c049eceb26a..7aa1f6c898ab7f75c3d2da3955a1f182fda7c501 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -429,7 +429,7 @@ my $v = $self->{"$pfx.$k"}; $ibx->{$k} = $v if defined $v; } for my $k (qw(filter inboxdir newsgroup replyto httpbackendmax feedmax - indexlevel indexsequentialshard)) { + indexlevel indexsequentialshard boost)) { my $v = get_1($self, $pfx, $k) // next; $ibx->{$k} = $v; } diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 0e27bba6573f91d125d95d1933b04c0fd5eaeb3a..357312b8d602591b4e89642964cf9e646b2c7b7c 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -44,7 +44,7 @@ xpfx => "$dir/ei".PublicInbox::Search::SCHEMA_VERSION, topdir => $dir, creat => $opt->{creat}, ibx_map => {}, # (newsgroup//inboxdir) => $ibx - ibx_list => [], + ibx_cfg => [], # by config section order indexlevel => $l, transact_bytes => 0, total_bytes => 0, @@ -62,7 +62,8 @@ sub attach_inbox { my ($self, $ibx) = @_; $self->{ibx_map}->{$ibx->eidx_key} //= do { - push @{$self->{ibx_list}}, $ibx; + delete $self->{-ibx_ary}; # invalidate cache + push @{$self->{ibx_cfg}}, $ibx; $ibx; } } @@ -388,7 +389,7 @@ sub _ibx_for ($$$) { my ($self, $sync, $smsg) = @_; my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset'; my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos"; - $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" + $self->{-ibx_ary}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" } sub _fd_constrained ($) { @@ -402,7 +403,7 @@ } else { chomp($soft = `sh -c 'ulimit -n'`); } if (defined($soft)) { - my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate + my $want = scalar(@{$self->{-ibx_ary}}) + 64; # estimate my $ret = $want > $soft; if ($ret) { warn <idx_shard($docid)->ipc_do('xdb_remove', $docid); return; } - # we sort {xr3r} in the reverse order of {ibx_list} so we can + # we sort {xr3r} in the reverse order of ibx_sorted so we can # hit the common case in _reindex_finalize without rereading # from git (or holding multiple messages in memory). - my $id2pos = $sync->{id2pos}; # index in {ibx_list} + my $id2pos = $sync->{id2pos}; # index in ibx_sorted @$xr3 = sort { $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} || @@ -621,6 +622,17 @@ EOF undef; } +sub ibx_sorted ($) { + my ($self) = @_; + $self->{-ibx_ary} //= do { + # highest boost first, stable for config-ordering tiebreaker + use sort 'stable'; + [ sort { + ($b->{boost} // 0) <=> ($a->{boost} // 0) + } @{$self->{ibx_cfg}} ]; + } +} + sub eidxq_process ($$) { # for reindexing my ($self, $sync) = @_; @@ -638,7 +650,7 @@ } $sync->{id2pos} //= do { my %id2pos; my $pos = 0; - $id2pos{$_->{-ibx_id}} = $pos++ for @{$self->{ibx_list}}; + $id2pos{$_->{-ibx_id}} = $pos++ for (@{ibx_sorted($self)}); \%id2pos; }; my ($del, $iter); @@ -829,7 +841,7 @@ if (!eidxq_lock_acquire($self)) { warn "E: aborting --reindex\n"; return; } - for my $ibx (@{$self->{ibx_list}}) { + for my $ibx (@{ibx_sorted($self)}) { _reindex_inbox($self, $sync, $ibx); last if $sync->{quit}; } @@ -959,7 +971,7 @@ my $quit = PublicInbox::SearchIdx::quit_cb($sync); local $SIG{QUIT} = $quit; local $SIG{INT} = $quit; local $SIG{TERM} = $quit; - for my $ibx (@{$self->{ibx_list}}) { + for my $ibx (@{ibx_sorted($self)}) { $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); } if (delete($opt->{dedupe})) { @@ -973,7 +985,7 @@ } # don't use $_ here, it'll get clobbered by reindex_checkpoint if ($opt->{scan} // 1) { - for my $ibx (@{$self->{ibx_list}}) { + for my $ibx (@{ibx_sorted($self)}) { last if $sync->{quit}; sync_inbox($self, $sync, $ibx); } @@ -1115,7 +1127,7 @@ } } undef $dh; } - for my $ibx (@{$self->{ibx_list}}) { + for my $ibx (@{ibx_sorted($self)}) { # create symlinks for multi-pack-index $git_midx += symlink_packs($ibx, $pd); # add new lines to our alternates file @@ -1180,7 +1192,8 @@ if ($self->{cfg}) { my $pr = $self->{-watch_sync}->{-opt}->{-progress}; $pr->('reloading ...') if $pr; delete $self->{-resync_queue}; - @{$self->{ibx_list}} = (); + delete $self->{-ibx_ary}; + $self->{ibx_cfg} = []; %{$self->{ibx_map}} = (); delete $self->{-watch_sync}->{id2pos}; my $cfg = PublicInbox::Config->new; @@ -1194,7 +1207,7 @@ } sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler my ($self) = @_; - $self->{-resync_queue} //= [ @{$self->{ibx_list}} ]; + $self->{-resync_queue} //= [ @{ibx_sorted($self)} ]; PublicInbox::DS::requeue($self); # trigger our ->event_step } @@ -1225,9 +1238,9 @@ require PublicInbox::Syscall; require PublicInbox::Sigfd; my $idler = PublicInbox::InboxIdle->new($self->{cfg}); if (!$self->{cfg}) { - $idler->watch_inbox($_) for @{$self->{ibx_list}}; + $idler->watch_inbox($_) for (@{ibx_sorted($self)}); } - $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + $_->subscribe_unlock(__PACKAGE__, $self) for (@{ibx_sorted($self)}); my $pr = $opt->{-progress}; $pr->("performing initial scan ...\n") if $pr; my $sync = eidx_sync($self, $opt); # initial sync diff --git a/t/extsearch.t b/t/extsearch.t index 5f0cd8662bfc0a102bbc32223e4c42a0fa199a30..46a6f2ec5eb80b1f81a911486839c42cc5b0a654 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -60,6 +60,38 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex"); ok($es->has_threadid, '->has_threadid'); } +if ('with boost') { + xsys([qw(git config publicinbox.v1test.boost), 10], + { GIT_CONFIG => $cfg_path }); + ok(run_script([qw(-extindex --all), "$home/extindex-b"]), + 'extindex init with boost'); + my $es = PublicInbox::ExtSearch->new("$home/extindex-b"); + my $smsg = $es->over->get_art(1); + ok($smsg, 'got first article'); + my $xref3 = $es->over->get_xref3($smsg->{num}); + my @v1 = grep(/\Av1/, @$xref3); + my @v2 = grep(/\Av2/, @$xref3); + like($v1[0], qr/\Av1\.example.*?\b\Q$smsg->{blob}\E\b/, + 'smsg->{blob} respected boost'); + is(scalar(@$xref3), 2, 'only to entries'); + undef $es; + + xsys([qw(git config publicinbox.v2test.boost), 20], + { GIT_CONFIG => $cfg_path }); + ok(run_script([qw(-extindex --all --reindex), "$home/extindex-b"]), + 'extindex --reindex with altered boost'); + + $es = PublicInbox::ExtSearch->new("$home/extindex-b"); + $smsg = $es->over->get_art(1); + like($v2[0], qr/\Av2\.example.*?\b\Q$smsg->{blob}\E\b/, + 'smsg->{blob} respects boost after reindex'); + + xsys([qw(git config --unset publicinbox.v1test.boost)], + { GIT_CONFIG => $cfg_path }); + xsys([qw(git config --unset publicinbox.v2test.boost)], + { GIT_CONFIG => $cfg_path }); +} + { # TODO: -extindex should write this to config open $fh, '>>', $cfg_path or BAIL_OUT $!; print $fh <