lib/PublicInbox/LEI.pm
lib/PublicInbox/LeiDedupe.pm
lib/PublicInbox/LeiExternal.pm
+lib/PublicInbox/LeiQuery.pm
lib/PublicInbox/LeiSearch.pm
lib/PublicInbox/LeiStore.pm
lib/PublicInbox/LeiToMail.pm
package PublicInbox::LEI;
use strict;
use v5.10.1;
-use parent qw(PublicInbox::DS PublicInbox::LeiExternal);
+use parent qw(PublicInbox::DS PublicInbox::LeiExternal
+ PublicInbox::LeiQuery);
use Getopt::Long ();
use Socket qw(AF_UNIX SOCK_STREAM pack_sockaddr_un);
use Errno qw(EAGAIN ECONNREFUSED ENOENT);
our %CMD = ( # sorted in order of importance/use:
'q' => [ 'SEARCH_TERMS...', 'search for messages matching terms', qw(
save-as=s output|mfolder|o=s format|f=s dedupe|d=s thread|t augment|a
- sort|s=s@ reverse|r offset=i remote local! external!
+ sort|s=s reverse|r offset=i remote local! external! pretty
since|after=s until|before=s), opt_dash('limit|n=i', '[0-9]+') ],
'show' => [ 'MID|OID', 'show a given object (Message-ID or object ID)',
'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ],
'offset=i' => ['OFF', 'search result offset (default: 0)'],
-'sort|s=s@' => [ 'VAL|internaldate,date,relevance,docid',
+'sort|s=s' => [ 'VAL|received,relevance,docid',
"order of results `--output'-dependent"],
+'reverse|r' => [ 'reverse search results' ], # like sort(1)
'boost=i' => 'increase/decrease priority of results (default: 0)',
my ($self, @argv) = @_;
}
-sub lei_query {
- my ($self, @argv) = @_;
-}
-
sub lei_mark {
my ($self, @argv) = @_;
}
use parent qw(Exporter);
our @EXPORT = qw(lei_ls_external lei_add_external lei_forget_external);
-sub lei_ls_external {
- my ($self, @argv) = @_;
- my $stor = $self->_lei_store(0);
+sub _externals_each {
+ my ($self, $cb, @arg) = @_;
my $cfg = $self->_lei_cfg(0);
- my $out = $self->{1};
- my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
- my (%boost, @loc);
+ my %boost;
for my $sec (grep(/\Aexternal\./, @{$cfg->{-section_order}})) {
my $loc = substr($sec, length('external.'));
$boost{$loc} = $cfg->{"$sec.boost"};
- push @loc, $loc;
}
- use sort 'stable';
+ return \%boost if !wantarray && !$cb;
+
# highest boost first, but stable for alphabetic tie break
- for (sort { $boost{$b} <=> $boost{$a} } sort keys %boost) {
- # TODO: use miscidx and show docid so forget/set is easier
- print $out $_, $OFS, 'boost=', $boost{$_}, $ORS;
+ use sort 'stable';
+ my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost;
+ return @order if !$cb;
+ for my $loc (@order) {
+ $cb->(@arg, $loc, $boost{$loc});
}
+ @order; # scalar or array
+}
+
+sub lei_ls_external {
+ my ($self, @argv) = @_;
+ my $stor = $self->_lei_store(0);
+ my $out = $self->{1};
+ my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
+ $self->_externals_each(sub {
+ my ($loc, $boost_val) = @_;
+ print $out $loc, $OFS, 'boost=', $boost_val, $ORS;
+ });
}
sub lei_add_external {
--- /dev/null
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# handles lei <q|ls-query|rm-query|mv-query> commands
+package PublicInbox::LeiQuery;
+use strict;
+use v5.10.1;
+use PublicInbox::MID qw($MID_EXTRACT);
+use POSIX qw(strftime);
+use PublicInbox::Address qw(pairs);
+use PublicInbox::Search qw(get_pct);
+
+sub _iso8601 ($) { strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($_[0])) }
+
+# prepares an smsg for JSON
+sub _smsg_unbless ($) {
+ my ($smsg) = @_;
+
+ delete @$smsg{qw(lines bytes)};
+ $smsg->{rcvd} = _iso8601(delete $smsg->{ts}); # JMAP receivedAt
+ $smsg->{dt} = _iso8601(delete $smsg->{ds}); # JMAP UTCDate
+
+ if (my $r = delete $smsg->{references}) {
+ $smsg->{references} = [
+ map { "<$_>" } ($r =~ m/$MID_EXTRACT/go) ];
+ }
+ if (my $m = delete($smsg->{mid})) {
+ $smsg->{'m'} = "<$m>";
+ }
+ # XXX breaking to/cc, into structured arrays or tables which
+ # distinguish "$phrase <$address>" causes pretty printing JSON
+ # to take up too much vertical space. I can't get either
+ # Cpanel::JSON::XS or JSON::XS or jq(1) only indent when
+ # wrapping is necessary, rather than blindly indenting and
+ # adding vertical space everywhere.
+ for my $f (qw(from to cc)) {
+ my $v = delete $smsg->{$f} or next;
+ $smsg->{substr($f, 0, 1)} = $v;
+ }
+ $smsg->{'s'} = delete $smsg->{subject};
+ # can we be bothered to parse From/To/Cc into arrays?
+ scalar { %$smsg }; # unbless
+}
+
+sub _vivify_external { # _externals_each callback
+ my ($src, $dir) = @_;
+ if (-f "$dir/ei.lock") {
+ require PublicInbox::ExtSearch;
+ push @$src, PublicInbox::ExtSearch->new($dir);
+ } elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") { # v2, v1
+ require PublicInbox::Inbox;
+ push @$src, bless { inboxdir => $dir }, 'PublicInbox::Inbox';
+ } else {
+ warn "W: ignoring $dir, unable to determine type\n";
+ }
+}
+
+# the main "lei q SEARCH_TERMS" method
+sub lei_q {
+ my ($self, @argv) = @_;
+ my $sto = $self->_lei_store(1);
+ my $cfg = $self->_lei_cfg(1);
+ my $opt = $self->{opt};
+ my $qstr = join(' ', map {;
+ # Consider spaces in argv to be for phrase search in Xapian.
+ # In other words, the users should need only care about
+ # normal shell quotes and not have to learn Xapian quoting.
+ /\s/ ? (s/\A(\w+:)// ? qq{$1"$_"} : qq{"$_"}) : $_
+ } @argv);
+ $opt->{limit} //= 10000;
+ my $lxs;
+
+ # --local is enabled by default
+ my @src = $opt->{'local'} ? ($sto->search) : ();
+
+ # --external is enabled by default, but allow --no-external
+ if ($opt->{external} // 1) {
+ $self->_externals_each(\&_vivify_external, \@src);
+ # {tid} is not unique between indices, so we have to search
+ # each src individually
+ if (!$opt->{thread}) {
+ require PublicInbox::LeiXSearch;
+ my $lxs = PublicInbox::LeiXSearch->new;
+ # local is always first
+ $lxs->attach_external($_) for @src;
+ @src = ($lxs);
+ }
+ }
+ my $out = $self->{output} // '-';
+ $out = 'json:/dev/stdout' if $out eq '-';
+ my $isatty = -t $self->{1};
+ $self->start_pager if $isatty;
+ my $json = substr($out, 0, 5) eq 'json:' ?
+ ref(PublicInbox::Config->json)->new : undef;
+ if ($json) {
+ if ($opt->{pretty} //= $isatty) {
+ $json->pretty(1)->space_before(0);
+ $json->indent_length($opt->{indent} // 2);
+ }
+ $json->utf8; # avoid Wide character in print warnings
+ $json->ascii(1) if $opt->{ascii}; # for "\uXXXX"
+ $json->canonical;
+ }
+
+ # src: LeiXSearch || LeiSearch || Inbox
+ my %mset_opt = map { $_ => $opt->{$_} } qw(thread limit offset);
+ delete $mset_opt{limit} if $opt->{limit} < 0;
+ $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0;
+ if (defined(my $sort = $opt->{'sort'})) {
+ if ($sort eq 'relevance') {
+ $mset_opt{relevance} = 1;
+ } elsif ($sort eq 'docid') {
+ $mset_opt{relevance} = $mset_opt{asc} ? -1 : -2;
+ } elsif ($sort =~ /\Areceived(?:-?[aA]t)?\z/) {
+ # the default
+ } else {
+ die "unrecognized --sort=$sort\n";
+ }
+ }
+ # $self->out($json->encode(\%mset_opt));
+ # descending docid order
+ $mset_opt{relevance} //= -2 if $opt->{thread};
+ # my $wcb = PublicInbox::LeiToMail->write_cb($out, $self);
+
+ # even w/o pretty, do the equivalent of a --pretty=oneline
+ # output so "lei q SEARCH_TERMS | wc -l" can be useful:
+ my $ORS = $json ? ($opt->{pretty} ? ', ' : ",\n") : "\n";
+ my $buf;
+
+ # we can generate too many records to hold in RAM, so we stream
+ # and fake a JSON array starting here:
+ $self->out('[') if $json;
+ my $emit_cb = sub {
+ my ($smsg) = @_;
+ delete @$smsg{qw(tid num)}; # only makes sense if single src
+ chomp($buf = $json->encode(_smsg_unbless($smsg)));
+ };
+ for my $src (@src) {
+ my $srch = $src->search;
+ my $over = $src->over;
+ my $smsg_for = $src->can('smsg_for'); # LeiXSearch
+ my $mo = { %mset_opt };
+ my $mset = $srch->mset($qstr, $mo);
+ my $ctx = {};
+ if ($smsg_for) {
+ for my $it ($mset->items) {
+ my $smsg = $smsg_for->($srch, $it) or next;
+ $self->out($buf .= $ORS) if defined $buf;
+ $smsg->{relevance} = get_pct($it);
+ $emit_cb->($smsg);
+ }
+ } else { # --thread
+ my $ids = $srch->mset_to_artnums($mset, $mo);
+ $ctx->{ids} = $ids;
+ my $i = 0;
+ my %n2p = map {
+ ($ids->[$i++], get_pct($_));
+ } $mset->items;
+ undef $mset;
+ while ($over && $over->expand_thread($ctx)) {
+ for my $n (@{$ctx->{xids}}) {
+ my $t = $over->get_art($n) or next;
+ if (my $p = delete $n2p{$t->{num}}) {
+ $t->{relevance} = $p;
+ }
+ $self->out($buf .= $ORS);
+ $emit_cb->($t);
+ }
+ @{$ctx->{xids}} = ();
+ }
+ }
+ }
+ $self->out($buf .= "]\n"); # done
+}
+
+1;
my (undef, $dir, $opt) = @_;
my $eidx = PublicInbox::ExtSearchIdx->new($dir, $opt);
my $self = bless { priv_eidx => $eidx }, __PACKAGE__;
- eidx_init($self) if $opt->{creat};
+ eidx_init($self)->done if $opt->{creat};
$self;
}
_mbox_write_cb($cls, $1, $dst, $lei);
} elsif ($dst =~ s!\A[Mm]aildir:!!) { # typically capitalized
_maildir_write_cb($dst, $lei);
+ } else {
+ undef;
}
# TODO: Maildir, MH, IMAP, JMAP ...
}
sub attach_external {
my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox
- if (!$ibxish->can('over')) {
- push @{$self->{remotes}}, $ibxish
+
+ if (!$ibxish->can('over') || !$ibxish->over) {
+ return push(@{$self->{remotes}}, $ibxish)
}
+ my $desc = $ibxish->{inboxdir} // $ibxish->{topdir};
+ my $srch = $ibxish->search or
+ return warn("$desc not indexed for Xapian\n");
+ my @shards = $srch->xdb_shards_flat or
+ return warn("$desc has no Xapian shardsXapian\n");
+
if (delete $self->{xdb}) { # XXX: do we need this?
# clobber existing {xdb} if amending
my $expect = delete $self->{nshard};
$nr == $expect or die
"BUG: reloaded $nr shards, expected $expect"
}
- my @shards = $ibxish->search->xdb_shards_flat;
push @{$self->{shards_flat}}, @shards;
push(@{$self->{shard2ibx}}, $ibxish) for (@shards);
}
+# returns a list of local inboxes (or count in scalar context)
+sub locals {
+ my %uniq = map {; "$_" => $_ } @{$_[0]->{shard2ibx} // []};
+ values %uniq;
+}
+
# called by PublicInbox::Search::xdb
-sub xdb_shards_flat { @{$_[0]->{shards_flat}} }
+sub xdb_shards_flat { @{$_[0]->{shards_flat} // []} }
# like over->get_art
sub smsg_for {
$self->mset($qstr //= 'bytes:1..', $opt);
}
+sub over {}
+
1;
package PublicInbox::Search;
use strict;
use parent qw(Exporter);
-our @EXPORT_OK = qw(retry_reopen int_val);
+our @EXPORT_OK = qw(retry_reopen int_val get_pct);
use List::Util qw(max);
# values for searching, changing the numeric value breaks
sortable_unserialise($val) + 0; # PV => IV conversion
}
+sub get_pct ($) { # mset item
+ # Capped at "99%" since "100%" takes an extra column in the
+ # thread skeleton view. <xapian/mset.h> says the value isn't
+ # very meaningful, anyways.
+ my $n = $_[0]->get_percent;
+ $n > 99 ? 99 : $n;
+}
+
1;
use PublicInbox::WwwStream qw(html_oneshot);
use PublicInbox::SearchThread;
use PublicInbox::SearchQuery;
-use PublicInbox::Search;
+use PublicInbox::Search qw(get_pct);
my %rmap_inc;
sub mbox_results {
} @{$_[0]} ]
}
-sub get_pct ($) {
- # Capped at "99%" since "100%" takes an extra column in the
- # thread skeleton view. <xapian/mset.h> says the value isn't
- # very meaningful, anyways.
- my $n = $_[0]->get_percent;
- $n > 99 ? 99 : $n;
-}
-
sub mset_thread {
my ($ctx, $mset, $q) = @_;
my $ibx = $ctx->{ibx};
return if $done eq $home;
use PublicInbox::InboxWritable;
for my $V (1, 2) {
- run_script([qw(-init -Lmedium), "-V$V", "t$V",
+ run_script([qw(-init), "-V$V", "t$V",
'--newsgroup', "t.$V",
"$home/t$V", "http://example.com/t$V",
"t$V\@example.com" ]) or BAIL_OUT "init v$V";
});
$lei->('ls-external');
like($out, qr/boost=0\n/s, 'ls-external has output');
+
+ # note, on a Bourne shell users should be able to use either:
+ # s:"use boolean prefix"
+ # "s:use boolean prefix"
+ # or use single quotes, it should not matter. Users only need
+ # to know shell quoting rules, not Xapian quoting rules.
+ # No double-quoting should be imposed on users on the CLI
+ $lei->('q', 's:use boolean prefix');
+ like($out, qr/search: use boolean prefix/, 'phrase search got result');
};
my $test_lei_common = sub {
is($lxs->smsg_for(($mset->items)[0])->{docid}, $max,
'got highest docid');
+my @ibxish = $lxs->locals;
+is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back');
+is($lxs->search, $lxs, '->search works');
+is($lxs->over, undef, '->over fails');
+
done_testing;