From fad9acd35e56a289ade90a62d056b2a6663d448c Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Fri, 6 Apr 2018 21:44:38 +0000 Subject: [PATCH] www: favor reading more from SQLite, and less from Xapian Favor simpler internal APIs this time around, this cuts a fair amount of code out and takes another step towards removing Xapian as a dependency for v2 repos. --- lib/PublicInbox/Inbox.pm | 2 +- lib/PublicInbox/Mbox.pm | 39 ++++++++++---------------- lib/PublicInbox/Over.pm | 29 +++++++++++++++++++ lib/PublicInbox/Search.pm | 52 +++-------------------------------- lib/PublicInbox/SearchIdx.pm | 3 +- lib/PublicInbox/V2Writable.pm | 34 ++++++++++------------- lib/PublicInbox/View.pm | 35 ++++++++--------------- t/psgi_v2.t | 1 + t/search.t | 16 ++++------- t/v2writable.t | 18 ++++++------ 10 files changed, 92 insertions(+), 137 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 0ea18b4c..f71493a0 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -273,7 +273,7 @@ sub msg_by_smsg ($$;$) { my ($self, $smsg, $ref) = @_; # ghosts may have undef smsg (from SearchThread.node) or - # no {blob} field (from each_smsg_by_mid) + # no {blob} field return unless defined $smsg; defined(my $blob = $smsg->{blob}) or return; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c5e1cb9c..4427ae5d 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -34,19 +34,17 @@ sub mb_stream { # called by PSGI server as body response sub getline { my ($more) = @_; # self - my ($ctx, $head, $tail, $db, $cur) = @$more; - if ($cur) { + my ($ctx, $id, $prev, $next, $cur) = @$more; + if ($cur) { # first pop @$more; return msg_str($ctx, $cur); } - for (; !defined($cur) && $head != $tail; $head++) { - my $smsg = PublicInbox::SearchMsg->get($head, $db, $ctx->{mid}); - my $mref = $ctx->{-inbox}->msg_by_smsg($smsg) or next; - $cur = Email::Simple->new($mref); - $cur = msg_str($ctx, $cur); - } - $more->[1] = $head; - $cur; + $cur = $next or return; + my $ibx = $ctx->{-inbox}; + $next = $ibx->search->next_by_mid($ctx->{mid}, \$id, \$prev); + @$more = ($ctx, $id, $prev, $next); # $next may be undef, here + my $mref = $ibx->msg_by_smsg($cur) or return; + msg_str($ctx, Email::Simple->new($mref)); } sub close {} # noop @@ -57,21 +55,14 @@ sub emit_raw { my $ibx = $ctx->{-inbox}; my $first; my $more; - my ($head, $tail, $db); - my %seen; if (my $srch = $ibx->search) { - $srch->retry_reopen(sub { - ($head, $tail, $db) = $srch->each_smsg_by_mid($mid); - for (; !defined($first) && $head != $tail; $head++) { - my @args = ($head, $db, $mid); - my $smsg = PublicInbox::SearchMsg->get(@args); - my $mref = $ibx->msg_by_smsg($smsg) or next; - $first = Email::Simple->new($mref); - } - if ($head != $tail) { - $more = [ $ctx, $head, $tail, $db, $first ]; - } - }); + my ($id, $prev); + my $smsg = $srch->next_by_mid($mid, \$id, \$prev) or return; + my $mref = $ibx->msg_by_smsg($smsg) or return; + $first = Email::Simple->new($mref); + my $next = $srch->next_by_mid($mid, \$id, \$prev); + # $more is for ->getline + $more = [ $ctx, $id, $prev, $next, $first ] if $next; } else { my $mref = $ibx->msg_by_mid($mid) or return; $first = Email::Simple->new($mref); diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 0bd6008b..309e044a 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -147,4 +147,33 @@ SELECT * from OVER where num = ? LIMIT 1 undef; } +sub next_by_mid { + my ($self, $mid, $id, $prev) = @_; + my $dbh = $self->connect; + + unless (defined $$id) { + my $sth = $dbh->prepare_cached(<<'', undef, 1); + SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + $$id = $sth->fetchrow_array; + defined $$id or return; + } + my $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT num FROM id2num WHERE id = ? AND num > ? +ORDER BY num ASC LIMIT 1 + + $$prev ||= 0; + $sth->execute($$id, $$prev); + my $num = $sth->fetchrow_array or return; + $$prev = $num; + + $sth = $dbh->prepare_cached(<<"", undef, 1); +SELECT num,ts,ds,ddd FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + my $smsg = $sth->fetchrow_hashref or return; + load_from_row($smsg); +} + 1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 34ebd1a6..7175ddc5 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -304,58 +304,14 @@ sub query_ts { $self->{over_ro}->query_ts($ts, $prev); } -sub first_smsg_by_mid { - my ($self, $mid) = @_; - my $smsg; - retry_reopen($self, sub { - each_smsg_by_mid($self, $mid, sub { $smsg = $_[0]; undef }); - }); - $smsg; -} - sub lookup_article { my ($self, $num) = @_; - my $term = 'XNUM'.$num; - my $db = $self->{xdb}; - retry_reopen($self, sub { - my $head = $db->postlist_begin($term); - my $tail = $db->postlist_end($term); - return if $head->equal($tail); - my $doc_id = $head->get_docid; - return unless defined $doc_id; - $head->inc; - if ($head->nequal($tail)) { - warn "article #$num is not unique\n"; - } - # raises on error: - my $doc = $db->get_document($doc_id); - my $smsg = PublicInbox::SearchMsg->wrap($doc); - $smsg->{doc_id} = $doc_id; - $smsg->load_expand; - }); + $self->{over_ro}->get_art($num); } -sub each_smsg_by_mid { - my ($self, $mid, $cb) = @_; - # XXX retry_reopen isn't necessary for V2Writable, but the PSGI - # interface will need it... - my $db = $self->{xdb}; - my $term = 'Q' . $mid; - my $head = $db->postlist_begin($term); - my $tail = $db->postlist_end($term); - if ($head == $tail) { - $db->reopen; - $head = $db->postlist_begin($term); - $tail = $db->postlist_end($term); - } - return ($head, $tail, $db) if wantarray; - for (; $head->nequal($tail); $head->inc) { - my $doc_id = $head->get_docid; - my $doc = $db->get_document($doc_id); - my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $smsg->{doc_id} = $doc_id; - $cb->($smsg) or return; - } +sub next_by_mid { + my ($self, $mid, $id, $prev) = @_; + $self->{over_ro}->next_by_mid($mid, $id, $prev); } # normalize subjects so they are suitable as pathnames for URLs diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 42562631..3596972f 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -76,7 +76,8 @@ sub new { if ($version == 1) { $self->{lock_path} = "$mainrepo/ssoma.lock"; my $dir = $self->xdir; - $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); + $self->{over_ro} = $self->{over} = + PublicInbox::OverIdx->new("$dir/over.sqlite3"); } elsif ($version == 2) { defined $part or die "partition is required for v2\n"; # partition is a number diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index a8117aaa..877a4591 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -271,22 +271,20 @@ sub remove_internal { foreach my $mid (@$mids) { my %gone; - $srch->reopen->each_smsg_by_mid($mid, sub { - my ($smsg) = @_; - $smsg->load_expand; + my ($id, $prev); + while (my $smsg = $srch->next_by_mid($mid, \$id, \$prev)) { my $msg = $ibx->msg_by_smsg($smsg); if (!defined($msg)) { warn "broken smsg for $mid\n"; - return 1; # continue + next; # continue } my $orig = $$msg; my $cur = PublicInbox::MIME->new($msg); if (content_id($cur) eq $cid) { $smsg->{mime} = $cur; - $gone{$smsg->num} = [ $smsg, \$orig ]; + $gone{$smsg->{num}} = [ $smsg, \$orig ]; } - 1; # continue - }); + } my $n = scalar keys %gone; next unless $n; if ($n > 1) { @@ -552,26 +550,23 @@ sub lookup_content { my $srch = $ibx->search->reopen; my $cid = content_id($mime); my $found; - $srch->each_smsg_by_mid($mid, sub { - my ($smsg) = @_; - $smsg->load_expand; + my ($id, $prev); + while (my $smsg = $srch->next_by_mid($mid, \$id, \$prev)) { my $msg = $ibx->msg_by_smsg($smsg); if (!defined($msg)) { warn "broken smsg for $mid\n"; - return 1; # continue + next; } my $cur = PublicInbox::MIME->new($msg); if (content_id($cur) eq $cid) { $smsg->{mime} = $cur; $found = $smsg; - return 0; # break out of loop + last; } # XXX DEBUG_DIFF is experimental and may be removed diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; - - 1; # continue - }); + } $found; } @@ -770,15 +765,14 @@ sub unindex_oid { my $mime = PublicInbox::MIME->new($msgref); my $mids = mids($mime->header_obj); $mime = $msgref = undef; - + my $srch = $self->{-inbox}->search; foreach my $mid (@$mids) { my %gone; - $self->{-inbox}->search->reopen->each_smsg_by_mid($mid, sub { - my ($smsg) = @_; - $smsg->load_expand; + my ($id, $prev); + while (my $smsg = $srch->next_by_mid($mid, \$id, \$prev)) { $gone{$smsg->num} = 1 if $oid eq $smsg->{blob}; 1; # continue - }); + } my $n = scalar keys %gone; next unless $n; if ($n > 1) { diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index cbed9163..94058ed0 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -58,20 +58,16 @@ sub msg_page { my ($ctx) = @_; my $mid = $ctx->{mid}; my $ibx = $ctx->{-inbox}; - my ($first, $more, $head, $tail, $db); + my ($first, $more); my $smsg; if (my $srch = $ibx->search) { - $srch->retry_reopen(sub { - ($head, $tail, $db) = $srch->each_smsg_by_mid($mid); - for (; !defined($first) && $head != $tail; $head++) { - my @args = ($head, $db, $mid); - $smsg = PublicInbox::SearchMsg->get(@args); - $first = $ibx->msg_by_smsg($smsg); - } - if ($head != $tail) { - $more = [ $head, $tail, $db ]; - } - }); + my ($id, $prev); + $smsg = $srch->next_by_mid($mid, \$id, \$prev); + $first = $ibx->msg_by_smsg($smsg) if $smsg; + if ($first) { + my $next = $srch->next_by_mid($mid, \$id, \$prev); + $more = [ $id, $prev, $next ] if $next; + } return unless $first; } else { $first = $ibx->msg_by_mid($mid) or return; @@ -82,18 +78,11 @@ sub msg_page { sub msg_html_more { my ($ctx, $more, $nr) = @_; my $str = eval { - my $smsg; - my ($head, $tail, $db) = @$more; + my ($id, $prev, $smsg) = @$more; my $mid = $ctx->{mid}; - for (; !defined($smsg) && $head != $tail; $head++) { - my $m = PublicInbox::SearchMsg->get($head, $db, $mid); - $smsg = $ctx->{-inbox}->smsg_mime($m); - } - if ($head == $tail) { # done - @$more = (); - } else { - $more->[0] = $head; - } + $smsg = $ctx->{-inbox}->smsg_mime($smsg); + my $next = $ctx->{srch}->next_by_mid($mid, \$id, \$prev); + @$more = $next ? ($id, $prev, $next) : (); if ($smsg) { my $mime = $smsg->{mime}; my $upfx = '../' . mid_escape($smsg->mid) . '/'; diff --git a/t/psgi_v2.t b/t/psgi_v2.t index bdf23deb..faa139fb 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -129,6 +129,7 @@ test_psgi(sub { $www->call(@_) }, sub { is(scalar(@from_), 3, 'three From_ lines in t.mbox.gz'); # search interface + $config->each_inbox(sub { $_[0]->search->reopen }); $res = $cb->(POST('/v2test/?q=m:a-mid@b&x=m')); $in = $res->content; $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); diff --git a/t/search.t b/t/search.t index 2f7b795e..fda32d36 100644 --- a/t/search.t +++ b/t/search.t @@ -89,10 +89,9 @@ sub filter_mids { { $rw_commit->(); $ro->reopen; - my $found = $ro->first_smsg_by_mid('root@s'); - ok($found, "message found"); - is($root_id, $found->{doc_id}, 'doc_id set correctly'); - is($found->mid, 'root@s', 'mid set correctly'); + my $found = $ro->query('m:root@s'); + is(scalar(@$found), 1, "message found"); + is($found->[0]->mid, 'root@s', 'mid set correctly'); my ($res, @res); my @exp = sort qw(root@s last@s); @@ -276,10 +275,9 @@ sub filter_mids { ], body => "LOOP!\n")); ok($doc_id > 0, "doc_id defined with circular reference"); - my $smsg = $rw->first_smsg_by_mid('circle@a'); + my $smsg = $rw->query('m:circle@a', {limit=>1})->[0]; is($smsg->references, '', "no references created"); - my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); - is($s, $msg->subject, 'long subject not rewritten'); + is($s, $smsg->subject, 'long subject not rewritten'); } { @@ -293,9 +291,7 @@ sub filter_mids { my $mime = Email::MIME->new($str); my $doc_id = $rw->add_message($mime); ok($doc_id > 0, 'message indexed doc_id with UTF-8'); - my $smsg = $rw->first_smsg_by_mid('testmessage@example.com'); - my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); - + my $msg = $rw->query('m:testmessage@example.com', {limit => 1})->[0]; is($mime->header('Subject'), $msg->subject, 'UTF-8 subject preserved'); } diff --git a/t/v2writable.t b/t/v2writable.t index ab85e9af..4a42c016 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -108,14 +108,13 @@ if ('ensure git configs are correct') { $mime->header_set('References', ''); ok($im->add($mime), 'message with multiple Message-ID'); $im->done; - my @found; my $srch = $ibx->search; - $srch->reopen->each_smsg_by_mid('abcde@1', sub { push @found, @_; 1 }); - is(scalar(@found), 1, 'message found by first MID'); - $srch->reopen->each_smsg_by_mid('abcde@2', sub { push @found, @_; 1 }); - is(scalar(@found), 2, 'message found by second MID'); - is($found[0]->{doc_id}, $found[1]->{doc_id}, 'same document'); - ok($found[1]->{doc_id} > 0, 'doc_id is positive'); + my $mset1 = $srch->reopen->query('m:abcde@1', { mset => 1 }); + is($mset1->size, 1, 'message found by first MID'); + my $mset2 = $srch->reopen->query('m:abcde@2', { mset => 1 }); + is($mset2->size, 1, 'message found by second MID'); + is((($mset1->items)[0])->get_docid, (($mset2->items)[0])->get_docid, + 'same document'); } SKIP: { @@ -224,9 +223,8 @@ EOF like($smsg->num, qr/\A\d+\z/, 'numeric number in return message'); is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num'); my $srch = $ibx->search->reopen; - my @found = (); - $srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 }); - is(scalar(@found), 0, 'no longer found in Xapian'); + my $mset = $srch->query('m:'.$smsg->mid, { mset => 1}); + is($mset->size, 0, 'no longer found in Xapian'); my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames); is($srch->{over_ro}->get_art($smsg->num), undef, 'removal propagated to Over DB'); -- 2.44.0