X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=0a4eb39e0a9d112260eb417c035846c5bac5e7e4;hb=af0b0fb7a454470a32c452119d0392e0dedb3fe1;hp=6f0477f0a7150587e4b64a0bf0470736b8caba75;hpb=7f4a11a6499aade26b418b7ae13cd12859f75bdd;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 6f0477f0..0a4eb39e 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -79,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -170,8 +175,14 @@ sub _resolve_mid_to_tid { $$tid = $cur_tid; } else { # rethreading, queue up dead ghosts $$tid = next_tid($self); - my $num = $smsg->{num}; - push(@{$self->{-ghosts_to_delete}}, $num) if $num < 0; + my $n = $smsg->{num}; + if ($n > 0) { + $self->{dbh}->prepare_cached(<<'')->execute($$tid, $n); +UPDATE over SET tid = ? WHERE num = ? + + } elsif ($n < 0) { + push(@{$self->{-ghosts_to_delete}}, $n); + } } 1; } @@ -261,21 +272,27 @@ sub subject_path ($) { lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; my $mids = mids_for_index($eml); my $refs = parse_references($smsg, $eml, $mids); + $mids->[0] //= $smsg->{mid} //= $eml->{-lei_fake_mid}; + $smsg->{mid} //= ''; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, $smsg, $mids, $refs, $xpath, $dd); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -294,7 +311,7 @@ sub _add_over { } } elsif ($n < 0) { # ghost $$old_tid //= $cur_valid ? $cur_tid : next_tid($self); - link_refs($self, $refs, $$old_tid); + $$old_tid = link_refs($self, $refs, $$old_tid); delete_by_num($self, $n); $$v++; } @@ -379,13 +396,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, - tid INTEGER NOT NULL, - sid INTEGER, - ts INTEGER, - ds INTEGER, - ddd VARBINARY, /* doc-data-deflated */ - UNIQUE (num) + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ + tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ + sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ + ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ + ds INTEGER, /* RFC-2822 sent Date: header, git author time */ + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -406,13 +422,13 @@ CREATE TABLE IF NOT EXISTS counter ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS subject ( sid INTEGER PRIMARY KEY AUTOINCREMENT, - path VARCHAR(40) NOT NULL, + path VARCHAR(40) NOT NULL, /* SHA-1 of normalized subject */ UNIQUE (path) ) $dbh->do(<<''); CREATE TABLE IF NOT EXISTS id2num ( - id INTEGER NOT NULL, + id INTEGER NOT NULL, /* <=> msgid.id */ num INTEGER NOT NULL, UNIQUE (id, num) ) @@ -423,7 +439,7 @@ CREATE TABLE IF NOT EXISTS id2num ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS msgid ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + id INTEGER PRIMARY KEY AUTOINCREMENT, /* <=> id2num.id */ mid VARCHAR(244) NOT NULL, UNIQUE (mid) ) @@ -459,10 +475,14 @@ sub dbh_close { sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: PublicInbox::Over::dbh($self); @@ -512,4 +532,172 @@ EOM $pr->("I: rethread culled $total ghosts\n") if $pr && $total; } +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<""); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq ( + docid INTEGER PRIMARY KEY NOT NULL +) + + $dbh; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# returns remaining reference count to $docid +sub remove_xref3 { + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; + begin_lazy($self); + my $oidbin = pack('H*', $oidhex); + my ($sth, $ibx_id); + if (defined $eidx_key) { + $ibx_id = ibx_id($self, $eidx_key); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $oidbin, SQL_BLOB); + } else { + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + } + $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + +sub blob_exists { + my ($self, $oidhex) = @_; + my $sth = $self->dbh->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE oidbin = ? + + $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); + $sth->execute; + $sth->fetchrow_array; +} + 1;