X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=8bec08da0b168746535bd32c445b87207fe72f5e;hb=811b8d3cbaa790f59b7b107140b86248da16499b;hp=4543bfa1d9acba6e06c4dc3bbe3a91c309123404;hpb=8b1950055d51d4360e596446e5ac0f41008e357d;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 4543bfa1..8bec08da 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -9,25 +9,19 @@ # are denoted by a negative NNTP article number. package PublicInbox::OverIdx; use strict; -use warnings; -use base qw(PublicInbox::Over); +use v5.10.1; +use parent qw(PublicInbox::Over); use IO::Handle; use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); -use PublicInbox::Search; use Carp qw(croak); sub dbh_new { my ($self) = @_; my $dbh = $self->SUPER::dbh_new($self->{-no_fsync} ? 2 : 1); - # TRUNCATE reduces I/O compared to the default (DELETE) - # We do not use WAL since we're optimized for read-only ops, - # (and read-only requires SQLite 3.22.0 (2018-01-22)). - $dbh->do('PRAGMA journal_mode = TRUNCATE'); - # 80000 pages (80MiB on SQLite <3.12.0, 320MiB on 3.12.0+) # was found to be good in 2018 during the large LKML import # at the time. This ought to be configurable based on HW @@ -190,23 +184,20 @@ sub resolve_mid_to_tid { if (my $del = delete $self->{-ghosts_to_delete}) { delete_by_num($self, $_) for @$del; } - $tid // create_ghost($self, $mid); -} - -sub create_ghost { - my ($self, $mid) = @_; - my $id = mid2id($self, $mid); - my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; - my $tid = next_tid($self); - my $dbh = $self->{dbh}; - $dbh->prepare_cached(<<'')->execute($num, $tid); + $tid // do { # create a new ghost + my $id = mid2id($self, $mid); + my $num = next_ghost_num($self); + $num < 0 or die "ghost num is non-negative: $num\n"; + $tid = next_tid($self); + my $dbh = $self->{dbh}; + $dbh->prepare_cached(<<'')->execute($num, $tid); INSERT INTO over (num, tid) VALUES (?,?) - $dbh->prepare_cached(<<'')->execute($id, $num); + $dbh->prepare_cached(<<'')->execute($id, $num); INSERT INTO id2num (id, num) VALUES (?,?) - $tid; + $tid; + }; } sub merge_threads { @@ -270,6 +261,13 @@ sub subject_path ($) { lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; @@ -281,10 +279,7 @@ sub add_overview { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -311,10 +306,10 @@ sub _add_over { } sub add_over { - my ($self, $values) = @_; - my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; + my ($self, $smsg, $mids, $refs, $xpath, $ddd) = @_; my $old_tid; my $vivified = 0; + my $num = $smsg->{num}; begin_lazy($self); delete_by_num($self, $num, \$old_tid); @@ -326,17 +321,17 @@ sub add_over { $v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n"; $vivified += $v; } - my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); - my $sid = sid($self, $xpath); + $smsg->{tid} = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); + $smsg->{sid} = sid($self, $xpath); my $dbh = $self->{dbh}; my $sth = $dbh->prepare_cached(<<''); INSERT INTO over (num, tid, sid, ts, ds, ddd) VALUES (?,?,?,?,?,?) - my $n = 0; - my @v = ($num, $tid, $sid, $ts, $ds); - foreach (@v) { $sth->bind_param(++$n, $_) } - $sth->bind_param(++$n, $ddd, SQL_BLOB); + my $nc = 1; + $sth->bind_param($nc, $num); + $sth->bind_param(++$nc, $smsg->{$_}) for (qw(tid sid ts ds)); + $sth->bind_param(++$nc, $ddd, SQL_BLOB); $sth->execute; $sth = $dbh->prepare_cached(<<''); INSERT INTO id2num (id, num) VALUES (?,?) @@ -388,12 +383,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, - tid INTEGER NOT NULL, - sid INTEGER, - ts INTEGER, - ds INTEGER, - ddd VARBINARY, /* doc-data-deflated */ + num INTEGER NOT NULL, /* NNTP article number == IMAP UID */ + tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ + sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ + ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ + ds INTEGER, /* RFC-2822 sent Date: header, git author time */ + ddd VARBINARY, /* doc-data-deflated (->to_doc_data, ->load_from_data) */ UNIQUE (num) ) @@ -415,13 +410,13 @@ CREATE TABLE IF NOT EXISTS counter ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS subject ( sid INTEGER PRIMARY KEY AUTOINCREMENT, - path VARCHAR(40) NOT NULL, + path VARCHAR(40) NOT NULL, /* SHA-1 of normalized subject */ UNIQUE (path) ) $dbh->do(<<''); CREATE TABLE IF NOT EXISTS id2num ( - id INTEGER NOT NULL, + id INTEGER NOT NULL, /* <=> msgid.id */ num INTEGER NOT NULL, UNIQUE (id, num) ) @@ -432,7 +427,7 @@ CREATE TABLE IF NOT EXISTS id2num ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS msgid ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + id INTEGER PRIMARY KEY AUTOINCREMENT, /* <=> id2num.id */ mid VARCHAR(244) NOT NULL, UNIQUE (mid) ) @@ -448,7 +443,7 @@ sub commit_lazy { sub begin_lazy { my ($self) = @_; return if $self->{txn}; - my $dbh = $self->connect or return; + my $dbh = $self->dbh or return; $dbh->begin_work; # $dbh->{Profile} = 2; $self->{txn} = 1; @@ -460,10 +455,10 @@ sub rollback_lazy { $self->{dbh}->rollback; } -sub disconnect { +sub dbh_close { my ($self) = @_; die "in transaction" if $self->{txn}; - $self->SUPER::disconnect; + $self->SUPER::dbh_close; } sub create { @@ -474,8 +469,8 @@ sub create { File::Path::mkpath(File::Basename::dirname($self->{filename})); } # create the DB: - PublicInbox::Over::connect($self); - $self->disconnect; + PublicInbox::Over::dbh($self); + $self->dbh_close; } sub rethread_prepare { @@ -521,4 +516,119 @@ EOM $pr->("I: rethread culled $total ghosts\n") if $pr && $total; } +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<""); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + $dbh; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +sub remove_xref3 { + my ($self, $docid, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $oidbin = pack('H*', $oidhex); + my $sth; + if (defined $eidx_key) { + my $ibx_id = id_for($self, 'inboxes', 'ibx_id', + eidx_key => $eidx_key); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $oidbin, SQL_BLOB); + } else { + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + } + $sth->execute; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + 1;