X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=0a4eb39e0a9d112260eb417c035846c5bac5e7e4;hb=af0b0fb7a454470a32c452119d0392e0dedb3fe1;hp=29c6e0b933aae2e757f6307ec3758c53fc1dec17;hpb=406b9eb2906518fb8c3999f2a826eeae86fa3d30;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 29c6e0b9..0a4eb39e 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -9,23 +9,18 @@ # are denoted by a negative NNTP article number. package PublicInbox::OverIdx; use strict; -use warnings; -use base qw(PublicInbox::Over); +use v5.10.1; +use parent qw(PublicInbox::Over); use IO::Handle; -use DBI; +use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); -use PublicInbox::Search; +use Carp qw(croak); sub dbh_new { my ($self) = @_; - my $dbh = $self->SUPER::dbh_new(1); - - # TRUNCATE reduces I/O compared to the default (DELETE) - # We do not use WAL since we're optimized for read-only ops, - # (and read-only requires SQLite 3.22.0 (2018-01-22)). - $dbh->do('PRAGMA journal_mode = TRUNCATE'); + my $dbh = $self->SUPER::dbh_new($self->{-no_fsync} ? 2 : 1); # 80000 pages (80MiB on SQLite <3.12.0, 320MiB on 3.12.0+) # was found to be good in 2018 during the large LKML import @@ -37,6 +32,13 @@ sub dbh_new { $dbh; } +sub new { + my ($class, $f) = @_; + my $self = $class->SUPER::new($f); + $self->{min_tid} = 0; + $self; +} + sub get_counter ($$) { my ($dbh, $key) = @_; my $sth = $dbh->prepare_cached(<<'', undef, 1); @@ -77,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -164,8 +171,18 @@ sub _resolve_mid_to_tid { my $cur_tid = $smsg->{tid}; if (defined $$tid) { merge_threads($self, $$tid, $cur_tid); - } else { + } elsif ($cur_tid > $self->{min_tid}) { $$tid = $cur_tid; + } else { # rethreading, queue up dead ghosts + $$tid = next_tid($self); + my $n = $smsg->{num}; + if ($n > 0) { + $self->{dbh}->prepare_cached(<<'')->execute($$tid, $n); +UPDATE over SET tid = ? WHERE num = ? + + } elsif ($n < 0) { + push(@{$self->{-ghosts_to_delete}}, $n); + } } 1; } @@ -175,23 +192,23 @@ sub resolve_mid_to_tid { my ($self, $mid) = @_; my $tid; each_by_mid($self, $mid, ['tid'], \&_resolve_mid_to_tid, \$tid); - defined $tid ? $tid : create_ghost($self, $mid); -} - -sub create_ghost { - my ($self, $mid) = @_; - my $id = mid2id($self, $mid); - my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; - my $tid = next_tid($self); - my $dbh = $self->{dbh}; - $dbh->prepare_cached(<<'')->execute($num, $tid); + if (my $del = delete $self->{-ghosts_to_delete}) { + delete_by_num($self, $_) for @$del; + } + $tid // do { # create a new ghost + my $id = mid2id($self, $mid); + my $num = next_ghost_num($self); + $num < 0 or die "ghost num is non-negative: $num\n"; + $tid = next_tid($self); + my $dbh = $self->{dbh}; + $dbh->prepare_cached(<<'')->execute($num, $tid); INSERT INTO over (num, tid) VALUES (?,?) - $dbh->prepare_cached(<<'')->execute($id, $num); + $dbh->prepare_cached(<<'')->execute($id, $num); INSERT INTO id2num (id, num) VALUES (?,?) - $tid; + $tid; + }; } sub merge_threads { @@ -221,7 +238,7 @@ sub link_refs { merge_threads($self, $tid, $ptid); } } else { - $tid = defined $old_tid ? $old_tid : next_tid($self); + $tid = $old_tid // next_tid($self); } $tid; } @@ -255,22 +272,27 @@ sub subject_path ($) { lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { - my ($self, $mime, $smsg) = @_; - $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!; - my $hdr = $mime->header_obj; - my $mids = mids_for_index($hdr); - my $refs = parse_references($smsg, $hdr, $mids); + my ($self, $eml, $smsg) = @_; + $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; + my $mids = mids_for_index($eml); + my $refs = parse_references($smsg, $eml, $mids); + $mids->[0] //= $smsg->{mid} //= $eml->{-lei_fake_mid}; + $smsg->{mid} //= ''; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -278,11 +300,18 @@ sub _add_over { my $cur_tid = $smsg->{tid}; my $n = $smsg->{num}; die "num must not be zero for $mid" if !$n; - $$old_tid = $cur_tid unless defined $$old_tid; + my $cur_valid = $cur_tid > $self->{min_tid}; + if ($n > 0) { # regular mail - merge_threads($self, $$old_tid, $cur_tid); + if ($cur_valid) { + $$old_tid //= $cur_tid; + merge_threads($self, $$old_tid, $cur_tid); + } else { + $$old_tid //= next_tid($self); + } } elsif ($n < 0) { # ghost - link_refs($self, $refs, $$old_tid); + $$old_tid //= $cur_valid ? $cur_tid : next_tid($self); + $$old_tid = link_refs($self, $refs, $$old_tid); delete_by_num($self, $n); $$v++; } @@ -290,13 +319,14 @@ sub _add_over { } sub add_over { - my ($self, $values) = @_; - my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; + my ($self, $smsg, $mids, $refs, $xpath, $ddd) = @_; my $old_tid; my $vivified = 0; + my $num = $smsg->{num}; begin_lazy($self); delete_by_num($self, $num, \$old_tid); + $old_tid = undef if ($old_tid // 0) <= $self->{min_tid}; foreach my $mid (@$mids) { my $v = 0; each_by_mid($self, $mid, ['tid'], \&_add_over, @@ -304,17 +334,17 @@ sub add_over { $v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n"; $vivified += $v; } - my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); - my $sid = sid($self, $xpath); + $smsg->{tid} = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); + $smsg->{sid} = sid($self, $xpath); my $dbh = $self->{dbh}; my $sth = $dbh->prepare_cached(<<''); INSERT INTO over (num, tid, sid, ts, ds, ddd) VALUES (?,?,?,?,?,?) - my $n = 0; - my @v = ($num, $tid, $sid, $ts, $ds); - foreach (@v) { $sth->bind_param(++$n, $_) } - $sth->bind_param(++$n, $ddd); + my $nc = 1; + $sth->bind_param($nc, $num); + $sth->bind_param(++$nc, $smsg->{$_}) for (qw(tid sid ts ds)); + $sth->bind_param(++$nc, $ddd, SQL_BLOB); $sth->execute; $sth = $dbh->prepare_cached(<<''); INSERT INTO id2num (id, num) VALUES (?,?) @@ -326,22 +356,23 @@ INSERT INTO id2num (id, num) VALUES (?,?) } sub _remove_oid { - my ($self, $smsg, $oid, $nr) = @_; + my ($self, $smsg, $oid, $removed) = @_; if (!defined($oid) || $smsg->{blob} eq $oid) { delete_by_num($self, $smsg->{num}); - $$nr++; + push @$removed, $smsg->{num}; } 1; } -# returns number of removed messages +# returns number of removed messages in scalar context, +# array of removed article numbers in array context. # $oid may be undef to match only on $mid sub remove_oid { my ($self, $oid, $mid) = @_; - my $nr = 0; + my $removed = []; begin_lazy($self); - each_by_mid($self, $mid, ['ddd'], \&_remove_oid, $oid, \$nr); - $nr; + each_by_mid($self, $mid, ['ddd'], \&_remove_oid, $oid, $removed); + @$removed; } sub _num_mid0_for_oid { @@ -365,13 +396,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, - tid INTEGER NOT NULL, - sid INTEGER, - ts INTEGER, - ds INTEGER, - ddd VARBINARY, /* doc-data-deflated */ - UNIQUE (num) + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ + tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ + sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ + ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ + ds INTEGER, /* RFC-2822 sent Date: header, git author time */ + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -392,13 +422,13 @@ CREATE TABLE IF NOT EXISTS counter ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS subject ( sid INTEGER PRIMARY KEY AUTOINCREMENT, - path VARCHAR(40) NOT NULL, + path VARCHAR(40) NOT NULL, /* SHA-1 of normalized subject */ UNIQUE (path) ) $dbh->do(<<''); CREATE TABLE IF NOT EXISTS id2num ( - id INTEGER NOT NULL, + id INTEGER NOT NULL, /* <=> msgid.id */ num INTEGER NOT NULL, UNIQUE (id, num) ) @@ -409,7 +439,7 @@ CREATE TABLE IF NOT EXISTS id2num ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS msgid ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + id INTEGER PRIMARY KEY AUTOINCREMENT, /* <=> id2num.id */ mid VARCHAR(244) NOT NULL, UNIQUE (mid) ) @@ -425,7 +455,7 @@ sub commit_lazy { sub begin_lazy { my ($self) = @_; return if $self->{txn}; - my $dbh = $self->connect or return; + my $dbh = $self->dbh or return; $dbh->begin_work; # $dbh->{Profile} = 2; $self->{txn} = 1; @@ -437,22 +467,237 @@ sub rollback_lazy { $self->{dbh}->rollback; } -sub disconnect { +sub dbh_close { my ($self) = @_; die "in transaction" if $self->{txn}; - $self->SUPER::disconnect; + $self->SUPER::dbh_close; } sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: - PublicInbox::Over::connect($self); - $self->disconnect; + PublicInbox::Over::dbh($self); + $self->dbh_close; +} + +sub rethread_prepare { + my ($self, $opt) = @_; + return unless $opt->{rethread}; + begin_lazy($self); + my $min = $self->{min_tid} = get_counter($self->{dbh}, 'thread') // 0; + my $pr = $opt->{-progress}; + $pr->("rethread min THREADID ".($min + 1)."\n") if $pr && $min; +} + +sub rethread_done { + my ($self, $opt) = @_; + return unless $opt->{rethread} && $self->{txn}; + defined(my $min = $self->{min_tid}) or croak('BUG: no min_tid'); + my $dbh = $self->{dbh} or croak('BUG: no dbh'); + my $rows = $dbh->selectall_arrayref(<<'', { Slice => {} }, $min); +SELECT num,tid FROM over WHERE num < 0 AND tid < ? + + my $show_id = $dbh->prepare('SELECT id FROM id2num WHERE num = ?'); + my $show_mid = $dbh->prepare('SELECT mid FROM msgid WHERE id = ?'); + my $pr = $opt->{-progress}; + my $total = 0; + for my $r (@$rows) { + my $exp = 0; + $show_id->execute($r->{num}); + while (defined(my $id = $show_id->fetchrow_array)) { + ++$exp; + $show_mid->execute($id); + my $mid = $show_mid->fetchrow_array; + if (!defined($mid)) { + warn <{num} ID=$id THREADID=$r->{tid} has no Message-ID +EOF + next; + } + $pr->(<{num} <$mid> THREADID=$r->{tid} culled +EOM + } + delete_by_num($self, $r->{num}); + } + $pr->("I: rethread culled $total ghosts\n") if $pr && $total; +} + +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<""); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq ( + docid INTEGER PRIMARY KEY NOT NULL +) + + $dbh; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# returns remaining reference count to $docid +sub remove_xref3 { + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; + begin_lazy($self); + my $oidbin = pack('H*', $oidhex); + my ($sth, $ibx_id); + if (defined $eidx_key) { + $ibx_id = ibx_id($self, $eidx_key); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $oidbin, SQL_BLOB); + } else { + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + } + $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + +sub blob_exists { + my ($self, $oidhex) = @_; + my $sth = $self->dbh->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE oidbin = ? + + $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); + $sth->execute; + $sth->fetchrow_array; } 1;