X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=6cc86d5d038d3669b0991d270aa74234f3937560;hb=HEAD;hp=4543bfa1d9acba6e06c4dc3bbe3a91c309123404;hpb=8b1950055d51d4360e596446e5ac0f41008e357d;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 4543bfa1..6cc86d5d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -9,25 +9,19 @@ # are denoted by a negative NNTP article number. package PublicInbox::OverIdx; use strict; -use warnings; -use base qw(PublicInbox::Over); +use v5.10.1; +use parent qw(PublicInbox::Over); use IO::Handle; use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); -use PublicInbox::Search; use Carp qw(croak); sub dbh_new { my ($self) = @_; my $dbh = $self->SUPER::dbh_new($self->{-no_fsync} ? 2 : 1); - # TRUNCATE reduces I/O compared to the default (DELETE) - # We do not use WAL since we're optimized for read-only ops, - # (and read-only requires SQLite 3.22.0 (2018-01-22)). - $dbh->do('PRAGMA journal_mode = TRUNCATE'); - # 80000 pages (80MiB on SQLite <3.12.0, 320MiB on 3.12.0+) # was found to be good in 2018 during the large LKML import # at the time. This ought to be configurable based on HW @@ -85,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -159,7 +158,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1 foreach (@$nums) { $sth->execute($_->[0]); - my $smsg = $sth->fetchrow_hashref; + # $cb may delete rows and invalidate nums + my $smsg = $sth->fetchrow_hashref // next; $smsg = PublicInbox::Over::load_from_row($smsg); $cb->($self, $smsg, @arg) or return; } @@ -176,8 +176,14 @@ sub _resolve_mid_to_tid { $$tid = $cur_tid; } else { # rethreading, queue up dead ghosts $$tid = next_tid($self); - my $num = $smsg->{num}; - push(@{$self->{-ghosts_to_delete}}, $num) if $num < 0; + my $n = $smsg->{num}; + if ($n > 0) { + $self->{dbh}->prepare_cached(<<'')->execute($$tid, $n); +UPDATE over SET tid = ? WHERE num = ? + + } elsif ($n < 0) { + push(@{$self->{-ghosts_to_delete}}, $n); + } } 1; } @@ -190,23 +196,20 @@ sub resolve_mid_to_tid { if (my $del = delete $self->{-ghosts_to_delete}) { delete_by_num($self, $_) for @$del; } - $tid // create_ghost($self, $mid); -} - -sub create_ghost { - my ($self, $mid) = @_; - my $id = mid2id($self, $mid); - my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; - my $tid = next_tid($self); - my $dbh = $self->{dbh}; - $dbh->prepare_cached(<<'')->execute($num, $tid); + $tid // do { # create a new ghost + my $id = mid2id($self, $mid); + my $num = next_ghost_num($self); + $num < 0 or die "ghost num is non-negative: $num\n"; + $tid = next_tid($self); + my $dbh = $self->{dbh}; + $dbh->prepare_cached(<<'')->execute($num, $tid); INSERT INTO over (num, tid) VALUES (?,?) - $dbh->prepare_cached(<<'')->execute($id, $num); + $dbh->prepare_cached(<<'')->execute($id, $num); INSERT INTO id2num (id, num) VALUES (?,?) - $tid; + $tid; + }; } sub merge_threads { @@ -241,50 +244,39 @@ sub link_refs { $tid; } -sub parse_references ($$$) { - my ($smsg, $hdr, $mids) = @_; - my $refs = references($hdr); - push(@$refs, @$mids) if scalar(@$mids) > 1; - return $refs if scalar(@$refs) == 0; - - # prevent circular references here: - my %seen = ( $smsg->{mid} => 1 ); - my @keep; - foreach my $ref (@$refs) { - if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - next; - } - push(@keep, $ref) unless $seen{$ref}++; - } - $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; - \@keep; -} - -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; my $mids = mids_for_index($eml); - my $refs = parse_references($smsg, $eml, $mids); + my $refs = $smsg->parse_references($eml, $mids); + $mids->[0] //= do { + $smsg->{mid} //= ''; + $eml->{-lei_fake_mid}; + }; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { @@ -303,7 +295,7 @@ sub _add_over { } } elsif ($n < 0) { # ghost $$old_tid //= $cur_valid ? $cur_tid : next_tid($self); - link_refs($self, $refs, $$old_tid); + $$old_tid = link_refs($self, $refs, $$old_tid); delete_by_num($self, $n); $$v++; } @@ -311,10 +303,10 @@ sub _add_over { } sub add_over { - my ($self, $values) = @_; - my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; + my ($self, $smsg, $mids, $refs, $xpath, $ddd) = @_; my $old_tid; my $vivified = 0; + my $num = $smsg->{num}; begin_lazy($self); delete_by_num($self, $num, \$old_tid); @@ -326,17 +318,17 @@ sub add_over { $v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n"; $vivified += $v; } - my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); - my $sid = sid($self, $xpath); + $smsg->{tid} = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); + $smsg->{sid} = sid($self, $xpath); my $dbh = $self->{dbh}; my $sth = $dbh->prepare_cached(<<''); INSERT INTO over (num, tid, sid, ts, ds, ddd) VALUES (?,?,?,?,?,?) - my $n = 0; - my @v = ($num, $tid, $sid, $ts, $ds); - foreach (@v) { $sth->bind_param(++$n, $_) } - $sth->bind_param(++$n, $ddd, SQL_BLOB); + my $nc = 1; + $sth->bind_param($nc, $num); + $sth->bind_param(++$nc, $smsg->{$_}) for (qw(tid sid ts ds)); + $sth->bind_param(++$nc, $ddd, SQL_BLOB); $sth->execute; $sth = $dbh->prepare_cached(<<''); INSERT INTO id2num (id, num) VALUES (?,?) @@ -388,13 +380,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, - tid INTEGER NOT NULL, - sid INTEGER, - ts INTEGER, - ds INTEGER, - ddd VARBINARY, /* doc-data-deflated */ - UNIQUE (num) + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ + tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ + sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ + ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ + ds INTEGER, /* RFC-2822 sent Date: header, git author time */ + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -415,13 +406,13 @@ CREATE TABLE IF NOT EXISTS counter ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS subject ( sid INTEGER PRIMARY KEY AUTOINCREMENT, - path VARCHAR(40) NOT NULL, + path VARCHAR(40) NOT NULL, /* SHA-1 of normalized subject */ UNIQUE (path) ) $dbh->do(<<''); CREATE TABLE IF NOT EXISTS id2num ( - id INTEGER NOT NULL, + id INTEGER NOT NULL, /* <=> msgid.id */ num INTEGER NOT NULL, UNIQUE (id, num) ) @@ -432,7 +423,7 @@ CREATE TABLE IF NOT EXISTS id2num ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS msgid ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + id INTEGER PRIMARY KEY AUTOINCREMENT, /* <=> id2num.id */ mid VARCHAR(244) NOT NULL, UNIQUE (mid) ) @@ -443,12 +434,13 @@ sub commit_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{dbh}->commit; + eval { $self->{dbh}->do('PRAGMA optimize') }; } sub begin_lazy { my ($self) = @_; return if $self->{txn}; - my $dbh = $self->connect or return; + my $dbh = $self->dbh or return; $dbh->begin_work; # $dbh->{Profile} = 2; $self->{txn} = 1; @@ -460,22 +452,26 @@ sub rollback_lazy { $self->{dbh}->rollback; } -sub disconnect { +sub dbh_close { my ($self) = @_; die "in transaction" if $self->{txn}; - $self->SUPER::disconnect; + $self->SUPER::dbh_close; } sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + croak('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; - require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + my ($dir) = ($fn =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($dir); } # create the DB: - PublicInbox::Over::connect($self); - $self->disconnect; + PublicInbox::Over::dbh($self); + $self->dbh_close; } sub rethread_prepare { @@ -513,12 +509,177 @@ EOF next; } $pr->(<{num} <$mid> THREADID=$r->{tid} culled +# ghost $r->{num} <$mid> THREADID=$r->{tid} culled EOM } delete_by_num($self, $r->{num}); } - $pr->("I: rethread culled $total ghosts\n") if $pr && $total; + $pr->("# rethread culled $total ghosts\n") if $pr && $total; +} + +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} //= do { + my $dbh = $self->dbh; + $dbh->do(<<''); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors. n.b. we used + # to index oidbin here, but leaving it out speeds up reindexing + # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin + $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '. + 'xref3 (xnum,ibx_id)'); + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL) + + 1; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub merge_xref3 { # used for "-extindex --dedupe" + my ($self, $keep_docid, $drop_docid, $oidbin) = @_; + my $sth = $self->{dbh}->prepare_cached(<<''); +UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $keep_docid); + $sth->bind_param(2, $drop_docid); + $sth->bind_param(3, $oidbin, SQL_BLOB); + $sth->execute; + + # drop anything that conflicted + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $drop_docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + +# returns true if we're vivifying a message for lei/store that was +# previously external-metadata only +sub vivify_xvmd { + my ($self, $smsg) = @_; + my @docids = $self->blob_exists($smsg->{blob}); + my @vivify_xvmd; + for my $id (@docids) { + if (my $cur = $self->get_art($id)) { + # already indexed if bytes > 0 + return if $cur->{bytes} > 0; + push @vivify_xvmd, $id; + } else { + warn "W: $smsg->{blob} #$id gone (bug?)\n"; + } + } + $smsg->{-vivify_xvmd} = \@vivify_xvmd; +} + +sub fork_ok { + return 1 if $DBD::SQLite::sqlite_version >= 3008003; + my ($opt) = @_; + my @j = split(/,/, $opt->{jobs} // ''); + state $warned; + grep { $_ > 1 } @j and $warned //= warn('DBD::SQLite version is ', + $DBD::SQLite::sqlite_version, + ", need >= 3008003 (3.8.3) for --jobs > 1\n"); + $opt->{jobs} = '1,1'; + undef; } 1;