X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=6cc86d5d038d3669b0991d270aa74234f3937560;hb=HEAD;hp=e606dcf564bfb8e59bf517558002a8b005579769;hpb=31e39c446d8b9534f0b91ffd8e517955aed65d82;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index e606dcf5..6cc86d5d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -158,7 +158,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1 foreach (@$nums) { $sth->execute($_->[0]); - my $smsg = $sth->fetchrow_hashref; + # $cb may delete rows and invalidate nums + my $smsg = $sth->fetchrow_hashref // next; $smsg = PublicInbox::Over::load_from_row($smsg); $cb->($self, $smsg, @arg) or return; } @@ -243,32 +244,13 @@ sub link_refs { $tid; } -sub parse_references ($$$) { - my ($smsg, $hdr, $mids) = @_; - my $refs = references($hdr); - push(@$refs, @$mids) if scalar(@$mids) > 1; - return $refs if scalar(@$refs) == 0; - - # prevent circular references here: - my %seen = ( $smsg->{mid} => 1 ); - my @keep; - foreach my $ref (@$refs) { - if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - next; - } - push(@keep, $ref) unless $seen{$ref}++; - } - $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; - \@keep; -} - -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } @@ -283,9 +265,11 @@ sub add_overview { my ($self, $eml, $smsg) = @_; $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; my $mids = mids_for_index($eml); - my $refs = parse_references($smsg, $eml, $mids); - $mids->[0] //= $smsg->{mid} //= $eml->{-lei_fake_mid}; - $smsg->{mid} //= ''; + my $refs = $smsg->parse_references($eml, $mids); + $mids->[0] //= do { + $smsg->{mid} //= ''; + $eml->{-lei_fake_mid}; + }; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { @@ -450,6 +434,7 @@ sub commit_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{dbh}->commit; + eval { $self->{dbh}->do('PRAGMA optimize') }; } sub begin_lazy { @@ -476,13 +461,13 @@ sub dbh_close { sub create { my ($self) = @_; my $fn = $self->{filename} // do { - Carp::confess('BUG: no {filename}') unless $self->{dbh}; + croak('BUG: no {filename}') unless $self->{dbh}; return; }; unless (-r $fn) { require File::Path; - require File::Basename; - File::Path::mkpath(File::Basename::dirname($fn)); + my ($dir) = ($fn =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($dir); } # create the DB: PublicInbox::Over::dbh($self); @@ -524,12 +509,12 @@ EOF next; } $pr->(<{num} <$mid> THREADID=$r->{tid} culled +# ghost $r->{num} <$mid> THREADID=$r->{tid} culled EOM } delete_by_num($self, $r->{num}); } - $pr->("I: rethread culled $total ghosts\n") if $pr && $total; + $pr->("# rethread culled $total ghosts\n") if $pr && $total; } # used for cross-inbox search @@ -559,9 +544,13 @@ CREATE TABLE IF NOT EXISTS xref3 ( $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); # performance critical, this is not UNIQUE since we may need to - # tolerate some old bugs from indexing mirrors - $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. - 'xref3 (oidbin,xnum,ibx_id)'); + # tolerate some old bugs from indexing mirrors. n.b. we used + # to index oidbin here, but leaving it out speeds up reindexing + # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin + $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '. + 'xref3 (xnum,ibx_id)'); + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)'); $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidx_meta ( @@ -618,50 +607,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } -# returns remaining reference count to $docid -sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; - begin_lazy($self); - my $oidbin = pack('H*', $oidhex); - my ($sth, $ibx_id); - if (defined $eidx_key) { - $ibx_id = ibx_id($self, $eidx_key); - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $ibx_id); - $sth->bind_param(3, $oidbin, SQL_BLOB); - } else { - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $oidbin, SQL_BLOB); - } - $sth->execute; - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? - - $sth->execute($docid); - my $nr = $sth->fetchrow_array; - if ($nr == 0) { - delete_by_num($self, $docid); - } elsif (defined($ibx_id) && $rm_eidx_info) { - # if deduplication rules in ContentHash change, it's - # possible a docid can have multiple rows with the - # same ibx_id. This governs whether or not we call - # ->shard_remove_eidx_info in ExtSearchIdx. - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? - - $sth->execute($docid, $ibx_id); - my $count = $sth->fetchrow_array; - $$rm_eidx_info = ($count == 0); - } - $nr; -} - # for when an xref3 goes missing, this does NOT update {ts} sub update_blob { my ($self, $smsg, $oidhex) = @_; @@ -674,6 +619,25 @@ UPDATE over SET ddd = ? WHERE num = ? $sth->execute; } +sub merge_xref3 { # used for "-extindex --dedupe" + my ($self, $keep_docid, $drop_docid, $oidbin) = @_; + my $sth = $self->{dbh}->prepare_cached(<<''); +UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $keep_docid); + $sth->bind_param(2, $drop_docid); + $sth->bind_param(3, $oidbin, SQL_BLOB); + $sth->execute; + + # drop anything that conflicted + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $drop_docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + $sth->execute; +} + sub eidxq_add { my ($self, $docid) = @_; $self->dbh->prepare_cached(<<'')->execute($docid); @@ -688,14 +652,34 @@ DELETE FROM eidxq WHERE docid = ? } -sub blob_exists { - my ($self, $oidhex) = @_; - my $sth = $self->dbh->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE oidbin = ? - - $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); - $sth->execute; - $sth->fetchrow_array; +# returns true if we're vivifying a message for lei/store that was +# previously external-metadata only +sub vivify_xvmd { + my ($self, $smsg) = @_; + my @docids = $self->blob_exists($smsg->{blob}); + my @vivify_xvmd; + for my $id (@docids) { + if (my $cur = $self->get_art($id)) { + # already indexed if bytes > 0 + return if $cur->{bytes} > 0; + push @vivify_xvmd, $id; + } else { + warn "W: $smsg->{blob} #$id gone (bug?)\n"; + } + } + $smsg->{-vivify_xvmd} = \@vivify_xvmd; +} + +sub fork_ok { + return 1 if $DBD::SQLite::sqlite_version >= 3008003; + my ($opt) = @_; + my @j = split(/,/, $opt->{jobs} // ''); + state $warned; + grep { $_ > 1 } @j and $warned //= warn('DBD::SQLite version is ', + $DBD::SQLite::sqlite_version, + ", need >= 3008003 (3.8.3) for --jobs > 1\n"); + $opt->{jobs} = '1,1'; + undef; } 1;