X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FOverIdx.pm;h=d6d706f7fed04363d409eb7aa34564f08ad7dc2e;hb=ed0167d2a851b4f5128f57ad60309a0b76e62cfa;hp=985c5473e7c7cf491386de5fdf07164de982db54;hpb=9dfc0b670fc634b54998c3020f173b82de1915ac;p=public-inbox.git diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 985c5473..d6d706f7 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -158,7 +158,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1 foreach (@$nums) { $sth->execute($_->[0]); - my $smsg = $sth->fetchrow_hashref; + # $cb may delete rows and invalidate nums + my $smsg = $sth->fetchrow_hashref // next; $smsg = PublicInbox::Over::load_from_row($smsg); $cb->($self, $smsg, @arg) or return; } @@ -243,12 +244,13 @@ sub link_refs { $tid; } -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } @@ -264,8 +266,10 @@ sub add_overview { $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; my $mids = mids_for_index($eml); my $refs = $smsg->parse_references($eml, $mids); - $mids->[0] //= $smsg->{mid} //= $eml->{-lei_fake_mid}; - $smsg->{mid} //= ''; + $mids->[0] //= do { + $smsg->{mid} //= ''; + $eml->{-lei_fake_mid}; + }; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { @@ -456,13 +460,13 @@ sub dbh_close { sub create { my ($self) = @_; my $fn = $self->{filename} // do { - Carp::confess('BUG: no {filename}') unless $self->{dbh}; + croak('BUG: no {filename}') unless $self->{dbh}; return; }; unless (-r $fn) { require File::Path; - require File::Basename; - File::Path::mkpath(File::Basename::dirname($fn)); + my ($dir) = ($fn =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($dir); } # create the DB: PublicInbox::Over::dbh($self); @@ -539,9 +543,13 @@ CREATE TABLE IF NOT EXISTS xref3 ( $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); # performance critical, this is not UNIQUE since we may need to - # tolerate some old bugs from indexing mirrors - $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. - 'xref3 (oidbin,xnum,ibx_id)'); + # tolerate some old bugs from indexing mirrors. n.b. we used + # to index oidbin here, but leaving it out speeds up reindexing + # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin + $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '. + 'xref3 (xnum,ibx_id)'); + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)'); $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidx_meta ( @@ -598,50 +606,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } -# returns remaining reference count to $docid -sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; - begin_lazy($self); - my $oidbin = pack('H*', $oidhex); - my ($sth, $ibx_id); - if (defined $eidx_key) { - $ibx_id = ibx_id($self, $eidx_key); - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $ibx_id); - $sth->bind_param(3, $oidbin, SQL_BLOB); - } else { - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $oidbin, SQL_BLOB); - } - $sth->execute; - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? - - $sth->execute($docid); - my $nr = $sth->fetchrow_array; - if ($nr == 0) { - delete_by_num($self, $docid); - } elsif (defined($ibx_id) && $rm_eidx_info) { - # if deduplication rules in ContentHash change, it's - # possible a docid can have multiple rows with the - # same ibx_id. This governs whether or not we call - # ->shard_remove_eidx_info in ExtSearchIdx. - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? - - $sth->execute($docid, $ibx_id); - my $count = $sth->fetchrow_array; - $$rm_eidx_info = ($count == 0); - } - $nr; -} - # for when an xref3 goes missing, this does NOT update {ts} sub update_blob { my ($self, $smsg, $oidhex) = @_; @@ -654,6 +618,26 @@ UPDATE over SET ddd = ? WHERE num = ? $sth->execute; } +sub merge_xref3 { # used for "-extindex --dedupe" + my ($self, $keep_docid, $drop_docid, $oidhex) = @_; + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $keep_docid); + $sth->bind_param(2, $drop_docid); + $sth->bind_param(3, $oidbin, SQL_BLOB); + $sth->execute; + + # drop anything that conflicted + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $drop_docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + $sth->execute; +} + sub eidxq_add { my ($self, $docid) = @_; $self->dbh->prepare_cached(<<'')->execute($docid); @@ -668,14 +652,22 @@ DELETE FROM eidxq WHERE docid = ? } -sub blob_exists { - my ($self, $oidhex) = @_; - my $sth = $self->dbh->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE oidbin = ? - - $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); - $sth->execute; - $sth->fetchrow_array; +# returns true if we're vivifying a message for lei/store that was +# previously external-metadata only +sub vivify_xvmd { + my ($self, $smsg) = @_; + my @docids = $self->blob_exists($smsg->{blob}); + my @vivify_xvmd; + for my $id (@docids) { + if (my $cur = $self->get_art($id)) { + # already indexed if bytes > 0 + return if $cur->{bytes} > 0; + push @vivify_xvmd, $id; + } else { + warn "W: $smsg->{blob} #$id gone (bug?)\n"; + } + } + $smsg->{-vivify_xvmd} = \@vivify_xvmd; } 1;