X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=30d3fe926a14cef02834bfe3511ac37022e14bee;hb=970eb1fd83b93c790d2faed6bf64a97d6d5fe126;hp=25452daec4280eae41a3d8f311c49bd2d5c0cbde;hpb=6ca633a1360a0974a8ebb117554a856022d797c6;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 25452dae..30d3fe92 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,13 +4,13 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; use Fcntl qw(:flock :DEFAULT); -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); @@ -19,15 +19,15 @@ use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); require PublicInbox::Git; -*xpfx = *PublicInbox::Search::xpfx; -use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian use constant { + MAX_MID_SIZE => 244, # max term size - 1 in Xapian PERM_UMASK => 0, OLD_PERM_GROUP => 1, OLD_PERM_EVERYBODY => 2, PERM_GROUP => 0660, PERM_EVERYBODY => 0664, + BATCH_BYTES => 1_000_000, }; sub new { @@ -72,7 +72,6 @@ sub _xdb_acquire { require File::Path; _lock_acquire($self); File::Path::mkpath($dir); - $self->{batch_size} = 100; $flag = Search::Xapian::DB_CREATE_OR_OPEN; } $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); @@ -117,7 +116,7 @@ sub add_values ($$$) { $smsg->{mime}->body_raw =~ tr!\n!\n!); my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); - $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); } sub index_users ($$) { @@ -135,13 +134,19 @@ sub index_users ($$) { $tg->increase_termpos; } +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); + $tg->increase_termpos; +} + sub add_message { my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; @@ -149,17 +154,17 @@ sub add_message { if ($smsg) { # convert a ghost to a regular message # it will also clobber any existing regular message - $doc_id = $smsg->doc_id; + $doc_id = $smsg->{doc_id}; $old_tid = $smsg->thread_id; } $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; - $doc->add_term(xpfx('mid') . $mid); + $doc->add_term('Q' . $mid); my $subj = $smsg->subject; if ($subj ne '') { my $path = $self->subject_path($subj); - $doc->add_term(xpfx('path') . id_compress($path)); + $doc->add_term('XPATH' . id_compress($path)); } add_values($smsg, $bytes, $num); @@ -174,34 +179,41 @@ sub add_message { msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || $ct_msg; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - my $s = join("\n", @quot); - @quot = (); - $tg->index_text($s, 0, 'XQUOT'); - $tg->increase_termpos; - } - if (@orig) { - my $s = join("\n", @orig); - @orig = (); - $tg->index_text($s, 1, 'XNQ'); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); link_message($self, $smsg, $old_tid); @@ -276,15 +288,14 @@ sub link_message { my ($self, $smsg, $old_tid) = @_; my $doc = $smsg->{doc}; my $mid = $smsg->mid; - my $mime = $smsg->mime; + my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; - my $refs = $hdr->header_raw('References'); - my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - if (my $irt = $hdr->header_raw('In-Reply-To')) { - # last References should be $irt - # we will de-dupe later - push @refs, mid_clean($irt); - } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + my @refs = ($hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + @refs = ((join(' ', @refs)) =~ /<([^>]+)>/g); my $tid; if (@refs) { @@ -302,6 +313,7 @@ sub link_message { push @refs, $ref; } } + if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; @@ -317,9 +329,9 @@ sub link_message { merge_threads($self, $tid, $ptid); } } else { - $tid = $self->next_thread_id; + $tid = defined $old_tid ? $old_tid : $self->next_thread_id; } - $doc->add_term(xpfx('thread') . $tid); + $doc->add_term('G' . $tid); } sub index_blob { @@ -373,7 +385,7 @@ sub do_cat_mail { my $str = $git->cat_file($blob, $sizeref); # fixup bugs from import: $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - Email::MIME->new($str); + PublicInbox::MIME->new($str); }; $@ ? undef : $mime; } @@ -383,6 +395,15 @@ sub index_sync { with_umask($self, sub { $self->_index_sync($opts) }); } +sub batch_adjust ($$$$) { + my ($max, $bytes, $batch_cb, $latest) = @_; + $$max -= $bytes; + if ($$max <= 0) { + $$max = BATCH_BYTES; + $batch_cb->($latest, 1); + } +} + sub rlog { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; @@ -392,23 +413,21 @@ sub rlog { my $git = $self->{git}; my $latest; my $bytes; - my $max = $self->{batch_size}; # may be undef + my $max = BATCH_BYTES; local $/ = "\n"; my $line; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - my $mime = do_cat_mail($git, $blob) or next; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); $del_cb->($self, $mime); } elsif ($line =~ /^commit ($h40)/o) { - if (defined $max && --$max <= 0) { - $max = $self->{batch_size}; - $batch_cb->($latest, 1); - } $latest = $1; } } @@ -529,9 +548,9 @@ sub create_ghost { my $tid = $self->next_thread_id; my $doc = Search::Xapian::Document->new; - $doc->add_term(xpfx('mid') . $mid); - $doc->add_term(xpfx('thread') . $tid); - $doc->add_term(xpfx('type') . 'ghost'); + $doc->add_term('Q' . $mid); + $doc->add_term('G' . $tid); + $doc->add_term('T' . 'ghost'); my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); $self->{xdb}->add_document($doc); @@ -542,15 +561,14 @@ sub create_ghost { sub merge_threads { my ($self, $winner_tid, $loser_tid) = @_; return if $winner_tid == $loser_tid; - my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); - my $thread_pfx = xpfx('thread'); + my ($head, $tail) = $self->find_doc_ids('G' . $loser_tid); my $db = $self->{xdb}; for (; $head != $tail; $head->inc) { my $docid = $head->get_docid; my $doc = $db->get_document($docid); - $doc->remove_term($thread_pfx . $loser_tid); - $doc->add_term($thread_pfx . $winner_tid); + $doc->remove_term('G' . $loser_tid); + $doc->add_term('G' . $winner_tid); $db->replace_document($docid, $doc); } } @@ -607,7 +625,7 @@ sub with_umask { my $rv = eval { $cb->() }; my $err = $@; umask $old; - die $err if $@; + die $err if $err; $rv; }