X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=4aac0281304146c39e2eb4d25cf8575c121a84a0;hb=172416d1cd465da4242cc744a3f309d307f1311d;hp=cd27a29459e2360d1607cc7267d910af3aa8acb4;hpb=a9c903a57ff9a18c56a53bcba4316eade423fef6;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cd27a294..4aac0281 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,8 +4,8 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; @@ -117,7 +117,7 @@ sub add_values ($$$) { $smsg->{mime}->body_raw =~ tr!\n!\n!); my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); - $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); } sub index_users ($$) { @@ -129,15 +129,16 @@ sub index_users ($$) { $tg->index_text($from, 1, 'A'); # A - author $tg->increase_termpos; - $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->increase_termpos; $tg->index_text($cc, 1, 'XCC') if $cc ne ''; - my $tc = join("\t", $to, $cc); - $tg->index_text($tc, 1, 'XTC') if $tc ne ''; - my $tcf = join("\t", $tc, $from); - $tg->index_text($tcf, 1, 'XTCF') if $tcf ne ''; + $tg->increase_termpos; +} - $tg->index_text($from); +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); $tg->increase_termpos; } @@ -147,7 +148,6 @@ sub add_message { my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; @@ -173,56 +173,52 @@ sub add_message { my $tg = $self->term_generator; $tg->set_document($doc); - if ($subj) { - $tg->index_text($subj, 1, 'S'); - $tg->index_text($subj, 1, 'XBS'); - } - $tg->increase_termpos; - $tg->index_text($subj) if $subj; + $tg->index_text($subj, 1, 'S') if $subj; $tg->increase_termpos; index_users($tg, $smsg); msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || $ct_msg; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { - if ($l =~ /^\s*>/) { + if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - my $s = join("\n", @quot); - @quot = (); - $tg->index_text($s, 1, 'XQUOT'); - $tg->index_text($s, 0, 'XBS'); - $tg->index_text($s, 0, 'XBODY'); - $tg->index_text($s, 0); - $tg->increase_termpos; - } - if (@orig) { - my $s = join("\n", @orig); - @orig = (); - $tg->index_text($s, 1, 'XNQ'); - $tg->index_text($s, 1, 'XBS'); - $tg->index_text($s, 1, 'XBODY'); - $tg->index_text($s); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); link_message($self, $smsg, $old_tid); - $tg->index_text($mid, 1); + $tg->index_text($mid, 1, 'XMID'); $doc->set_data($smsg->to_doc_data($blob)); if (my $altid = $self->{-altid}) {