X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=d63dd7c742b4c21ef9420d810954df3e8332f45e;hb=3fc59df0d633a17e0c5e43d633d12e8772c06ec3;hp=6efc1f3fb937ebb17726cdbddbd3defd77c85e4b;hpb=3713c727cda431a0dc2865a7878c13ecf9f21851;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 6efc1f3f..d63dd7c7 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,19 +4,20 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; use Fcntl qw(:flock :DEFAULT); -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); use PublicInbox::MID qw/mid_clean id_compress mid_mime/; use PublicInbox::MsgIter; use Carp qw(croak); +use POSIX qw(strftime); require PublicInbox::Git; *xpfx = *PublicInbox::Search::xpfx; @@ -30,9 +31,21 @@ use constant { }; sub new { - my ($class, $git_dir, $creat) = @_; + my ($class, $inbox, $creat) = @_; + my $git_dir = $inbox; + my $altid; + if (ref $inbox) { + $git_dir = $inbox->{mainrepo}; + $altid = $inbox->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { + PublicInbox::AltId->new($inbox, $_); + } @$altid ]; + } + } require Search::Xapian::WritableDatabase; - my $self = bless { git_dir => $git_dir }, $class; + my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; @@ -83,19 +96,58 @@ sub _lock_release { close $lockfh or die "close failed: $!\n"; } -sub add_val { +sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } +sub add_values ($$$) { + my ($smsg, $bytes, $num) = @_; + + my $ts = $smsg->ts; + my $doc = $smsg->{doc}; + add_val($doc, &PublicInbox::Search::TS, $ts); + + defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $smsg->{mime}->body_raw =~ tr!\n!\n!); + + my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); +} + +sub index_users ($$) { + my ($tg, $smsg) = @_; + + my $from = $smsg->from; + my $to = $smsg->to; + my $cc = $smsg->cc; + + $tg->index_text($from, 1, 'A'); # A - author + $tg->increase_termpos; + $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->increase_termpos; + $tg->index_text($cc, 1, 'XCC') if $cc ne ''; + $tg->increase_termpos; +} + +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); + $tg->increase_termpos; +} + sub add_message { my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; @@ -103,7 +155,7 @@ sub add_message { if ($smsg) { # convert a ghost to a regular message # it will also clobber any existing regular message - $doc_id = $smsg->doc_id; + $doc_id = $smsg->{doc_id}; $old_tid = $smsg->thread_id; } $smsg = PublicInbox::SearchMsg->new($mime); @@ -116,60 +168,66 @@ sub add_message { $doc->add_term(xpfx('path') . id_compress($path)); } - add_val($doc, &PublicInbox::Search::TS, $smsg->ts); - - defined($num) and - add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and - add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $mime->body_raw =~ tr!\n!\n!); + add_values($smsg, $bytes, $num); my $tg = $self->term_generator; $tg->set_document($doc); $tg->index_text($subj, 1, 'S') if $subj; $tg->increase_termpos; - $tg->index_text($subj) if $subj; - $tg->increase_termpos; - $tg->index_text($smsg->from); - $tg->increase_termpos; + index_users($tg, $smsg); msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || $ct_msg; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { - if ($l =~ /^\s*>/) { + if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - $tg->index_text(join("\n", @quot), 0); - @quot = (); - $tg->increase_termpos; - } - if (@orig) { - $tg->index_text(join("\n", @orig)); - @orig = (); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); link_message($self, $smsg, $old_tid); + $tg->index_text($mid, 1, 'XMID'); $doc->set_data($smsg->to_doc_data($blob)); + + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } + } if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { @@ -231,14 +289,14 @@ sub link_message { my ($self, $smsg, $old_tid) = @_; my $doc = $smsg->{doc}; my $mid = $smsg->mid; - my $mime = $smsg->mime; + my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; my $refs = $hdr->header_raw('References'); my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - if (my $irt = $hdr->header_raw('In-Reply-To')) { - # last References should be $irt - # we will de-dupe later - push @refs, mid_clean($irt); + my $irt = $hdr->header_raw('In-Reply-To'); + if (defined $irt) { + $irt = mid_clean($irt); + $irt = undef if $mid eq $irt; } my $tid; @@ -247,6 +305,15 @@ sub link_message { my @orig_refs = @refs; @refs = (); + if (defined $irt) { + # to check MAX_MID_SIZE + push @orig_refs, $irt; + + # below, we will ensure IRT (if specified) + # is the last References + $uniq{$irt} = 1; + } + # prevent circular references via References: here: foreach my $ref (@orig_refs) { if (length($ref) > MAX_MID_SIZE) { @@ -257,6 +324,11 @@ sub link_message { push @refs, $ref; } } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + push @refs, $irt if defined $irt; + if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; @@ -328,7 +400,7 @@ sub do_cat_mail { my $str = $git->cat_file($blob, $sizeref); # fixup bugs from import: $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - Email::MIME->new($str); + PublicInbox::MIME->new($str); }; $@ ? undef : $mime; } @@ -390,6 +462,7 @@ sub _index_sync { my $tip = $opts->{ref} || 'HEAD'; my $reindex = $opts->{reindex}; my ($mkey, $last_commit, $lx, $xlog); + $self->{git}->batch_prepare; my $xdb = _xdb_acquire($self); $xdb->begin_transaction; do { @@ -414,19 +487,24 @@ sub _index_sync { my $mm = _msgmap_init($self); my $dbh = $mm->{dbh} if $mm; + my $mm_only; my $cb = sub { my ($commit, $more) = @_; if ($dbh) { $mm->last_commit($commit) if $commit; $dbh->commit; } - $xdb->set_metadata($mkey, $commit) if $mkey && $commit; - $xdb->commit_transaction; - $xdb = _xdb_release($self); + if (!$mm_only) { + $xdb->set_metadata($mkey, $commit) if $mkey && $commit; + $xdb->commit_transaction; + $xdb = _xdb_release($self); + } # let another process do some work... < if ($more) { - $xdb = _xdb_acquire($self); - $xdb->begin_transaction; + if (!$mm_only) { + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } $dbh->begin_work if $dbh; } }; @@ -450,14 +528,13 @@ sub _index_sync { my $mkey_prev = $mkey; $mkey = undef; # ignore xapian, for now my $mlog = _git_log($self, $r); + $mm_only = 1; rlog($self, $mlog, *index_mm, *unindex_mm, $cb); - $mlog = undef; + $mm_only = $mlog = undef; # now deal with Xapian $mkey = $mkey_prev; $dbh = undef; - $xdb = _xdb_acquire($self); - $xdb->begin_transaction; rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); } } else {