X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=4e951bbedc7449d899a683b8ce6f965f3d291e5e;hb=227a1d886672767e37cc86a3432952c14eb8a143;hp=21ab8119e70fe559fbe5435fea50a443b0b1459a;hpb=cd8dd7b08fddc7c2b5f218c3fcaa5dca5f9ad945;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 21ab8119..4e951bbe 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -12,7 +12,7 @@ use warnings; use base qw(PublicInbox::Search PublicInbox::Lock); use PublicInbox::MIME; use PublicInbox::InboxWritable; -use PublicInbox::MID qw/mid_clean id_compress mid_mime mids_for_index/; +use PublicInbox::MID qw/mid_clean mid_mime mids_for_index/; use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); @@ -34,7 +34,7 @@ sub new { ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; my $levels = qr/\A(?:full|medium|basic)\z/; my $inboxdir = $ibx->{inboxdir}; - my $version = $ibx->{version} || 1; + my $version = $ibx->version; my $indexlevel = 'full'; my $altid = $ibx->{altid}; if ($altid) { @@ -133,16 +133,25 @@ sub add_val ($$$) { $doc->add_value($col, $num); } -sub index_text ($$$$) -{ - my ($self, $field, $n, $text) = @_; - my $tg = $self->term_generator; +sub term_generator ($) { # write-only + my ($self) = @_; + + $self->{term_generator} //= do { + my $tg = $X->{TermGenerator}->new; + $tg->set_stemmer($self->stemmer); + $tg; + } +} + +sub index_text ($$$$) { + my ($self, $text, $wdf_inc, $prefix) = @_; + my $tg = term_generator($self); # man Search::Xapian::TermGenerator if ($self->{indexlevel} eq 'full') { - $tg->index_text($field, $n, $text); + $tg->index_text($text, $wdf_inc, $prefix); $tg->increase_termpos; } else { - $tg->index_text_without_positions($field, $n, $text); + $tg->index_text_without_positions($text, $wdf_inc, $prefix); } } @@ -153,18 +162,18 @@ sub index_users ($$) { my $to = $smsg->to; my $cc = $smsg->cc; - $self->index_text($from, 1, 'A'); # A - author - $self->index_text($to, 1, 'XTO') if $to ne ''; - $self->index_text($cc, 1, 'XCC') if $cc ne ''; + index_text($self, $from, 1, 'A'); # A - author + index_text($self, $to, 1, 'XTO') if $to ne ''; + index_text($self, $cc, 1, 'XCC') if $cc ne ''; } sub index_diff_inc ($$$$) { my ($self, $text, $pfx, $xnq) = @_; if (@$xnq) { - $self->index_text(join("\n", @$xnq), 1, 'XNQ'); + index_text($self, join("\n", @$xnq), 1, 'XNQ'); @$xnq = (); } - $self->index_text($text, 1, $pfx); + index_text($self, $text, 1, $pfx); } sub index_old_diff_fn { @@ -179,7 +188,7 @@ sub index_old_diff_fn { $fb = join('/', @fb); if ($fa eq $fb) { unless ($seen->{$fa}++) { - $self->index_diff_inc($fa, 'XDFN', $xnq); + index_diff_inc($self, $fa, 'XDFN', $xnq); } return 1; } @@ -190,22 +199,22 @@ sub index_old_diff_fn { } sub index_diff ($$$) { - my ($self, $lines, $doc) = @_; + my ($self, $txt, $doc) = @_; my %seen; my $in_diff; my @xnq; my $xnq = \@xnq; - foreach (@$lines) { + foreach (split(/\n/, $txt)) { if ($in_diff && s/^ //) { # diff context - $self->index_diff_inc($_, 'XDFCTX', $xnq); + index_diff_inc($self, $_, 'XDFCTX', $xnq); } elsif (/^-- $/) { # email signature begins $in_diff = undef; } elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) { my ($fa, $fb) = ($1, $2); my $fn = (split('/', git_unquote($fa), 2))[1]; - $seen{$fn}++ or $self->index_diff_inc($fn, 'XDFN', $xnq); + $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $fn = (split('/', git_unquote($fb), 2))[1]; - $seen{$fn}++ or $self->index_diff_inc($fn, 'XDFN', $xnq); + $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; # traditional diff: } elsif (m/^diff -(.+) (\S+) (\S+)$/) { @@ -213,28 +222,28 @@ sub index_diff ($$$) { push @xnq, $_; # only support unified: next unless $opt =~ /[uU]/; - $in_diff = $self->index_old_diff_fn(\%seen, $fa, $fb, + $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb, $xnq); } elsif (m!^--- ("?a/.+)!) { my $fn = $1; $fn = (split('/', git_unquote($fn), 2))[1]; - $seen{$fn}++ or $self->index_diff_inc($fn, 'XDFN', $xnq); + $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (m!^\+\+\+ ("?b/.+)!) { my $fn = $1; $fn = (split('/', git_unquote($fn), 2))[1]; - $seen{$fn}++ or $self->index_diff_inc($fn, 'XDFN', $xnq); + $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (/^--- (\S+)/) { $in_diff = $1; push @xnq, $_; } elsif (defined $in_diff && /^\+\+\+ (\S+)/) { - $in_diff = $self->index_old_diff_fn(\%seen, $in_diff, $1, - $xnq); + $in_diff = index_old_diff_fn($self, \%seen, $in_diff, + $1, $xnq); } elsif ($in_diff && s/^\+//) { # diff added - $self->index_diff_inc($_, 'XDFB', $xnq); + index_diff_inc($self, $_, 'XDFB', $xnq); } elsif ($in_diff && s/^-//) { # diff removed - $self->index_diff_inc($_, 'XDFA', $xnq); + index_diff_inc($self, $_, 'XDFA', $xnq); } elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) { my ($ba, $bb) = ($1, $2); index_git_blob_id($doc, 'XDFPRE', $ba); @@ -244,7 +253,7 @@ sub index_diff ($$$) { # traditional diff w/o -p } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) { # hunk header context - $self->index_diff_inc($1, 'XDFHH', $xnq); + index_diff_inc($self, $1, 'XDFHH', $xnq); # ignore the following lines: } elsif (/^(?:dis)similarity index/ || /^(?:old|new) mode/ || @@ -255,7 +264,9 @@ sub index_diff ($$$) { /^Binary files .* differ/) { push @xnq, $_; } elsif ($_ eq '') { - $in_diff = undef; + # possible to be in diff context, some mail may be + # stripped by MUA or even GNU diff(1). "git apply" + # treats a bare "\n" as diff context, too } else { push @xnq, $_; warn "non-diff line: $_\n" if DEBUG && $_ ne ''; @@ -263,27 +274,42 @@ sub index_diff ($$$) { } } - $self->index_text(join("\n", @xnq), 1, 'XNQ'); + index_text($self, join("\n", @xnq), 1, 'XNQ'); } sub index_body ($$$) { - my ($self, $lines, $doc) = @_; - my $txt = join("\n", @$lines); + my ($self, $txt, $doc) = @_; if ($doc) { # does it look like a diff? if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - $txt = undef; - $self->index_diff($lines, $doc); + index_diff($self, $txt, $doc); } else { - $self->index_text($txt, 1, 'XNQ'); + index_text($self, $txt, 1, 'XNQ'); } } else { - $self->index_text($txt, 0, 'XQUOT'); + index_text($self, $txt, 0, 'XQUOT'); + } +} + +sub index_xapian { # msg_iter callback + my ($part, $depth, @idx) = @{$_[0]}; + my ($self, $doc) = @{$_[1]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + index_text($self, $fn, 1, 'XFN'); } - @$lines = (); + + my ($s, undef) = msg_part_text($part, $ct); + defined $s or return; + + # split off quoted and unquoted blocks: + my @sections = split(/((?:^>[^\n]*\n)+)/sm, $s); + $part = $s = undef; + index_body($self, $_, /\A>/ ? 0 : $doc) for @sections; } -sub add_xapian ($$$$$) { +sub add_xapian ($$$$$$) { my ($self, $mime, $num, $oid, $mids, $mid0) = @_; my $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $X->{Document}->new; @@ -295,46 +321,21 @@ sub add_xapian ($$$$$) { my $dt = strftime('%Y%m%d%H%M%S', @ds); add_val($doc, PublicInbox::Search::DT(), $dt); - my $tg = $self->term_generator; + my $tg = term_generator($self); $tg->set_document($doc); - $self->index_text($subj, 1, 'S') if $subj; - $self->index_users($smsg); - - msg_iter($mime, sub { - my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || 'text/plain'; - my $fn = $part->filename; - if (defined $fn && $fn ne '') { - $self->index_text($fn, 1, 'XFN'); - } - - my ($s, undef) = msg_part_text($part, $ct); - defined $s or return; - - my (@orig, @quot); - my @lines = split(/\n/, $s); - while (defined(my $l = shift @lines)) { - if ($l =~ /^>/) { - $self->index_body(\@orig, $doc) if @orig; - push @quot, $l; - } else { - $self->index_body(\@quot, 0) if @quot; - push @orig, $l; - } - } - $self->index_body(\@quot, 0) if @quot; - $self->index_body(\@orig, $doc) if @orig; - }); + index_text($self, $subj, 1, 'S') if $subj; + index_users($self, $smsg); + msg_iter($mime, \&index_xapian, [ $self, $doc ]); foreach my $mid (@$mids) { - $self->index_text($mid, 1, 'XM'); + index_text($self, $mid, 1, 'XM'); # because too many Message-IDs are prefixed with # "Pine.LNX."... if ($mid =~ /\w{12,}/) { my @long = ($mid =~ /(\w{3,}+)/g); - $self->index_text(join(' ', @long), 1, 'XM'); + index_text($self, join(' ', @long), 1, 'XM'); } } $smsg->{to} = $smsg->{cc} = ''; @@ -355,18 +356,27 @@ sub add_xapian ($$$$$) { $self->{xdb}->replace_document($num, $doc); } +sub _msgmap_init ($) { + my ($self) = @_; + die "BUG: _msgmap_init is only for v1\n" if $self->{version} != 1; + $self->{mm} //= eval { + require PublicInbox::Msgmap; + PublicInbox::Msgmap->new($self->{inboxdir}, 1); + }; +} + sub add_message { # mime = Email::MIME object my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; my $mids = mids_for_index($mime->header_obj); - $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility - unless (defined $num) { # v1 - $self->_msgmap_init; - $num = index_mm($self, $mime); - } + $mid0 //= $mids->[0]; # v1 compatibility + $num //= do { # v1 + _msgmap_init($self); + index_mm($self, $mime); + }; eval { if (need_xapian($self)) { - $self->add_xapian($mime, $num, $oid, $mids, $mid0) + add_xapian($self, $mime, $num, $oid, $mids, $mid0); } if (my $over = $self->{over}) { $over->add_overview($mime, $bytes, $num, $oid, $mid0); @@ -423,7 +433,7 @@ sub remove_message { batch_do($self, 'Q' . $mid, sub { my ($ids) = @_; $db->delete_document($_) for @$ids; - $nr = scalar @$ids; + $nr += scalar @$ids; }); }; if ($@) { @@ -464,18 +474,6 @@ sub remove_by_oid { scalar(@delete); } -sub term_generator { # write-only - my ($self) = @_; - - my $tg = $self->{term_generator}; - return $tg if $tg; - - $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); - - $self->{term_generator} = $tg; -} - sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; @@ -613,15 +611,6 @@ sub read_log { $batch_cb->($nr, $latest, $newest); } -sub _msgmap_init { - my ($self) = @_; - die "BUG: _msgmap_init is only for v1\n" if $self->{version} != 1; - $self->{mm} ||= eval { - require PublicInbox::Msgmap; - PublicInbox::Msgmap->new($self->{inboxdir}, 1); - }; -} - sub _git_log { my ($self, $opts, $range) = @_; my $git = $self->{git}; @@ -677,7 +666,6 @@ sub is_ancestor ($$$) { my $cmd = [ 'git', "--git-dir=$git->{git_dir}", qw(merge-base --is-ancestor), $cur, $tip ]; my $pid = spawn($cmd); - defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; $? == 0; }