X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=d1290dc208e37eb7689deff009da165fa5234ed1;hb=b15ca9a77bff088a3f5f0b8955de8b6a60565b04;hp=5ca819c383525de90c72eae9bc5c3f31efea76ce;hpb=1218a4126807951a0f47286338dc04d7f197bb78;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5ca819c3..d1290dc2 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -58,6 +58,7 @@ sub new { ibx_ver => $version, indexlevel => $indexlevel, }, $class; + $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; @@ -274,22 +275,8 @@ sub index_diff ($$$) { index_text($self, join("\n", @xnq), 1, 'XNQ'); } -sub index_body ($$$) { - my ($self, $txt, $doc) = @_; - if ($doc) { - # does it look like a diff? - if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - index_diff($self, $txt, $doc); - } else { - index_text($self, $txt, 1, 'XNQ'); - } - } else { - index_text($self, $txt, 0, 'XQUOT'); - } -} - sub index_xapian { # msg_iter callback - my ($part, $depth, @idx) = @{$_[0]}; + my $part = $_[0]->[0]; # ignore $depth and @idx my ($self, $doc) = @{$_[1]}; my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; @@ -299,19 +286,30 @@ sub index_xapian { # msg_iter callback my ($s, undef) = msg_part_text($part, $ct); defined $s or return; + $_[0]->[0] = $part = undef; # free memory # split off quoted and unquoted blocks: - my @sections = split(/((?:^>[^\n]*\n)+)/sm, $s); - $part = $s = undef; - index_body($self, $_, /\A>/ ? 0 : $doc) for @sections; + my @sections = PublicInbox::MsgIter::split_quotes($s); + undef $s; # free memory + for my $txt (@sections) { + if ($txt =~ /\A>/) { + index_text($self, $txt, 0, 'XQUOT'); + } else { + # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + index_diff($self, $txt, $doc); + } else { + index_text($self, $txt, 1, 'XNQ'); + } + } + undef $txt; # free memory + } } sub add_xapian ($$$$) { my ($self, $mime, $smsg, $mids) = @_; $smsg->{mime} = $mime; # XXX dangerous my $hdr = $mime->header_obj; - $smsg->{ds} = msg_datestamp($hdr, $self->{autime}); - $smsg->{ts} = msg_timestamp($hdr, $self->{cotime}); my $doc = $X->{Document}->new; my $subj = $smsg->subject; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); @@ -368,13 +366,19 @@ sub _msgmap_init ($) { sub add_message { # mime = Email::MIME object my ($self, $mime, $smsg) = @_; - my $mids = mids_for_index($mime->header_obj); + my $hdr = $mime->header_obj; + my $mids = mids_for_index($hdr); $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat $smsg->{mid} //= $mids->[0]; # v1 compatibility $smsg->{num} //= do { # v1 _msgmap_init($self); index_mm($self, $mime); }; + + # v1 and tests only: + $smsg->{ds} //= msg_datestamp($hdr, $self->{autime}); + $smsg->{ts} //= msg_timestamp($hdr, $self->{cotime}); + eval { # order matters, overview stores every possible piece of # data in doc_data (deflated). Xapian only stores a subset @@ -382,7 +386,7 @@ sub add_message { # storing doc_data in Xapian sometime after we get multi-inbox # search working. if (my $over = $self->{over}) { # v1 only - $over->add_overview($mime, $smsg, $self); + $over->add_overview($mime, $smsg); } if (need_xapian($self)) { add_xapian($self, $mime, $smsg, $mids); @@ -492,13 +496,13 @@ sub index_git_blob_id { sub unindex_blob { my ($self, $mime) = @_; - my $mid = eval { mid_clean(mid_mime($mime)) }; + my $mid = eval { mid_mime($mime) }; $self->remove_message($mid) if defined $mid; } sub index_mm { my ($self, $mime) = @_; - my $mid = mid_clean(mid_mime($mime)); + my $mid = mid_mime($mime); my $mm = $self->{mm}; my $num; @@ -529,7 +533,7 @@ sub index_mm { sub unindex_mm { my ($self, $mime) = @_; - $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); + $self->{mm}->mid_delete(mid_mime($mime)); } sub index_both { @@ -547,13 +551,11 @@ sub unindex_both { sub do_cat_mail { my ($git, $blob, $sizeref) = @_; - my $mime = eval { - my $str = $git->cat_file($blob, $sizeref); - # fixup bugs from import: - $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - PublicInbox::MIME->new($str); - }; - $@ ? undef : $mime; + my $str = $git->cat_file($blob, $sizeref) or + die "BUG: $blob not found in $git->{git_dir}"; + # fixup bugs from import: + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::MIME->new($str); } # called by public-inbox-index @@ -598,7 +600,7 @@ sub read_log { } next; } - my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mime = do_cat_mail($git, $blob, \$bytes); my $smsg = bless {}, 'PublicInbox::Smsg'; batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); $smsg->{blob} = $blob; @@ -611,15 +613,15 @@ sub read_log { $latest = $1; $newest ||= $latest; } elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{over}->{autime} = $self->{autime} = $1; + $self->{autime} = $1; } elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{over}->{cotime} = $self->{cotime} = $1; + $self->{cotime} = $1; } } close($log) or die "git log failed: \$?=$?"; # get the leftovers foreach my $blob (keys %D) { - my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mime = do_cat_mail($git, $blob, \$bytes); $del_cb->($self, $mime); } $batch_cb->($nr, $latest, $newest); @@ -630,7 +632,7 @@ sub _git_log { my $git = $self->{git}; if (index($range, '..') < 0) { - # don't show annoying git errrors to users who run -index + # don't show annoying git errors to users who run -index # on empty inboxes $git->qx(qw(rev-parse -q --verify), "$range^0"); if ($?) { @@ -838,20 +840,27 @@ sub begin_txn_lazy { }); } +# store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) +# This metadata is read by Admin::detect_indexlevel: +sub set_indexlevel { + my ($self) = @_; + + if (!$self->{shard} && # undef or 0, not >0 + delete($self->{-set_indexlevel_once})) { + my $xdb = $self->{xdb}; + my $level = $xdb->get_metadata('indexlevel'); + if (!$level || $level ne 'medium') { + $xdb->set_metadata('indexlevel', 'medium'); + } + } +} + sub commit_txn_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{-inbox}->with_umask(sub { if (my $xdb = $self->{xdb}) { - - # store 'indexlevel=medium' in v2 shard=0 and - # v1 (only one shard) - # This metadata is read by Admin::detect_indexlevel: - if (!$self->{shard} # undef or 0, not >0 - && $self->{indexlevel} eq 'medium') { - $xdb->set_metadata('indexlevel', 'medium'); - } - + set_indexlevel($self); $xdb->commit_transaction; } $self->{over}->commit_lazy if $self->{over};