X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=4bdd69f540b58e769499e5abd72f8349293712ed;hb=46742d95647c7a80cb2f60d5c134717dd91e22e2;hp=9a5484e30b81b8ac29bb9bbfa43816ac6ee04210;hpb=fec19e492eacb10f990091592f423542ab4249bd;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 9a5484e3..4bdd69f5 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -10,7 +10,7 @@ package PublicInbox::SearchIdx; use strict; use warnings; use base qw(PublicInbox::Search PublicInbox::Lock); -use PublicInbox::MIME; +use PublicInbox::Eml; use PublicInbox::InboxWritable; use PublicInbox::MID qw/mid_clean mid_mime mids_for_index/; use PublicInbox::MsgIter; @@ -64,6 +64,7 @@ sub new { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); + $self->{index_max_size} = $ibx->{index_max_size}; } elsif ($version == 2) { defined $shard or die "shard is required for v2\n"; # shard is a number @@ -276,7 +277,7 @@ sub index_diff ($$$) { } sub index_xapian { # msg_iter callback - my $part = $_[0]->[0]; # ignore $depth and @idx + my $part = $_[0]->[0]; # ignore $depth and $idx my ($self, $doc) = @{$_[1]}; my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; @@ -351,6 +352,12 @@ sub add_xapian ($$$$) { } } $doc->add_boolean_term('Q' . $_) foreach @$mids; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } $self->{xdb}->replace_document($smsg->{num}, $doc); } @@ -364,7 +371,7 @@ sub _msgmap_init ($) { } sub add_message { - # mime = Email::MIME object + # mime = PublicInbox::Eml or Email::MIME object my ($self, $mime, $smsg) = @_; my $hdr = $mime->header_obj; my $mids = mids_for_index($hdr); @@ -551,13 +558,9 @@ sub unindex_both { sub do_cat_mail { my ($git, $blob, $sizeref) = @_; - my $mime = eval { - my $str = $git->cat_file($blob, $sizeref); - # fixup bugs from import: - $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - PublicInbox::MIME->new($str); - }; - $@ ? undef : $mime; + my $str = $git->cat_file($blob, $sizeref) or + die "BUG: $blob not found in $git->{git_dir}"; + PublicInbox::Eml->new($str); } # called by public-inbox-index @@ -576,6 +579,16 @@ sub batch_adjust ($$$$$) { } } +sub too_big ($$$) { + my ($self, $git, $oid) = @_; + my $max_size = $self->{index_max_size} or return; + my (undef, undef, $size) = $git->check($oid); + die "E: bad $oid in $git->{git_dir}\n" if !defined($size); + return if $size <= $max_size; + warn "W: skipping $oid ($size > $max_size)\n"; + 1; +} + # only for v1 sub read_log { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; @@ -602,7 +615,8 @@ sub read_log { } next; } - my $mime = do_cat_mail($git, $blob, \$bytes) or next; + next if too_big($self, $git, $blob); + my $mime = do_cat_mail($git, $blob, \$bytes); my $smsg = bless {}, 'PublicInbox::Smsg'; batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); $smsg->{blob} = $blob; @@ -610,7 +624,7 @@ sub read_log { $add_cb->($self, $mime, $smsg); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - $D{$blob} = 1; + $D{$blob} = 1 unless too_big($self, $git, $blob); } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; $newest ||= $latest; @@ -623,7 +637,7 @@ sub read_log { close($log) or die "git log failed: \$?=$?"; # get the leftovers foreach my $blob (keys %D) { - my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $mime = do_cat_mail($git, $blob, \$bytes); $del_cb->($self, $mime); } $batch_cb->($nr, $latest, $newest); @@ -634,7 +648,7 @@ sub _git_log { my $git = $self->{git}; if (index($range, '..') < 0) { - # don't show annoying git errrors to users who run -index + # don't show annoying git errors to users who run -index # on empty inboxes $git->qx(qw(rev-parse -q --verify), "$range^0"); if ($?) {