use warnings;
use PublicInbox::SearchMsg;
use Search::Xapian qw/:standard/;
-require PublicInbox::View;
use Email::MIME;
use PublicInbox::MID qw/mid_clean mid_compressed/;
+# This is English-only, everything else is non-standard and may be confused as
+# a prefix common in patch emails
+our $REPLY_RE = qr/^re:\s+/i;
+our $LANG = 'english';
+
use constant {
TS => 0,
# SCHEMA_VERSION history
# 1 - subject_path is lower-cased
# 2 - subject_path is mid_compressed in the index, only
# 3 - message-ID is compressed if it includes '%' (hack!)
- SCHEMA_VERSION => 3,
- LANG => 'english',
+ # 4 - change "Re: " normalization, avoid circular Reference ghosts
+ # 5 - subject_path drops trailing '.'
+ SCHEMA_VERSION => 5,
QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
};
my $mid = mid_compressed($mid_orig);
my $was_ghost = 0;
my $ct_msg = $mime->header('Content-Type') || 'text/plain';
- my $enc_msg = PublicInbox::View::enc_for($ct_msg);
eval {
my $smsg = $self->lookup_message($mid);
# account for filter bugs...
$ct =~ m!\btext/plain\b!i or return;
- my $enc = PublicInbox::View::enc_for($ct, $enc_msg);
my (@orig, @quot);
- foreach my $l (split(/\n/, $enc->decode($part->body))) {
+ my $body = $part->body;
+ $part->body_set('');
+ my @lines = split(/\n/, $body);
+ while (defined(my $l = shift @lines)) {
if ($l =~ /^\s*>/) {
push @quot, $l;
} else {
}
if (@quot) {
$tg->index_text(join("\n", @quot), 0);
+ @quot = ();
$tg->increase_termpos;
}
if (@orig) {
$tg->index_text(join("\n", @orig));
+ @orig = ();
$tg->increase_termpos;
}
});
my ($self, $query_string, $opts) = @_;
my $query = $self->qp->parse_query($query_string, QP_FLAGS);
- $query = Search::Xapian::Query->new(OP_AND, $mail_query, $query);
$self->do_enquire($query, $opts);
}
my ($self, $query, $opts) = @_;
my $enquire = $self->enquire;
+ $query = Search::Xapian::Query->new(OP_AND, $query, $mail_query);
$enquire->set_query($query);
$enquire->set_sort_by_relevance_then_value(TS, 0);
$opts ||= {};
}
# read-write
-sub stemmer { Search::Xapian::Stem->new(LANG) }
+sub stemmer { Search::Xapian::Stem->new($LANG) }
# read-only
sub qp {
# normalize subjects so they are suitable as pathnames for URLs
sub subject_path {
my $subj = pop;
-
- $subj =~ s/\A\s+//;
- $subj =~ s/\s+\z//;
- $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
+ $subj = subject_normalized($subj);
$subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
lc($subj);
}
+sub subject_normalized {
+ my $subj = pop;
+ $subj =~ s/\A\s+//s; # no leading space
+ $subj =~ s/\s+\z//s; # no trailing space
+ $subj =~ s/\s+/ /gs; # no redundant spaces
+ $subj =~ s/\.+\z//; # no trailing '.'
+ $subj =~ s/$REPLY_RE//igo; # remove reply prefix
+ $subj;
+}
+
sub do_cat_mail {
my ($git, $blob) = @_;
my $mime = eval {