X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearch.pm;h=2b33b395e6340544fc0fa1ff142e9587f89235e1;hb=f76f265a851944b5dedcc3be5f3b5224b6ebda89;hp=d780878975d9af03e2562cfd6249c1585eed68a9;hpb=04939d5db13168e127c6b18dd366c21c16cf170a;p=public-inbox.git diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index d7808789..2b33b395 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -1,13 +1,14 @@ -# Copyright (C) 2015, all contributors +# Copyright (C) 2015 all contributors # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) # based on notmuch, but with no concept of folders, files or flags package PublicInbox::Search; use strict; use warnings; -use PublicInbox::SearchMsg; +use constant TS => 0; use Search::Xapian qw/:standard/; +use PublicInbox::SearchMsg; use Email::MIME; -use PublicInbox::MID qw/mid_clean mid_compressed/; +use PublicInbox::MID qw/mid_clean mid_compress/; # This is English-only, everything else is non-standard and may be confused as # a prefix common in patch emails @@ -15,34 +16,39 @@ our $REPLY_RE = qr/^re:\s+/i; our $LANG = 'english'; use constant { - TS => 0, # SCHEMA_VERSION history # 0 - initial # 1 - subject_path is lower-cased - # 2 - subject_path is mid_compressed in the index, only + # 2 - subject_path is mid_compress in the index, only # 3 - message-ID is compressed if it includes '%' (hack!) # 4 - change "Re: " normalization, avoid circular Reference ghosts # 5 - subject_path drops trailing '.' # 6 - preserve References: order in document data - SCHEMA_VERSION => 6, + # 7 - remove references and inreplyto terms + # 8 - remove redundant/unneeded document data + # 9 - disable Message-ID compression + SCHEMA_VERSION => 9, + + # n.b. FLAG_PURE_NOT is expensive not suitable for a public website + # as it could become a denial-of-service vector QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; # setup prefixes my %bool_pfx_internal = ( type => 'T', # "mail" or "ghost" - mid => 'Q', # uniQue id (Message-ID or mid_compressed) + thread => 'G', # newsGroup (or similar entity - e.g. a web forum name) ); my %bool_pfx_external = ( path => 'XPATH', - thread => 'G', # newsGroup (or similar entity - e.g. a web forum name) - references => 'XREFS', - inreplyto => 'XIRT', + mid => 'Q', # uniQue id (Message-ID) ); my %prob_prefix = ( subject => 'S', + s => 'S', # for mairix compatibility + m => 'Q', # 'mid' is exact, 'm' can do partial ); my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); @@ -50,7 +56,7 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } our %PFX2TERM_RMAP; -my %meta_pfx = (mid => 1, thread => 1, path => 1, type => 1); +my %meta_pfx = (mid => 1, thread => 1, path => 1); while (my ($k, $v) = each %all_pfx) { $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k}; } @@ -74,29 +80,21 @@ sub reopen { $_[0]->{xdb}->reopen } # read-only sub query { my ($self, $query_string, $opts) = @_; - my $query = $self->qp->parse_query($query_string, QP_FLAGS); + my $query; $opts ||= {}; - $opts->{relevance} = 1; + unless ($query_string eq '') { + $query = $self->qp->parse_query($query_string, QP_FLAGS); + $opts->{relevance} = 1 unless exists $opts->{relevance}; + } + $self->do_enquire($query, $opts); } sub get_subject_path { my ($self, $path, $opts) = @_; - my $query = $self->qp->parse_query("path:".mid_compressed($path), 0); - $self->do_enquire($query, $opts); -} - -# given a message ID, get followups to a message -sub get_followups { - my ($self, $mid, $opts) = @_; - $mid = mid_clean($mid); - $mid = mid_compressed($mid); - my $qp = $self->qp; - my $irt = $qp->parse_query("inreplyto:$mid", 0); - my $ref = $qp->parse_query("references:$mid", 0); - my $query = Search::Xapian::Query->new(OP_OR, $irt, $ref); - $self->do_enquire($query, $opts); + my $q = Search::Xapian::Query->new(xpfx("path").mid_compress($path)); + $self->do_enquire($q, $opts); } sub get_thread { @@ -104,9 +102,9 @@ sub get_thread { my $smsg = eval { $self->lookup_message($mid) }; return { total => 0, msgs => [] } unless $smsg; - my $qp = $self->qp; - my $qtid = $qp->parse_query('thread:'.$smsg->thread_id, 0); - my $qsub = $qp->parse_query('path:'.mid_compressed($smsg->path), 0); + my $qtid = Search::Xapian::Query->new(xpfx('thread').$smsg->thread_id); + my $path = mid_compress($smsg->path); + my $qsub = Search::Xapian::Query->new(xpfx('path').$path); my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); $self->do_enquire($query, $opts); } @@ -116,18 +114,22 @@ sub get_thread { sub do_enquire { my ($self, $query, $opts) = @_; my $enquire = $self->enquire; - - $query = Search::Xapian::Query->new(OP_AND, $query, $mail_query); + if (defined $query) { + $query = Search::Xapian::Query->new(OP_AND,$query,$mail_query); + } else { + $query = $mail_query; + } $enquire->set_query($query); if ($opts->{relevance}) { - $enquire->set_sort_by_relevance_then_value(TS, 0); + $enquire->set_sort_by_relevance_then_value(TS, 1); } else { - $enquire->set_sort_by_value(TS, 0); + $enquire->set_sort_by_value_then_relevance(TS, 1); } $opts ||= {}; my $offset = $opts->{offset} || 0; my $limit = $opts->{limit} || 50; my $mset = $enquire->get_mset($offset, $limit); + return $mset if $opts->{mset}; my @msgs = map { PublicInbox::SearchMsg->load_doc($_->get_document); } $mset->items; @@ -176,7 +178,6 @@ sub date_range_processor { sub lookup_message { my ($self, $mid) = @_; $mid = mid_clean($mid); - $mid = mid_compressed($mid); my $doc_id = $self->find_unique_doc_id('mid', $mid); my $smsg; @@ -237,9 +238,44 @@ sub subject_normalized { $subj; } +# for doc data +sub subject_summary { + my $subj = pop; + my $max = 68; + if (length($subj) > $max) { + my @subj = split(/\s+/, $subj); + $subj = ''; + my $l; + + while ($l = shift @subj) { + my $new = $subj . $l . ' '; + last if length($new) >= $max; + $subj = $new; + } + if ($subj ne '') { + my $r = scalar @subj ? ' ...' : ''; + $subj =~ s/ \z/$r/s; + } else { + # subject has one REALLY long word, and NOT spam? wtf + @subj = ($l =~ /\A(.{1,72})/); + $subj = $subj[0] . ' ...'; + } + } + $subj; +} + sub enquire { my ($self) = @_; $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb}); } +sub mid_prefix { + my ($self, $mpfx) = @_; + my $query = eval { $self->qp->parse_query("m:$mpfx", FLAG_PARTIAL) }; + return if $@; + my $res = $self->do_enquire($query, { relevance => 1 }); + return unless $res->{total}; + [ map { $_->mid } @{$res->{msgs}} ]; +} + 1;