X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=blobdiff_plain;f=lib%2FPublicInbox%2FSearch.pm;h=bc2b69854a8765add7c4956f0f7a3c23d9edf0a0;hp=c8e297f42634a842c3b9b3ba033d21cd71353d9e;hb=de243560e2caa1d19bcbf518edfaf8b016161245;hpb=21ab8f3cc530d9483091f32c0865ba1ce867cef8 diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index c8e297f4..bc2b6985 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -16,7 +16,7 @@ use constant YYYYMMDD => 4; # for searching in the WWW UI use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; -use Email::MIME; +use PublicInbox::MIME; use PublicInbox::MID qw/mid_clean id_compress/; # This is English-only, everything else is non-standard and may be confused as @@ -38,7 +38,10 @@ use constant { # 9 - disable Message-ID compression (SHA-1) # 10 - optimize doc for NNTP overviews # 11 - merge threads when vivifying ghosts - SCHEMA_VERSION => 11, + # 12 - change YYYYMMDD value column to numeric + # 13 - fix threading for empty References/In-Reply-To + # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0) + SCHEMA_VERSION => 13, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector @@ -66,39 +69,40 @@ my %prob_prefix = ( tc => 'XTO XCC', c => 'XCC', tcf => 'XTO XCC A', + a => 'XTO XCC A', b => 'XNQ XQUOT', bs => 'XNQ XQUOT S', + n => 'XFN', - # n.b.: leaving out "a:" alias for "tcf:" even though - # mairix supports it. It is only mentioned in passing in mairix(1) - # and the extra two letters are not significantly longer. q => 'XQUOT', nq => 'XNQ', # default: - '' => 'XMID S A XNQ XQUOT', + '' => 'XMID S A XNQ XQUOT XFN', ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian our @HELP = ( - 's:' => < 'match within Subject e.g. s:"a quick brown fox"', 'd:' => < 'match within message body, including text attachments', + 'nq:' => 'match non-quoted text within message body', + 'q:' => 'match quoted text within message body', + 'n:' => 'match filename of attachment(s)', + 't:' => 'match within the To header', + 'c:' => 'match within the Cc header', + 'f:' => 'match within the From header', + 'a:' => 'match within the To, Cc, and From headers', + 'tc:' => 'match within the To and Cc headers', + 'bs:' => 'match within the Subject and body', ); -# TODO: (from mairix, some of these are maybe) -# b (body), f (From:), c (Cc:), n (attachment), t (To:) -# tc (To:+Cc:), bs (body + Subject), tcf (To: +Cc: +From:) -# -# Non-mairix: +chomp @HELP; +# TODO: # df (filenames from diff) -# nq (non-quoted body) # da (diff a/ removed lines) # db (diff b/ added lines) @@ -106,12 +110,6 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } -our %PFX2TERM_RMAP; -my %meta_pfx = (mid => 1, thread => 1, path => 1); -while (my ($k, $v) = each %all_pfx) { - $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k}; -} - my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail'); sub xdir { @@ -156,25 +154,35 @@ sub get_thread { } $opts ||= {}; $opts->{limit} ||= 1000; + + # always sort threads by timestamp, this makes life easier + # for the threading algorithm (in SearchThread.pm) + $opts->{asc} = 1; + _do_enquire($self, $qtid, $opts); } -sub _do_enquire { - my ($self, $query, $opts) = @_; +sub retry_reopen { + my ($self, $cb) = @_; my $ret; for (1..10) { - eval { $ret = _enquire_once($self, $query, $opts) }; + eval { $ret = $cb->() }; return $ret unless $@; # Exception: The revision being read has been discarded - # you should call Xapian::Database::reopen() - if (index($@, 'Xapian::Database::reopen') >= 0) { + if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { reopen($self); } else { - die $@; + die; } } } +sub _do_enquire { + my ($self, $query, $opts) = @_; + retry_reopen($self, sub { _enquire_once($self, $query, $opts) }); +} + sub _enquire_once { my ($self, $query, $opts) = @_; my $enquire = $self->enquire; @@ -221,7 +229,7 @@ sub qp { $qp->set_stemmer($self->stemmer); $qp->set_stemming_strategy(STEM_SOME); $qp->add_valuerangeprocessor( - Search::Xapian::StringValueRangeProcessor->new(YYYYMMDD, 'd:')); + Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $prefix); @@ -236,11 +244,12 @@ sub qp { /\Aserial:(\w+):/ or next; my $pfx = $1; push @$user_pfx, "$pfx:", < XGMANE $qp->add_boolean_prefix($pfx, 'X'.uc($pfx)); } + chomp @$user_pfx; } while (my ($name, $prefix) = each %prob_prefix) { @@ -275,15 +284,17 @@ sub lookup_message { # raises on error: my $doc = $self->{xdb}->get_document($doc_id); $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $smsg->doc_id($doc_id); + $smsg->{doc_id} = $doc_id; } $smsg; } sub lookup_mail { # no ghosts! my ($self, $mid) = @_; - my $smsg = lookup_message($self, $mid) or return; - PublicInbox::SearchMsg->load_doc($smsg->{doc}); + retry_reopen($self, sub { + my $smsg = lookup_message($self, $mid) or return; + PublicInbox::SearchMsg->load_doc($smsg->{doc}); + }); } sub find_unique_doc_id { @@ -317,6 +328,7 @@ sub find_doc_ids_for_term { } # normalize subjects so they are suitable as pathnames for URLs +# XXX: consider for removal sub subject_path { my $subj = pop; $subj = subject_normalized($subj); @@ -334,32 +346,6 @@ sub subject_normalized { $subj; } -# for doc data -sub subject_summary { - my $subj = pop; - my $max = 68; - if (length($subj) > $max) { - my @subj = split(/\s+/, $subj); - $subj = ''; - my $l; - - while ($l = shift @subj) { - my $new = $subj . $l . ' '; - last if length($new) >= $max; - $subj = $new; - } - if ($subj ne '') { - my $r = scalar @subj ? ' ...' : ''; - $subj =~ s/ \z/$r/s; - } else { - # subject has one REALLY long word, and NOT spam? wtf - @subj = ($l =~ /\A(.{1,72})/); - $subj = $subj[0] . ' ...'; - } - } - $subj; -} - sub enquire { my ($self) = @_; $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb});