From: Eric Wong (Contractor, The Linux Foundation) Date: Thu, 5 Apr 2018 21:45:28 +0000 (+0000) Subject: search: index and allow searching by date-time X-Git-Tag: v1.1.0-pre1~49 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=936670cb83971bf250571a7dd9b0a0a0b33babd8 search: index and allow searching by date-time Dscho found this useful for finding matching git commits based on AuthorDate in git. Add it to the overview DB format, too; so in the future we can support v2 repos without Xapian. https://public-inbox.org/git/nycvar.QRO.7.76.6.1804041821420.55@ZVAVAG-6OXH6DA.rhebcr.pbec.zvpebfbsg.pbz https://public-inbox.org/git/alpine.DEB.2.20.1702041206130.3496@virtualbox/ --- diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 5c20f1f0..28e4aa9c 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -202,7 +202,7 @@ sub link_refs { sub add_over { my ($self, $values) = @_; - my ($ts, $num, $mids, $refs, $xpath, $ddd) = @$values; + my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; my $old_tid; my $vivified = 0; @@ -232,11 +232,11 @@ sub add_over { my $sid = $self->sid($xpath); my $dbh = $self->{dbh}; my $sth = $dbh->prepare_cached(<<''); -INSERT INTO over (num, tid, sid, ts, ddd) -VALUES (?,?,?,?,?) +INSERT INTO over (num, tid, sid, ts, ds, ddd) +VALUES (?,?,?,?,?,?) my $n = 0; - my @v = ($num, $tid, $sid, $ts); + my @v = ($num, $tid, $sid, $ts, $ds); foreach (@v) { $sth->bind_param(++$n, $_) } $sth->bind_param(++$n, $ddd, SQL_BLOB); $sth->execute; @@ -274,6 +274,7 @@ CREATE TABLE IF NOT EXISTS over ( tid INTEGER NOT NULL, sid INTEGER, ts INTEGER, + ds INTEGER, ddd VARBINARY, /* doc-data-deflated */ UNIQUE (num) ) @@ -281,6 +282,7 @@ CREATE TABLE IF NOT EXISTS over ( $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); $dbh->do('CREATE INDEX IF NOT EXISTS idx_sid ON over (sid)'); $dbh->do('CREATE INDEX IF NOT EXISTS idx_ts ON over (ts)'); + $dbh->do('CREATE INDEX IF NOT EXISTS idx_ds ON over (ds)'); $dbh->do(<<''); CREATE TABLE IF NOT EXISTS counter ( diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 9eb07284..34ebd1a6 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -9,7 +9,8 @@ use warnings; # values for searching use constant TS => 0; # Received: header in Unix time -use constant YYYYMMDD => 1; # for searching in the WWW UI +use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI +use constant DT => 2; # Date: YYYYMMDDHHMMSS use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; @@ -88,6 +89,9 @@ our @HELP = ( date range as YYYYMMDD e.g. d:19931002..20101002 Open-ended ranges such as d:19931002.. and d:..20101002 are also supported +EOF + 'dt:' => < 'match within message body, including text attachments', 'nq:' => 'match non-quoted text within message body', @@ -258,6 +262,8 @@ sub qp { $qp->set_stemming_strategy(STEM_SOME); $qp->add_valuerangeprocessor( Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(DT, 'dt:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $prefix); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 06bce70a..42562631 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -282,9 +282,14 @@ sub add_message { $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!; defined $bytes or $bytes = length($mime->as_string); $smsg->{bytes} = $bytes; + add_val($doc, PublicInbox::Search::TS(), $smsg->ts); - my $yyyymmdd = strftime('%Y%m%d', gmtime($smsg->ds)); - add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); + my @ds = gmtime($smsg->ds); + my $yyyymmdd = strftime('%Y%m%d', @ds); + add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); + my $dt = strftime('%Y%m%d%H%M%S', @ds); + add_val($doc, PublicInbox::Search::DT(), $dt); + my @vals = ($smsg->{ts}, $smsg->{ds}); my $tg = $self->term_generator; @@ -355,7 +360,7 @@ sub add_message { utf8::encode($data); $data = compress($data); - my @vals = ($smsg->ts, $num, $mids, $refs, $xpath, $data); + push @vals, $num, $mids, $refs, $xpath, $data; $self->{over}->add_over(\@vals); $doc->add_boolean_term('Q' . $_) foreach @$mids; $doc->add_boolean_term('XNUM' . $num) if defined $num; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index d43853a0..3278802b 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -9,6 +9,7 @@ use warnings; use PublicInbox::MID qw/mid_clean mid_mime/; use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use Time::Local qw(timegm); sub new { my ($class, $mime) = @_; @@ -44,7 +45,6 @@ sub to_doc_data { $self->cc, $oid, $mid0, - $self->ds, $self->{bytes}, $self->{lines} ); @@ -65,7 +65,6 @@ sub load_from_data ($$) { $self->{blob}, $self->{mid}, - $self->{ds}, $self->{bytes}, $self->{lines} ) = split(/\n/, $_[1]); @@ -75,7 +74,10 @@ sub load_expand { my ($self) = @_; my $doc = $self->{doc}; my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, &PublicInbox::Search::TS); + $self->{ts} = get_val($doc, PublicInbox::Search::TS()); + my $dt = get_val($doc, PublicInbox::Search::DT()); + my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); + $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); utf8::decode($data); load_from_data($self, $data); $self; diff --git a/t/over.t b/t/over.t index 2a7e8d1d..c0d9d5e5 100644 --- a/t/over.t +++ b/t/over.t @@ -38,21 +38,21 @@ is($y, $x + 1, 'integer tid for ghost increases'); my $ddd = compress(''); foreach my $s ('', undef) { - $over->add_over([0, 98, [ 'a' ], [], $s, $ddd]); - $over->add_over([0, 99, [ 'b' ], [], $s, $ddd]); + $over->add_over([0, 0, 98, [ 'a' ], [], $s, $ddd]); + $over->add_over([0, 0, 99, [ 'b' ], [], $s, $ddd]); my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; is_deeply([98], $msgs, 'messages not linked by empty subject'); } -$over->add_over([0, 98, [ 'a' ], [], 's', $ddd]); -$over->add_over([0, 99, [ 'b' ], [], 's', $ddd]); +$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); +$over->add_over([0, 0, 99, [ 'b' ], [], 's', $ddd]); foreach my $mid (qw(a b)) { my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; is_deeply([98, 99], $msgs, 'linked messages by subject'); } -$over->add_over([0, 98, [ 'a' ], [], 's', $ddd]); -$over->add_over([0, 99, [ 'b' ], ['a'], 'diff', $ddd]); +$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); +$over->add_over([0, 0, 99, [ 'b' ], ['a'], 'diff', $ddd]); foreach my $mid (qw(a b)) { my $msgs = [ map { $_->{num} } @{$over->get_thread($mid)} ]; is_deeply([98, 99], $msgs, "linked messages by Message-ID: <$mid>"); diff --git a/t/search.t b/t/search.t index c9bef718..2f7b795e 100644 --- a/t/search.t +++ b/t/search.t @@ -170,6 +170,13 @@ sub filter_mids { # body $res = $ro->query('goodbye'); is($res->[0]->mid, 'last@s', 'got goodbye message body'); + + # datestamp + $res = $ro->query('dt:20101002000001..20101002000001'); + @res = filter_mids($res); + is_deeply(\@res, ['ghost-message@s'], 'exact Date: match works'); + $res = $ro->query('dt:20101002000002..20101002000002'); + is_deeply($res, [], 'exact Date: match down to the second'); } # long message-id