X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSearchIdx.pm;h=30d3fe926a14cef02834bfe3511ac37022e14bee;hb=970eb1fd83b93c790d2faed6bf64a97d6d5fe126;hp=a259d86f2cff7a5a17c67fe47b80db0bb596e582;hpb=221d916965ccd55e0aa3e83819f8a05df1a5dcb6;p=public-inbox.git diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index a259d86f..30d3fe92 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,158 +4,232 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; -use Email::MIME; +use Fcntl qw(:flock :DEFAULT); +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); use PublicInbox::MID qw/mid_clean id_compress mid_mime/; use PublicInbox::MsgIter; +use Carp qw(croak); +use POSIX qw(strftime); require PublicInbox::Git; -*xpfx = *PublicInbox::Search::xpfx; -use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian use constant { + MAX_MID_SIZE => 244, # max term size - 1 in Xapian PERM_UMASK => 0, OLD_PERM_GROUP => 1, OLD_PERM_EVERYBODY => 2, PERM_GROUP => 0660, PERM_EVERYBODY => 0664, + BATCH_BYTES => 1_000_000, }; sub new { - my ($class, $git_dir, $writable) = @_; - my $dir = $class->xdir($git_dir); + my ($class, $inbox, $creat) = @_; + my $git_dir = $inbox; + my $altid; + if (ref $inbox) { + $git_dir = $inbox->{mainrepo}; + $altid = $inbox->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { + PublicInbox::AltId->new($inbox, $_); + } @$altid ]; + } + } require Search::Xapian::WritableDatabase; - my $flag = Search::Xapian::DB_OPEN; - my $self = bless { git_dir => $git_dir }, $class; + my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; - $self->{xdb} = $self->with_umask(sub { - if ($writable == 1) { - require File::Path; - File::Path::mkpath($dir); - $flag = Search::Xapian::DB_CREATE_OR_OPEN; - } - Search::Xapian::WritableDatabase->new($dir, $flag); - }); + $self->{lock_path} = "$git_dir/ssoma.lock"; + $self->{git} = PublicInbox::Git->new($git_dir); + $self->{creat} = ($creat || 0) == 1; $self; } -sub add_val { +sub _xdb_release { + my ($self) = @_; + my $xdb = delete $self->{xdb} or croak 'not acquired'; + $xdb->close; + _lock_release($self) if $self->{creat}; + undef; +} + +sub _xdb_acquire { + my ($self) = @_; + croak 'already acquired' if $self->{xdb}; + my $dir = PublicInbox::Search->xdir($self->{git_dir}); + my $flag = Search::Xapian::DB_OPEN; + if ($self->{creat}) { + require File::Path; + _lock_acquire($self); + File::Path::mkpath($dir); + $flag = Search::Xapian::DB_CREATE_OR_OPEN; + } + $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); +} + +# we only acquire the flock if creating or reindexing; +# PublicInbox::Import already has the lock on its own. +sub _lock_acquire { + my ($self) = @_; + croak 'already locked' if $self->{lockfh}; + sysopen(my $lockfh, $self->{lock_path}, O_WRONLY|O_CREAT) or + die "failed to open lock $self->{lock_path}: $!\n"; + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + $self->{lockfh} = $lockfh; +} + +sub _lock_release { + my ($self) = @_; + my $lockfh = delete $self->{lockfh} or croak 'not locked'; + flock($lockfh, LOCK_UN) or die "unlock failed: $!\n"; + close $lockfh or die "close failed: $!\n"; +} + +sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } +sub add_values ($$$) { + my ($smsg, $bytes, $num) = @_; + + my $ts = $smsg->ts; + my $doc = $smsg->{doc}; + add_val($doc, &PublicInbox::Search::TS, $ts); + + defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $smsg->{mime}->body_raw =~ tr!\n!\n!); + + my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); +} + +sub index_users ($$) { + my ($tg, $smsg) = @_; + + my $from = $smsg->from; + my $to = $smsg->to; + my $cc = $smsg->cc; + + $tg->index_text($from, 1, 'A'); # A - author + $tg->increase_termpos; + $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->increase_termpos; + $tg->index_text($cc, 1, 'XCC') if $cc ne ''; + $tg->increase_termpos; +} + +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); + $tg->increase_termpos; +} + sub add_message { - my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; - my $doc_id; + my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $was_ghost = 0; - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; my $smsg = $self->lookup_message($mid); - my $doc; - if ($smsg) { - $smsg->ensure_metadata; # convert a ghost to a regular message # it will also clobber any existing regular message - $smsg->mime($mime); - $doc = $smsg->{doc}; - - my $type = xpfx('type'); - eval { - $doc->remove_term($type . 'ghost'); - $was_ghost = 1; - }; - - # probably does not exist: - eval { $doc->remove_term($type . 'mail') }; - $doc->add_term($type . 'mail'); - } else { - $smsg = PublicInbox::SearchMsg->new($mime); - $doc = $smsg->{doc}; - $doc->add_term(xpfx('mid') . $mid); + $doc_id = $smsg->{doc_id}; + $old_tid = $smsg->thread_id; } + $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + $doc->add_term('Q' . $mid); my $subj = $smsg->subject; - if ($subj ne '') { my $path = $self->subject_path($subj); - $doc->add_term(xpfx('path') . id_compress($path)); + $doc->add_term('XPATH' . id_compress($path)); } - add_val($doc, &PublicInbox::Search::TS, $smsg->ts); - - defined($num) and - add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and - add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $mime->body_raw =~ tr!\n!\n!); + add_values($smsg, $bytes, $num); my $tg = $self->term_generator; $tg->set_document($doc); $tg->index_text($subj, 1, 'S') if $subj; $tg->increase_termpos; - $tg->index_text($subj) if $subj; - $tg->increase_termpos; - $tg->index_text($smsg->from); - $tg->increase_termpos; + index_users($tg, $smsg); msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || $ct_msg; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { - if ($l =~ /^\s*>/) { + if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - $tg->index_text(join("\n", @quot), 0); - @quot = (); - $tg->increase_termpos; - } - if (@orig) { - $tg->index_text(join("\n", @orig)); - @orig = (); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); - if ($was_ghost) { - $doc_id = $smsg->doc_id; - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); + link_message($self, $smsg, $old_tid); + $tg->index_text($mid, 1, 'XMID'); + $doc->set_data($smsg->to_doc_data($blob)); + + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } + } + if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } }; @@ -211,28 +285,17 @@ sub next_thread_id { } sub link_message { - my ($self, $smsg, $is_ghost) = @_; - - if ($is_ghost) { - $smsg->ensure_metadata; - } else { - $self->link_message_to_parents($smsg); - } -} - -sub link_message_to_parents { - my ($self, $smsg) = @_; + my ($self, $smsg, $old_tid) = @_; my $doc = $smsg->{doc}; my $mid = $smsg->mid; - my $mime = $smsg->mime; + my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; - my $refs = $hdr->header_raw('References'); - my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - if (my $irt = $hdr->header_raw('In-Reply-To')) { - # last References should be $irt - # we will de-dupe later - push @refs, mid_clean($irt); - } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + my @refs = ($hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + @refs = ((join(' ', @refs)) =~ /<([^>]+)>/g); my $tid; if (@refs) { @@ -250,6 +313,7 @@ sub link_message_to_parents { push @refs, $ref; } } + if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; @@ -257,151 +321,217 @@ sub link_message_to_parents { # but we can never trust clients to do the right thing my $ref = shift @refs; $tid = $self->_resolve_mid_to_tid($ref); + $self->merge_threads($tid, $old_tid) if defined $old_tid; # the rest of the refs should point to this tid: foreach $ref (@refs) { my $ptid = $self->_resolve_mid_to_tid($ref); - if ($tid ne $ptid) { - $self->merge_threads($tid, $ptid); - } + merge_threads($self, $tid, $ptid); } } else { - $tid = $self->next_thread_id; + $tid = defined $old_tid ? $old_tid : $self->next_thread_id; } - $doc->add_term(xpfx('thread') . $tid); + $doc->add_term('G' . $tid); } sub index_blob { - my ($self, $git, $mime, $bytes, $num) = @_; - $self->add_message($mime, $bytes, $num); + my ($self, $mime, $bytes, $num, $blob) = @_; + $self->add_message($mime, $bytes, $num, $blob); } sub unindex_blob { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; my $mid = eval { mid_clean(mid_mime($mime)) }; $self->remove_message($mid) if defined $mid; } sub index_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_insert(mid_clean(mid_mime($mime))); } sub unindex_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); } sub index_mm2 { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $mime, $bytes, $blob) = @_; my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_mm2 { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); - unindex_blob($self, $git, $mime); + unindex_blob($self, $mime); } sub index_both { - my ($self, $git, $mime, $bytes) = @_; - my $num = index_mm($self, $git, $mime); - index_blob($self, $git, $mime, $bytes, $num); + my ($self, $mime, $bytes, $blob) = @_; + my $num = index_mm($self, $mime); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_both { - my ($self, $git, $mime) = @_; - unindex_blob($self, $git, $mime); - unindex_mm($self, $git, $mime); + my ($self, $mime) = @_; + unindex_blob($self, $mime); + unindex_mm($self, $mime); } sub do_cat_mail { my ($git, $blob, $sizeref) = @_; my $mime = eval { my $str = $git->cat_file($blob, $sizeref); - Email::MIME->new($str); + # fixup bugs from import: + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::MIME->new($str); }; $@ ? undef : $mime; } sub index_sync { - my ($self, $head) = @_; - $self->with_umask(sub { $self->_index_sync($head) }); + my ($self, $opts) = @_; + with_umask($self, sub { $self->_index_sync($opts) }); +} + +sub batch_adjust ($$$$) { + my ($max, $bytes, $batch_cb, $latest) = @_; + $$max -= $bytes; + if ($$max <= 0) { + $$max = BATCH_BYTES; + $batch_cb->($latest, 1); + } } sub rlog { - my ($self, $range, $add_cb, $del_cb) = @_; + my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; - my $git = PublicInbox::Git->new($self->{git_dir}); - my $log = $git->popen(qw/log --reverse --no-notes --no-color - --raw -r --no-abbrev/, $range); + my $git = $self->{git}; my $latest; my $bytes; + my $max = BATCH_BYTES; local $/ = "\n"; - while (defined(my $line = <$log>)) { + my $line; + while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1, \$bytes) or next; - $add_cb->($self, $git, $mime, $bytes); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); + $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { - my $mime = do_cat_mail($git, $1) or next; - $del_cb->($self, $git, $mime); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); + $del_cb->($self, $mime); } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; } } - $latest; + $batch_cb->($latest, 0); } -# indexes all unindexed messages -sub _index_sync { - my ($self, $head) = @_; - my $db = $self->{xdb}; - $head ||= 'HEAD'; - my $mm = $self->{mm} = eval { +sub _msgmap_init { + my ($self) = @_; + $self->{mm} = eval { require PublicInbox::Msgmap; PublicInbox::Msgmap->new($self->{git_dir}, 1); }; +} + +sub _git_log { + my ($self, $range) = @_; + $self->{git}->popen(qw/log --reverse --no-notes --no-color + --raw -r --no-abbrev/, $range); +} + +# indexes all unindexed messages +sub _index_sync { + my ($self, $opts) = @_; + my $tip = $opts->{ref} || 'HEAD'; + my $reindex = $opts->{reindex}; + my ($mkey, $last_commit, $lx, $xlog); + $self->{git}->batch_prepare; + my $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + do { + $xlog = undef; + $mkey = 'last_commit'; + $last_commit = $xdb->get_metadata('last_commit'); + $lx = $last_commit; + if ($reindex) { + $lx = ''; + $mkey = undef if $last_commit ne ''; + } + $xdb->cancel_transaction; + $xdb = _xdb_release($self); + + # ensure we leak no FDs to "git log" + my $range = $lx eq '' ? $tip : "$lx..$tip"; + $xlog = _git_log($self, $range); + + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } while ($xdb->get_metadata('last_commit') ne $last_commit); + + my $mm = _msgmap_init($self); + my $dbh = $mm->{dbh} if $mm; + my $mm_only; + my $cb = sub { + my ($commit, $more) = @_; + if ($dbh) { + $mm->last_commit($commit) if $commit; + $dbh->commit; + } + if (!$mm_only) { + $xdb->set_metadata($mkey, $commit) if $mkey && $commit; + $xdb->commit_transaction; + $xdb = _xdb_release($self); + } + # let another process do some work... < + if ($more) { + if (!$mm_only) { + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } + $dbh->begin_work if $dbh; + } + }; - $db->begin_transaction; - my $lx = $db->get_metadata('last_commit'); - my $range = $lx eq '' ? $head : "$lx..$head"; if ($mm) { - $mm->{dbh}->begin_work; + $dbh->begin_work; my $lm = $mm->last_commit || ''; if ($lm eq $lx) { # Common case is the indexes are synced, # we only need to run git-log once: - $lx = $self->rlog($range, *index_both, *unindex_both); - $mm->{dbh}->commit; - if (defined $lx) { - $db->set_metadata('last_commit', $lx); - $mm->last_commit($lx); - } + rlog($self, $xlog, *index_both, *unindex_both, $cb); } else { - # dumb case, msgmap and xapian are out-of-sync - # do not care for performance: - my $r = $lm eq '' ? $head : "$lm..$head"; - $lm = $self->rlog($r, *index_mm, *unindex_mm); - $mm->{dbh}->commit; - $mm->last_commit($lm) if defined $lm; - - $lx = $self->rlog($range, *index_mm2, *unindex_mm2); - $db->set_metadata('last_commit', $lx) if defined $lx; + # Uncommon case, msgmap and xapian are out-of-sync + # do not care for performance (but git is fast :>) + # This happens if we have to reindex Xapian since + # msgmap is a frozen format and our Xapian format + # is evolving. + my $r = $lm eq '' ? $tip : "$lm..$tip"; + + # first, ensure msgmap is up-to-date: + my $mkey_prev = $mkey; + $mkey = undef; # ignore xapian, for now + my $mlog = _git_log($self, $r); + $mm_only = 1; + rlog($self, $mlog, *index_mm, *unindex_mm, $cb); + $mm_only = $mlog = undef; + + # now deal with Xapian + $mkey = $mkey_prev; + $dbh = undef; + rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); } } else { # user didn't install DBD::SQLite and DBI - $lx = $self->rlog($range, *index_blob, *unindex_blob); - $db->set_metadata('last_commit', $lx) if defined $lx; - } - if ($@) { - $db->cancel_transaction; - $mm->{dbh}->rollback if $mm; - } else { - $db->commit_transaction; + rlog($self, $xlog, *index_blob, *unindex_blob, $cb); } } @@ -414,17 +544,15 @@ sub _resolve_mid_to_tid { } sub create_ghost { - my ($self, $mid, $tid) = @_; - - $tid = $self->next_thread_id unless defined $tid; + my ($self, $mid) = @_; + my $tid = $self->next_thread_id; my $doc = Search::Xapian::Document->new; - $doc->add_term(xpfx('mid') . $mid); - $doc->add_term(xpfx('thread') . $tid); - $doc->add_term(xpfx('type') . 'ghost'); + $doc->add_term('Q' . $mid); + $doc->add_term('G' . $tid); + $doc->add_term('T' . 'ghost'); my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $self->link_message($smsg, 1); $self->{xdb}->add_document($doc); $smsg; @@ -432,15 +560,15 @@ sub create_ghost { sub merge_threads { my ($self, $winner_tid, $loser_tid) = @_; - my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); - my $thread_pfx = xpfx('thread'); + return if $winner_tid == $loser_tid; + my ($head, $tail) = $self->find_doc_ids('G' . $loser_tid); my $db = $self->{xdb}; for (; $head != $tail; $head->inc) { my $docid = $head->get_docid; my $doc = $db->get_document($docid); - $doc->remove_term($thread_pfx . $loser_tid); - $doc->add_term($thread_pfx . $winner_tid); + $doc->remove_term('G' . $loser_tid); + $doc->add_term('G' . $winner_tid); $db->replace_document($docid, $doc); } } @@ -497,8 +625,14 @@ sub with_umask { my $rv = eval { $cb->() }; my $err = $@; umask $old; - die $err if $@; + die $err if $err; $rv; } +sub DESTROY { + # order matters for unlocking + $_[0]->{xdb} = undef; + $_[0]->{lockfh} = undef; +} + 1;