use Fcntl qw(:flock :DEFAULT);
use PublicInbox::MIME;
use base qw(PublicInbox::Search);
-use PublicInbox::MID qw/mid_clean id_compress mid_mime/;
+use PublicInbox::MID qw/mid_clean id_compress mid_mime mids references/;
use PublicInbox::MsgIter;
use Carp qw(croak);
use POSIX qw(strftime);
require PublicInbox::Git;
use constant {
- MAX_MID_SIZE => 244, # max term size - 1 in Xapian
+ MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q')
PERM_UMASK => 0,
OLD_PERM_GROUP => 1,
OLD_PERM_EVERYBODY => 2,
$tg->increase_termpos;
}
-sub index_text_inc ($$$) {
- my ($tg, $text, $pfx) = @_;
+sub index_diff_inc ($$$$) {
+ my ($tg, $text, $pfx, $xnq) = @_;
+ if (@$xnq) {
+ $tg->index_text(join("\n", @$xnq), 1, 'XNQ');
+ $tg->increase_termpos;
+ @$xnq = ();
+ }
$tg->index_text($text, 1, $pfx);
$tg->increase_termpos;
}
sub index_old_diff_fn {
- my ($tg, $seen, $fa, $fb) = @_;
+ my ($tg, $seen, $fa, $fb, $xnq) = @_;
# no renames or space support for traditional diffs,
# find the number of leading common paths to strip:
$fa = join('/', @fa);
$fb = join('/', @fb);
if ($fa eq $fb) {
- index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++;
+ unless ($seen->{$fa}++) {
+ index_diff_inc($tg, $fa, 'XDFN', $xnq);
+ }
return 1;
}
shift @fa;
my ($tg, $lines, $doc) = @_;
my %seen;
my $in_diff;
+ my @xnq;
+ my $xnq = \@xnq;
foreach (@$lines) {
if ($in_diff && s/^ //) { # diff context
- index_text_inc($tg, $_, 'XDFCTX');
+ index_diff_inc($tg, $_, 'XDFCTX', $xnq);
} elsif (/^-- $/) { # email signature begins
$in_diff = undef;
} elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) {
my ($fa, $fb) = ($1, $2);
my $fn = (split('/', git_unquote($fa), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$fn = (split('/', git_unquote($fb), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
# traditional diff:
} elsif (m/^diff -(.+) (\S+) (\S+)$/) {
my ($opt, $fa, $fb) = ($1, $2, $3);
+ push @xnq, $_;
# only support unified:
next unless $opt =~ /[uU]/;
- $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb);
+ $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb,
+ $xnq);
} elsif (m!^--- ("?a/.+)!) {
my $fn = (split('/', git_unquote($1), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (m!^\+\+\+ ("?b/.+)!) {
my $fn = (split('/', git_unquote($1), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (/^--- (\S+)/) {
$in_diff = $1;
+ push @xnq, $_;
} elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
- $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1);
+ $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1,
+ $xnq);
} elsif ($in_diff && s/^\+//) { # diff added
- index_text_inc($tg, $_, 'XDFB');
+ index_diff_inc($tg, $_, 'XDFB', $xnq);
} elsif ($in_diff && s/^-//) { # diff removed
- index_text_inc($tg, $_, 'XDFA');
+ index_diff_inc($tg, $_, 'XDFA', $xnq);
} elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) {
my ($ba, $bb) = ($1, $2);
index_git_blob_id($doc, 'XDFPRE', $ba);
# traditional diff w/o -p
} elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) {
# hunk header context
- index_text_inc($tg, $1, 'XDFHH');
+ index_diff_inc($tg, $1, 'XDFHH', $xnq);
# ignore the following lines:
- } elsif (/^(?:dis)similarity index/) {
- } elsif (/^(?:old|new) mode/) {
- } elsif (/^(?:deleted|new) file mode/) {
- } elsif (/^(?:copy|rename) (?:from|to) /) {
- } elsif (/^(?:dis)?similarity index /) {
- } elsif (/^\\ No newline at end of file/) {
- } elsif (/^Binary files .* differ/) {
+ } elsif (/^(?:dis)similarity index/ ||
+ /^(?:old|new) mode/ ||
+ /^(?:deleted|new) file mode/ ||
+ /^(?:copy|rename) (?:from|to) / ||
+ /^(?:dis)?similarity index / ||
+ /^\\ No newline at end of file/ ||
+ /^Binary files .* differ/) {
+ push @xnq, $_;
} elsif ($_ eq '') {
$in_diff = undef;
} else {
+ push @xnq, $_;
warn "non-diff line: $_\n" if DEBUG && $_ ne '';
$in_diff = undef;
}
}
+
+ $tg->index_text(join("\n", @xnq), 1, 'XNQ');
+ $tg->increase_termpos;
}
sub index_body ($$$) {
my ($tg, $lines, $doc) = @_;
my $txt = join("\n", @$lines);
- $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT');
- $tg->increase_termpos;
- # does it look like a diff?
- if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) {
- $txt = undef;
- index_diff($tg, $lines, $doc);
+ if ($doc) {
+ # does it look like a diff?
+ if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
+ $txt = undef;
+ index_diff($tg, $lines, $doc);
+ } else {
+ $tg->index_text($txt, 1, 'XNQ');
+ }
+ } else {
+ $tg->index_text($txt, 0, 'XQUOT');
}
+ $tg->increase_termpos;
@$lines = ();
}
sub add_message {
my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object
- my $db = $self->{xdb};
-
- my ($doc_id, $old_tid);
- my $mid = mid_clean(mid_mime($mime));
+ my $doc_id;
+ my $mids = mids($mime->header_obj);
my $skel = $self->{skeleton};
eval {
- die 'Message-ID too long' if length($mid) > MAX_MID_SIZE;
- my $smsg = $self->lookup_message($mid);
- if ($smsg) {
- # convert a ghost to a regular message
- # it will also clobber any existing regular message
- $doc_id = $smsg->{doc_id};
- $old_tid = $smsg->thread_id unless $skel;
- }
- $smsg = PublicInbox::SearchMsg->new($mime);
+ my $smsg = PublicInbox::SearchMsg->new($mime);
my $doc = $smsg->{doc};
- $doc->add_term('XMID' . $mid);
-
+ foreach my $mid (@$mids) {
+ # FIXME: may be abused to prevent archival
+ length($mid) > MAX_MID_SIZE and
+ die 'Message-ID too long';
+ $doc->add_term('Q' . $mid);
+ }
my $subj = $smsg->subject;
my $xpath;
if ($subj ne '') {
$xpath = $self->subject_path($subj);
$xpath = id_compress($xpath);
- $doc->add_term('XPATH' . $xpath);
+ $doc->add_boolean_term('XPATH' . $xpath);
}
my $lines = $mime->body_raw =~ tr!\n!\n!;
# populates smsg->references for smsg->to_doc_data
my $refs = parse_references($smsg);
my $data = $smsg->to_doc_data($blob);
- if ($skel) {
- push @values, $mid, $xpath, $data;
- $skel->index_skeleton(\@values);
- } else {
- link_message($self, $smsg, $refs, $old_tid);
+ foreach my $mid (@$mids) {
+ $tg->index_text($mid, 1, 'XM');
}
- $tg->index_text($mid, 1, 'XM');
$doc->set_data($data);
-
if (my $altid = $self->{-altid}) {
foreach my $alt (@$altid) {
- my $id = $alt->mid2alt($mid);
- next unless defined $id;
- $doc->add_term($alt->{xprefix} . $id);
+ my $pfx = $alt->{xprefix};
+ foreach my $mid (@$mids) {
+ my $id = $alt->mid2alt($mid);
+ next unless defined $id;
+ $doc->add_boolean_term($pfx . $id);
+ }
}
}
- if (defined $doc_id) {
- $db->replace_document($doc_id, $doc);
+ if ($skel) {
+ push @values, $mids, $xpath, $data;
+ $skel->index_skeleton(\@values);
+ $doc_id = $self->{xdb}->add_document($doc);
} else {
- $doc_id = $db->add_document($doc);
+ $doc_id = link_and_save($self, $doc, $mids, $refs);
}
};
if ($@) {
- warn "failed to index message <$mid>: $@\n";
+ warn "failed to index message <".join('> <',@$mids).">: $@\n";
return undef;
}
$doc_id;
$mid = mid_clean($mid);
eval {
- my ($head, $tail) = $self->find_doc_ids('XMID' . $mid);
+ my ($head, $tail) = $self->find_doc_ids('Q' . $mid);
if ($head->equal($tail)) {
warn "cannot remove non-existent <$mid>\n";
}
sub parse_references ($) {
my ($smsg) = @_;
- my $doc = $smsg->{doc};
- my $mid = $smsg->mid;
my $mime = $smsg->{mime};
my $hdr = $mime->header_obj;
-
- # last References should be IRT, but some mail clients do things
- # out of order, so trust IRT over References iff IRT exists
- my @refs = (($hdr->header_raw('References') || '') =~ /<([^>]+)>/g);
- push(@refs, (($hdr->header_raw('In-Reply-To') || '') =~ /<([^>]+)>/g));
-
- if (@refs) {
- my %uniq = ($mid => 1);
- my @orig_refs = @refs;
- @refs = ();
-
- # prevent circular references via References: here:
- foreach my $ref (@orig_refs) {
- if (length($ref) > MAX_MID_SIZE) {
- warn "References: <$ref> too long, ignoring\n";
- }
- next if $uniq{$ref};
- $uniq{$ref} = 1;
- push @refs, $ref;
+ my $refs = references($hdr);
+ return $refs if scalar(@$refs) == 0;
+
+ # prevent circular references via References here:
+ my %mids = map { $_ => 1 } @{mids($hdr)};
+ my @keep;
+ foreach my $ref (@$refs) {
+ # FIXME: this is an archive-prevention vector like X-No-Archive
+ if (length($ref) > MAX_MID_SIZE) {
+ warn "References: <$ref> too long, ignoring\n";
}
+ next if $mids{$ref};
+ push @keep, $ref;
}
- $smsg->{references} = '<'.join('> <', @refs).'>' if @refs;
- \@refs
+ $smsg->{references} = '<'.join('> <', @keep).'>' if @keep;
+ \@keep;
}
-sub link_message {
- my ($self, $smsg, $refs, $old_tid) = @_;
+sub link_doc {
+ my ($self, $doc, $refs, $old_tid) = @_;
my $tid;
if (@$refs) {
-
# first ref *should* be the thread root,
# but we can never trust clients to do the right thing
my $ref = shift @$refs;
- $tid = $self->_resolve_mid_to_tid($ref);
- $self->merge_threads($tid, $old_tid) if defined $old_tid;
+ $tid = resolve_mid_to_tid($self, $ref);
+ merge_threads($self, $tid, $old_tid) if defined $old_tid;
# the rest of the refs should point to this tid:
foreach $ref (@$refs) {
- my $ptid = $self->_resolve_mid_to_tid($ref);
+ my $ptid = resolve_mid_to_tid($self, $ref);
merge_threads($self, $tid, $ptid);
}
} else {
$tid = defined $old_tid ? $old_tid : $self->next_thread_id;
}
- $smsg->{doc}->add_term('G' . $tid);
+ $doc->add_boolean_term('G' . $tid);
+ $tid;
+}
+
+sub link_and_save {
+ my ($self, $doc, $mids, $refs) = @_;
+ my $db = $self->{xdb};
+ my $old_tid;
+ my $doc_id;
+ my $vivified = 0;
+ foreach my $mid (@$mids) {
+ $self->each_smsg_by_mid($mid, sub {
+ my ($cur) = @_;
+ my $type = $cur->type;
+ my $cur_tid = $cur->thread_id;
+ $old_tid = $cur_tid unless defined $old_tid;
+ if ($type eq 'mail') {
+ # do not break existing mail messages,
+ # just merge the threads
+ merge_threads($self, $old_tid, $cur_tid);
+ return 1;
+ }
+ if ($type ne 'ghost') {
+ die "<$mid> has a bad type: $type\n";
+ }
+ my $tid = link_doc($self, $doc, $refs, $old_tid);
+ $old_tid = $tid unless defined $old_tid;
+ $doc_id = $cur->{doc_id};
+ $self->{xdb}->replace_document($doc_id, $doc);
+ ++$vivified;
+ 1;
+ });
+ }
+ # not really important, but we return any vivified ghost docid, here:
+ return $doc_id if defined $doc_id;
+ link_doc($self, $doc, $refs, $old_tid);
+ $self->{xdb}->add_document($doc);
}
sub index_git_blob_id {
}
sub index_mm {
- my ($self, $mime, $warn_existing) = @_;
+ my ($self, $mime) = @_;
my $mid = mid_clean(mid_mime($mime));
my $mm = $self->{mm};
my $num = $mm->mid_insert($mid);
return $num if defined $num;
- warn "<$mid> reused\n" if $warn_existing;
# fallback to num_for since filters like RubyLang set the number
$mm->num_for($mid);
}
}
# this will create a ghost as necessary
-sub _resolve_mid_to_tid {
+sub resolve_mid_to_tid {
my ($self, $mid) = @_;
+ my $tid;
+ $self->each_smsg_by_mid($mid, sub {
+ my ($smsg) = @_;
+ my $cur_tid = $smsg->thread_id;
+ if (defined $tid) {
+ merge_threads($self, $tid, $cur_tid);
+ } else {
+ $tid = $smsg->thread_id;
+ }
+ 1;
+ });
+ return $tid if defined $tid;
- my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
- $smsg->thread_id;
+ $self->create_ghost($mid)->thread_id;
}
sub create_ghost {
my $tid = $self->next_thread_id;
my $doc = Search::Xapian::Document->new;
- $doc->add_term('XMID' . $mid);
- $doc->add_term('G' . $tid);
- $doc->add_term('T' . 'ghost');
+ $doc->add_boolean_term('Q' . $mid);
+ $doc->add_boolean_term('G' . $tid);
+ $doc->add_boolean_term('T' . 'ghost');
my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
$self->{xdb}->add_document($doc);
foreach my $docid (@ids) {
my $doc = $db->get_document($docid);
$doc->remove_term('G' . $loser_tid);
- $doc->add_term('G' . $winner_tid);
+ $doc->add_boolean_term('G' . $winner_tid);
$db->replace_document($docid, $doc);
}
}