]> Sergey Matveev's repositories - public-inbox.git/commitdiff
search: do not index base-85 binary patches
authorEric Wong <e@80x24.org>
Mon, 20 Jun 2022 19:27:30 +0000 (19:27 +0000)
committerEric Wong <e@80x24.org>
Tue, 21 Jun 2022 10:39:11 +0000 (10:39 +0000)
Base-85 binary patches generated by git lead to many false
positives, so skip over gibberish words which may occur in them.
To avoid regressions in search results, continue to allow
searching for exact size matches (via "literal $SIZE") and the
phrase "GIT binary patch" for the mere presence of a binary
patch.

MANIFEST
TODO
lib/PublicInbox/SearchIdx.pm
t/data/binary.patch [new file with mode: 0644]
t/search.t

index ce2cf4a583f6bc804cb36f76416ee3fddf81a7e0..607a4c5bce13905155bc8b2cced2b6089162ae08 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -397,6 +397,7 @@ t/content_hash.t
 t/convert-compact.t
 t/data-gen/.gitignore
 t/data/0001.patch
+t/data/binary.patch
 t/data/message_embed.eml
 t/dir_idle.t
 t/ds-kqxs.t
diff --git a/TODO b/TODO
index 43eee0638f658aa1ce752b474461ba910c3fc53f..7a27fdd2f716e80d1542c94a00662e42c5b90ba3 100644 (file)
--- a/TODO
+++ b/TODO
@@ -153,8 +153,6 @@ all need to be considered for everything we introduce)
 
 * support UUCP addresses for legacy archives
 
-* decode (skip indexing of) base-85 binary patches to avoid false-positives
-
 * support pipelining as an IMAP/NNTP client for -watch + lei
 
 * auto-detect and reload on TLS cert+key changes in daemons
index 53ec23a5c50c4facfcf1977ea04f2fa4498d8805..cbfe7816044535c398b3dbe3acb7907ebe19a447 100644 (file)
@@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
        # assume a typical 64-bit system has 8x more RAM than a
        # typical 32-bit system:
        (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-
 use constant DEBUG => !!$ENV{DEBUG};
-
+my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -258,21 +257,42 @@ sub index_diff ($$$) {
        my ($self, $txt, $doc) = @_;
        my %seen;
        my $in_diff;
-       my @xnq;
-       my $xnq = \@xnq;
-       foreach (split(/\n/, $txt)) {
-               if ($in_diff && s/^ //) { # diff context
+       my $xnq = [];
+       my @l = split(/\n/, $$txt);
+       undef $$txt;
+       while (defined($_ = shift @l)) {
+               if ($in_diff && /^GIT binary patch/) {
+                       push @$xnq, $_;
+                       while (@l && $l[0] =~ /^literal /) {
+                               # TODO allow searching by size range?
+                               # allows searching by exact size via:
+                               # "literal $SIZE"
+                               push @$xnq, shift(@l);
+
+                               # skip base85 and empty lines
+                               while (@l && ($l[0] =~ /$BASE85/o ||
+                                               $l[0] !~ /\S/)) {
+                                       shift @l;
+                               }
+                               # loop hits trailing "literal 0\nHcmV?d00001\n"
+                       }
+               } elsif ($in_diff && s/^ //) { # diff context
                        index_diff_inc($self, $_, 'XDFCTX', $xnq);
                } elsif (/^-- $/) { # email signature begins
                        $in_diff = undef;
-               } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
-                       # wait until "---" and "+++" to capture filenames
+               } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) {
+                       # capture filenames here for binary diffs:
+                       my ($fa, $fb) = ($1, $2);
+                       push @$xnq, $_;
                        $in_diff = 1;
-                       push @xnq, $_;
+                       $fa = (split(m'/', git_unquote($fa), 2))[1];
+                       $fb = (split(m'/', git_unquote($fb), 2))[1];
+                       $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq);
+                       $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq);
                # traditional diff:
                } elsif (m/^diff -(.+) (\S+) (\S+)$/) {
                        my ($opt, $fa, $fb) = ($1, $2, $3);
-                       push @xnq, $_;
+                       push @$xnq, $_;
                        # only support unified:
                        next unless $opt =~ /[uU]/;
                        $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb,
@@ -288,8 +308,8 @@ sub index_diff ($$$) {
                        $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
                        $in_diff = 1;
                } elsif (/^--- (\S+)/) {
-                       $in_diff = $1;
-                       push @xnq, $_;
+                       $in_diff = $1; # old diff filename
+                       push @$xnq, $_;
                } elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
                        $in_diff = index_old_diff_fn($self, \%seen, $in_diff,
                                                        $1, $xnq);
@@ -315,19 +335,19 @@ sub index_diff ($$$) {
                                /^(?:dis)?similarity index / ||
                                /^\\ No newline at end of file/ ||
                                /^Binary files .* differ/) {
-                       push @xnq, $_;
+                       push @$xnq, $_;
                } elsif ($_ eq '') {
                        # possible to be in diff context, some mail may be
                        # stripped by MUA or even GNU diff(1).  "git apply"
                        # treats a bare "\n" as diff context, too
                } else {
-                       push @xnq, $_;
+                       push @$xnq, $_;
                        warn "non-diff line: $_\n" if DEBUG && $_ ne '';
                        $in_diff = undef;
                }
        }
 
-       index_text($self, join("\n", @xnq), 1, 'XNQ');
+       index_text($self, join("\n", @$xnq), 1, 'XNQ');
 }
 
 sub index_xapian { # msg_iter callback
@@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback
                } else {
                        # does it look like a diff?
                        if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
-                               index_diff($self, $txt, $doc);
+                               index_diff($self, \$txt, $doc);
                        } else {
                                index_text($self, $txt, 1, 'XNQ');
                        }
diff --git a/t/data/binary.patch b/t/data/binary.patch
new file mode 100644 (file)
index 0000000..58717ab
--- /dev/null
@@ -0,0 +1,20 @@
+From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001
+From: BOFH <bofh@example.com>
+Date: Fri, 13 May 2022 23:04:14 +0000
+Subject: [PATCH] binary patch test
+Message-ID: <binary-patch-test@example>
+
+---
+ zero | Bin 0 -> 1 bytes
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 zero
+
+diff --git a/zero b/zero
+new file mode 100644
+index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d
+GIT binary patch
+literal 1
+IcmZPo000310RR91
+
+literal 0
+HcmV?d00001
index 47a67f7fb5e94eace65f3bf3d30a68232da7c2fa..13210ff5a4af56e518490fe282c18bc2e8f08af4 100644 (file)
@@ -533,6 +533,21 @@ $ibx->with_umask(sub {
        is($query->('s:"mail header experiments"')->[0]->{mid},
                '20200418222508.GA13918@dcvr',
                'Subject search reaches inside message/rfc822');
+
+       $doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+       $rw->commit_txn_lazy;
+       $ibx->search->reopen;
+       my $res = $query->('HcmV');
+       is_deeply($res, [], 'no results against trailer');
+       $res = $query->('IcmZPo000310RR91');
+       is_deeply($res, [], 'no results against 1-byte binary patch');
+       $res = $query->('"GIT binary patch"');
+       is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+       is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+       my $s = $query->('"literal 1"');
+       is_deeply($s, $res, 'got binary result from exact literal size');
+       $s = $query->('"literal 2"');
+       is_deeply($s, [], 'no results for wrong size');
 });
 
 SKIP: {