1 # Copyright (C) 2021 all contributors <meta@public-inbox.org>
2 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
4 # front-end for the "lei patch-to-query" sub-command
5 package PublicInbox::LeiP2q;
8 use parent qw(PublicInbox::IPC);
10 use PublicInbox::Smsg;
11 use PublicInbox::MsgIter qw(msg_part_text);
12 use PublicInbox::Git qw(git_unquote);
13 use PublicInbox::Spawn qw(popen_rd);
14 use URI::Escape qw(uri_escape_utf8);
15 my $FN = qr!((?:"?[^/\n]+/[^\r\n]+)|/dev/null)!;
19 return () unless $s =~ /\S/;
20 # cf. xapian-core/queryparser/queryparser.lemony
21 # [\./:\\\@] - is_phrase_generator (implicit phrase search)
22 # FIXME not really sure about these..., we basically want to
23 # extract the longest phrase possible that Xapian can handle
27 m![^\./:\\\@\-\w]! ? qq("$_") : $_ ;
28 } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g);
31 sub extract_terms { # eml->each_part callback
33 my $part = $p->[0]; # ignore $depth and @idx;
34 my $ct = $part->content_type || 'text/plain';
35 my ($s, undef) = msg_part_text($part, $ct);
39 for (split(/\n/, $s)) {
40 if ($in_diff && s/^ //) { # diff context
41 push @{$lei->{qterms}->{dfctx}}, xphrase($_);
42 } elsif (/^-- $/) { # email signature begins
44 } elsif (m!^diff --git $FN $FN!) {
45 # wait until "---" and "+++" to capture filenames
47 } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) {
48 my ($oa, $ob) = ($1, $2);
49 push @{$lei->{qterms}->{dfpre}}, $oa;
50 push @{$lei->{qterms}->{dfpost}}, $ob;
52 } elsif (m!^(?:---|\+{3}) ($FN)!) {
53 next if $1 eq '/dev/null';
54 my $fn = (split(m!/!, git_unquote($1.''), 2))[1];
55 push @{$lei->{qterms}->{dfn}}, xphrase($fn);
56 } elsif ($in_diff && s/^\+//) { # diff added
57 push @{$lei->{qterms}->{dfb}}, xphrase($_);
58 } elsif ($in_diff && s/^-//) { # diff removed
59 push @{$lei->{qterms}->{dfa}}, xphrase($_);
60 } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) {
61 # traditional diff w/o -p
62 } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) {
63 push @{$lei->{qterms}->{dfhh}}, xphrase($1);
64 } elsif (/^(?:dis)similarity index/ ||
65 /^(?:old|new) mode/ ||
66 /^(?:deleted|new) file mode/ ||
67 /^(?:copy|rename) (?:from|to) / ||
68 /^(?:dis)?similarity index / ||
69 /^\\ No newline at end of file/ ||
70 /^Binary files .* differ/) {
72 # possible to be in diff context, some mail may be
73 # stripped by MUA or even GNU diff(1). "git apply"
74 # treats a bare "\n" as diff context, too
86 tcf => [ qw(to cc from) ],
87 a => [ qw(to cc from) ],
89 bs => [ qw(subject) ], # body handled elsewhere
90 d => [ qw(ds) ], # nonsense?
91 dt => [ qw(ds) ], # ditto...
92 rt => [ qw(ts) ], # ditto...
95 sub do_p2q { # via wq_do
97 my $lei = $self->{lei};
98 my $want = $lei->{opt}->{want} // [ qw(dfpost7) ];
99 my @want = split(/[, ]+/, "@$want");
101 /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/ or next;
102 my ($pfx, $n, $unit) = ($1, $2, $3);
103 $n *= 86400 * ($unit =~ /week/i ? 7 : 1);
106 my $smsg = bless {}, 'PublicInbox::Smsg';
109 my $input = $self->{input};
111 $in = $lei->fopen('<', $input) or
112 return $lei->fail("open < $input: $!");
114 my @cmd = (qw(git format-patch --stdout -1), $input);
115 $in = popen_rd(\@cmd, undef, { 2 => $lei->{2} });
118 my $eml = PublicInbox::Eml->new(\(do { local $/; <$in> }));
119 $lei->{diff_want} = +{ map { $_ => 1 } @want };
120 $smsg->populate($eml);
121 while (my ($pfx, $fields) = each %pfx2smsg) {
122 next unless $lei->{diff_want}->{$pfx};
123 for my $f (@$fields) {
124 my $v = $smsg->{$f} // next;
125 push @{$lei->{qterms}->{$pfx}}, xphrase($v);
128 $eml->each_part(\&extract_terms, $lei, 1);
129 if ($lei->{opt}->{debug}) {
130 my $json = ref(PublicInbox::Config->json)->new;
131 $json->utf8->canonical->pretty;
132 $lei->err($json->encode($lei->{qterms}));
135 for my $pfx (@want) {
136 if (ref($pfx) eq 'ARRAY') {
137 my ($p, $t_range) = @$pfx; # TODO
139 } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! ||
140 $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) {
143 my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : '';
144 my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : '';
145 my $x = delete($lei->{qterms}->{$pfx}) or next;
146 my $star = $end =~ tr/*//d ? '*' : '';
147 my $min_len = ($end || 0) + 0;
149 # no wildcards for bool_pfx_external
150 $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/;
151 $pfx = "$plusminus$pfx:";
154 my @t = ($pfx.$_.$star);
155 while (length > $min_len) {
157 push @t, 'OR', $pfx.$_.$star;
163 my $k = $pfx.$_.$star;
164 $seen{$k}++ ? () : $k
169 if ($lei->{opt}->{uri}) {
170 @q = (join('+', map { uri_escape_utf8($_) } @q));
172 @q = (join(' ', @q));
177 sub lei_p2q { # the "lei patch-to-query" entry point
178 my ($lei, $input) = @_;
179 my $self = $lei->{p2q} = bless {}, __PACKAGE__;
180 if ($lei->{opt}->{stdin}) {
181 $self->{0} = delete $lei->{0}; # guard from lei_atfork_child
183 $self->{input} = $input;
185 my $op = $lei->workers_start($self, 'lei patch2query', 1);
186 $self->wq_io_do('do_p2q', []);
188 while ($op && $op->{sock}) { $op->event_step }
191 sub ipc_atfork_child {
193 my $lei = $self->{lei};
194 $lei->lei_atfork_child;
195 $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb();
196 $self->SUPER::ipc_atfork_child;