]> Sergey Matveev's repositories - public-inbox.git/blob - lib/PublicInbox/LeiP2q.pm
imap+nntp: share COMPRESS implementation
[public-inbox.git] / lib / PublicInbox / LeiP2q.pm
1 # Copyright (C) all contributors <meta@public-inbox.org>
2 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
3
4 # front-end for the "lei patch-to-query" sub-command
5 package PublicInbox::LeiP2q;
6 use strict;
7 use v5.10.1;
8 use parent qw(PublicInbox::IPC PublicInbox::LeiInput);
9 use PublicInbox::Eml;
10 use PublicInbox::Smsg;
11 use PublicInbox::MsgIter qw(msg_part_text);
12 use PublicInbox::Git qw(git_unquote);
13 use PublicInbox::OnDestroy;
14 use URI::Escape qw(uri_escape_utf8);
15 my $FN = qr!((?:"?[^/\n]+/[^\r\n]+)|/dev/null)!;
16
17 sub xphrase ($) {
18         my ($s) = @_;
19         return () unless $s =~ /\S/;
20         # cf. xapian-core/queryparser/queryparser.lemony
21         # [\./:\\\@] - is_phrase_generator (implicit phrase search)
22         # FIXME not really sure about these..., we basically want to
23         # extract the longest phrase possible that Xapian can handle
24         map {
25                 s/\A\s*//;
26                 s/\s+\z//;
27                 m![^\./:\\\@\-\w]! ? qq("$_") : $_ ;
28         } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g);
29 }
30
31 sub add_qterm ($$@) {
32         my ($self, $p, @v) = @_;
33         for (@v) {
34                 $self->{qseen}->{"$p\0$_"} //=
35                         push(@{$self->{qterms}->{$p}}, $_);
36         }
37 }
38
39 sub extract_terms { # eml->each_part callback
40         my ($p, $self) = @_;
41         my $part = $p->[0]; # ignore $depth and @idx;
42         my $ct = $part->content_type || 'text/plain';
43         my ($s, undef) = msg_part_text($part, $ct);
44         defined $s or return;
45         my $in_diff;
46         # TODO: b: nq: q:
47         for (split(/\n/, $s)) {
48                 if ($in_diff && s/^ //) { # diff context
49                         add_qterm($self, 'dfctx', xphrase($_));
50                 } elsif (/^-- $/) { # email signature begins
51                         $in_diff = undef;
52                 } elsif (m!^diff --git $FN $FN!) {
53                         # wait until "---" and "+++" to capture filenames
54                         $in_diff = 1;
55                 } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) {
56                         my ($oa, $ob) = ($1, $2);
57                         add_qterm($self, 'dfpre', $oa);
58                         add_qterm($self, 'dfpost', $ob);
59                         # who uses dfblob?
60                 } elsif (m!^(?:---|\+{3}) ($FN)!) {
61                         next if $1 eq '/dev/null';
62                         my $fn = (split(m!/!, git_unquote($1.''), 2))[1];
63                         add_qterm($self, 'dfn', xphrase($fn));
64                 } elsif ($in_diff && s/^\+//) { # diff added
65                         add_qterm($self, 'dfb', xphrase($_));
66                 } elsif ($in_diff && s/^-//) { # diff removed
67                         add_qterm($self, 'dfa', xphrase($_));
68                 } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) {
69                         # traditional diff w/o -p
70                 } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) {
71                         add_qterm($self, 'dfhh', xphrase($1));
72                 } elsif (/^(?:dis)similarity index/ ||
73                                 /^(?:old|new) mode/ ||
74                                 /^(?:deleted|new) file mode/ ||
75                                 /^(?:copy|rename) (?:from|to) / ||
76                                 /^(?:dis)?similarity index / ||
77                                 /^\\ No newline at end of file/ ||
78                                 /^Binary files .* differ/) {
79                 } elsif ($_ eq '') {
80                         # possible to be in diff context, some mail may be
81                         # stripped by MUA or even GNU diff(1).  "git apply"
82                         # treats a bare "\n" as diff context, too
83                 } else {
84                         $in_diff = undef;
85                 }
86         }
87 }
88
89 my %pfx2smsg = (
90         t => [ qw(to) ],
91         c => [ qw(cc) ],
92         f => [ qw(from) ],
93         tc => [ qw(to cc) ],
94         tcf => [ qw(to cc from) ],
95         a => [ qw(to cc from) ],
96         s => [ qw(subject) ],
97         bs => [ qw(subject) ], # body handled elsewhere
98         d => [ qw(ds) ], # nonsense?
99         dt => [ qw(ds) ], # ditto...
100         rt => [ qw(ts) ], # ditto...
101 );
102
103 sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh
104         my ($self, $eml) = @_;
105         my $diff_want = $self->{diff_want} // do {
106                 my $want = $self->{lei}->{opt}->{want} // [ qw(dfpost7) ];
107                 my @want = split(/[, ]+/, "@$want");
108                 for (@want) {
109                         /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/
110                                 or next;
111                         my ($pfx, $n, $unit) = ($1, $2, $3);
112                         $n *= 86400 * ($unit =~ /week/i ? 7 : 1);
113                         $_ = [ $pfx, $n ];
114                 }
115                 $self->{want_order} = \@want;
116                 $self->{diff_want} = +{ map { $_ => 1 } @want };
117         };
118         my $smsg = bless {}, 'PublicInbox::Smsg';
119         $smsg->populate($eml);
120         while (my ($pfx, $fields) = each %pfx2smsg) {
121                 next unless $diff_want->{$pfx};
122                 for my $f (@$fields) {
123                         my $v = $smsg->{$f} // next;
124                         add_qterm($self, $pfx, xphrase($v));
125                 }
126         }
127         $eml->each_part(\&extract_terms, $self, 1);
128 }
129
130 sub emit_query {
131         my ($self) = @_;
132         my $lei = $self->{lei};
133         if ($lei->{opt}->{debug}) {
134                 my $json = ref(PublicInbox::Config->json)->new;
135                 $json->utf8->canonical->pretty;
136                 print { $lei->{2} } $json->encode($self->{qterms});
137         }
138         my (@q, %seen);
139         for my $pfx (@{$self->{want_order}}) {
140                 if (ref($pfx) eq 'ARRAY') {
141                         my ($p, $t_range) = @$pfx; # TODO
142
143                 } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! ||
144                                 $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) {
145                         push @q, $pfx;
146                 } else {
147                         my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : '';
148                         my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : '';
149                         my $x = delete($self->{qterms}->{$pfx}) or next;
150                         my $star = $end =~ tr/*//d ? '*' : '';
151                         my $min_len = ($end || 0) + 0;
152
153                         # no wildcards for bool_pfx_external
154                         $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/;
155                         $pfx = "$plusminus$pfx:";
156                         if ($min_len) {
157                                 push @q, map {
158                                         my @t = ($pfx.$_.$star);
159                                         while (length > $min_len) {
160                                                 chop $_;
161                                                 push @t, 'OR', $pfx.$_.$star;
162                                         }
163                                         @t;
164                                 } @$x;
165                         } else {
166                                 push @q, map {
167                                         my $k = $pfx.$_.$star;
168                                         $seen{$k}++ ? () : $k
169                                 } @$x;
170                         }
171                 }
172         }
173         if ($lei->{opt}->{uri}) {
174                 @q = (join('+', map { uri_escape_utf8($_) } @q));
175         } else {
176                 @q = (join(' ', @q));
177         }
178         $lei->out(@q, "\n");
179 }
180
181 sub lei_p2q { # the "lei patch-to-query" entry point
182         my ($lei, @inputs) = @_;
183         $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin};
184         my $self = bless { missing_ok => 1 }, __PACKAGE__;
185         $self->prepare_inputs($lei, \@inputs) or return;
186         $lei->wq1_start($self);
187 }
188
189 sub ipc_atfork_child {
190         my ($self) = @_;
191         PublicInbox::LeiInput::input_only_atfork_child($self);
192         PublicInbox::OnDestroy->new($$, \&emit_query, $self);
193 }
194
195 no warnings 'once';
196 *net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done;
197
198 1;