1 # Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
2 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
3 package PublicInbox::LeiDedupe;
6 use PublicInbox::ContentHash qw(content_hash git_sha);
9 # n.b. mutt sets most of these headers not sure about Bytes
10 our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes);
12 # best-effort regeneration of OID when augmenting existing results
15 my @stash; # stash away headers we shouldn't have in git
16 for my $k (@OID_IGNORE) {
17 my @v = $eml->header_raw($k) or next;
18 push @stash, [ $k, \@v ];
19 $eml->header_set($k); # restore below
21 my $dig = git_sha(1, $eml);
22 for my $kv (@stash) { # restore stashed headers
24 $eml->header_set($k, @v);
29 sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef }
33 my $dig = Digest::SHA->new(256);
34 my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)});
43 (sub { # may be called in a child process
44 my ($eml, $oidhex) = @_;
45 $skv->set_maybe(_oidbin($oidhex) // _regen_oid($eml), '');
48 $skv->set_maybe(_oidbin($smsg->{blob}), '');
52 # dangerous if there's duplicate messages with different Message-IDs
55 (sub { # may be called in a child process
56 my ($eml, $oidhex) = @_;
57 # lei supports non-public drafts w/o Message-ID
58 my $mid = $eml->header_raw('Message-ID') // _oidbin($oidhex) //
60 $skv->set_maybe($mid, '');
63 my $mid = $smsg->{mid};
64 $mid = undef if $mid eq '';
65 $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob});
66 $skv->set_maybe($mid, '');
70 # our default deduplication strategy (used by v2, also)
71 sub dedupe_content ($) {
73 (sub { # may be called in a child process
74 my ($eml) = @_; # $oidhex = $_[1], ignored
75 $skv->set_maybe(content_hash($eml), '');
78 $skv->set_maybe(smsg_hash($smsg), '');
82 # no deduplication at all
84 sub dedupe_none ($) { (\&true, \&true) }
88 my $dd = $lei->{opt}->{dedupe} // 'content';
89 my $dst = $lei->{ovv}->{dst};
91 # allow "none" to bypass Eml->new if writing to directory:
92 return if ($dd eq 'none' && substr($dst // '', -1) eq '/');
94 $cls->can($m) or die "unsupported dedupe strategy: $dd\n";
97 require PublicInbox::SharedKV;
98 $skv = PublicInbox::SharedKV->new;
100 # [ $skv, $eml_cb, $smsg_cb, "dedupe_$dd" ]
101 bless [ $skv, undef, undef, $m ], $cls;
104 # returns true on seen messages according to the deduplication strategy,
105 # returns false if unseen
107 my ($self, $eml, $smsg) = @_;
108 !$self->[1]->($eml, $smsg ? $smsg->{blob} : undef);
112 my ($self, $smsg) = @_;
113 !$self->[2]->($smsg);
118 my $skv = $self->[0];
119 $self->[1] or @$self[1,2] = $self->can($self->[3])->($skv);
120 $skv ? $skv->dbh : undef;
125 my $skv = $self->[0] or return;
127 delete($skv->{dbh}) if $skv;
131 my $skv = $_[0]->[0] or return undef;