X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FLeiDedupe.pm;h=32f99cd0a30c3eafeed64940c7a0bd21940861ad;hb=23af251dd607c4e75ab1e68063f2c885c48cc035;hp=34d8bc2784805fb232112f5949614dc90ec7a0ca;hpb=abd0a85b212ce1467ddc94d523152d9a65028960;p=public-inbox.git diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index 34d8bc27..32f99cd0 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2020 all contributors +# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ package PublicInbox::LeiDedupe; use strict; use v5.10.1; -use PublicInbox::SharedKV; -use PublicInbox::ContentHash qw(content_hash); +use PublicInbox::ContentHash qw(content_hash git_sha); +use Digest::SHA (); # n.b. mutt sets most of these headers not sure about Bytes our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); @@ -18,12 +18,7 @@ sub _regen_oid ($) { push @stash, [ $k, \@v ]; $eml->header_set($k); # restore below } - my $dig = Digest::SHA->new(1); # XXX SHA256 later - my $buf = $eml->as_string; - $dig->add('blob '.length($buf)."\0"); - $dig->add($buf); - undef $buf; - + my $dig = git_sha(1, $eml); for my $kv (@stash) { # restore stashed headers my ($k, @v) = @$kv; $eml->header_set($k, @v); @@ -33,68 +28,108 @@ sub _regen_oid ($) { sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } +sub smsg_hash ($) { + my ($smsg) = @_; + my $dig = Digest::SHA->new(256); + my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); + utf8::encode($x); + $dig->add($x); + $dig->digest; +} + # the paranoid option -sub dedupe_oid () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process - my ($eml, $oid) = @_; - $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); +sub dedupe_oid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oidhex) = @_; + $skv->set_maybe(_oidbin($oidhex) // _regen_oid($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(_oidbin($smsg->{blob}), ''); }); } # dangerous if there's duplicate messages with different Message-IDs -sub dedupe_mid () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process - my ($eml, $oid) = @_; - # TODO: lei will support non-public messages w/o Message-ID - my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // +sub dedupe_mid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oidhex) = @_; + # lei supports non-public drafts w/o Message-ID + my $mid = $eml->header_raw('Message-ID') // _oidbin($oidhex) // content_hash($eml); $skv->set_maybe($mid, ''); + }, sub { + my ($smsg) = @_; + my $mid = $smsg->{mid}; + $mid = undef if $mid eq ''; + $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob}); + $skv->set_maybe($mid, ''); }); } # our default deduplication strategy (used by v2, also) -sub dedupe_content () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process - my ($eml) = @_; # oid = $_[1], ignored +sub dedupe_content ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml) = @_; # $oidhex = $_[1], ignored $skv->set_maybe(content_hash($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(smsg_hash($smsg), ''); }); } # no deduplication at all -sub dedupe_none () { (undef, sub { 1 }) } +sub true { 1 } +sub dedupe_none ($) { (\&true, \&true) } sub new { - my ($cls, $lei, $dst) = @_; + my ($cls, $lei) = @_; my $dd = $lei->{opt}->{dedupe} // 'content'; + my $dst = $lei->{ovv}->{dst}; # allow "none" to bypass Eml->new if writing to directory: return if ($dd eq 'none' && substr($dst // '', -1) eq '/'); - - my $dd_new = $cls->can("dedupe_$dd") // - die "unsupported dedupe strategy: $dd\n"; - bless [ $dd_new->() ], $cls; # [ $skv, $cb ] + my $m = "dedupe_$dd"; + $cls->can($m) or die "unsupported dedupe strategy: $dd\n"; + my $skv; + if ($dd ne 'none') { + require PublicInbox::SharedKV; + $skv = PublicInbox::SharedKV->new; + } + # [ $skv, $eml_cb, $smsg_cb, "dedupe_$dd" ] + bless [ $skv, undef, undef, $m ], $cls; } -# returns true on unseen messages according to the deduplication strategy, -# returns false if seen +# returns true on seen messages according to the deduplication strategy, +# returns false if unseen sub is_dup { - my ($self, $eml, $oid) = @_; - !$self->[1]->($eml, $oid); + my ($self, $eml, $smsg) = @_; + !$self->[1]->($eml, $smsg ? $smsg->{blob} : undef); +} + +sub is_smsg_dup { + my ($self, $smsg) = @_; + !$self->[2]->($smsg); } sub prepare_dedupe { my ($self) = @_; my $skv = $self->[0]; + $self->[1] or @$self[1,2] = $self->can($self->[3])->($skv); $skv ? $skv->dbh : undef; } sub pause_dedupe { my ($self) = @_; - my $skv = $self->[0]; + my $skv = $self->[0] or return; + $skv->dbh_release; delete($skv->{dbh}) if $skv; } +sub has_entries { + my $skv = $_[0]->[0] or return undef; + $skv->has_entries; +} + 1;