]> Sergey Matveev's repositories - public-inbox.git/commitdiff
lei: track seen messages to note duplicates
authorEric Wong <e@80x24.org>
Thu, 7 Jul 2022 09:40:30 +0000 (09:40 +0000)
committerEric Wong <e@80x24.org>
Thu, 7 Jul 2022 09:47:58 +0000 (09:47 +0000)
This may help track down deduplication or other bugs in lei
which lead to occasionally missing messages.

Link: https://public-inbox.org/meta/CAL_JsqJH8xx_2NyZffNsRXbGXiv3kjmCETvKXt3Yfb0uToLm9Q@mail.gmail.com/
lib/PublicInbox/LeiConvert.pm
lib/PublicInbox/LeiToMail.pm
lib/PublicInbox/LeiXSearch.pm

index 906f3026834481f8c98cd03e2a62f6ffede6c3e3..59af40dea1b11a64ca8e074412c978ab1e98020d 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # front-end for the "lei convert" sub-command
@@ -35,8 +35,10 @@ sub process_inputs { # via wq_do
        my $lei = $self->{lei};
        delete $lei->{1};
        delete $self->{wcb}; # commit
-       my $nr = delete($lei->{-nr_write}) // 0;
-       $lei->qerr("# converted $nr messages");
+       my $nr_w = delete($lei->{-nr_write}) // 0;
+       my $d = (delete($lei->{-nr_seen}) // 0) - $nr_w;
+       $d = $d ? " ($d duplicates)" : '';
+       $lei->qerr("# converted $nr_w messages$d");
 }
 
 sub lei_convert { # the main "lei convert" method
index 3c5e7e59e8eedf3b2108a70a076e0ea83d18509c..2aa3977e8034d01f744c05a3a7c5598c9e97890e 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir
@@ -197,6 +197,7 @@ sub _mbox_write_cb ($$) {
        sub { # for git_to_mail
                my ($buf, $smsg, $eml) = @_;
                $eml //= PublicInbox::Eml->new($buf);
+               ++$lei->{-nr_seen};
                return if $dedupe->is_dup($eml, $smsg);
                $lse->xsmsg_vmd($smsg) if $lse;
                $smsg->{-recent} = 1 if $set_recent;
@@ -291,6 +292,8 @@ sub _maildir_write_cb ($$) {
        sub { # for git_to_mail
                my ($bref, $smsg, $eml) = @_;
                $dst // return $lei->fail; # dst may be undef-ed in last run
+
+               ++$lei->{-nr_seen};
                return if $dedupe && $dedupe->is_dup($eml //
                                                PublicInbox::Eml->new($$bref),
                                                $smsg);
@@ -317,6 +320,8 @@ sub _imap_write_cb ($$) {
        sub { # for git_to_mail
                my ($bref, $smsg, $eml) = @_;
                $mic // return $lei->fail; # mic may be undef-ed in last run
+
+               ++$lei->{-nr_seen};
                return if $dedupe && $dedupe->is_dup($eml //
                                                PublicInbox::Eml->new($$bref),
                                                $smsg);
@@ -360,6 +365,7 @@ sub _v2_write_cb ($$) {
        sub { # for git_to_mail
                my ($bref, $smsg, $eml) = @_;
                $eml //= PublicInbox::Eml->new($bref);
+               ++$lei->{-nr_seen};
                return if $dedupe && $dedupe->is_dup($eml, $smsg);
                $lei->{v2w}->wq_do('add', $eml); # V2Writable->add
                ++$lei->{-nr_write};
@@ -792,9 +798,10 @@ sub wq_atexit_child {
        my $lei = $self->{lei};
        delete $self->{wcb};
        $lei->{ale}->git->async_wait_all;
-       my $nr = delete($lei->{-nr_write}) or return;
+       my ($nr_w, $nr_s) = delete(@$lei{qw(-nr_write -nr_seen)});
+       $nr_s or return;
        return if $lei->{early_mua} || !$lei->{-progress} || !$lei->{pkt_op_p};
-       $lei->{pkt_op_p}->pkt_do('l2m_progress', $nr);
+       $lei->{pkt_op_p}->pkt_do('l2m_progress', $nr_w, $nr_s);
 }
 
 # runs on a 1s timer in lei-daemon
index 41e798562d424b097ec611ef5f283f2251d9563b..6f8770191ee6a28b36b1f7f631cbb9ebc7b26e86 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Combine any combination of PublicInbox::Search,
@@ -163,8 +163,9 @@ sub mset_progress {
 }
 
 sub l2m_progress {
-       my ($lei, $nr) = @_;
-       $lei->{-nr_write} += $nr;
+       my ($lei, $nr_write, $nr_seen) = @_;
+       $lei->{-nr_write} += $nr_write;
+       $lei->{-nr_seen} += $nr_seen;
 }
 
 sub query_one_mset { # for --threads and l2m w/o sort
@@ -447,13 +448,16 @@ Error closing $lei->{ovv}->{dst}: \$!=$! \$?=$?
                }
                if ($lei->{-progress}) {
                        my $tot = $lei->{-mset_total} // 0;
-                       my $nr = $lei->{-nr_write} // 0;
+                       my $nr_w = $lei->{-nr_write} // 0;
+                       my $d = ($lei->{-nr_seen} // 0) - $nr_w;
+                       my $x = "$tot matches";
+                       $x .= ", $d duplicates" if $d;
                        if ($l2m) {
-                               my $m = "# $nr written to " .
-                                       "$lei->{ovv}->{dst} ($tot matches)";
-                               $nr ? $lei->qfin($m) : $lei->qerr($m);
+                               my $m = "# $nr_w written to " .
+                                       "$lei->{ovv}->{dst} ($x)";
+                               $nr_w ? $lei->qfin($m) : $lei->qerr($m);
                        } else {
-                               $lei->qerr("# $tot matches");
+                               $lei->qerr("# $x");
                        }
                }
                $lei->start_mua if $l2m && !$l2m->lock_free;