#!/usr/bin/perl -w
# Copyright (C) 2018 all contributors
# License: AGPL-3.0+
#
# ad-hoc tool for finding duplicates, unstable!
use strict;
use warnings;
use PublicInbox::Inbox;
use PublicInbox::Over;
use PublicInbox::Search;
use PublicInbox::Config;
my $repo = shift;
my $ibx;
if (index($repo, '@') > 0) {
$ibx = PublicInbox::Config->new->lookup($repo);
} elsif (-d $repo) {
$ibx = { mainrepo => $repo, address => 'unnamed@example.com' };
$ibx = PublicInbox::Inbox->new($ibx);
} else {
$ibx = PublicInbox::Config->new->lookup_name($repo);
}
$ibx or die "No inbox";
$ibx->search or die "search not available for inbox";
my $dbh = $ibx->search->{over_ro}->connect;
my $over = PublicInbox::Over->new($dbh->sqlite_db_filename);
sub emit ($) {
my ($nums) = @_;
foreach my $n (@$nums) {
my $smsg = $over->get_art($n) or next;
print STDERR "$n $smsg->{blob} $smsg->{mid}\n";
my $msg = $ibx->msg_by_smsg($smsg) or next;
print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n";
$$msg =~ s/^(>*From )/>$1/gm;
print $$msg, "\n";
}
}
my $sth = $dbh->prepare(<<'');
SELECT id,num FROM id2num WHERE num > 0 ORDER BY id
$sth->execute;
my $prev_id = -1;
my ($id, $num, @nums);
while (1) {
($id, $num) = $sth->fetchrow_array;
defined $id or last;
if ($prev_id != $id) {
emit(\@nums) if scalar(@nums) > 1;
@nums = ();
}
$prev_id = $id;
push @nums, $num;
}