m1 += public-inbox-httpd
m1 += public-inbox-index
m1 += public-inbox-init
+m1 += public-inbox-learn
m1 += public-inbox-mda
m1 += public-inbox-nntpd
m1 += public-inbox-watch
--- /dev/null
+=head1 NAME
+
+public-inbox-learn - spam trainer and remover for public-inbox
+
+=head1 SYNOPSIS
+
+B<public-inbox-learn> <spam|ham|rm> E<lt>MESSAGE
+
+=head1 DESCRIPTION
+
+public-inbox-learn can remove spam or inject ham messages into
+an inbox while training a SpamAssassin instance.
+
+It is intended for users of L<public-inbox-mda(1)> or
+L<public-inbox-watch(1)>, but not users relying on
+L<git-fetch(1)> to mirror inboxes.
+
+It reads one message from standard input and operates on it
+depending on the command given:
+
+=head1 COMMANDS
+
+public-inbox-learn takes one of the following commands as its
+first and only argument:
+
+=over 8
+
+=item spam
+
+Treat the message as spam. This will mark the message as
+removed so it becomes inaccessible via NNTP or WWW endpoints
+for all configured inboxes.
+
+The message remains accessible in git history.
+
+It will also be fed to L<spamc(1)> for training purposes unless
+C<publicinboxmda.spamcheck> is C<none> in L<public-inbox-config(5)>.
+
+=item ham
+
+Treat standard input as ham. This is useful for manually injecting
+messages into the archives which failed the spam check run by
+L<public-inbox-mda(1)> or L<public-inbox-watch(1)>.
+
+It relies on the C<To:>, C<Cc:>, and C<List-ID:> headers
+to match configured inbox addresses and C<listid> directives.
+
+It will also be fed to L<spamc(1)> for training purposes unless
+C<publicinboxmda.spamcheck> is C<none> in L<public-inbox-config(5)>.
+
+=item rm
+
+This is identical to the C<spam> command above, but does
+not feed the message to L<spamc(1)>
+
+=back
+
+=head1 ENVIRONMENT
+
+=over 8
+
+=item PI_CONFIG
+
+Per-user config file parseable by L<git-config(1)>.
+See L<public-inbox-config(5)>.
+
+Default: ~/.public-inbox/config
+
+=back
+
+=head1 CONTACT
+
+Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org>
+
+The mail archives are hosted at L<https://public-inbox.org/meta/>
+and L<http://hjrcffqmbrq6wope.onion/meta/>
+
+=head1 COPYRIGHT
+
+Copyright 2019 all contributors L<mailto:meta@public-inbox.org>
+
+License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
+
+=head1 SEE ALSO
+
+L<spamc(1)>, L<public-inbox-mda(1)>, L<public-inbox-watch(1)>
Documentation/public-inbox-httpd.pod
Documentation/public-inbox-index.pod
Documentation/public-inbox-init.pod
+Documentation/public-inbox-learn.pod
Documentation/public-inbox-mda.pod
Documentation/public-inbox-nntpd.pod
Documentation/public-inbox-overview.pod
use strict;
use warnings;
use PublicInbox::MsgIter;
-use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian
sub No ($) { "*** We only accept plain-text mail, No $_[0] ***" }
$self;
}
+sub assert_usable_dir {
+ my ($self) = @_;
+ my $dir = $self->{inboxdir};
+ return $dir if defined($dir) && $dir ne '';
+ die "no inboxdir defined for $self->{name}\n";
+}
+
sub init_inbox {
my ($self, $shards, $skip_epoch, $skip_artnum) = @_;
# TODO: honor skip_artnum
my $v = $self->{version} || 1;
if ($v == 1) {
- my $dir = $self->{inboxdir} or die "no inboxdir in inbox\n";
+ my $dir = assert_usable_dir($self);
PublicInbox::Import::init_bare($dir);
} else {
my $v2w = importer($self);
}
}
+sub inboxes_for_list_id ($$) {
+ my ($klass, $config, $simple) = @_;
+
+ # newer Email::Simple allows header_raw, as does Email::MIME:
+ my @list_ids = $simple->can('header_raw') ?
+ $simple->header_raw('List-Id') :
+ $simple->header('List-Id');
+ my @dests;
+ for my $list_id (@list_ids) {
+ $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next;
+ if (my $ibx = $config->lookup_list_id($1)) {
+ push @dests, $ibx;
+ }
+ }
+ if (scalar(@list_ids) > 1) {
+ warn "W: multiple List-IDs in message:\n";
+ warn "W: List-ID: $_\n" for @list_ids
+ }
+ \@dests;
+}
+
1;
# $creat may be any true value, or 0/undef. A hashref is true,
# and $creat->{nproc} may be set to an integer
my ($class, $v2ibx, $creat) = @_;
- my $dir = $v2ibx->{inboxdir} or die "no inboxdir in inbox\n";
+ $v2ibx = PublicInbox::InboxWritable->new($v2ibx);
+ my $dir = $v2ibx->assert_usable_dir;
unless (-d $dir) {
if ($creat) {
require File::Path;
die "$dir does not exist\n";
}
}
-
- $v2ibx = PublicInbox::InboxWritable->new($v2ibx);
$v2ibx->umask_prepare;
my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION;
#
# Used for training spam (via SpamAssassin) and removing messages from a
# public-inbox
-my $usage = "$0 (spam|ham) < /path/to/message";
+my $usage = "$0 <spam|ham|rm> </path/to/message";
use strict;
use warnings;
use PublicInbox::Config;
$data
});
-# get all recipients
-my %dests;
-foreach my $h (qw(Cc To)) {
- my $val = $mime->header($h) or next;
- foreach my $email (PublicInbox::Address::emails($val)) {
- $dests{lc($email)} = 1;
- }
-}
-
-if ($train eq 'spam') {
- $pi_config->each_inbox(sub {
- my ($ibx) = @_;
- $ibx = PublicInbox::InboxWritable->new($ibx);
- my $im = $ibx->importer(0);
- $im->remove($mime, 'spam');
- $im->done;
- });
-}
-
-require PublicInbox::MDA if $train eq "ham";
+sub remove_or_add ($$$) {
+ my ($ibx, $train, $addr) = @_;
-# n.b. message may be cross-posted to multiple public-inboxes
-foreach my $recipient (keys %dests) {
- my $dst = $pi_config->lookup($recipient) or next;
# We do not touch GIT_COMMITTER_* env here so we can track
# who trained the message.
- $dst->{name} = $ENV{GIT_COMMITTER_NAME} || $dst->{name};
- $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} || $recipient;
- $dst = PublicInbox::InboxWritable->new($dst);
- my $im = $dst->importer(0);
+ $ibx->{name} = $ENV{GIT_COMMITTER_NAME} // $ibx->{name};
+ $ibx->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} // $addr;
+ $ibx = PublicInbox::InboxWritable->new($ibx);
+ my $im = $ibx->importer(0);
- if ($train eq "spam" || $train eq "rm") {
+ if ($train eq "rm") {
# This needs to be idempotent, as my inotify trainer
# may train for each cross-posted message, and this
# script already learns for every list in
# ~/.public-inbox/config
$im->remove($mime, $train);
- } else { # $train eq "ham"
+ } elsif ($train eq "ham") {
# no checking for spam here, we assume the message has
# been reviewed by a human at this point:
- PublicInbox::MDA->set_list_headers($mime, $dst);
+ PublicInbox::MDA->set_list_headers($mime, $ibx);
# Ham messages are trained when they're marked into
# a SEEN state, so this is idempotent:
$im->done;
}
+# spam is removed from all known inboxes since it is often Bcc:-ed
+if ($train eq 'spam') {
+ $pi_config->each_inbox(sub {
+ my ($ibx) = @_;
+ $ibx = PublicInbox::InboxWritable->new($ibx);
+ my $im = $ibx->importer(0);
+ $im->remove($mime, 'spam');
+ $im->done;
+ });
+} else {
+ require PublicInbox::MDA;
+
+ # get all recipients
+ my %dests; # address => <PublicInbox::Inbox|0(false)>
+ for ($mime->header('Cc'), $mime->header('To')) {
+ foreach my $addr (PublicInbox::Address::emails($_)) {
+ $addr = lc($addr);
+ $dests{$addr} //= $pi_config->lookup($addr) // 0;
+ }
+ }
+
+ # n.b. message may be cross-posted to multiple public-inboxes
+ my %seen;
+ while (my ($addr, $ibx) = each %dests) {
+ next unless ref($ibx); # $ibx may be 0
+ next if $seen{"$ibx"}++;
+ remove_or_add($ibx, $train, $addr);
+ }
+ my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime);
+ for my $ibx (@$dests) {
+ next if !$seen{"$ibx"}++;
+ remove_or_add($ibx, $train, $ibx->{-primary_address});
+ }
+}
+
if ($err) {
warn $err;
exit 1;
my $key = 'publicinboxmda.spamcheck';
my $default = 'PublicInbox::Spamcheck::Spamc';
my $spamc = PublicInbox::Spamcheck::get($config, $key, $default);
-my $dst;
+my $dests = [];
my $recipient = $ENV{ORIGINAL_RECIPIENT};
if (defined $recipient) {
- $dst = $config->lookup($recipient); # first check
+ my $ibx = $config->lookup($recipient); # first check
+ push @$dests, $ibx if $ibx;
}
-if (!defined $dst) {
- my $list_id = $simple->header('List-Id');
- if (defined $list_id && $list_id =~ /<[ \t]*(.+)?[ \t]*>/) {
- $dst = $config->lookup_list_id($1);
- }
- if (!defined $dst && !defined $recipient) {
+if (!scalar(@$dests)) {
+ $dests = PublicInbox::MDA->inboxes_for_list_id($config, $simple);
+ if (!scalar(@$dests) && !defined($recipient)) {
die "ORIGINAL_RECIPIENT not defined in ENV\n";
}
- defined $dst or do_exit(67); # EX_NOUSER 5.1.1 user unknown
+ scalar(@$dests) or do_exit(67); # EX_NOUSER 5.1.1 user unknown
}
-$dst->{inboxdir} or do_exit(67);
-$dst = PublicInbox::InboxWritable->new($dst);
-# pre-check, MDA has stricter rules than an importer might;
-if ($precheck && !PublicInbox::MDA->precheck($simple, $dst->{address})) {
- do_exit(0);
-}
+my $err;
+@$dests = grep {
+ my $ibx = PublicInbox::InboxWritable->new($_);
+ eval { $ibx->assert_usable_dir };
+ if ($@) {
+ warn $@;
+ $err = 1;
+ 0;
+ # pre-check, MDA has stricter rules than an importer might;
+ } elsif ($precheck) {
+ !!PublicInbox::MDA->precheck($simple, $ibx->{address});
+ } else {
+ 1;
+ }
+} @$dests;
+
+do_exit(67) if $err && scalar(@$dests) == 0;
+
$simple = undef;
my $spam_ok;
if ($spamc) {
my $fh = $emm->fh;
read($fh, $str, -s $fh);
}
-
-my $mime = PublicInbox::MIME->new(\$str);
do_exit(0) unless $spam_ok;
-my $fcfg = $dst->{filter} || '';
-# -mda defaults to the strict base filter
-if ($fcfg eq '') {
- $dst->{filter} = 'PublicInbox::Filter::Base';
-} elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove?
- $dst->{filter} = 'PublicInbox::Filter::Mirror';
+# -mda defaults to the strict base filter which we won't use anywhere else
+sub mda_filter_adjust ($) {
+ my ($ibx) = @_;
+ my $fcfg = $ibx->{filter} || '';
+ if ($fcfg eq '') {
+ $ibx->{filter} = 'PublicInbox::Filter::Base';
+ } elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove?
+ $ibx->{filter} = 'PublicInbox::Filter::Mirror';
+ }
+}
+
+my @rejects;
+for my $ibx (@$dests) {
+ mda_filter_adjust($ibx);
+ my $filter = $ibx->filter;
+ my $mime = PublicInbox::MIME->new($str);
+ my $ret = $filter->delivery($mime);
+ if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message
+ $mime = $ret;
+ } elsif ($ret == PublicInbox::Filter::Base::IGNORE) {
+ next; # nothing, keep looping
+ } elsif ($ret == PublicInbox::Filter::Base::REJECT) {
+ push @rejects, $filter->err;
+ next;
+ }
+
+ PublicInbox::MDA->set_list_headers($mime, $ibx);
+ my $im = $ibx->importer(0);
+ if (defined $im->add($mime)) {
+ # ->abort is idempotent, no emergency if a single
+ # destination succeeds
+ $emm->abort;
+ } else { # v1-only
+ my $mid = $mime->header_obj->header_raw('Message-ID');
+ # this message is similar to what ssoma-mda shows:
+ print STDERR "CONFLICT: Message-ID: $mid exists\n";
+ }
+ $im->done;
}
-my $filter = $dst->filter;
-my $ret = $filter->delivery($mime);
-if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message
- $mime = $ret;
-} elsif ($ret == PublicInbox::Filter::Base::IGNORE) {
- do_exit(0); # chuck it to emergency
-} elsif ($ret == PublicInbox::Filter::Base::REJECT) {
- $! = 65; # EX_DATAERR 5.6.0 data format error
- die $filter->err, "\n";
-} # else { accept
-$filter = undef;
-PublicInbox::MDA->set_list_headers($mime, $dst);
-my $im = $dst->importer(0);
-if (defined $im->add($mime)) {
- $emm = $emm->abort;
-} else {
- # this message is similar to what ssoma-mda shows:
- print STDERR "CONFLICT: Message-ID: ",
- $mime->header_obj->header_raw('Message-ID'),
- " exists\n";
+if (scalar(@rejects) && scalar(@rejects) == scalar(@$dests)) {
+ $! = 65; # EX_DATAERR 5.6.0 data format error
+ die join("\n", @rejects, '');
}
-$im->done;
do_exit(0);
$im->done;
is(undef, $im->checkpoint, 'checkpoint works after ->done');
$im->checkpoint;
+
+my $nogit = PublicInbox::Git->new("$dir/non-existent/dir");
+eval {
+ my $nope = PublicInbox::Import->new($nogit, 'nope', 'no@example.com');
+ $nope->add($mime);
+};
+ok($@, 'Import->add fails on non-existent dir');
+
done_testing();
my $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`;
like($cur, qr/this message would not be accepted without --no-precheck/,
'--no-precheck delivered message anyways');
+
+ # try a message with multiple List-ID headers
+ $in = <<EOF;
+List-ID: <foo.bar>
+List-ID: <$list_id>
+Message-ID: <2lids\@example>
+Subject: two List-IDs
+From: user <user\@example.com>
+To: $addr
+Date: Fri, 02 Oct 1993 00:00:00 +0000
+
+EOF
+ ($out, $err) = ('', '');
+ IPC::Run::run([$mda], \$in, \$out, \$err);
+ is($?, 0, 'mda OK with multiple List-Id matches');
+ $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`;
+ like($cur, qr/Message-ID: <2lids\@example>/,
+ 'multi List-ID match delivered');
+ like($err, qr/multiple List-ID/, 'warned about multiple List-ID');
}
done_testing();
$im->done;
}
+my $tmp = {
+ inboxdir => "$inboxdir/non-existent/subdir",
+ name => 'nope',
+ version => 2,
+ -primary_address => 'test@example.com',
+};
+eval {
+ my $nope = PublicInbox::V2Writable->new($tmp);
+ $nope->add($mime);
+};
+ok($@, 'V2Writable fails on non-existent dir');
+
done_testing();