NNTP and IMAP both require CRLF conversions on the wire.
They're also the only components which care about
$smsg->{bytes}, so store the CRLF-adjusted value in over.sqlite3
and Xapian DBs..
This will allow us to optimize RFC822.SIZE fetch item in IMAP
without triggering size mismatch errors in some clients' default
configurations (e.g. Mail::IMAPClient), but not most others.
It could also fix hypothetical problems with NNTP clients that
report discrepancies between overview and article data.
# v2: we need this for Xapian
if ($smsg) {
$smsg->{blob} = $self->get_mark(":$blob");
- $smsg->{bytes} = $n;
+ $smsg->{raw_bytes} = $n;
$smsg->{-raw_email} = \$raw_email;
}
my $ref = $self->{ref};
$self->{mm}->mid_delete(mid_mime($mime));
}
+# returns the number of bytes to add if given a non-CRLF arg
+sub crlf_adjust ($) {
+ if (index($_[0], "\r\n") < 0) {
+ # common case is LF-only, every \n needs an \r;
+ # so favor a cheap tr// over an expensive m//g
+ $_[0] =~ tr/\n/\n/;
+ } else { # count number of '\n' w/o '\r', expensive:
+ scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
+ }
+}
+
sub index_both { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
+ $size += crlf_adjust($$bref);
my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
my $self = $sync->{sidx};
my $eml = PublicInbox::Eml->new($bref);
} else {
chomp $line;
# n.b. $mid may contain spaces(!)
- my ($bytes, $num, $blob, $ds, $ts, $mid) =
- split(/ /, $line, 6);
+ my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) =
+ split(/ /, $line, 7);
$self->begin_txn_lazy;
- my $n = read($r, my $msg, $bytes) or die "read: $!\n";
- $n == $bytes or die "short read: $n != $bytes\n";
+ my $n = read($r, my $msg, $to_read) or die "read: $!\n";
+ $n == $to_read or die "short read: $n != $to_read\n";
my $mime = PublicInbox::Eml->new(\$msg);
my $smsg = bless {
bytes => $bytes,
my ($self, $msgref, $mime, $smsg) = @_;
if (my $w = $self->{w}) {
# mid must be last, it can contain spaces (but not LF)
- print $w join(' ', @$smsg{qw(bytes num blob ds ts mid)}),
+ print $w join(' ', @$smsg{qw(raw_bytes bytes
+ num blob ds ts mid)}),
"\n", $$msgref or die "failed to write shard $!\n";
} else {
$$msgref = undef;
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
+ $smsg->{bytes} = $smsg->{raw_bytes} +
+ PublicInbox::SearchIdx::crlf_adjust($$msgref);
$self->{over}->add_overview($mime, $smsg);
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
- my $n = $self->{transact_bytes} += $smsg->{bytes};
+ my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
$n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
}
for my $smsg (@$need_reindex) {
my $new_smsg = bless {
blob => $blob,
- bytes => $bytes,
+ raw_bytes => $bytes,
num => $smsg->{num},
mid => $smsg->{mid},
}, 'PublicInbox::Smsg';
}
$sync->{nr}++;
my $smsg = bless {
- bytes => $len,
+ raw_bytes => $len,
num => $num,
blob => $oid,
mid => $mid0,
die "failed to delete <$mid0> for article #$num\n";
$sync->{nr}++;
my $smsg = bless {
- bytes => $len,
+ raw_bytes => $len,
num => $num,
blob => $oid,
mid => $mid0,
if ($v2) {
like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
- is($mime->as_string, ${$smsg->{-raw_email}}, 'string matches');
- is($smsg->{bytes}, length(${$smsg->{-raw_email}}), 'length matches');
+ my $raw_email = $smsg->{-raw_email};
+ is($mime->as_string, $$raw_email, 'string matches');
+ is($smsg->{raw_bytes}, length($$raw_email), 'length matches');
my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
my $in = tempfile();
print $in $mime->as_string or die "write failed: $!";
my $list_id = $addr;
$list_id =~ s/@/./;
$mime->header_set('List-Id', "<$list_id>");
- $len = length($mime->as_string);
+ my $str = $mime->as_string;
+ $str =~ s/(?<!\r)\n/\r\n/sg;
+ $len = length($str);
+ undef $str;
$im->add($mime);
$im->done;
if ($version == 1) {
}
}
+{
+ my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust;
+ is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed');
+ is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR');
+ is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');
+ is($crlf_adjust->("hi\nworld\r\n"), 1, 'CRLF/LF-mix 2 counts 1 CR');
+}
+
$ibx->with_umask(sub {
my $root = PublicInbox::Eml->new(<<'EOF');
Date: Fri, 02 Oct 1993 00:00:00 +0000