lib/PublicInbox/ExtSearchIdx.pm | 11 ++++------- lib/PublicInbox/Import.pm | 4 +--- lib/PublicInbox/LeiStore.pm | 4 ---- lib/PublicInbox/SearchIdx.pm | 17 +++-------------- lib/PublicInbox/Smsg.pm | 13 +++++++++++++ lib/PublicInbox/V2Writable.pm | 23 ++++++++++------------- t/import.t | 12 ++---------- t/search.t | 2 +- diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index d55d3db9d31dd935a37eb9762a0d1b32295fcc7f..e6c21866b084fd4aa915cd6ac4c13c1194f8c6d1 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -21,8 +21,7 @@ use Carp qw(croak carp); use Sys::Hostname qw(hostname); use POSIX qw(strftime); use PublicInbox::Search; -use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor - is_bad_blob); +use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); use PublicInbox::OverIdx; use PublicInbox::MiscIdx; use PublicInbox::MID qw(mids); @@ -82,8 +81,6 @@ sub check_batch_limit ($) { my ($req) = @_; my $self = $req->{self}; my $new_smsg = $req->{new_smsg}; - - # {raw_bytes} may be unset, so just use {bytes} my $n = $self->{transact_bytes} += $new_smsg->{bytes}; # set flag for PublicInbox::V2Writable::index_todo: @@ -239,7 +236,7 @@ return if is_bad_blob($oid, $type, $size, $req->{oid}); my $new_smsg = $req->{new_smsg} = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return; ++${$req->{nr}}; do_step($req); @@ -496,7 +493,7 @@ } my $ci = $self->{current_info}; local $self->{current_info} = "$ci #$docid $oid"; my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; - $re_smsg->{bytes} = $size + crlf_adjust($$bref); + $re_smsg->set_bytes($$bref, $size); my $eml = PublicInbox::Eml->new($bref); $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, cotime => $orig_smsg->{ts} }); @@ -676,7 +673,7 @@ return if is_bad_blob($oid, $type, $size, $req->{oid}); my $self = $req->{self} // die 'BUG: {self} unset'; local $self->{current_info} = "$self->{current_info} $oid"; my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); my $eml = $req->{eml} = PublicInbox::Eml->new($bref); $req->{new_smsg} = $new_smsg; $req->{chash} = content_hash($eml); diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 052b145b2606123599b0aa451ae971c5ae84d1e2..8a06a661654b861975efbd88cc0df7fdcb010122 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -412,12 +412,10 @@ # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{raw_bytes} = $n; + $smsg->set_bytes($raw_email, $n); if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore return if $oidx->blob_exists($smsg->{blob}); } - # XXX do we need this? it's in git at this point - $smsg->{-raw_email} = \$raw_email; } my $ref = $self->{ref}; my $commit = $self->{mark}++; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 4f77e8fad9d318fe0ce354c3536b5cdd176f8224..7cda7e449e2f8faa69e46c2eee60156631510778 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -10,7 +10,6 @@ package PublicInbox::LeiStore; use strict; use v5.10.1; use parent qw(PublicInbox::Lock PublicInbox::IPC); -use PublicInbox::SearchIdx qw(crlf_adjust); use PublicInbox::ExtSearchIdx; use PublicInbox::Import; use PublicInbox::InboxWritable; @@ -197,9 +196,6 @@ my $oidx = $eidx->{oidx}; my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; my $im = $self->importer; $im->add($eml, undef, $smsg) or return; # duplicate returns undef - my $msgref = delete $smsg->{-raw_email}; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); - undef $msgref; local $self->{current_info} = $smsg->{blob}; if (my @docids = _docids_for($self, $eml)) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index da3ac2e3abbb19e8471feb270b2a5c151b931a49..a7005051d9deef241fab065dd05072348e80d8ce 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,7 +22,7 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack +our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; our ($DB_CREATE_OR_OPEN, $DB_OPEN); @@ -613,17 +613,6 @@ $mm->mid_insert($mids->[0]) // $mm->num_for($mids->[0]); } } -# returns the number of bytes to add if given a non-CRLF arg -sub crlf_adjust ($) { - if (index($_[0], "\r\n") < 0) { - # common case is LF-only, every \n needs an \r; - # so favor a cheap tr// over an expensive m//g - $_[0] =~ tr/\n/\n/; - } else { # count number of '\n' w/o '\r', expensive: - scalar(my @n = ($_[0] =~ m/(?{oid}); my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; - $size += crlf_adjust($$bref); - my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $smsg->set_bytes($$bref, $size); my $self = $sync->{sidx}; local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 571cbb6faca9417d804b1771b0bb94fdac5e6987..c6ff7f52b0ac90a2aad2326f80c4b862b4cc982b 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -135,4 +135,17 @@ $subj =~ s/$REPLY_RE//igo; # remove reply prefix $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?{bytes} = $_[2] + crlf_adjust($_[1]) } + 1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 7b6b93a0d124baf8635cf13fc8d6ecc08221980d..c4efbdd203ab71497af35f99fdcc4fd3d9b90ffa 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -17,8 +17,7 @@ use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd run_die); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size - is_bad_blob); +use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush use File::Temp (); @@ -139,13 +138,12 @@ $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})]; } # indexes a message, returns true if checkpointing is needed -sub do_idx ($$$$) { - my ($self, $msgref, $eml, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); +sub do_idx ($$$) { + my ($self, $eml, $smsg) = @_; $self->{oidx}->add_overview($eml, $smsg); my $idx = idx_shard($self, $smsg->{num}); $idx->index_eml($eml, $smsg); - my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; + my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= $self->{batch_bytes}; } @@ -173,7 +171,7 @@ my $cmt = $im->add($mime, undef, $smsg); # sets $smsg->{ds|ts|blob} $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) { + if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } @@ -536,13 +534,13 @@ # reindex modified messages: for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; $new_smsg->populate($new_mime, $sync); - do_idx($self, \$raw, $new_mime, $new_smsg); + $new_smsg->set_bytes($raw, $bytes); + do_idx($self, $new_mime, $new_smsg); } $rewritten->{rewrites}; } @@ -937,13 +935,13 @@ return; } ++${$arg->{nr}}; my $smsg = bless { - raw_bytes => $size, num => $num, blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; $smsg->populate($eml, $arg); - if (do_idx($self, $bref, $eml, $smsg)) { + $smsg->set_bytes($$bref, $size); + if (do_idx($self, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } index_finalize($arg, 1); @@ -1217,9 +1215,8 @@ sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{self}; my $idx = idx_shard($self, $smsg->{num}); - $smsg->{raw_bytes} = $size; $idx->index_eml(PublicInbox::Eml->new($bref), $smsg); - $self->{transact_bytes} += $size; + $self->{transact_bytes} += $smsg->{bytes}; } sub index_xap_step ($$$;$) { diff --git a/t/import.t b/t/import.t index 855b5d7cff13207ce0e5847c9f76d7b2222be588..ae76858ba2ab7827bc7aaa1d45db377a0954ee64 100644 --- a/t/import.t +++ b/t/import.t @@ -7,7 +7,6 @@ use PublicInbox::Eml; use PublicInbox::Smsg; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::Spawn qw(spawn); use Fcntl qw(:DEFAULT SEEK_SET); use PublicInbox::TestCommon; use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2 @@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) { like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id'); - my $raw_email = $smsg->{-raw_email}; - is($mime->as_string, $$raw_email, 'string matches'); - is($smsg->{raw_bytes}, length($$raw_email), 'length matches'); my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); open my $in, '+<', undef or BAIL_OUT "open(+<): $!"; print $in $mime->as_string or die "write failed: $!"; $in->flush or die "flush failed: $!"; - seek($in, 0, SEEK_SET); - open my $out, '+<', undef or BAIL_OUT "open(+<): $!"; - my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out }); - is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object'); + seek($in, 0, SEEK_SET) or die "seek: $!"; + chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in })); is($?, 0, 'hash-object'); - seek($out, 0, SEEK_SET); - chomp(my $hashed_obj = <$out>); is($hashed_obj, $smsg->{blob}, "blob object_id matches exp"); } diff --git a/t/search.t b/t/search.t index 7495233ed20a881ceb8fa5f9deb077b516ef638a..b2958c002c2a85a1a52f26b9e64e7115a6aad3c0 100644 --- a/t/search.t +++ b/t/search.t @@ -60,7 +60,7 @@ } } { - my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust; + my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust; is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed'); is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR'); is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');