]> Sergey Matveev's repositories - public-inbox.git/commitdiff
searchidx: switch to accounting by message bytes
authorEric Wong <e@80x24.org>
Wed, 14 Jun 2017 00:14:47 +0000 (00:14 +0000)
committerEric Wong <e@80x24.org>
Wed, 14 Jun 2017 00:15:45 +0000 (00:15 +0000)
Xapian memory usage is tied to the size of the indexed
text, so take the raw message size into account when
deciding when to flush Xapian data.

More importantly, we now flush Xapian before we have it
buffer beyond our maximum; and we do it unconditionally
to prevent even high priority processes from OOM-ing.

lib/PublicInbox/SearchIdx.pm

index 316111bf0be4d8a8227be046a33867f6b8939fcf..30d3fe926a14cef02834bfe3511ac37022e14bee 100644 (file)
@@ -20,13 +20,14 @@ use Carp qw(croak);
 use POSIX qw(strftime);
 require PublicInbox::Git;
 
-use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian
 use constant {
+       MAX_MID_SIZE => 244, # max term size - 1 in Xapian
        PERM_UMASK => 0,
        OLD_PERM_GROUP => 1,
        OLD_PERM_EVERYBODY => 2,
        PERM_GROUP => 0660,
        PERM_EVERYBODY => 0664,
+       BATCH_BYTES => 1_000_000,
 };
 
 sub new {
@@ -71,7 +72,6 @@ sub _xdb_acquire {
                require File::Path;
                _lock_acquire($self);
                File::Path::mkpath($dir);
-               $self->{batch_size} = 100;
                $flag = Search::Xapian::DB_CREATE_OR_OPEN;
        }
        $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag);
@@ -395,6 +395,15 @@ sub index_sync {
        with_umask($self, sub { $self->_index_sync($opts) });
 }
 
+sub batch_adjust ($$$$) {
+       my ($max, $bytes, $batch_cb, $latest) = @_;
+       $$max -= $bytes;
+       if ($$max <= 0) {
+               $$max = BATCH_BYTES;
+               $batch_cb->($latest, 1);
+       }
+}
+
 sub rlog {
        my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
        my $hex = '[a-f0-9]';
@@ -404,23 +413,21 @@ sub rlog {
        my $git = $self->{git};
        my $latest;
        my $bytes;
-       my $max = $self->{batch_size}; # may be undef
+       my $max = BATCH_BYTES;
        local $/ = "\n";
        my $line;
        while (defined($line = <$log>)) {
                if ($line =~ /$addmsg/o) {
                        my $blob = $1;
                        my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+                       batch_adjust(\$max, $bytes, $batch_cb, $latest);
                        $add_cb->($self, $mime, $bytes, $blob);
                } elsif ($line =~ /$delmsg/o) {
                        my $blob = $1;
-                       my $mime = do_cat_mail($git, $blob) or next;
+                       my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+                       batch_adjust(\$max, $bytes, $batch_cb, $latest);
                        $del_cb->($self, $mime);
                } elsif ($line =~ /^commit ($h40)/o) {
-                       if (defined $max && --$max <= 0) {
-                               $max = $self->{batch_size};
-                               $batch_cb->($latest, 1);
-                       }
                        $latest = $1;
                }
        }