From: Eric Wong (Contractor, The Linux Foundation) Date: Sat, 17 Feb 2018 10:19:47 +0000 (+0000) Subject: v2writable: initial cut for repo-rotation X-Git-Tag: v1.1.0-pre1~233 X-Git-Url: http://www.git.stargrave.org/?p=public-inbox.git;a=commitdiff_plain;h=ff25a60b1b85c92a7e76a735a2c79c0205233a27 v2writable: initial cut for repo-rotation Wrap the old Import package to enable creating new repos based on size thresholds. This is better than relying on time-based rotation as LKML traffic seems to be increasing. --- diff --git a/MANIFEST b/MANIFEST index 1df27f26..4b51b543 100644 --- a/MANIFEST +++ b/MANIFEST @@ -91,6 +91,7 @@ lib/PublicInbox/Spamcheck/Spamc.pm lib/PublicInbox/Spawn.pm lib/PublicInbox/SpawnPP.pm lib/PublicInbox/Unsubscribe.pm +lib/PublicInbox/V2Writable.pm lib/PublicInbox/View.pm lib/PublicInbox/WWW.pm lib/PublicInbox/WWW.pod diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index ea2b814e..6437643d 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -139,6 +139,18 @@ sub cleanup { _destroy($self, qw(in_c out_c pid_c)); } +# assuming a well-maintained repo, this should be a somewhat +# accurate estimation of its size +# TODO: show this in the WWW UI as a hint to potential cloners +sub packed_bytes { + my ($self) = @_; + my $n = 0; + foreach my $p (glob("$self->{git_dir}/objects/pack/*.pack")) { + $n += -s $p; + } + $n +} + sub DESTROY { cleanup(@_) } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 1f831a7b..364ab602 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -31,6 +31,7 @@ sub new { inbox => $ibx, path_type => '2/38', # or 'v2' ssoma_lock => 1, # disable for v2 + bytes_added => 0, }, $class } @@ -275,7 +276,9 @@ sub add { my $blob = $self->{mark}++; my $str = $mime->as_string; - print $w "blob\nmark :$blob\ndata ", length($str), "\n" or wfail; + my $n = length($str); + $self->{bytes_added} += $n; + print $w "blob\nmark :$blob\ndata ", $n, "\n" or wfail; print $w $str, "\n" or wfail; $str = undef; @@ -325,7 +328,7 @@ sub add { $self->{tip} = ":$commit"; } -sub run_die ($$) { +sub run_die ($;$) { my ($cmd, $env) = @_; my $pid = spawn($cmd, $env, undef); defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; @@ -354,7 +357,7 @@ sub done { } if ($nchg) { run_die([@cmd, 'update-server-info'], undef); - eval { + ($self->{path_type} eq '2/38') and eval { require PublicInbox::SearchIdx; my $inbox = $self->{inbox} || $git_dir; my $s = PublicInbox::SearchIdx->new($inbox); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm new file mode 100644 index 00000000..9b68e9b1 --- /dev/null +++ b/lib/PublicInbox/V2Writable.pm @@ -0,0 +1,180 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# This interface wraps and mimics PublicInbox::Import +package PublicInbox::V2Writable; +use strict; +use warnings; +use Fcntl qw(:flock :DEFAULT); +use PublicInbox::SearchIdx; +use PublicInbox::MIME; +use PublicInbox::Git; +use PublicInbox::Import; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; + +# an estimate of the post-packed size to the raw uncompressed size +my $PACKING_FACTOR = 0.4; + +sub new { + my ($class, $v2ibx, $creat) = @_; + my $dir = $v2ibx->{mainrepo} or die "no mainrepo in inbox\n"; + unless (-d $dir) { + if ($creat) { + require File::Path; + File::Path::mkpath($dir); + } else { + die "$dir does not exist\n"; + } + } + my $self = { + -inbox => $v2ibx, + im => undef, # PublicInbox::Import + xap_rw => undef, # PublicInbox::V2SearchIdx + xap_ro => undef, + + # limit each repo to 1GB or so + rotate_bytes => int((100 * 1024 * 1024) / $PACKING_FACTOR), + }; + bless $self, $class +} + +# returns undef on duplicate or spam +# mimics Import::add and wraps it for v2 +sub add { + my ($self, $mime, $check_cb) = @_; + my $existing = $self->lookup_content($mime); + + if ($existing) { + return undef if $existing->type eq 'mail'; # duplicate + } + + my $im = $self->importer; + + # im->add returns undef if check_cb fails + my $cmt = $im->add($mime, $check_cb) or return; + $cmt = $im->get_mark($cmt); + my $oid = $im->{last_object_id}; + $self->index_msg($mime, $existing, $cmt, $oid); + $mime; +} + +sub index_msg { # TODO +} + +sub remove { + my ($self, $mime, $msg) = @_; + my $existing = $self->lookup_content($mime) or return; + + # don't touch ghosts or already junked messages + return unless $existing->type eq 'mail'; + + # always write removals to the current (latest) git repo since + # we process chronologically + my $im = $self->importer; + my ($cmt, undef) = $im->remove($mime, $msg); + $cmt = $im->get_mark($cmt); + $self->unindex_msg($existing, $cmt); +} + +sub done { + my ($self) = @_; + $self->{im}->done; # PublicInbox::Import::done +} + +sub checkpoint { + my ($self) = @_; + $self->{im}->checkpoint; # PublicInbox::Import::checkpoint +} + +sub git_init { + my ($self, $new) = @_; + my $pfx = "$self->{-inbox}->{mainrepo}/git"; + my $git_dir = "$pfx/$new.git"; + die "$git_dir exists\n" if -e $git_dir; + my @cmd = (qw(git init --bare -q), $git_dir); + PublicInbox::Import::run_die(\@cmd); + @cmd = (qw/git config/, "--file=$git_dir/config", + 'repack.writeBitmaps', 'true'); + PublicInbox::Import::run_die(\@cmd); + + my $all = "$self->{-inbox}->{mainrepo}/all.git"; + unless (-d $all) { + @cmd = (qw(git init --bare -q), $all); + PublicInbox::Import::run_die(\@cmd); + } + + my $alt = "$all/objects/info/alternates"; + my $new_obj_dir = "../../git/$new.git/objects"; + my %alts; + if (-e $alt) { + open(my $fh, '<', $alt) or die "open < $alt: $!\n"; + %alts = map { chomp; $_ => 1 } (<$fh>); + } + return $git_dir if $alts{$new_obj_dir}; + open my $fh, '>>', $alt or die "open >> $alt: $!\n"; + print $fh "$new_obj_dir\n" or die "print >> $alt: $!\n"; + close $fh or die "close $alt: $!\n"; + $git_dir +} + +sub importer { + my ($self) = @_; + my $im = $self->{im}; + if ($im) { + if ($im->{bytes_added} < $self->{rotate_bytes}) { + return $im; + } else { + $self->{im} = undef; + $im->done; + $im = undef; + my $git_dir = $self->git_init(++$self->{max_git}); + my $git = PublicInbox::Git->new($git_dir); + return $self->import_init($git, 0); + } + } + my $latest; + my $max = -1; + my $new = 0; + my $pfx = "$self->{-inbox}->{mainrepo}/git"; + if (-d $pfx) { + foreach my $git_dir (glob("$pfx/*.git")) { + $git_dir =~ m!/(\d+)\.git\z! or next; + my $n = $1; + if ($n > $max) { + $max = $n; + $latest = $git_dir; + } + } + } + if (defined $latest) { + my $git = PublicInbox::Git->new($latest); + my $packed_bytes = $git->packed_bytes; + if ($packed_bytes >= $self->{rotate_bytes}) { + $new = $max + 1; + } else { + $self->{max_git} = $max; + return $self->import_init($git, $packed_bytes); + } + } else { + warn "latest not found in $pfx\n"; + } + $self->{max_git} = $new; + $latest = $self->git_init($new); + $self->import_init(PublicInbox::Git->new($latest), 0); +} + +sub import_init { + my ($self, $git, $packed_bytes) = @_; + my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox}); + $im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR); + $im->{ssoma_lock} = 0; + $im->{path_type} = 'v2'; + $self->{im} = $im; +} + +sub lookup_content { + undef # TODO +} + +1; diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox index 6ea2ca5d..c45dc4ee 100644 --- a/scripts/import_vger_from_mbox +++ b/scripts/import_vger_from_mbox @@ -7,25 +7,24 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use Date::Parse qw/str2time/; use Email::MIME; $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect -use PublicInbox::Git; -use PublicInbox::Import; -my $usage = "usage: $0 NAME EMAIL \$dry_run ); GetOptions(%opts) or die $usage; -chomp(my $git_dir = `git rev-parse --git-dir`); -my $git = PublicInbox::Git->new($git_dir); my $name = shift or die $usage; # git my $email = shift or die $usage; # git@vger.kernel.org -my $im = $dry_run ? undef : PublicInbox::Import->new($git, $name, $email); +my $mainrepo = shift or die $usage; # /path/to/v2/repo +my $v2ibx = { + mainrepo => $mainrepo, + name => $name, + -primary_address => $email, +}; +my $im = $dry_run ? undef : PublicInbox::V2Writable->new($v2ibx, 1); binmode STDIN; my $msg = ''; use PublicInbox::Filter::Vger; my $vger = PublicInbox::Filter::Vger->new; -if ($im) { - $im->{ssoma_lock} = 0; - $im->{path_type} = 'v2'; -} sub do_add ($$) { my ($im, $msg) = @_; diff --git a/t/git.t b/t/git.t index 5efc18ab..ab588a15 100644 --- a/t/git.t +++ b/t/git.t @@ -137,6 +137,9 @@ if (1) { is($all, join('', @ref), 'qx returned array when wanted'); my $nl = scalar @ref; ok($nl > 1, "qx returned array length of $nl"); + + $gcf->qx(qw(repack -adbq)); + ok($gcf->packed_bytes > 0, 'packed size is positive'); } done_testing();