From: Sergey Matveev Date: Sun, 20 Feb 2022 16:06:47 +0000 (+0300) Subject: Compress WARCs X-Git-Url: http://www.git.stargrave.org/?p=feeder.git;a=commitdiff_plain;h=ef71dced397f07faa38537b7a214897025ee86fa Compress WARCs --- diff --git a/cmd/env.rc b/cmd/env.rc index 1982a94..7c5874c 100644 --- a/cmd/env.rc +++ b/cmd/env.rc @@ -12,3 +12,12 @@ FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0} command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary" #MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}" + +_feeder_warc_compress() { + $HOME/work/tofuproxy/warc-extract.cmd -for-enzstd "$1" | + $HOME/work/tofuproxy/cmd/zstd/enzstd > "$1".zst + rm "$1" +} + +FEEDER_WARC_COMPRESS=_feeder_warc_compress +FEEDER_WARC_COMPRESS=: # do not compress diff --git a/cmd/warcs.zsh b/cmd/warcs.zsh index 4e3efd6..2f662ab 100755 --- a/cmd/warcs.zsh +++ b/cmd/warcs.zsh @@ -33,6 +33,7 @@ for new (new/*(N)) { } fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url) ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url - print $dst/$fn.warc + $FEEDER_WARC_COMPRESS $dst/$fn.warc + print $dst/$fn.warc* done < $new }