]> Sergey Matveev's repositories - feeder.git/commitdiff
Compress WARCs
authorSergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 16:06:47 +0000 (19:06 +0300)
committerSergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 16:06:47 +0000 (19:06 +0300)
cmd/env.rc
cmd/warcs.zsh

index 1982a94bcf6e8588a8a655e61827843dcb932515..7c5874cfc1ec699530212e1127a17ef7d77c1101 100644 (file)
@@ -12,3 +12,12 @@ FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0}
 command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary"
 
 #MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}"
+
+_feeder_warc_compress() {
+    $HOME/work/tofuproxy/warc-extract.cmd -for-enzstd "$1" |
+    $HOME/work/tofuproxy/cmd/zstd/enzstd > "$1".zst
+    rm "$1"
+}
+
+FEEDER_WARC_COMPRESS=_feeder_warc_compress
+FEEDER_WARC_COMPRESS=: # do not compress
index 4e3efd647f2334de34edc26fe325539d69b0b7f7..2f662abe2df420a7534a9bd2ff5137db8f6c3ca5 100755 (executable)
@@ -33,6 +33,7 @@ for new (new/*(N)) {
         }
         fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
         ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
-        print $dst/$fn.warc
+        $FEEDER_WARC_COMPRESS $dst/$fn.warc
+        print $dst/$fn.warc*
     done < $new
 }