]> Sergey Matveev's repositories - feeder.git/blobdiff - cmd/env.rc
Compress WARCs
[feeder.git] / cmd / env.rc
index 6bb0cd09affc21ae23d3dea7ea9739291f87c57b..7c5874cfc1ec699530212e1127a17ef7d77c1101 100644 (file)
@@ -1,9 +1,9 @@
 CURL="${CURL:-curl}"
 ZSTD="${ZSTD:-zstdmt -19}"
 WGET="${WGET:-wget}"
-PARALLEL="${PARALLEL:-parallel --bar}"
+PARALLEL="${PARALLEL:-parallel --bar --shuf}"
 
-FEEDER_USER_AGENT="${FEEDER_USER_AGENT:-go.stargrave.org-feeder/0.1.0}"
+FEEDER_USER_AGENT="${FEEDER_USER_AGENT:-stargrave.org-feeder/0.1.0}"
 #FEEDER_CURL_VERBOSE=1
 FEEDER_MAX_ITEMS=${FEEDER_MAX_ITEMS:-100}
 FEEDER_DOWNLOAD_JOBS=${FEEDER_DOWNLOAD_JOBS:-10}
@@ -12,3 +12,12 @@ FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0}
 command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary"
 
 #MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}"
+
+_feeder_warc_compress() {
+    $HOME/work/tofuproxy/warc-extract.cmd -for-enzstd "$1" |
+    $HOME/work/tofuproxy/cmd/zstd/enzstd > "$1".zst
+    rm "$1"
+}
+
+FEEDER_WARC_COMPRESS=_feeder_warc_compress
+FEEDER_WARC_COMPRESS=: # do not compress