From: Sergey Matveev Date: Sat, 19 Feb 2022 09:05:40 +0000 (+0300) Subject: encs/warcs destination is optional X-Git-Url: http://www.git.stargrave.org/?p=feeder.git;a=commitdiff_plain;h=a10ac881c31515d23afad734c36392a9391e95b1 encs/warcs destination is optional --- diff --git a/cmd/download-encs.zsh b/cmd/download-encs.zsh index 9e62ffd..81ac0a6 100755 --- a/cmd/download-encs.zsh +++ b/cmd/download-encs.zsh @@ -3,6 +3,8 @@ set -e fpath=($0:h:a/functions.zsh $fpath) dst=$2:a cd $1 +[[ -n "$dst" ]] || { dst=encs ; dst=$dst:a } +mkdir -p $dst autoload url-to-filename zmodload -F zsh/datetime b:strftime setopt EXTENDED_GLOB @@ -14,7 +16,7 @@ for new (new/*(N)) { url=$cols[2] [[ -n "$url" ]] fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S) - wget --output-file=enc.log --output-document=$dst/$fn $url - print $fn + wget --output-document=$dst/$fn $url 2>&2 2>enc.log + print $dst/$fn done < $new } diff --git a/cmd/download-warcs.zsh b/cmd/download-warcs.zsh index ac9e589..0198ca2 100755 --- a/cmd/download-warcs.zsh +++ b/cmd/download-warcs.zsh @@ -3,6 +3,8 @@ set -e fpath=($0:h:a/functions.zsh $fpath) dst=$2:a cd $1 +[[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a } +mkdir -p $dst autoload url-to-filename zmodload -F zsh/datetime b:strftime setopt EXTENDED_GLOB @@ -27,7 +29,7 @@ for new (new/*(N)) { wget_opts=(--output-document=$tmp $wget_opts) } fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S) - wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url - print $fn + wget $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url + print $dst/$fn.warc done < $new } diff --git a/doc/usage.texi b/doc/usage.texi index b59afea..d7eb951 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -184,22 +184,31 @@ $ cmd/download-clean.sh feed/FEED Many feeds include links to so-called enclosures, like audio files for podcasts. While you mail is not processed by MUA, its @file{new/} messages still there, you can run enclosure downloading process, that -uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Specify the -directory where your enclosures should be placed. Each enclosure's -filename is more or less filesystem-friendly with the current timestamp -in it. +uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Each +enclosure's filename is more or less filesystem-friendly with the +current timestamp in it. @example -$ mkdir path/to/enclosures -$ ./feeds-encs.zsh path/to/enclosures +$ ./feeds-encs.zsh [...] -traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822 +monsterfeet.com_grue.rss/encs/20220218-152822-traffic.libsyn.com_monsterfeet_grue_018.mp3 +www.astronews.ru_astronews.xml/encs/20220219-115710-www.astronews.ru_news_2022_20220216125238.jpg [...] -$ file path/to/enclosures/traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822 -path/to/...: Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural +$ file feeds/**/encs/*/ +monsterfeet.com_grue.rss/encs/20220218-152822-traffic.libsyn.com_monsterfeet_grue_018.mp3: + Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural +www.astronews.ru_astronews.xml/encs/20220219-115710-www.astronews.ru_news_2022_20220216125238.jpg: + JPEG image data, JFIF standard 1.01, ... @end example -@command{feeds-encs.zsh} do not parallelize jobs, because enclosure are -often heavy enough to satiate your Internet link. +@command{feeds-encs.zsh} does not parallelize jobs, because enclosure are +often heavy enough to satiate your Internet link. @command{wget}'s +progress is also printed both to stderr and @file{feeds/FEED/encs.log}. + +Of course you can also download only single feed's enclosures: + +@example +$ cmd/download-encs.sh path/to/FEED [optional overriden destination directory] +@end example @end table diff --git a/doc/warcs.texi b/doc/warcs.texi index 638ab0a..ca151f0 100644 --- a/doc/warcs.texi +++ b/doc/warcs.texi @@ -10,10 +10,9 @@ able to output the whole document in @url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} format. @example -$ mkdir path/to/warcs -$ ./feeds-warcs.zsh path/to/warcs +$ ./feeds-warcs.zsh [...] -www.darkside.ru_news_140480-20220218-145755.warc +www.darkside.ru_news_rss/warcs/20220218-145755-www.darkside.ru_news_140480.warc [...] @end example @@ -23,8 +22,14 @@ help as an option. After you get pile of various @file{*.warc} files, you can simply add them to running @command{tofuproxy}: @example -$ for w (path/to/warcs/*.warc) print $w > path/to/tofuproxy/fifos/add-warcs +$ for w (feeds/*/warcs/*.warc) print $w:a > path/to/tofuproxy/fifos/add-warcs @end example And then visit @url{http://warc/} URL (when @command{tofuproxy} already acts as a proxy) to view and visit existing URLs. + +Of course you can also download only single feed's enclosures: + +@example +$ cmd/download-warcs.sh path/to/FEED [optional overriden destination directory] +@end example diff --git a/feeds-encs.zsh b/feeds-encs.zsh index eb01aea..4511af0 100755 --- a/feeds-encs.zsh +++ b/feeds-encs.zsh @@ -1,9 +1,5 @@ #!/usr/bin/env zsh dst=$1 -[[ -n "$dst" ]] || { - print Usage: $0 dst-dir-for-enclosures >&2 - exit 1 -} cmds=$0:h:a/cmd fpath=($cmds/functions.zsh $fpath) autoload print-joblog-failed diff --git a/feeds-warcs.zsh b/feeds-warcs.zsh index 6758c08..17e5333 100755 --- a/feeds-warcs.zsh +++ b/feeds-warcs.zsh @@ -1,9 +1,5 @@ #!/usr/bin/env zsh dst=$1 -[[ -n "$dst" ]] || { - print Usage: $0 dst-dir-for-warcs >&2 - exit 1 -} cmds=$0:h:a/cmd fpath=($cmds/functions.zsh $fpath) autoload print-joblog-failed