fpath=($0:h:a/functions.zsh $fpath)
dst=$2:a
cd $1
+[[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a }
+mkdir -p $dst
autoload url-to-filename
zmodload -F zsh/datetime b:strftime
setopt EXTENDED_GLOB
wget_opts=(--output-document=$tmp $wget_opts)
}
fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
- wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url
- print $fn
+ wget $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
+ print $dst/$fn.warc
done < $new
}
Many feeds include links to so-called enclosures, like audio files for
podcasts. While you mail is not processed by MUA, its @file{new/}
messages still there, you can run enclosure downloading process, that
-uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Specify the
-directory where your enclosures should be placed. Each enclosure's
-filename is more or less filesystem-friendly with the current timestamp
-in it.
+uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Each
+enclosure's filename is more or less filesystem-friendly with the
+current timestamp in it.
@example
-$ mkdir path/to/enclosures
-$ ./feeds-encs.zsh path/to/enclosures
+$ ./feeds-encs.zsh
[...]
-traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
+monsterfeet.com_grue.rss/encs/20220218-152822-traffic.libsyn.com_monsterfeet_grue_018.mp3
+www.astronews.ru_astronews.xml/encs/20220219-115710-www.astronews.ru_news_2022_20220216125238.jpg
[...]
-$ file path/to/enclosures/traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
-path/to/...: Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural
+$ file feeds/**/encs/*/
+monsterfeet.com_grue.rss/encs/20220218-152822-traffic.libsyn.com_monsterfeet_grue_018.mp3:
+ Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural
+www.astronews.ru_astronews.xml/encs/20220219-115710-www.astronews.ru_news_2022_20220216125238.jpg:
+ JPEG image data, JFIF standard 1.01, ...
@end example
-@command{feeds-encs.zsh} do not parallelize jobs, because enclosure are
-often heavy enough to satiate your Internet link.
+@command{feeds-encs.zsh} does not parallelize jobs, because enclosure are
+often heavy enough to satiate your Internet link. @command{wget}'s
+progress is also printed both to stderr and @file{feeds/FEED/encs.log}.
+
+Of course you can also download only single feed's enclosures:
+
+@example
+$ cmd/download-encs.sh path/to/FEED [optional overriden destination directory]
+@end example
@end table
@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} format.
@example
-$ mkdir path/to/warcs
-$ ./feeds-warcs.zsh path/to/warcs
+$ ./feeds-warcs.zsh
[...]
-www.darkside.ru_news_140480-20220218-145755.warc
+www.darkside.ru_news_rss/warcs/20220218-145755-www.darkside.ru_news_140480.warc
[...]
@end example
you can simply add them to running @command{tofuproxy}:
@example
-$ for w (path/to/warcs/*.warc) print $w > path/to/tofuproxy/fifos/add-warcs
+$ for w (feeds/*/warcs/*.warc) print $w:a > path/to/tofuproxy/fifos/add-warcs
@end example
And then visit @url{http://warc/} URL (when @command{tofuproxy} already
acts as a proxy) to view and visit existing URLs.
+
+Of course you can also download only single feed's enclosures:
+
+@example
+$ cmd/download-warcs.sh path/to/FEED [optional overriden destination directory]
+@end example