]> Sergey Matveev's repositories - feeder.git/blob - cmd/download-warcs.zsh
Enclosures and WARCs downloader
[feeder.git] / cmd / download-warcs.zsh
1 #!/usr/bin/env zsh
2 set -e
3 fpath=($0:h:a/functions.zsh $fpath)
4 dst=$2:a
5 cd $1
6 autoload url-to-filename
7 zmodload -F zsh/datetime b:strftime
8 setopt EXTENDED_GLOB
9 wget_opts=(
10     --page-requisites
11     --compression=auto
12     --no-warc-keep-log
13     --no-warc-digests
14     --no-warc-compression
15 )
16 for new (new/*(N)) {
17     while read line ; do
18         [[ "$line" != "" ]] || break
19         cols=(${(s: :)line})
20         [[ $cols[1] = "X-URL:" ]] || continue
21         url=$cols[2]
22         [[ -n "$url" ]]
23         [[ -n "$tmp" ]] || {
24             # Lazy temporary file creation
25             tmp=`mktemp`
26             trap "rm -f $tmp" HUP PIPE INT QUIT TERM EXIT
27             wget_opts=(--output-document=$tmp $wget_opts)
28         }
29         fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
30         wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url
31         print $fn
32     done < $new
33 }