]> Sergey Matveev's repositories - feeder.git/blob - cmd/warcs
Use stdin for simplicity and flexibility
[feeder.git] / cmd / warcs
1 #!/usr/bin/env zsh
2 set -e
3 cmds=$0:h:a
4 . $cmds/env.rc
5 fpath=($cmds/functions.zsh $fpath)
6 dst=$2:a
7 cd $1
8 [[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a }
9 mkdir -p $dst
10 autoload url-to-filename
11 zmodload -F zsh/datetime b:strftime
12 setopt EXTENDED_GLOB
13 wget_opts=(
14     --user-agent="$FEEDER_USER_AGENT"
15     --page-requisites
16     --compression=auto
17     --no-warc-keep-log
18     --no-warc-digests
19     --no-warc-compression
20 )
21 for new (new/*(N)) {
22     while read line ; do
23         [[ "$line" != "" ]] || break
24         cols=(${(s: :)line})
25         [[ $cols[1] = "X-URL:" ]] || continue
26         url=$cols[2]
27         [[ -n "$url" ]]
28         [[ -n "$tmp" ]] || {
29             # Lazy temporary file creation
30             tmp=`mktemp`
31             trap "rm -f $tmp" HUP PIPE INT QUIT TERM EXIT
32             wget_opts=(--output-document=$tmp $wget_opts)
33         }
34         fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
35         ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
36         $FEEDER_WARC_COMPRESS $dst/$fn.warc
37         print $dst/$fn.warc*
38     done < $new
39 }