]> Sergey Matveev's repositories - feeder.git/commitdiff
Various refactoring and parametrizing
authorSergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 11:37:32 +0000 (14:37 +0300)
committerSergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 12:39:58 +0000 (15:39 +0300)
* cmd/env.rc has all options you can override
* HTTP/HTTP proxy for curl/wget is controlled with
  http_proxy/https_proxy environment variable
* User-Agent is not hard-coded, can be even empty
* download-n-parse helper
* Example mailcap

19 files changed:
cmd/clear.zsh [new file with mode: 0755]
cmd/do-in-parallel.zsh [new file with mode: 0755]
cmd/download.sh
cmd/encs.zsh [moved from cmd/download-encs.zsh with 74% similarity]
cmd/env.rc [new file with mode: 0644]
cmd/muttrc-gen.sh
cmd/parse.sh
cmd/warcs.zsh [moved from cmd/download-warcs.zsh with 82% similarity]
contrib/mailcap [new file with mode: 0644]
doc/usage.texi
doc/warcs.texi
feeds-browse.sh [new file with mode: 0755]
feeds-browse.zsh [deleted file]
feeds-clear.zsh
feeds-dnp.zsh [new file with mode: 0755]
feeds-download.zsh
feeds-encs.zsh
feeds-parse.zsh
feeds-warcs.zsh

diff --git a/cmd/clear.zsh b/cmd/clear.zsh
new file mode 100755 (executable)
index 0000000..63eb946
--- /dev/null
@@ -0,0 +1,8 @@
+#!/usr/bin/env zsh
+set -e
+cmds=$0:h:a
+. $cmds/env.rc
+setopt EXTENDED_GLOB
+[[ -s $1/max ]] && max=`cat $1/max` || max=$FEEDER_MAX_ITEMS
+(( max++ ))
+[[ $max -eq 1 ]] || rm -fv $1/cur/*(Nom[$max,-1])
diff --git a/cmd/do-in-parallel.zsh b/cmd/do-in-parallel.zsh
new file mode 100755 (executable)
index 0000000..6389b33
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/env zsh
+set -e
+cmds=$0:h:a
+. $cmds/env.rc
+log=$1.log
+${=PARALLEL} --jobs ${(P)2} --joblog $log $3 ::: feeds/*
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+print-joblog-failed < $log
index a70c6d7039dd0ae1ceeeb450c75b48dd00289f0c..2b642f6b02b09a239d28367f8ba8fa1eb9412c75 100755 (executable)
@@ -1,28 +1,28 @@
 #!/bin/sh -e
 
-PROXY="--proxy http://localhost:8080/"
+cmds="$(dirname "$(realpath -- "$0")")"
+. "$cmds/env.rc"
 cd "$1"
 read url < url
 [ -s etag ] && etag_compare="--etag-compare etag" || :
 [ -r out ] && time_cond="--time-cond out" || :
 [ -z "$FEEDER_CURL_VERBOSE" ] && silent="--silent" || silent="--verbose"
-curl --fail \
-    --user-agent "go.stargrave.org-feeder/0.1.0" \
+${CURL:-curl} --fail \
+    --user-agent "$FEEDER_USER_AGENT" \
     --compressed \
     --location --max-redirs 2 \
     --dump-header hdr \
     --output out \
     --remote-time \
     --etag-save etag \
-    $PROXY \
     $etag_compare \
     $time_cond \
     $silent \
     "$url" >&2
 if [ -s out ] ; then
-    zstdmt -19 < out > feed.zst
+    $ZSTD < out > feed.zst
     touch -r out feed.zst
     truncate -s 0 out
     touch -r feed.zst out
 fi
-sha512 < feed.zst > download.hash
+$SHA512 < feed.zst > download.hash
similarity index 74%
rename from cmd/download-encs.zsh
rename to cmd/encs.zsh
index b612aef0df897632c28b46517c78cde1e575a798..a3642668ce45c1d31b6a09aeb1b3b33fe8b0b498 100755 (executable)
@@ -1,6 +1,8 @@
 #!/usr/bin/env zsh
 set -e
-fpath=($0:h:a/functions.zsh $fpath)
+cmds=$0:h:a
+. $cmds/env.rc
+fpath=($cmds/functions.zsh $fpath)
 dst=$2:a
 cd $1
 [[ -n "$dst" ]] || { dst=encs ; dst=$dst:a }
@@ -16,7 +18,8 @@ for new (new/*(N)) {
         url=$cols[2]
         [[ -n "$url" ]]
         fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
-        wget --timestamping --output-document=$dst/$fn $url 2>&2 2>enc.log
+        ${=WGET} --user-agent=$FEEDER_USER_AGENT \
+            --output-document=$dst/$fn $url 2>&2 2>encs.log
         print $dst/$fn
     done < $new
 }
diff --git a/cmd/env.rc b/cmd/env.rc
new file mode 100644 (file)
index 0000000..6bb0cd0
--- /dev/null
@@ -0,0 +1,14 @@
+CURL="${CURL:-curl}"
+ZSTD="${ZSTD:-zstdmt -19}"
+WGET="${WGET:-wget}"
+PARALLEL="${PARALLEL:-parallel --bar}"
+
+FEEDER_USER_AGENT="${FEEDER_USER_AGENT:-go.stargrave.org-feeder/0.1.0}"
+#FEEDER_CURL_VERBOSE=1
+FEEDER_MAX_ITEMS=${FEEDER_MAX_ITEMS:-100}
+FEEDER_DOWNLOAD_JOBS=${FEEDER_DOWNLOAD_JOBS:-10}
+FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0}
+
+command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary"
+
+#MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}"
index c33161e76f13847f225dbc0ee6b5ca1538102bec..692a5aa0c58133d53edc6648ee8ad36a95a12b19 100755 (executable)
@@ -24,6 +24,9 @@ folder-hook search "set index_format = \"%4C [%D] %s (%F)\""
 
 unignore X-Author X-URL X-Enclosure X-Categories
 
+alternative_order text/plain text/html
+auto_view text/html
+
 set folder = \`pwd\`
 unmailboxes *
 mailboxes search
index b1e4a0268b90843892e7aad5a0fd6c1dd8c49fdd..5b598932a1c12fb77015fe001b8fe889a94c199d 100755 (executable)
@@ -1,11 +1,12 @@
 #!/bin/sh -e
 
 cmds="$(dirname "$(realpath -- "$0")")"
+. "$cmds/env.rc"
 cd "$1"
-[ -s parse.hash ] && hash_our=`cat parse.hash` || :
-[ -s download.hash ] && hash_their=`cat download.hash` || :
+[ -s parse.hash ] && hash_our="`cat parse.hash`" || :
+[ -s download.hash ] && hash_their="`cat download.hash`" || :
 [ "$hash_our" != "$hash_their" ] || exit 0
-[ -s max ] && max=`cat max` || max=${FEEDER_MAX_ITEMS:-100}
-zstd -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp
+[ -s max ] && max=`cat max` || max=$FEEDER_MAX_ITEMS
+$ZSTD -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp
 mv title.tmp title
-echo $hash_their > parse.hash
+echo "$hash_their" > parse.hash
similarity index 82%
rename from cmd/download-warcs.zsh
rename to cmd/warcs.zsh
index 30d8202f4d536d6aacf6ac5eb0f25e4095ad3c47..4e3efd647f2334de34edc26fe325539d69b0b7f7 100755 (executable)
@@ -1,6 +1,8 @@
 #!/usr/bin/env zsh
 set -e
-fpath=($0:h:a/functions.zsh $fpath)
+cmds=$0:h:a
+. $cmds/env.rc
+fpath=($cmds/functions.zsh $fpath)
 dst=$2:a
 cd $1
 [[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a }
@@ -9,6 +11,7 @@ autoload url-to-filename
 zmodload -F zsh/datetime b:strftime
 setopt EXTENDED_GLOB
 wget_opts=(
+    --user-agent="$FEEDER_USER_AGENT"
     --page-requisites
     --compression=auto
     --no-warc-keep-log
@@ -29,7 +32,7 @@ for new (new/*(N)) {
             wget_opts=(--output-document=$tmp $wget_opts)
         }
         fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
-        wget $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
+        ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
         print $dst/$fn.warc
     done < $new
 }
diff --git a/contrib/mailcap b/contrib/mailcap
new file mode 100644 (file)
index 0000000..9fb0ecd
--- /dev/null
@@ -0,0 +1,2 @@
+# text/html; w3m -T text/html -I %{charset} -dump %s; copiousoutput; nametemplate=%s.html
+text/html; lynx -assume_charset=%{charset} -dump %s; copiousoutput; nametemplate=%s.html
index d7eb951665529e99ae35d01e1f3b921af5ed7ad9..328201a42f8cd1f0b2b23751d84a240d03bedbc6 100644 (file)
@@ -39,6 +39,13 @@ http://blog.stargrave.org/russian/feed.atom
 @command{urls2feeds.zsh} won't touch already existing directories and will
 warn if some of them disappeared from @file{urls}.
 
+@item Check configuration options
+
+@file{cmd/env.rc} contains list of various options you can override by
+environment variables, like @command{curl}, @command{wget},
+@command{zstd}, @command{parallel} command invocations,
+@code{User-Agent}, number of download/parse jobs run in parallel and so on.
+
 @item Download your feed(s) data
 
 @example
@@ -58,6 +65,14 @@ $ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom
 $ ./feeds-parse.zsh # to parse all feeds in parallel
 @end example
 
+@item Download-n-parse
+
+You can also download and parse the feeds immediately:
+
+@example
+$ ./feeds-dnp.zsh
+@end example
+
 @item Quick overview of the news:
 
 @example
@@ -74,13 +89,20 @@ www.darkside.ru_news_rss: 5
 @item Run Mutt
 
 @example
-$ ./feeds-browse.zsh
+$ ./feeds-browse.sh
 @end example
 
 That will read all feeds titles and create @file{mutt.rc} sourceable
 configuration file with predefined helpers and @code{mailboxes}
-commands. Mutt will be started in mailboxes browser mode (I will skip
-many entries):
+commands.
+
+That configuration contains @code{auto_view text/html}, that expects
+proper @file{mailcap} configuration file with @code{text/html} entry to
+exists. Mutt has some built-in default search paths for, but you can
+override them with @env{$MAILCAPS} environment variable. There is
+example @file{contrib/mailcap}.
+
+Mutt will be started in mailboxes browser mode (I will skip many entries):
 
 @verbatim
   1   N [  1|101] 2021-02-17 20:41 Cryptology ePrint Archive/
@@ -162,6 +184,7 @@ Parser only appends them, but does not remove obsolete ones.
 
 @example
 $ ./feeds-clear.zsh
+$ cmd/clear.zsh feeds/FEED # to clear single feed
 @end example
 
 will clear everything exceeding the quantity limit. You can set that
@@ -208,7 +231,7 @@ progress is also printed both to stderr and @file{feeds/FEED/encs.log}.
 Of course you can also download only single feed's enclosures:
 
 @example
-$ cmd/download-encs.sh path/to/FEED [optional overriden destination directory]
+$ cmd/encs.zsh path/to/FEED [optional overriden destination directory]
 @end example
 
 @end table
index ca151f05fae15fd21d957d1a65fd656ed6601675..0c179f5af6c513db2fdf76d42bf1ffc2c1cf2d62 100644 (file)
@@ -31,5 +31,5 @@ acts as a proxy) to view and visit existing URLs.
 Of course you can also download only single feed's enclosures:
 
 @example
-$ cmd/download-warcs.sh path/to/FEED [optional overriden destination directory]
+$ cmd/warcs.zsh path/to/FEED [optional overriden destination directory]
 @end example
diff --git a/feeds-browse.sh b/feeds-browse.sh
new file mode 100755 (executable)
index 0000000..c728c0c
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+cmds="$(dirname "$(realpath -- "$0")")"/cmd
+muttrc_their="$($cmds/muttrc-gen.sh)"
+[ -r mutt.rc ] && muttrc_our="$(cat mutt.rc)" || :
+[ "$muttrc_our" = "$muttrc_their" ] || cat > mutt.rc <<EOF
+$muttrc_their
+EOF
+mutt -e "source mutt.rc" -y
diff --git a/feeds-browse.zsh b/feeds-browse.zsh
deleted file mode 100755 (executable)
index 80d18b2..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env zsh
-set -e
-muttrc_their=`$0:h/cmd/muttrc-gen.sh`
-[[ -r mutt.rc ]] && muttrc_our=`cat mutt.rc` || :
-[[ "$muttrc_our" = "$muttrc_their" ]] || cat > mutt.rc <<EOF
-$muttrc_their
-EOF
-mutt -e "source mutt.rc" -y
index c1a55aad69a220aae9ad8c8af7c7db218ee4a132..e9424ca7554ed296fdba5695e071deed99cf4f73 100755 (executable)
@@ -1,8 +1,4 @@
 #!/usr/bin/env zsh
 set -e
-setopt EXTENDED_GLOB
-for f (feeds/*) {
-    [[ -s $f/max ]] && max=`cat $f/max` || max=${FEEDER_MAX_ITEMS:-100}
-    (( max++ ))
-    [[ $max -eq 1 ]] || rm -fv $f/cur/*(Nom[$max,-1])
-}
+cmds=$0:h:a/cmd
+for f (feeds/*) $cmds/clear.zsh $f
diff --git a/feeds-dnp.zsh b/feeds-dnp.zsh
new file mode 100755 (executable)
index 0000000..487ec2a
--- /dev/null
@@ -0,0 +1,3 @@
+#!/usr/bin/env zsh
+cmds=$0:h:a/cmd
+exec $cmds/do-in-parallel.zsh dnp FEEDER_DOWNLOAD_JOBS "$cmds/download.sh {} ; $cmds/parse.sh {}"
index 6ddfb63040690a9b952a5b897850df9f0cef812f..06ba96c650f219062c77a16bacc6c9a9f3fba597 100755 (executable)
@@ -1,6 +1,3 @@
 #!/usr/bin/env zsh
 cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/*
-print-joblog-failed < download.log
+exec $cmds/do-in-parallel.zsh download FEEDER_DOWNLOAD_JOBS "$cmds/download.sh {}"
index 4511af015ff127dd43a44651c88271a60fee603e..fb6e4e173f6efe2dbeeed5424522ac222e20c4fe 100755 (executable)
@@ -1,7 +1,3 @@
 #!/usr/bin/env zsh
-dst=$1
 cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --jobs 1 --joblog encs.log "$cmds/download-encs.zsh {} $dst" ::: feeds/*
-print-joblog-failed < encs.log
+JOBS=1 exec $cmds/do-in-parallel.zsh encs JOBS "$cmds/encs.zsh {} $1"
index 4dcc61e80bd7313f09dc51d4a551ca0102d2c66f..f5ca7c1c258171507402d5710f9a22128d7998a4 100755 (executable)
@@ -1,6 +1,3 @@
 #!/usr/bin/env zsh
 cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/*
-print-joblog-failed < parse.log
+exec $cmds/do-in-parallel.zsh parse FEEDER_PARSE_JOBS "$cmds/parse.sh {}"
index 17e5333ae1ec4a6eed855a5e6703da8dc88bc404..8a24dab2ec389cb2bdeeb0149ec42edab04df512 100755 (executable)
@@ -1,7 +1,3 @@
 #!/usr/bin/env zsh
-dst=$1
 cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog warcs.log "$cmds/download-warcs.zsh {} $dst" ::: feeds/*
-print-joblog-failed < warcs.log
+exec $cmds/do-in-parallel.zsh warcs FEEDER_DOWNLOAD_JOBS "$cmds/warcs.zsh {} $1"