From: Sergey Matveev Date: Sun, 20 Feb 2022 11:37:32 +0000 (+0300) Subject: Various refactoring and parametrizing X-Git-Url: http://www.git.stargrave.org/?p=feeder.git;a=commitdiff_plain;h=9f2a1d215e7ad30b1f211785fe3384fea33ca56e Various refactoring and parametrizing * cmd/env.rc has all options you can override * HTTP/HTTP proxy for curl/wget is controlled with http_proxy/https_proxy environment variable * User-Agent is not hard-coded, can be even empty * download-n-parse helper * Example mailcap --- diff --git a/cmd/clear.zsh b/cmd/clear.zsh new file mode 100755 index 0000000..63eb946 --- /dev/null +++ b/cmd/clear.zsh @@ -0,0 +1,8 @@ +#!/usr/bin/env zsh +set -e +cmds=$0:h:a +. $cmds/env.rc +setopt EXTENDED_GLOB +[[ -s $1/max ]] && max=`cat $1/max` || max=$FEEDER_MAX_ITEMS +(( max++ )) +[[ $max -eq 1 ]] || rm -fv $1/cur/*(Nom[$max,-1]) diff --git a/cmd/do-in-parallel.zsh b/cmd/do-in-parallel.zsh new file mode 100755 index 0000000..6389b33 --- /dev/null +++ b/cmd/do-in-parallel.zsh @@ -0,0 +1,9 @@ +#!/usr/bin/env zsh +set -e +cmds=$0:h:a +. $cmds/env.rc +log=$1.log +${=PARALLEL} --jobs ${(P)2} --joblog $log $3 ::: feeds/* +fpath=($cmds/functions.zsh $fpath) +autoload print-joblog-failed +print-joblog-failed < $log diff --git a/cmd/download.sh b/cmd/download.sh index a70c6d7..2b642f6 100755 --- a/cmd/download.sh +++ b/cmd/download.sh @@ -1,28 +1,28 @@ #!/bin/sh -e -PROXY="--proxy http://localhost:8080/" +cmds="$(dirname "$(realpath -- "$0")")" +. "$cmds/env.rc" cd "$1" read url < url [ -s etag ] && etag_compare="--etag-compare etag" || : [ -r out ] && time_cond="--time-cond out" || : [ -z "$FEEDER_CURL_VERBOSE" ] && silent="--silent" || silent="--verbose" -curl --fail \ - --user-agent "go.stargrave.org-feeder/0.1.0" \ +${CURL:-curl} --fail \ + --user-agent "$FEEDER_USER_AGENT" \ --compressed \ --location --max-redirs 2 \ --dump-header hdr \ --output out \ --remote-time \ --etag-save etag \ - $PROXY \ $etag_compare \ $time_cond \ $silent \ "$url" >&2 if [ -s out ] ; then - zstdmt -19 < out > feed.zst + $ZSTD < out > feed.zst touch -r out feed.zst truncate -s 0 out touch -r feed.zst out fi -sha512 < feed.zst > download.hash +$SHA512 < feed.zst > download.hash diff --git a/cmd/download-encs.zsh b/cmd/encs.zsh similarity index 74% rename from cmd/download-encs.zsh rename to cmd/encs.zsh index b612aef..a364266 100755 --- a/cmd/download-encs.zsh +++ b/cmd/encs.zsh @@ -1,6 +1,8 @@ #!/usr/bin/env zsh set -e -fpath=($0:h:a/functions.zsh $fpath) +cmds=$0:h:a +. $cmds/env.rc +fpath=($cmds/functions.zsh $fpath) dst=$2:a cd $1 [[ -n "$dst" ]] || { dst=encs ; dst=$dst:a } @@ -16,7 +18,8 @@ for new (new/*(N)) { url=$cols[2] [[ -n "$url" ]] fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url) - wget --timestamping --output-document=$dst/$fn $url 2>&2 2>enc.log + ${=WGET} --user-agent=$FEEDER_USER_AGENT \ + --output-document=$dst/$fn $url 2>&2 2>encs.log print $dst/$fn done < $new } diff --git a/cmd/env.rc b/cmd/env.rc new file mode 100644 index 0000000..6bb0cd0 --- /dev/null +++ b/cmd/env.rc @@ -0,0 +1,14 @@ +CURL="${CURL:-curl}" +ZSTD="${ZSTD:-zstdmt -19}" +WGET="${WGET:-wget}" +PARALLEL="${PARALLEL:-parallel --bar}" + +FEEDER_USER_AGENT="${FEEDER_USER_AGENT:-go.stargrave.org-feeder/0.1.0}" +#FEEDER_CURL_VERBOSE=1 +FEEDER_MAX_ITEMS=${FEEDER_MAX_ITEMS:-100} +FEEDER_DOWNLOAD_JOBS=${FEEDER_DOWNLOAD_JOBS:-10} +FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0} + +command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary" + +#MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}" diff --git a/cmd/muttrc-gen.sh b/cmd/muttrc-gen.sh index c33161e..692a5aa 100755 --- a/cmd/muttrc-gen.sh +++ b/cmd/muttrc-gen.sh @@ -24,6 +24,9 @@ folder-hook search "set index_format = \"%4C [%D] %s (%F)\"" unignore X-Author X-URL X-Enclosure X-Categories +alternative_order text/plain text/html +auto_view text/html + set folder = \`pwd\` unmailboxes * mailboxes search diff --git a/cmd/parse.sh b/cmd/parse.sh index b1e4a02..5b59893 100755 --- a/cmd/parse.sh +++ b/cmd/parse.sh @@ -1,11 +1,12 @@ #!/bin/sh -e cmds="$(dirname "$(realpath -- "$0")")" +. "$cmds/env.rc" cd "$1" -[ -s parse.hash ] && hash_our=`cat parse.hash` || : -[ -s download.hash ] && hash_their=`cat download.hash` || : +[ -s parse.hash ] && hash_our="`cat parse.hash`" || : +[ -s download.hash ] && hash_their="`cat download.hash`" || : [ "$hash_our" != "$hash_their" ] || exit 0 -[ -s max ] && max=`cat max` || max=${FEEDER_MAX_ITEMS:-100} -zstd -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp +[ -s max ] && max=`cat max` || max=$FEEDER_MAX_ITEMS +$ZSTD -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp mv title.tmp title -echo $hash_their > parse.hash +echo "$hash_their" > parse.hash diff --git a/cmd/download-warcs.zsh b/cmd/warcs.zsh similarity index 82% rename from cmd/download-warcs.zsh rename to cmd/warcs.zsh index 30d8202..4e3efd6 100755 --- a/cmd/download-warcs.zsh +++ b/cmd/warcs.zsh @@ -1,6 +1,8 @@ #!/usr/bin/env zsh set -e -fpath=($0:h:a/functions.zsh $fpath) +cmds=$0:h:a +. $cmds/env.rc +fpath=($cmds/functions.zsh $fpath) dst=$2:a cd $1 [[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a } @@ -9,6 +11,7 @@ autoload url-to-filename zmodload -F zsh/datetime b:strftime setopt EXTENDED_GLOB wget_opts=( + --user-agent="$FEEDER_USER_AGENT" --page-requisites --compression=auto --no-warc-keep-log @@ -29,7 +32,7 @@ for new (new/*(N)) { wget_opts=(--output-document=$tmp $wget_opts) } fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url) - wget $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url + ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url print $dst/$fn.warc done < $new } diff --git a/contrib/mailcap b/contrib/mailcap new file mode 100644 index 0000000..9fb0ecd --- /dev/null +++ b/contrib/mailcap @@ -0,0 +1,2 @@ +# text/html; w3m -T text/html -I %{charset} -dump %s; copiousoutput; nametemplate=%s.html +text/html; lynx -assume_charset=%{charset} -dump %s; copiousoutput; nametemplate=%s.html diff --git a/doc/usage.texi b/doc/usage.texi index d7eb951..328201a 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -39,6 +39,13 @@ http://blog.stargrave.org/russian/feed.atom @command{urls2feeds.zsh} won't touch already existing directories and will warn if some of them disappeared from @file{urls}. +@item Check configuration options + +@file{cmd/env.rc} contains list of various options you can override by +environment variables, like @command{curl}, @command{wget}, +@command{zstd}, @command{parallel} command invocations, +@code{User-Agent}, number of download/parse jobs run in parallel and so on. + @item Download your feed(s) data @example @@ -58,6 +65,14 @@ $ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom $ ./feeds-parse.zsh # to parse all feeds in parallel @end example +@item Download-n-parse + +You can also download and parse the feeds immediately: + +@example +$ ./feeds-dnp.zsh +@end example + @item Quick overview of the news: @example @@ -74,13 +89,20 @@ www.darkside.ru_news_rss: 5 @item Run Mutt @example -$ ./feeds-browse.zsh +$ ./feeds-browse.sh @end example That will read all feeds titles and create @file{mutt.rc} sourceable configuration file with predefined helpers and @code{mailboxes} -commands. Mutt will be started in mailboxes browser mode (I will skip -many entries): +commands. + +That configuration contains @code{auto_view text/html}, that expects +proper @file{mailcap} configuration file with @code{text/html} entry to +exists. Mutt has some built-in default search paths for, but you can +override them with @env{$MAILCAPS} environment variable. There is +example @file{contrib/mailcap}. + +Mutt will be started in mailboxes browser mode (I will skip many entries): @verbatim 1 N [ 1|101] 2021-02-17 20:41 Cryptology ePrint Archive/ @@ -162,6 +184,7 @@ Parser only appends them, but does not remove obsolete ones. @example $ ./feeds-clear.zsh +$ cmd/clear.zsh feeds/FEED # to clear single feed @end example will clear everything exceeding the quantity limit. You can set that @@ -208,7 +231,7 @@ progress is also printed both to stderr and @file{feeds/FEED/encs.log}. Of course you can also download only single feed's enclosures: @example -$ cmd/download-encs.sh path/to/FEED [optional overriden destination directory] +$ cmd/encs.zsh path/to/FEED [optional overriden destination directory] @end example @end table diff --git a/doc/warcs.texi b/doc/warcs.texi index ca151f0..0c179f5 100644 --- a/doc/warcs.texi +++ b/doc/warcs.texi @@ -31,5 +31,5 @@ acts as a proxy) to view and visit existing URLs. Of course you can also download only single feed's enclosures: @example -$ cmd/download-warcs.sh path/to/FEED [optional overriden destination directory] +$ cmd/warcs.zsh path/to/FEED [optional overriden destination directory] @end example diff --git a/feeds-browse.sh b/feeds-browse.sh new file mode 100755 index 0000000..c728c0c --- /dev/null +++ b/feeds-browse.sh @@ -0,0 +1,8 @@ +#!/bin/sh -e +cmds="$(dirname "$(realpath -- "$0")")"/cmd +muttrc_their="$($cmds/muttrc-gen.sh)" +[ -r mutt.rc ] && muttrc_our="$(cat mutt.rc)" || : +[ "$muttrc_our" = "$muttrc_their" ] || cat > mutt.rc < mutt.rc <