Various refactoring and parametrizing

author Sergey Matveev <stargrave@stargrave.org>

Sun, 20 Feb 2022 11:37:32 +0000 (14:37 +0300)

committer Sergey Matveev <stargrave@stargrave.org>

Sun, 20 Feb 2022 12:39:58 +0000 (15:39 +0300)
author Sergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 11:37:32 +0000 (14:37 +0300)
committer Sergey Matveev <stargrave@stargrave.org>
Sun, 20 Feb 2022 12:39:58 +0000 (15:39 +0300)
diff --git a/cmd/clear.zsh b/cmd/clear.zsh

new file mode 100755 (executable)

index 0000000..63eb946
--- /dev/null
+++ b/cmd/clear.zsh
@@ -0,0 +1,8 @@
+#!/usr/bin/env zsh
+set -e
+cmds=$0:h:a
+. $cmds/env.rc
+setopt EXTENDED_GLOB
+[[ -s $1/max ]] && max=`cat $1/max` || max=$FEEDER_MAX_ITEMS
+(( max++ ))
+[[ $max -eq 1 ]] || rm -fv $1/cur/*(Nom[$max,-1])
diff --git a/cmd/do-in-parallel.zsh b/cmd/do-in-parallel.zsh

new file mode 100755 (executable)

index 0000000..6389b33
--- /dev/null
+++ b/cmd/do-in-parallel.zsh
@@ -0,0 +1,9 @@
+#!/usr/bin/env zsh
+set -e
+cmds=$0:h:a
+. $cmds/env.rc
+log=$1.log
+${=PARALLEL} --jobs ${(P)2} --joblog $log $3 ::: feeds/*
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+print-joblog-failed < $log
diff --git a/cmd/download.sh b/cmd/download.sh

index a70c6d7039dd0ae1ceeeb450c75b48dd00289f0c..2b642f6b02b09a239d28367f8ba8fa1eb9412c75 100755 (executable)
--- a/cmd/download.sh
+++ b/cmd/download.sh
@@ -1,28 +1,28 @@
  #!/bin/sh -e
  
-PROXY="--proxy http://localhost:8080/"
+cmds="$(dirname "$(realpath -- "$0")")"
+. "$cmds/env.rc"
  cd "$1"
  read url < url
  [ -s etag ] && etag_compare="--etag-compare etag" || :
  [ -r out ] && time_cond="--time-cond out" || :
  [ -z "$FEEDER_CURL_VERBOSE" ] && silent="--silent" || silent="--verbose"
-curl --fail \
-    --user-agent "go.stargrave.org-feeder/0.1.0" \
+${CURL:-curl} --fail \
+    --user-agent "$FEEDER_USER_AGENT" \
      --compressed \
      --location --max-redirs 2 \
      --dump-header hdr \
      --output out \
      --remote-time \
      --etag-save etag \
-    $PROXY \
      $etag_compare \
      $time_cond \
      $silent \
      "$url" >&2
  if [ -s out ] ; then
-    zstdmt -19 < out > feed.zst
+    $ZSTD < out > feed.zst
      touch -r out feed.zst
      truncate -s 0 out
      touch -r feed.zst out
  fi
-sha512 < feed.zst > download.hash
+$SHA512 < feed.zst > download.hash
diff --git a/cmd/download-encs.zsh b/cmd/encs.zsh

similarity index 74%

rename from cmd/download-encs.zsh

rename to cmd/encs.zsh

index b612aef0df897632c28b46517c78cde1e575a798..a3642668ce45c1d31b6a09aeb1b3b33fe8b0b498 100755 (executable)
--- a/cmd/download-encs.zsh
+++ b/cmd/encs.zsh
@@ -1,6 +1,8 @@
  #!/usr/bin/env zsh
  set -e
-fpath=($0:h:a/functions.zsh $fpath)
+cmds=$0:h:a
+. $cmds/env.rc
+fpath=($cmds/functions.zsh $fpath)
  dst=$2:a
  cd $1
  [[ -n "$dst" ]] || { dst=encs ; dst=$dst:a }
@@ -16,7 +18,8 @@ for new (new/*(N)) {
          url=$cols[2]
          [[ -n "$url" ]]
          fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
-        wget --timestamping --output-document=$dst/$fn $url 2>&2 2>enc.log
+        ${=WGET} --user-agent=$FEEDER_USER_AGENT \
+            --output-document=$dst/$fn $url 2>&2 2>encs.log
          print $dst/$fn
      done < $new
  }
diff --git a/cmd/env.rc b/cmd/env.rc

new file mode 100644 (file)

index 0000000..6bb0cd0
--- /dev/null
+++ b/cmd/env.rc
@@ -0,0 +1,14 @@
+CURL="${CURL:-curl}"
+ZSTD="${ZSTD:-zstdmt -19}"
+WGET="${WGET:-wget}"
+PARALLEL="${PARALLEL:-parallel --bar}"
+
+FEEDER_USER_AGENT="${FEEDER_USER_AGENT:-go.stargrave.org-feeder/0.1.0}"
+#FEEDER_CURL_VERBOSE=1
+FEEDER_MAX_ITEMS=${FEEDER_MAX_ITEMS:-100}
+FEEDER_DOWNLOAD_JOBS=${FEEDER_DOWNLOAD_JOBS:-10}
+FEEDER_PARSE_JOBS=${FEEDER_PARSE_JOBS:-0}
+
+command -v sha512 >/dev/null && SHA512="sha512" || SHA512="sha512sum --binary"
+
+#MAILCAPS="${MAILCAPS:-$cmds/../contrib/mailcap}"
diff --git a/cmd/muttrc-gen.sh b/cmd/muttrc-gen.sh

index c33161e76f13847f225dbc0ee6b5ca1538102bec..692a5aa0c58133d53edc6648ee8ad36a95a12b19 100755 (executable)
--- a/cmd/muttrc-gen.sh
+++ b/cmd/muttrc-gen.sh
@@ -24,6 +24,9 @@ folder-hook search "set index_format = \"%4C [%D] %s (%F)\""
  
  unignore X-Author X-URL X-Enclosure X-Categories
  
+alternative_order text/plain text/html
+auto_view text/html
+
  set folder = \`pwd\`
  unmailboxes *
  mailboxes search
diff --git a/cmd/parse.sh b/cmd/parse.sh

index b1e4a0268b90843892e7aad5a0fd6c1dd8c49fdd..5b598932a1c12fb77015fe001b8fe889a94c199d 100755 (executable)
--- a/cmd/parse.sh
+++ b/cmd/parse.sh
@@ -1,11 +1,12 @@
  #!/bin/sh -e
  
  cmds="$(dirname "$(realpath -- "$0")")"
+. "$cmds/env.rc"
  cd "$1"
-[ -s parse.hash ] && hash_our=`cat parse.hash` || :
-[ -s download.hash ] && hash_their=`cat download.hash` || :
+[ -s parse.hash ] && hash_our="`cat parse.hash`" || :
+[ -s download.hash ] && hash_their="`cat download.hash`" || :
  [ "$hash_our" != "$hash_their" ] || exit 0
-[ -s max ] && max=`cat max` || max=${FEEDER_MAX_ITEMS:-100}
-zstd -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp
+[ -s max ] && max=`cat max` || max=$FEEDER_MAX_ITEMS
+$ZSTD -d < feed.zst | $cmds/feed2mdir/feed2mdir -max-entries $max . > title.tmp
  mv title.tmp title
-echo $hash_their > parse.hash
+echo "$hash_their" > parse.hash
diff --git a/cmd/download-warcs.zsh b/cmd/warcs.zsh

similarity index 82%

rename from cmd/download-warcs.zsh

rename to cmd/warcs.zsh

index 30d8202f4d536d6aacf6ac5eb0f25e4095ad3c47..4e3efd647f2334de34edc26fe325539d69b0b7f7 100755 (executable)
--- a/cmd/download-warcs.zsh
+++ b/cmd/warcs.zsh
@@ -1,6 +1,8 @@
  #!/usr/bin/env zsh
  set -e
-fpath=($0:h:a/functions.zsh $fpath)
+cmds=$0:h:a
+. $cmds/env.rc
+fpath=($cmds/functions.zsh $fpath)
  dst=$2:a
  cd $1
  [[ -n "$dst" ]] || { dst=warcs ; dst=$dst:a }
@@ -9,6 +11,7 @@ autoload url-to-filename
  zmodload -F zsh/datetime b:strftime
  setopt EXTENDED_GLOB
  wget_opts=(
+    --user-agent="$FEEDER_USER_AGENT"
      --page-requisites
      --compression=auto
      --no-warc-keep-log
@@ -29,7 +32,7 @@ for new (new/*(N)) {
              wget_opts=(--output-document=$tmp $wget_opts)
          }
          fn=$(strftime %Y%m%d-%H%M%S)-$(url-to-filename $url)
-        wget $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
+        ${=WGET} $wget_opts --output-file=warcs.log --warc-file=$dst/$fn $url
          print $dst/$fn.warc
      done < $new
  }
diff --git a/contrib/mailcap b/contrib/mailcap

new file mode 100644 (file)

index 0000000..9fb0ecd
--- /dev/null
+++ b/contrib/mailcap
@@ -0,0 +1,2 @@
+# text/html; w3m -T text/html -I %{charset} -dump %s; copiousoutput; nametemplate=%s.html
+text/html; lynx -assume_charset=%{charset} -dump %s; copiousoutput; nametemplate=%s.html
diff --git a/doc/usage.texi b/doc/usage.texi

index d7eb951665529e99ae35d01e1f3b921af5ed7ad9..328201a42f8cd1f0b2b23751d84a240d03bedbc6 100644 (file)
--- a/doc/usage.texi
+++ b/doc/usage.texi
@@ -39,6 +39,13 @@ http://blog.stargrave.org/russian/feed.atom
  @command{urls2feeds.zsh} won't touch already existing directories and will
  warn if some of them disappeared from @file{urls}.
  
+@item Check configuration options
+
+@file{cmd/env.rc} contains list of various options you can override by
+environment variables, like @command{curl}, @command{wget},
+@command{zstd}, @command{parallel} command invocations,
+@code{User-Agent}, number of download/parse jobs run in parallel and so on.
+
  @item Download your feed(s) data
  
  @example
@@ -58,6 +65,14 @@ $ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom
  $ ./feeds-parse.zsh # to parse all feeds in parallel
  @end example
  
+@item Download-n-parse
+
+You can also download and parse the feeds immediately:
+
+@example
+$ ./feeds-dnp.zsh
+@end example
+
  @item Quick overview of the news:
  
  @example
@@ -74,13 +89,20 @@ www.darkside.ru_news_rss: 5
  @item Run Mutt
  
  @example
-$ ./feeds-browse.zsh
+$ ./feeds-browse.sh
  @end example
  
  That will read all feeds titles and create @file{mutt.rc} sourceable
  configuration file with predefined helpers and @code{mailboxes}
-commands. Mutt will be started in mailboxes browser mode (I will skip
-many entries):
+commands.
+
+That configuration contains @code{auto_view text/html}, that expects
+proper @file{mailcap} configuration file with @code{text/html} entry to
+exists. Mutt has some built-in default search paths for, but you can
+override them with @env{$MAILCAPS} environment variable. There is
+example @file{contrib/mailcap}.
+
+Mutt will be started in mailboxes browser mode (I will skip many entries):
  
  @verbatim
    1   N [  1|101] 2021-02-17 20:41 Cryptology ePrint Archive/
@@ -162,6 +184,7 @@ Parser only appends them, but does not remove obsolete ones.
  
  @example
  $ ./feeds-clear.zsh
+$ cmd/clear.zsh feeds/FEED # to clear single feed
  @end example
  
  will clear everything exceeding the quantity limit. You can set that
@@ -208,7 +231,7 @@ progress is also printed both to stderr and @file{feeds/FEED/encs.log}.
  Of course you can also download only single feed's enclosures:
  
  @example
-$ cmd/download-encs.sh path/to/FEED [optional overriden destination directory]
+$ cmd/encs.zsh path/to/FEED [optional overriden destination directory]
  @end example
  
  @end table
diff --git a/doc/warcs.texi b/doc/warcs.texi

index ca151f05fae15fd21d957d1a65fd656ed6601675..0c179f5af6c513db2fdf76d42bf1ffc2c1cf2d62 100644 (file)
--- a/doc/warcs.texi
+++ b/doc/warcs.texi
@@ -31,5 +31,5 @@ acts as a proxy) to view and visit existing URLs.
  Of course you can also download only single feed's enclosures:
  
  @example
-$ cmd/download-warcs.sh path/to/FEED [optional overriden destination directory]
+$ cmd/warcs.zsh path/to/FEED [optional overriden destination directory]
  @end example
diff --git a/feeds-browse.sh b/feeds-browse.sh

new file mode 100755 (executable)

index 0000000..c728c0c
--- /dev/null
+++ b/feeds-browse.sh
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+cmds="$(dirname "$(realpath -- "$0")")"/cmd
+muttrc_their="$($cmds/muttrc-gen.sh)"
+[ -r mutt.rc ] && muttrc_our="$(cat mutt.rc)" || :
+[ "$muttrc_our" = "$muttrc_their" ] || cat > mutt.rc <<EOF
+$muttrc_their
+EOF
+mutt -e "source mutt.rc" -y
diff --git a/feeds-browse.zsh b/feeds-browse.zsh

deleted file mode 100755 (executable)

index 80d18b2..0000000
--- a/feeds-browse.zsh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env zsh
-set -e
-muttrc_their=`$0:h/cmd/muttrc-gen.sh`
-[[ -r mutt.rc ]] && muttrc_our=`cat mutt.rc` || :
-[[ "$muttrc_our" = "$muttrc_their" ]] || cat > mutt.rc <<EOF
-$muttrc_their
-EOF
-mutt -e "source mutt.rc" -y
diff --git a/feeds-clear.zsh b/feeds-clear.zsh

index c1a55aad69a220aae9ad8c8af7c7db218ee4a132..e9424ca7554ed296fdba5695e071deed99cf4f73 100755 (executable)
--- a/feeds-clear.zsh
+++ b/feeds-clear.zsh
@@ -1,8 +1,4 @@
  #!/usr/bin/env zsh
  set -e
-setopt EXTENDED_GLOB
-for f (feeds/*) {
-    [[ -s $f/max ]] && max=`cat $f/max` || max=${FEEDER_MAX_ITEMS:-100}
-    (( max++ ))
-    [[ $max -eq 1 ]] || rm -fv $f/cur/*(Nom[$max,-1])
-}
+cmds=$0:h:a/cmd
+for f (feeds/*) $cmds/clear.zsh $f
diff --git a/feeds-dnp.zsh b/feeds-dnp.zsh

new file mode 100755 (executable)

index 0000000..487ec2a
--- /dev/null
+++ b/feeds-dnp.zsh
@@ -0,0 +1,3 @@
+#!/usr/bin/env zsh
+cmds=$0:h:a/cmd
+exec $cmds/do-in-parallel.zsh dnp FEEDER_DOWNLOAD_JOBS "$cmds/download.sh {} ; $cmds/parse.sh {}"
diff --git a/feeds-download.zsh b/feeds-download.zsh

index 6ddfb63040690a9b952a5b897850df9f0cef812f..06ba96c650f219062c77a16bacc6c9a9f3fba597 100755 (executable)
--- a/feeds-download.zsh
+++ b/feeds-download.zsh
@@ -1,6 +1,3 @@
  #!/usr/bin/env zsh
  cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/*
-print-joblog-failed < download.log
+exec $cmds/do-in-parallel.zsh download FEEDER_DOWNLOAD_JOBS "$cmds/download.sh {}"
diff --git a/feeds-encs.zsh b/feeds-encs.zsh

index 4511af015ff127dd43a44651c88271a60fee603e..fb6e4e173f6efe2dbeeed5424522ac222e20c4fe 100755 (executable)
--- a/feeds-encs.zsh
+++ b/feeds-encs.zsh
@@ -1,7 +1,3 @@
  #!/usr/bin/env zsh
-dst=$1
  cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --jobs 1 --joblog encs.log "$cmds/download-encs.zsh {} $dst" ::: feeds/*
-print-joblog-failed < encs.log
+JOBS=1 exec $cmds/do-in-parallel.zsh encs JOBS "$cmds/encs.zsh {} $1"
diff --git a/feeds-parse.zsh b/feeds-parse.zsh

index 4dcc61e80bd7313f09dc51d4a551ca0102d2c66f..f5ca7c1c258171507402d5710f9a22128d7998a4 100755 (executable)
--- a/feeds-parse.zsh
+++ b/feeds-parse.zsh
@@ -1,6 +1,3 @@
  #!/usr/bin/env zsh
  cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/*
-print-joblog-failed < parse.log
+exec $cmds/do-in-parallel.zsh parse FEEDER_PARSE_JOBS "$cmds/parse.sh {}"
diff --git a/feeds-warcs.zsh b/feeds-warcs.zsh

index 17e5333ae1ec4a6eed855a5e6703da8dc88bc404..8a24dab2ec389cb2bdeeb0149ec42edab04df512 100755 (executable)
--- a/feeds-warcs.zsh
+++ b/feeds-warcs.zsh
@@ -1,7 +1,3 @@
  #!/usr/bin/env zsh
-dst=$1
  cmds=$0:h:a/cmd
-fpath=($cmds/functions.zsh $fpath)
-autoload print-joblog-failed
-parallel --joblog warcs.log "$cmds/download-warcs.zsh {} $dst" ::: feeds/*
-print-joblog-failed < warcs.log
+exec $cmds/do-in-parallel.zsh warcs FEEDER_DOWNLOAD_JOBS "$cmds/warcs.zsh {} $1"
author	Sergey Matveev <stargrave@stargrave.org>
	Sun, 20 Feb 2022 11:37:32 +0000 (14:37 +0300)
committer	Sergey Matveev <stargrave@stargrave.org>
	Sun, 20 Feb 2022 12:39:58 +0000 (15:39 +0300)
cmd/clear.zsh	[new file with mode: 0755]	patch \| blob
cmd/do-in-parallel.zsh	[new file with mode: 0755]	patch \| blob
cmd/download.sh		patch \| blob \| history
cmd/encs.zsh	[moved from cmd/download-encs.zsh with 74% similarity]	patch \| blob \| history
cmd/env.rc	[new file with mode: 0644]	patch \| blob
cmd/muttrc-gen.sh		patch \| blob \| history
cmd/parse.sh		patch \| blob \| history
cmd/warcs.zsh	[moved from cmd/download-warcs.zsh with 82% similarity]	patch \| blob \| history
contrib/mailcap	[new file with mode: 0644]	patch \| blob
doc/usage.texi		patch \| blob \| history
doc/warcs.texi		patch \| blob \| history
feeds-browse.sh	[new file with mode: 0755]	patch \| blob
feeds-browse.zsh	[deleted file]	patch \| blob \| history
feeds-clear.zsh		patch \| blob \| history
feeds-dnp.zsh	[new file with mode: 0755]	patch \| blob
feeds-download.zsh		patch \| blob \| history
feeds-encs.zsh		patch \| blob \| history
feeds-parse.zsh		patch \| blob \| history
feeds-warcs.zsh		patch \| blob \| history