From e18c7b0710b49655fd3b66896b8404381174e982 Mon Sep 17 00:00:00 2001 From: Sergey Matveev Date: Fri, 18 Feb 2022 11:20:51 +0300 Subject: [PATCH] Various refactoring * No excess redo * parallel is used, with failed jobs printing * All commands can be run outside the project * feed.zst -- it multiple times smaller * Various small fixes --- cmd/download-clean.sh | 4 ++++ cmd/download.sh | 28 +++++++++++++++++++++++++ mutt.rc.do => cmd/muttrc-gen.sh | 4 +++- cmd/parse.sh | 10 +++++++++ cmd/print-failed.zsh.rc | 11 ++++++++++ default.clean.do | 3 --- default.download.do | 29 -------------------------- default.parse.do | 3 --- doc/index.texi | 13 ++++++------ doc/storage.texi | 15 +++++++------ doc/usage.texi | 37 +++++++-------------------------- feeds-browse.sh | 4 ---- feeds-browse.zsh | 4 ++++ feeds-download.zsh | 6 ++++-- feeds-parse.zsh | 6 ++++-- 15 files changed, 90 insertions(+), 87 deletions(-) create mode 100755 cmd/download-clean.sh create mode 100755 cmd/download.sh rename mutt.rc.do => cmd/muttrc-gen.sh (94%) mode change 100644 => 100755 create mode 100755 cmd/parse.sh create mode 100644 cmd/print-failed.zsh.rc delete mode 100644 default.clean.do delete mode 100644 default.download.do delete mode 100644 default.parse.do delete mode 100755 feeds-browse.sh create mode 100755 feeds-browse.zsh diff --git a/cmd/download-clean.sh b/cmd/download-clean.sh new file mode 100755 index 0000000..084d5ee --- /dev/null +++ b/cmd/download-clean.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e + +cd "$1" +rm -fv download.hash etag feed hdr out diff --git a/cmd/download.sh b/cmd/download.sh new file mode 100755 index 0000000..b3a42a6 --- /dev/null +++ b/cmd/download.sh @@ -0,0 +1,28 @@ +#!/bin/sh -e + +PROXY="--proxy http://localhost:8080/" +cd "$1" +read url < url +[ -s etag ] && etag_compare="--etag-compare etag" || etag_compare="" +[ -r out ] && time_cond="--time-cond out" || time_cond="" +[ -z "$FEEDER_CURL_VERBOSE" ] && silent="--silent" || silent="--verbose" +curl --fail \ + --user-agent "go.stargrave.org-feeder/0.1.0" \ + --compressed \ + --location --max-redirs 2 \ + --dump-header hdr \ + --output out \ + --remote-time \ + --etag-save etag \ + $PROXY \ + $etag_compare \ + $time_cond \ + $silent \ + "$url" >&2 +if [ -s out ] ; then + zstdmt -19 < out > feed.zst + touch -r out feed.zst + truncate -s 0 out + touch -r feed.zst out +fi +sha512 < feed.zst > download.hash diff --git a/mutt.rc.do b/cmd/muttrc-gen.sh old mode 100644 new mode 100755 similarity index 94% rename from mutt.rc.do rename to cmd/muttrc-gen.sh index d980ab1..9c3c432 --- a/mutt.rc.do +++ b/cmd/muttrc-gen.sh @@ -1,4 +1,5 @@ -find feeds -name title | sort | xargs redo-ifchange +#!/bin/sh -e + cat <&2 diff --git a/cmd/parse.sh b/cmd/parse.sh new file mode 100755 index 0000000..ca3bf2a --- /dev/null +++ b/cmd/parse.sh @@ -0,0 +1,10 @@ +#!/bin/sh -e + +cmds="$(dirname "$(realpath "$0")")" +cd "$1" +[ -s parse.hash ] && hash_our=`cat parse.hash` || hash_our="" +[ -s download.hash ] && hash_their=`cat download.hash` || hash_their="" +[ "$hash_our" != "$hash_their" ] || exit 0 +zstd -d < feed.zst | $cmds/feed2mdir/feed2mdir . > title.tmp +mv title.tmp title +echo $hash_their > parse.hash diff --git a/cmd/print-failed.zsh.rc b/cmd/print-failed.zsh.rc new file mode 100644 index 0000000..ea52596 --- /dev/null +++ b/cmd/print-failed.zsh.rc @@ -0,0 +1,11 @@ +print-joglog-failed() { + local row + read row + local cols=(${(s: :)row}) + local exitvalI=${cols[(i)Exitval]} + while read row ; do + cols=(${(s: :)row}) + [[ ${cols[$exitvalI]} -ne 0 ]] || continue + print "Failed: ${cols[$#cols]}" + done +} diff --git a/default.clean.do b/default.clean.do deleted file mode 100644 index 931bb9a..0000000 --- a/default.clean.do +++ /dev/null @@ -1,3 +0,0 @@ -d=${1%/*} -cd $d -rm etag feed* hdr out diff --git a/default.download.do b/default.download.do deleted file mode 100644 index 6591f7d..0000000 --- a/default.download.do +++ /dev/null @@ -1,29 +0,0 @@ -PROXY="--proxy http://localhost:8080/" -d=${1%/*} -[ -z "$FEEDER_SKIP_DOWNLOAD" ] || { - echo temporarily skipping feed download >&2 - sha512 < $d/feed - exit -} -read url < $d/url -[ -s $d/etag ] && etag_compare="--etag-compare $d/etag" || etag_compare="" -[ -z "$FEEDER_CURL_VERBOSE" ] && silent="--silent" || silent="--verbose" -curl --fail \ - --user-agent "go.stargrave.org-feeder/0.1.0" \ - --compressed \ - --location --max-redirs 2 \ - --dump-header $d/hdr \ - --output $d/out \ - --remote-time \ - --time-cond $d/out \ - --etag-save $d/etag \ - $PROXY \ - $silent \ - $etag_compare \ - "$url" >&2 -if [ -s $d/out ] ; then - cp -a $d/out $d/feed - truncate -s 0 $d/out - touch -r $d/feed $d/out -fi -sha512 < $d/feed diff --git a/default.parse.do b/default.parse.do deleted file mode 100644 index 8c44c45..0000000 --- a/default.parse.do +++ /dev/null @@ -1,3 +0,0 @@ -d=${1%/*} -FEEDER_SKIP_DOWNLOAD=1 redo-ifchange $d/feed.download -cmd/feed2mdir/feed2mdir $d < $d/feed > $d/title diff --git a/doc/index.texi b/doc/index.texi index 808bd33..6a26bf9 100644 --- a/doc/index.texi +++ b/doc/index.texi @@ -1,6 +1,6 @@ \input texinfo @documentencoding UTF-8 -@settitle feeder +@settitle go.stargrave.org/feeder @copying Copyright @copyright{} 2022 @email{stargrave@@stargrave.org, Sergey Matveev} @@ -33,17 +33,18 @@ Its architecture is very simple: @url{https://en.wikipedia.org/wiki/Maildir, Maildir}. @item Automatically generated @url{http://www.mutt.org/, Mutt} source file contains convenient options and human-readable mailboxes list. -@item @url{http://cr.yp.to/redo.html, redo} is used for parallel - invocation of download/parsing jobs and skipping parsing of - unchanged feeds. +@item @url{https://www.gnu.org/software/parallel/, GNU parallel} for + parallel invocation of download/parsing jobs. @item @url{https://www.djcbsoftware.nl/code/mu/, mu} utilities perform indexing and searching among the messages. Of course it would be trivial to use @url{https://www.lesbonscomptes.com/recoll/, recoll} or @url{https://notmuchmail.org/, notmuch} instead. If you need that. -@item Bunch of @url{https://www.zsh.org/, Zsh} helper scripts, that are - completely optional and pretty trivial. +@item Bunch of optional @url{https://www.zsh.org/, Zsh} helper scripts. @end itemize +But of course it has its price: original feed data, its Base64-encoded +copy in mailbox and another copy in @command{mu}'s Xapian database. + @insertcopying @include storage.texi diff --git a/doc/storage.texi b/doc/storage.texi index 8b9ad85..e015254 100644 --- a/doc/storage.texi +++ b/doc/storage.texi @@ -16,14 +16,13 @@ Those files are used by @command{curl} to keep the content, its proper @code{mtime} (for @code{If-Modified-Since} header generation), @code{ETag} and response headers for debugging. -@item feed -It contains the content itself. - -@item feed.download -Used as intermediate target for the @command{redo} build system and -contains SHA-512 hash of the @file{feed} file. It is used to nearly -completely skip file copying/moving on filesystem if feed was not -modified. +@item feed.zst +It contains the content itself. Compressed with +@url{https://facebook.github.io/zstd/, Zstandard}. + +@item download.hash, parse.hash +SHA-512 hash of the @file{feed.zst}, used to determine if feed was +updated and parser has to do the job. @item title Automatically generated file with the title of the feed. diff --git a/doc/usage.texi b/doc/usage.texi index c99e5a8..280cce0 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -38,42 +38,21 @@ http://blog.stargrave.org/russian/feed.atom @item Download your feed(s) data -Downloading is implemented in @command{redo}'s -@file{default.download.do} file. Probably you want to change its default -@env{$PROXY} value. It uses @command{curl}, that is aware of -@code{If-Modified-Since} and @code{ETag} headers, compressed content -encodings and HTTP redirections. - -You can invoke feed downloading like that: - @example -$ redo feeds/blog.stargrave.org_russian_feed.atom/feed.download +$ cmd/download.sh feeds/blog.stargrave.org_russian_feed.atom +$ ./feeds-download.zsh # to invoke parallel downloading of everything @end example +Probably you want to change its default @env{$PROXY} value. It uses +@command{curl}, that is aware of @code{If-Modified-Since} and +@code{ETag} headers, compressed content encodings and HTTP redirections. If you want to see verbose output, then set @env{FEEDER_CURL_VERBOSE=1}. -As a rule, you wish to run all feeds downloading in parallel. You can -use @file{feeds-download.zsh}, that just invokes @command{redo-ifchange} -with @option{-f} option (forceful rebuild). Why that? Because most -@command{redo} implementations (that forces target building) do not -parallelize specified targets build. But you can also use @command{parallel}: - -@example -$ parallel "redo @{@}/feed.download" ::: feeds/* -@end example - @item Parse your feeds -Parsing (and Maildir filling) is implemented in @command{redo}'s -@file{default.parse.do} file. It calls @command{cmd/feed2mdir/feed2mdir} -utility, that read @file{feeds/FEED/feed} from stdin, takes Maildir -directory as its first argument and prints feed's title. - -You can use @file{feeds-parse.zsh} helper or invoke @command{parallel} -as in example above, replacing @code{.download} with @code{.parse}. - @example -$ ./feeds-parse.zsh +$ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom +$ ./feeds-parse.zsh # to parse all feeds in parallel @end example @item Run Mutt @@ -170,7 +149,7 @@ workers. @command{cmd/feed2mdir/feed2mdir} command by default has @item If you want to clean download state @example -$ redo feeds/FEED/feed.clean +$ cmd/download-clean.sh feed/FEED @end example @end table diff --git a/feeds-browse.sh b/feeds-browse.sh deleted file mode 100755 index dfc3f63..0000000 --- a/feeds-browse.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -e - -redo-ifchange mutt.rc -mutt -e "source mutt.rc" -y diff --git a/feeds-browse.zsh b/feeds-browse.zsh new file mode 100755 index 0000000..e1a6af7 --- /dev/null +++ b/feeds-browse.zsh @@ -0,0 +1,4 @@ +#!/usr/bin/env zsh +set -e +$0:h/cmd/muttrc-gen.sh > mutt.rc +mutt -e "source mutt.rc" -y diff --git a/feeds-download.zsh b/feeds-download.zsh index 5a9375b..d83f1bb 100755 --- a/feeds-download.zsh +++ b/feeds-download.zsh @@ -1,3 +1,5 @@ #!/usr/bin/env zsh -set -e -redo-ifchange -f -j 10 `for f (feeds/*) print $f/feed.download` +cmds=$0:h:a/cmd +parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/* +. $cmds/print-failed.zsh.rc +print-joglog-failed < download.log diff --git a/feeds-parse.zsh b/feeds-parse.zsh index 17fbec7..f5af82d 100755 --- a/feeds-parse.zsh +++ b/feeds-parse.zsh @@ -1,3 +1,5 @@ #!/usr/bin/env zsh -set -e -redo-ifchange -j 16 `for f (feeds/*) print $f/feed.parse` +cmds=$0:h:a/cmd +parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/* +. $cmds/print-failed.zsh.rc +print-joglog-failed < parse.log -- 2.44.0