From 388adb0a193d848b7408ab905b5ed5bfba319f9d Mon Sep 17 00:00:00 2001 From: Sergey Matveev Date: Fri, 18 Feb 2022 15:00:33 +0300 Subject: [PATCH] Enclosures and WARCs downloader --- cmd/download-encs.zsh | 20 ++++++++++++++ cmd/download-warcs.zsh | 33 ++++++++++++++++++++++ cmd/feed2mdir/main.go | 7 +++-- cmd/functions.zsh/print-joblog-failed | 9 ++++++ cmd/functions.zsh/url-to-filename | 5 ++++ cmd/muttrc-gen.sh | 2 +- cmd/print-failed.zsh.rc | 11 -------- doc/index.texi | 1 + doc/mail.texi | 1 + doc/usage.texi | 40 +++++++++++++++++++++++++++ doc/warcs.texi | 30 ++++++++++++++++++++ feeds-download.zsh | 5 ++-- feeds-encs.zsh | 11 ++++++++ feeds-news.zsh | 7 +++++ feeds-parse.zsh | 5 ++-- feeds-warcs.zsh | 11 ++++++++ urls2feeds.zsh | 8 ++---- 17 files changed, 183 insertions(+), 23 deletions(-) create mode 100755 cmd/download-encs.zsh create mode 100755 cmd/download-warcs.zsh create mode 100644 cmd/functions.zsh/print-joblog-failed create mode 100644 cmd/functions.zsh/url-to-filename delete mode 100644 cmd/print-failed.zsh.rc create mode 100644 doc/warcs.texi create mode 100755 feeds-encs.zsh create mode 100755 feeds-news.zsh create mode 100755 feeds-warcs.zsh diff --git a/cmd/download-encs.zsh b/cmd/download-encs.zsh new file mode 100755 index 0000000..9e62ffd --- /dev/null +++ b/cmd/download-encs.zsh @@ -0,0 +1,20 @@ +#!/usr/bin/env zsh +set -e +fpath=($0:h:a/functions.zsh $fpath) +dst=$2:a +cd $1 +autoload url-to-filename +zmodload -F zsh/datetime b:strftime +setopt EXTENDED_GLOB +for new (new/*(N)) { + while read line ; do + [[ "$line" != "" ]] || break + cols=(${(s: :)line}) + [[ $cols[1] = "X-Enclosure:" ]] || continue + url=$cols[2] + [[ -n "$url" ]] + fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S) + wget --output-file=enc.log --output-document=$dst/$fn $url + print $fn + done < $new +} diff --git a/cmd/download-warcs.zsh b/cmd/download-warcs.zsh new file mode 100755 index 0000000..ac9e589 --- /dev/null +++ b/cmd/download-warcs.zsh @@ -0,0 +1,33 @@ +#!/usr/bin/env zsh +set -e +fpath=($0:h:a/functions.zsh $fpath) +dst=$2:a +cd $1 +autoload url-to-filename +zmodload -F zsh/datetime b:strftime +setopt EXTENDED_GLOB +wget_opts=( + --page-requisites + --compression=auto + --no-warc-keep-log + --no-warc-digests + --no-warc-compression +) +for new (new/*(N)) { + while read line ; do + [[ "$line" != "" ]] || break + cols=(${(s: :)line}) + [[ $cols[1] = "X-URL:" ]] || continue + url=$cols[2] + [[ -n "$url" ]] + [[ -n "$tmp" ]] || { + # Lazy temporary file creation + tmp=`mktemp` + trap "rm -f $tmp" HUP PIPE INT QUIT TERM EXIT + wget_opts=(--output-document=$tmp $wget_opts) + } + fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S) + wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url + print $fn + done < $new +} diff --git a/cmd/feed2mdir/main.go b/cmd/feed2mdir/main.go index 70922b8..9fd657c 100644 --- a/cmd/feed2mdir/main.go +++ b/cmd/feed2mdir/main.go @@ -114,11 +114,14 @@ func main() { fd.WriteString("MIME-Version: 1.0\n") fd.WriteString("Content-Type: text/html; charset=utf-8\n") fd.WriteString("Content-Transfer-Encoding: base64\n") + for _, author := range item.Authors { + fd.WriteString("X-Author: " + author.Name + "\n") + } for _, link := range item.Links { fd.WriteString("X-URL: " + link + "\n") } - for _, author := range item.Authors { - fd.WriteString("X-Author: " + author.Name + "\n") + for _, enc := range item.Enclosures { + fd.WriteString("X-Enclosure: " + enc.URL + "\n") } if len(item.Categories) > 0 { fd.WriteString("X-Categories: " + strings.Join(item.Categories, ", ") + "\n") diff --git a/cmd/functions.zsh/print-joblog-failed b/cmd/functions.zsh/print-joblog-failed new file mode 100644 index 0000000..86b9f05 --- /dev/null +++ b/cmd/functions.zsh/print-joblog-failed @@ -0,0 +1,9 @@ +local row +read row +local cols=(${(s: :)row}) +local exitvalI=${cols[(i)Exitval]} +while read row ; do + cols=(${(s: :)row}) + [[ ${cols[$exitvalI]} -ne 0 ]] || continue + print failed: ${cols[$#cols]} +done diff --git a/cmd/functions.zsh/url-to-filename b/cmd/functions.zsh/url-to-filename new file mode 100644 index 0000000..a5540c5 --- /dev/null +++ b/cmd/functions.zsh/url-to-filename @@ -0,0 +1,5 @@ +autoload regexp-replace +local fn=${1:gs#/#_#} +regexp-replace fn "^.*__" "" +regexp-replace fn "_$" "" || : +print $fn diff --git a/cmd/muttrc-gen.sh b/cmd/muttrc-gen.sh index 9c3c432..81738e6 100755 --- a/cmd/muttrc-gen.sh +++ b/cmd/muttrc-gen.sh @@ -18,7 +18,7 @@ macro index "mu find --muhome mu --clearlinks --format=links macro index "search" "mu find results" folder-hook search "set index_format = \"%4C [%D] %s (%F)\"" -unignore X-URL X-Author X-Category +unignore X-Author X-URL X-Enclosure X-Categories set folder = \`pwd\` unmailboxes * diff --git a/cmd/print-failed.zsh.rc b/cmd/print-failed.zsh.rc deleted file mode 100644 index 9e7e665..0000000 --- a/cmd/print-failed.zsh.rc +++ /dev/null @@ -1,11 +0,0 @@ -print-joglog-failed() { - local row - read row - local cols=(${(s: :)row}) - local exitvalI=${cols[(i)Exitval]} - while read row ; do - cols=(${(s: :)row}) - [[ ${cols[$exitvalI]} -ne 0 ]] || continue - print failed: ${cols[$#cols]} - done -} diff --git a/doc/index.texi b/doc/index.texi index 6a26bf9..2848c57 100644 --- a/doc/index.texi +++ b/doc/index.texi @@ -50,5 +50,6 @@ copy in mailbox and another copy in @command{mu}'s Xapian database. @include storage.texi @include mail.texi @include usage.texi +@include warcs.texi @bye diff --git a/doc/mail.texi b/doc/mail.texi index 672a11f..b9353a1 100644 --- a/doc/mail.texi +++ b/doc/mail.texi @@ -9,6 +9,7 @@ Date: Item's updated/published date, now otherwise Subject: Item's subject Content-Type: text/html; charset=utf-8 [X-URL: link presented in the item] (maybe multiple) +[X-Enclosure: enclosure link presented in the item] (maybe multiple) [X-Author: author's name] (maybe multiple) [X-Categories: item's comma separated categories] diff --git a/doc/usage.texi b/doc/usage.texi index 280cce0..b709310 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -36,6 +36,9 @@ $ cat feeds/blog.stargrave.org_russian_feed.atom/url http://blog.stargrave.org/russian/feed.atom @end example +@file{urls2feeds.zsh} won't touch already existing directories and will +warn if some of them disappeared from @file{urls}. + @item Download your feed(s) data @example @@ -55,6 +58,19 @@ $ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom $ ./feeds-parse.zsh # to parse all feeds in parallel @end example +@item Quick overview of the news: + +@example +$ ./feeds-news.zsh +habr.com_ru_rss_interesting: 7 +habr.com_ru_rss_news: 3 +lobste.rs_rss: 3 +naked-science.ru_?yandex_feed=news: 1 +planet.fsfe.org_atom.xml: 1 +www.astronews.ru_astronews.xml: 1 +www.darkside.ru_news_rss: 5 +@end example + @item Run Mutt @example @@ -152,4 +168,28 @@ workers. @command{cmd/feed2mdir/feed2mdir} command by default has $ cmd/download-clean.sh feed/FEED @end example +@anchor{Enclosures} +@item Download enclosures + +Many feeds include links to so-called enclosures, like audio files for +podcasts. While you mail is not processed by MUA, its @file{new/} +messages still there, you can run enclosure downloading process, that +uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Specify the +directory where your enclosures should be placed. Each enclosure's +filename is more or less filesystem-friendly with the current timestamp +in it. + +@example +$ mkdir path/to/enclosures +$ ./feeds-encs.zsh path/to/enclosures +[...] +traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822 +[...] +$ file path/to/enclosures/traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822 +path/to/...: Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural +@end example + +@command{feeds-encs.zsh} do not parallelize jobs, because enclosure are +often heavy enough to satiate your Internet link. + @end table diff --git a/doc/warcs.texi b/doc/warcs.texi new file mode 100644 index 0000000..638ab0a --- /dev/null +++ b/doc/warcs.texi @@ -0,0 +1,30 @@ +@node WARCs +@unnumbered WARCs + +Similarly to @ref{Enclosures, enclosures} downloading, you may run +downloading of @code{X-URL} URLs, pointing to the article itself. If it +is HTML document, then it can depend on various other resources, like +images and stylesheets. @url{https://www.gnu.org/software/wget/, GNU Wget} +has ability to download it with all required requisites. Moreover it is +able to output the whole document in +@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} format. + +@example +$ mkdir path/to/warcs +$ ./feeds-warcs.zsh path/to/warcs +[...] +www.darkside.ru_news_140480-20220218-145755.warc +[...] +@end example + +It is not compressed by default. You can both view and compress them +with @url{https://www.tofuproxy.stargrave.org/WARCs.html, tofuproxy}'s +help as an option. After you get pile of various @file{*.warc} files, +you can simply add them to running @command{tofuproxy}: + +@example +$ for w (path/to/warcs/*.warc) print $w > path/to/tofuproxy/fifos/add-warcs +@end example + +And then visit @url{http://warc/} URL (when @command{tofuproxy} already +acts as a proxy) to view and visit existing URLs. diff --git a/feeds-download.zsh b/feeds-download.zsh index d83f1bb..6ddfb63 100755 --- a/feeds-download.zsh +++ b/feeds-download.zsh @@ -1,5 +1,6 @@ #!/usr/bin/env zsh cmds=$0:h:a/cmd +fpath=($cmds/functions.zsh $fpath) +autoload print-joblog-failed parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/* -. $cmds/print-failed.zsh.rc -print-joglog-failed < download.log +print-joblog-failed < download.log diff --git a/feeds-encs.zsh b/feeds-encs.zsh new file mode 100755 index 0000000..eb01aea --- /dev/null +++ b/feeds-encs.zsh @@ -0,0 +1,11 @@ +#!/usr/bin/env zsh +dst=$1 +[[ -n "$dst" ]] || { + print Usage: $0 dst-dir-for-enclosures >&2 + exit 1 +} +cmds=$0:h:a/cmd +fpath=($cmds/functions.zsh $fpath) +autoload print-joblog-failed +parallel --jobs 1 --joblog encs.log "$cmds/download-encs.zsh {} $dst" ::: feeds/* +print-joblog-failed < encs.log diff --git a/feeds-news.zsh b/feeds-news.zsh new file mode 100755 index 0000000..3d45202 --- /dev/null +++ b/feeds-news.zsh @@ -0,0 +1,7 @@ +#!/usr/bin/env zsh +set -e +setopt EXTENDED_GLOB +for f (feeds/*(/on)) { + news=($f/new/*(N)) + [[ $#news -eq 0 ]] || print "$f:t": $#news +} diff --git a/feeds-parse.zsh b/feeds-parse.zsh index f5af82d..4dcc61e 100755 --- a/feeds-parse.zsh +++ b/feeds-parse.zsh @@ -1,5 +1,6 @@ #!/usr/bin/env zsh cmds=$0:h:a/cmd +fpath=($cmds/functions.zsh $fpath) +autoload print-joblog-failed parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/* -. $cmds/print-failed.zsh.rc -print-joglog-failed < parse.log +print-joblog-failed < parse.log diff --git a/feeds-warcs.zsh b/feeds-warcs.zsh new file mode 100755 index 0000000..6758c08 --- /dev/null +++ b/feeds-warcs.zsh @@ -0,0 +1,11 @@ +#!/usr/bin/env zsh +dst=$1 +[[ -n "$dst" ]] || { + print Usage: $0 dst-dir-for-warcs >&2 + exit 1 +} +cmds=$0:h:a/cmd +fpath=($cmds/functions.zsh $fpath) +autoload print-joblog-failed +parallel --joblog warcs.log "$cmds/download-warcs.zsh {} $dst" ::: feeds/* +print-joblog-failed < warcs.log diff --git a/urls2feeds.zsh b/urls2feeds.zsh index e878ece..c7e97f1 100755 --- a/urls2feeds.zsh +++ b/urls2feeds.zsh @@ -1,14 +1,12 @@ #!/usr/bin/env zsh set -e -autoload regexp-replace +fpath=($0:h:a/cmd/functions.zsh $fpath) +autoload url-to-filename typeset -A seen while read url ; do url="$url " # to be sure that next line will work url=${${=url}[1]} - dir=${url:gs#/#_#} - regexp-replace dir "^.*__" "" - regexp-replace dir "_$" "" || : - dir=feeds/$dir + dir=feeds/$(url-to-filename $url) seen[$dir]=1 [[ -e $dir ]] && continue || : mkdir -p $dir/{cur,new,tmp} # make it maildir -- 2.44.0