--- /dev/null
+#!/usr/bin/env zsh
+set -e
+fpath=($0:h:a/functions.zsh $fpath)
+dst=$2:a
+cd $1
+autoload url-to-filename
+zmodload -F zsh/datetime b:strftime
+setopt EXTENDED_GLOB
+for new (new/*(N)) {
+ while read line ; do
+ [[ "$line" != "" ]] || break
+ cols=(${(s: :)line})
+ [[ $cols[1] = "X-Enclosure:" ]] || continue
+ url=$cols[2]
+ [[ -n "$url" ]]
+ fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
+ wget --output-file=enc.log --output-document=$dst/$fn $url
+ print $fn
+ done < $new
+}
--- /dev/null
+#!/usr/bin/env zsh
+set -e
+fpath=($0:h:a/functions.zsh $fpath)
+dst=$2:a
+cd $1
+autoload url-to-filename
+zmodload -F zsh/datetime b:strftime
+setopt EXTENDED_GLOB
+wget_opts=(
+ --page-requisites
+ --compression=auto
+ --no-warc-keep-log
+ --no-warc-digests
+ --no-warc-compression
+)
+for new (new/*(N)) {
+ while read line ; do
+ [[ "$line" != "" ]] || break
+ cols=(${(s: :)line})
+ [[ $cols[1] = "X-URL:" ]] || continue
+ url=$cols[2]
+ [[ -n "$url" ]]
+ [[ -n "$tmp" ]] || {
+ # Lazy temporary file creation
+ tmp=`mktemp`
+ trap "rm -f $tmp" HUP PIPE INT QUIT TERM EXIT
+ wget_opts=(--output-document=$tmp $wget_opts)
+ }
+ fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
+ wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url
+ print $fn
+ done < $new
+}
fd.WriteString("MIME-Version: 1.0\n")
fd.WriteString("Content-Type: text/html; charset=utf-8\n")
fd.WriteString("Content-Transfer-Encoding: base64\n")
+ for _, author := range item.Authors {
+ fd.WriteString("X-Author: " + author.Name + "\n")
+ }
for _, link := range item.Links {
fd.WriteString("X-URL: " + link + "\n")
}
- for _, author := range item.Authors {
- fd.WriteString("X-Author: " + author.Name + "\n")
+ for _, enc := range item.Enclosures {
+ fd.WriteString("X-Enclosure: " + enc.URL + "\n")
}
if len(item.Categories) > 0 {
fd.WriteString("X-Categories: " + strings.Join(item.Categories, ", ") + "\n")
--- /dev/null
+local row
+read row
+local cols=(${(s: :)row})
+local exitvalI=${cols[(i)Exitval]}
+while read row ; do
+ cols=(${(s: :)row})
+ [[ ${cols[$exitvalI]} -ne 0 ]] || continue
+ print failed: ${cols[$#cols]}
+done
--- /dev/null
+autoload regexp-replace
+local fn=${1:gs#/#_#}
+regexp-replace fn "^.*__" ""
+regexp-replace fn "_$" "" || :
+print $fn
macro index <F9> "<change-folder-readonly>search<enter>" "mu find results"
folder-hook search "set index_format = \"%4C [%D] %s (%F)\""
-unignore X-URL X-Author X-Category
+unignore X-Author X-URL X-Enclosure X-Categories
set folder = \`pwd\`
unmailboxes *
+++ /dev/null
-print-joglog-failed() {
- local row
- read row
- local cols=(${(s: :)row})
- local exitvalI=${cols[(i)Exitval]}
- while read row ; do
- cols=(${(s: :)row})
- [[ ${cols[$exitvalI]} -ne 0 ]] || continue
- print failed: ${cols[$#cols]}
- done
-}
@include storage.texi
@include mail.texi
@include usage.texi
+@include warcs.texi
@bye
Subject: Item's subject
Content-Type: text/html; charset=utf-8
[X-URL: link presented in the item] (maybe multiple)
+[X-Enclosure: enclosure link presented in the item] (maybe multiple)
[X-Author: author's name] (maybe multiple)
[X-Categories: item's comma separated categories]
http://blog.stargrave.org/russian/feed.atom
@end example
+@file{urls2feeds.zsh} won't touch already existing directories and will
+warn if some of them disappeared from @file{urls}.
+
@item Download your feed(s) data
@example
$ ./feeds-parse.zsh # to parse all feeds in parallel
@end example
+@item Quick overview of the news:
+
+@example
+$ ./feeds-news.zsh
+habr.com_ru_rss_interesting: 7
+habr.com_ru_rss_news: 3
+lobste.rs_rss: 3
+naked-science.ru_?yandex_feed=news: 1
+planet.fsfe.org_atom.xml: 1
+www.astronews.ru_astronews.xml: 1
+www.darkside.ru_news_rss: 5
+@end example
+
@item Run Mutt
@example
$ cmd/download-clean.sh feed/FEED
@end example
+@anchor{Enclosures}
+@item Download enclosures
+
+Many feeds include links to so-called enclosures, like audio files for
+podcasts. While you mail is not processed by MUA, its @file{new/}
+messages still there, you can run enclosure downloading process, that
+uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Specify the
+directory where your enclosures should be placed. Each enclosure's
+filename is more or less filesystem-friendly with the current timestamp
+in it.
+
+@example
+$ mkdir path/to/enclosures
+$ ./feeds-encs.zsh path/to/enclosures
+[...]
+traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
+[...]
+$ file path/to/enclosures/traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
+path/to/...: Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1, 96 kbps, 44.1 kHz, Monaural
+@end example
+
+@command{feeds-encs.zsh} do not parallelize jobs, because enclosure are
+often heavy enough to satiate your Internet link.
+
@end table
--- /dev/null
+@node WARCs
+@unnumbered WARCs
+
+Similarly to @ref{Enclosures, enclosures} downloading, you may run
+downloading of @code{X-URL} URLs, pointing to the article itself. If it
+is HTML document, then it can depend on various other resources, like
+images and stylesheets. @url{https://www.gnu.org/software/wget/, GNU Wget}
+has ability to download it with all required requisites. Moreover it is
+able to output the whole document in
+@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} format.
+
+@example
+$ mkdir path/to/warcs
+$ ./feeds-warcs.zsh path/to/warcs
+[...]
+www.darkside.ru_news_140480-20220218-145755.warc
+[...]
+@end example
+
+It is not compressed by default. You can both view and compress them
+with @url{https://www.tofuproxy.stargrave.org/WARCs.html, tofuproxy}'s
+help as an option. After you get pile of various @file{*.warc} files,
+you can simply add them to running @command{tofuproxy}:
+
+@example
+$ for w (path/to/warcs/*.warc) print $w > path/to/tofuproxy/fifos/add-warcs
+@end example
+
+And then visit @url{http://warc/} URL (when @command{tofuproxy} already
+acts as a proxy) to view and visit existing URLs.
#!/usr/bin/env zsh
cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/*
-. $cmds/print-failed.zsh.rc
-print-joglog-failed < download.log
+print-joblog-failed < download.log
--- /dev/null
+#!/usr/bin/env zsh
+dst=$1
+[[ -n "$dst" ]] || {
+ print Usage: $0 dst-dir-for-enclosures >&2
+ exit 1
+}
+cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+parallel --jobs 1 --joblog encs.log "$cmds/download-encs.zsh {} $dst" ::: feeds/*
+print-joblog-failed < encs.log
--- /dev/null
+#!/usr/bin/env zsh
+set -e
+setopt EXTENDED_GLOB
+for f (feeds/*(/on)) {
+ news=($f/new/*(N))
+ [[ $#news -eq 0 ]] || print "$f:t": $#news
+}
#!/usr/bin/env zsh
cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/*
-. $cmds/print-failed.zsh.rc
-print-joglog-failed < parse.log
+print-joblog-failed < parse.log
--- /dev/null
+#!/usr/bin/env zsh
+dst=$1
+[[ -n "$dst" ]] || {
+ print Usage: $0 dst-dir-for-warcs >&2
+ exit 1
+}
+cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+parallel --joblog warcs.log "$cmds/download-warcs.zsh {} $dst" ::: feeds/*
+print-joblog-failed < warcs.log
#!/usr/bin/env zsh
set -e
-autoload regexp-replace
+fpath=($0:h:a/cmd/functions.zsh $fpath)
+autoload url-to-filename
typeset -A seen
while read url ; do
url="$url " # to be sure that next line will work
url=${${=url}[1]}
- dir=${url:gs#/#_#}
- regexp-replace dir "^.*__" ""
- regexp-replace dir "_$" "" || :
- dir=feeds/$dir
+ dir=feeds/$(url-to-filename $url)
seen[$dir]=1
[[ -e $dir ]] && continue || :
mkdir -p $dir/{cur,new,tmp} # make it maildir