Enclosures and WARCs downloader

author Sergey Matveev <stargrave@stargrave.org>

Fri, 18 Feb 2022 12:00:33 +0000 (15:00 +0300)

committer Sergey Matveev <stargrave@stargrave.org>

Fri, 18 Feb 2022 12:52:28 +0000 (15:52 +0300)
author Sergey Matveev <stargrave@stargrave.org>
Fri, 18 Feb 2022 12:00:33 +0000 (15:00 +0300)
committer Sergey Matveev <stargrave@stargrave.org>
Fri, 18 Feb 2022 12:52:28 +0000 (15:52 +0300)
diff --git a/cmd/download-encs.zsh b/cmd/download-encs.zsh

new file mode 100755 (executable)

index 0000000..9e62ffd
--- /dev/null
+++ b/cmd/download-encs.zsh
@@ -0,0 +1,20 @@
+#!/usr/bin/env zsh
+set -e
+fpath=($0:h:a/functions.zsh $fpath)
+dst=$2:a
+cd $1
+autoload url-to-filename
+zmodload -F zsh/datetime b:strftime
+setopt EXTENDED_GLOB
+for new (new/*(N)) {
+    while read line ; do
+        [[ "$line" != "" ]] || break
+        cols=(${(s: :)line})
+        [[ $cols[1] = "X-Enclosure:" ]] || continue
+        url=$cols[2]
+        [[ -n "$url" ]]
+        fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
+        wget --output-file=enc.log --output-document=$dst/$fn $url
+        print $fn
+    done < $new
+}
diff --git a/cmd/download-warcs.zsh b/cmd/download-warcs.zsh

new file mode 100755 (executable)

index 0000000..ac9e589
--- /dev/null
+++ b/cmd/download-warcs.zsh
@@ -0,0 +1,33 @@
+#!/usr/bin/env zsh
+set -e
+fpath=($0:h:a/functions.zsh $fpath)
+dst=$2:a
+cd $1
+autoload url-to-filename
+zmodload -F zsh/datetime b:strftime
+setopt EXTENDED_GLOB
+wget_opts=(
+    --page-requisites
+    --compression=auto
+    --no-warc-keep-log
+    --no-warc-digests
+    --no-warc-compression
+)
+for new (new/*(N)) {
+    while read line ; do
+        [[ "$line" != "" ]] || break
+        cols=(${(s: :)line})
+        [[ $cols[1] = "X-URL:" ]] || continue
+        url=$cols[2]
+        [[ -n "$url" ]]
+        [[ -n "$tmp" ]] || {
+            # Lazy temporary file creation
+            tmp=`mktemp`
+            trap "rm -f $tmp" HUP PIPE INT QUIT TERM EXIT
+            wget_opts=(--output-document=$tmp $wget_opts)
+        }
+        fn=$(url-to-filename $url)-$(strftime %Y%m%d-%H%M%S)
+        wget $wget_opts --output-file=warc.log --warc-file=$dst/$fn $url
+        print $fn
+    done < $new
+}
diff --git a/cmd/feed2mdir/main.go b/cmd/feed2mdir/main.go

index 70922b871fefaf19827cbafbbad0b7aedf2b923b..9fd657ce4e14a3d12a304b5271bdec61116d29c7 100644 (file)
--- a/cmd/feed2mdir/main.go
+++ b/cmd/feed2mdir/main.go
@@ -114,11 +114,14 @@ func main() {
                 fd.WriteString("MIME-Version: 1.0\n")
                 fd.WriteString("Content-Type: text/html; charset=utf-8\n")
                 fd.WriteString("Content-Transfer-Encoding: base64\n")
+               for _, author := range item.Authors {
+                       fd.WriteString("X-Author: " + author.Name + "\n")
+               }
                 for _, link := range item.Links {
                         fd.WriteString("X-URL: " + link + "\n")
                 }
-               for _, author := range item.Authors {
-                       fd.WriteString("X-Author: " + author.Name + "\n")
+               for _, enc := range item.Enclosures {
+                       fd.WriteString("X-Enclosure: " + enc.URL + "\n")
                 }
                 if len(item.Categories) > 0 {
                         fd.WriteString("X-Categories: " + strings.Join(item.Categories, ", ") + "\n")
diff --git a/cmd/functions.zsh/print-joblog-failed b/cmd/functions.zsh/print-joblog-failed

new file mode 100644 (file)

index 0000000..86b9f05
--- /dev/null
+++ b/cmd/functions.zsh/print-joblog-failed
@@ -0,0 +1,9 @@
+local row
+read row
+local cols=(${(s:      :)row})
+local exitvalI=${cols[(i)Exitval]}
+while read row ; do
+    cols=(${(s:        :)row})
+    [[ ${cols[$exitvalI]} -ne 0 ]] || continue
+    print failed: ${cols[$#cols]}
+done
diff --git a/cmd/functions.zsh/url-to-filename b/cmd/functions.zsh/url-to-filename

new file mode 100644 (file)

index 0000000..a5540c5
--- /dev/null
+++ b/cmd/functions.zsh/url-to-filename
@@ -0,0 +1,5 @@
+autoload regexp-replace
+local fn=${1:gs#/#_#}
+regexp-replace fn "^.*__" ""
+regexp-replace fn "_$" "" || :
+print $fn
diff --git a/cmd/muttrc-gen.sh b/cmd/muttrc-gen.sh

index 9c3c432879061a8a30dfcaadb2f015123cd97f9d..81738e60eecd463329bb88ca32e91e4d03020590 100755 (executable)
--- a/cmd/muttrc-gen.sh
+++ b/cmd/muttrc-gen.sh
@@ -18,7 +18,7 @@ macro index <F8> "<shell-escape>mu find --muhome mu --clearlinks --format=links
  macro index <F9> "<change-folder-readonly>search<enter>" "mu find results"
  folder-hook search "set index_format = \"%4C [%D] %s (%F)\""
  
-unignore X-URL X-Author X-Category
+unignore X-Author X-URL X-Enclosure X-Categories
  
  set folder = \`pwd\`
  unmailboxes *
diff --git a/cmd/print-failed.zsh.rc b/cmd/print-failed.zsh.rc

deleted file mode 100644 (file)

index 9e7e665..0000000
--- a/cmd/print-failed.zsh.rc
+++ /dev/null
@@ -1,11 +0,0 @@
-print-joglog-failed() {
-    local row
-    read row
-    local cols=(${(s:  :)row})
-    local exitvalI=${cols[(i)Exitval]}
-    while read row ; do
-        cols=(${(s:    :)row})
-        [[ ${cols[$exitvalI]} -ne 0 ]] || continue
-        print failed: ${cols[$#cols]}
-    done
-}
diff --git a/doc/index.texi b/doc/index.texi

index 6a26bf9c07eaf33ff90c4f1f3145de20b57f3dd2..2848c571099e5cb8fd2dcba2cc7ba1e230921d6f 100644 (file)
--- a/doc/index.texi
+++ b/doc/index.texi
@@ -50,5 +50,6 @@ copy in mailbox and another copy in @command{mu}'s Xapian database.
  @include storage.texi
  @include mail.texi
  @include usage.texi
+@include warcs.texi
  
  @bye
diff --git a/doc/mail.texi b/doc/mail.texi

index 672a11f1c752109d5310708366b67afcf96de1df..b9353a1278597d4851be3500598adb4a3eba1cb6 100644 (file)
--- a/doc/mail.texi
+++ b/doc/mail.texi
@@ -9,6 +9,7 @@ Date: Item's updated/published date, now otherwise
  Subject: Item's subject
  Content-Type: text/html; charset=utf-8
  [X-URL: link presented in the item] (maybe multiple)
+[X-Enclosure: enclosure link presented in the item] (maybe multiple)
  [X-Author: author's name] (maybe multiple)
  [X-Categories: item's comma separated categories]
  
diff --git a/doc/usage.texi b/doc/usage.texi

index 280cce0e80e82491046af4fb150c27b6fd89fe93..b7093104240788e2ea013c3e650d6e1002ecf420 100644 (file)
--- a/doc/usage.texi
+++ b/doc/usage.texi
@@ -36,6 +36,9 @@ $ cat feeds/blog.stargrave.org_russian_feed.atom/url
  http://blog.stargrave.org/russian/feed.atom
  @end example
  
+@file{urls2feeds.zsh} won't touch already existing directories and will
+warn if some of them disappeared from @file{urls}.
+
  @item Download your feed(s) data
  
  @example
@@ -55,6 +58,19 @@ $ cmd/parse.sh feeds/blog.stargrave.org_russian_feed.atom
  $ ./feeds-parse.zsh # to parse all feeds in parallel
  @end example
  
+@item Quick overview of the news:
+
+@example
+$ ./feeds-news.zsh
+habr.com_ru_rss_interesting: 7
+habr.com_ru_rss_news: 3
+lobste.rs_rss: 3
+naked-science.ru_?yandex_feed=news: 1
+planet.fsfe.org_atom.xml: 1
+www.astronews.ru_astronews.xml: 1
+www.darkside.ru_news_rss: 5
+@end example
+
  @item Run Mutt
  
  @example
@@ -152,4 +168,28 @@ workers. @command{cmd/feed2mdir/feed2mdir} command by default has
  $ cmd/download-clean.sh feed/FEED
  @end example
  
+@anchor{Enclosures}
+@item Download enclosures
+
+Many feeds include links to so-called enclosures, like audio files for
+podcasts. While you mail is not processed by MUA, its @file{new/}
+messages still there, you can run enclosure downloading process, that
+uses @url{https://www.gnu.org/software/wget/, GNU Wget}. Specify the
+directory where your enclosures should be placed. Each enclosure's
+filename is more or less filesystem-friendly with the current timestamp
+in it.
+
+@example
+$ mkdir path/to/enclosures
+$ ./feeds-encs.zsh path/to/enclosures
+[...]
+traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
+[...]
+$ file path/to/enclosures/traffic.libsyn.com_monsterfeet_grue_018.mp3-20220218-152822
+path/to/...: Audio file with ID3 version 2.2.0, contains:MPEG ADTS, layer III, v1,  96 kbps, 44.1 kHz, Monaural
+@end example
+
+@command{feeds-encs.zsh} do not parallelize jobs, because enclosure are
+often heavy enough to satiate your Internet link.
+
  @end table
diff --git a/doc/warcs.texi b/doc/warcs.texi

new file mode 100644 (file)

index 0000000..638ab0a
--- /dev/null
+++ b/doc/warcs.texi
@@ -0,0 +1,30 @@
+@node WARCs
+@unnumbered WARCs
+
+Similarly to @ref{Enclosures, enclosures} downloading, you may run
+downloading of @code{X-URL} URLs, pointing to the article itself. If it
+is HTML document, then it can depend on various other resources, like
+images and stylesheets. @url{https://www.gnu.org/software/wget/, GNU Wget}
+has ability to download it with all required requisites. Moreover it is
+able to output the whole document in
+@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} format.
+
+@example
+$ mkdir path/to/warcs
+$ ./feeds-warcs.zsh path/to/warcs
+[...]
+www.darkside.ru_news_140480-20220218-145755.warc
+[...]
+@end example
+
+It is not compressed by default. You can both view and compress them
+with @url{https://www.tofuproxy.stargrave.org/WARCs.html, tofuproxy}'s
+help as an option. After you get pile of various @file{*.warc} files,
+you can simply add them to running @command{tofuproxy}:
+
+@example
+$ for w (path/to/warcs/*.warc) print $w > path/to/tofuproxy/fifos/add-warcs
+@end example
+
+And then visit @url{http://warc/} URL (when @command{tofuproxy} already
+acts as a proxy) to view and visit existing URLs.
diff --git a/feeds-download.zsh b/feeds-download.zsh

index d83f1bb5fc87f3558b7a789573f784aea060aed4..6ddfb63040690a9b952a5b897850df9f0cef812f 100755 (executable)
--- a/feeds-download.zsh
+++ b/feeds-download.zsh
@@ -1,5 +1,6 @@
  #!/usr/bin/env zsh
  cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
  parallel --joblog download.log "$cmds/download.sh {}" ::: feeds/*
-. $cmds/print-failed.zsh.rc
-print-joglog-failed < download.log
+print-joblog-failed < download.log
diff --git a/feeds-encs.zsh b/feeds-encs.zsh

new file mode 100755 (executable)

index 0000000..eb01aea
--- /dev/null
+++ b/feeds-encs.zsh
@@ -0,0 +1,11 @@
+#!/usr/bin/env zsh
+dst=$1
+[[ -n "$dst" ]] || {
+    print Usage: $0 dst-dir-for-enclosures >&2
+    exit 1
+}
+cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+parallel --jobs 1 --joblog encs.log "$cmds/download-encs.zsh {} $dst" ::: feeds/*
+print-joblog-failed < encs.log
diff --git a/feeds-news.zsh b/feeds-news.zsh

new file mode 100755 (executable)

index 0000000..3d45202
--- /dev/null
+++ b/feeds-news.zsh
@@ -0,0 +1,7 @@
+#!/usr/bin/env zsh
+set -e
+setopt EXTENDED_GLOB
+for f (feeds/*(/on)) {
+    news=($f/new/*(N))
+    [[ $#news -eq 0 ]] || print "$f:t": $#news
+}
diff --git a/feeds-parse.zsh b/feeds-parse.zsh

index f5af82dccddfb8787287c3d29c1032e6d40cbd0e..4dcc61e80bd7313f09dc51d4a551ca0102d2c66f 100755 (executable)
--- a/feeds-parse.zsh
+++ b/feeds-parse.zsh
@@ -1,5 +1,6 @@
  #!/usr/bin/env zsh
  cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
  parallel --joblog parse.log "$cmds/parse.sh {}" ::: feeds/*
-. $cmds/print-failed.zsh.rc
-print-joglog-failed < parse.log
+print-joblog-failed < parse.log
diff --git a/feeds-warcs.zsh b/feeds-warcs.zsh

new file mode 100755 (executable)

index 0000000..6758c08
--- /dev/null
+++ b/feeds-warcs.zsh
@@ -0,0 +1,11 @@
+#!/usr/bin/env zsh
+dst=$1
+[[ -n "$dst" ]] || {
+    print Usage: $0 dst-dir-for-warcs >&2
+    exit 1
+}
+cmds=$0:h:a/cmd
+fpath=($cmds/functions.zsh $fpath)
+autoload print-joblog-failed
+parallel --joblog warcs.log "$cmds/download-warcs.zsh {} $dst" ::: feeds/*
+print-joblog-failed < warcs.log
diff --git a/urls2feeds.zsh b/urls2feeds.zsh

index e878ece04d71ffacd30453e99debd51c4e3207fb..c7e97f1248abf308d8d060c842ab7da2f733e670 100755 (executable)
--- a/urls2feeds.zsh
+++ b/urls2feeds.zsh
@@ -1,14 +1,12 @@
  #!/usr/bin/env zsh
  set -e
-autoload regexp-replace
+fpath=($0:h:a/cmd/functions.zsh $fpath)
+autoload url-to-filename
  typeset -A seen
  while read url ; do
      url="$url " # to be sure that next line will work
      url=${${=url}[1]}
-    dir=${url:gs#/#_#}
-    regexp-replace dir "^.*__" ""
-    regexp-replace dir "_$" "" || :
-    dir=feeds/$dir
+    dir=feeds/$(url-to-filename $url)
      seen[$dir]=1
      [[ -e $dir ]] && continue || :
      mkdir -p $dir/{cur,new,tmp} # make it maildir
author	Sergey Matveev <stargrave@stargrave.org>
	Fri, 18 Feb 2022 12:00:33 +0000 (15:00 +0300)
committer	Sergey Matveev <stargrave@stargrave.org>
	Fri, 18 Feb 2022 12:52:28 +0000 (15:52 +0300)
cmd/download-encs.zsh	[new file with mode: 0755]	patch \| blob
cmd/download-warcs.zsh	[new file with mode: 0755]	patch \| blob
cmd/feed2mdir/main.go		patch \| blob \| history
cmd/functions.zsh/print-joblog-failed	[new file with mode: 0644]	patch \| blob
cmd/functions.zsh/url-to-filename	[new file with mode: 0644]	patch \| blob
cmd/muttrc-gen.sh		patch \| blob \| history
cmd/print-failed.zsh.rc	[deleted file]	patch \| blob \| history
doc/index.texi		patch \| blob \| history
doc/mail.texi		patch \| blob \| history
doc/usage.texi		patch \| blob \| history
doc/warcs.texi	[new file with mode: 0644]	patch \| blob
feeds-download.zsh		patch \| blob \| history
feeds-encs.zsh	[new file with mode: 0755]	patch \| blob
feeds-news.zsh	[new file with mode: 0755]	patch \| blob
feeds-parse.zsh		patch \| blob \| history
feeds-warcs.zsh	[new file with mode: 0755]	patch \| blob
urls2feeds.zsh		patch \| blob \| history