/certs
/prv.pem
/tofuproxy.cmd
+/warc-extract.cmd
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Home page: http://www.tofuproxy.stargrave.org/
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
"go.cypherpunks.ru/ucspi"
"go.stargrave.org/tofuproxy"
"go.stargrave.org/tofuproxy/fifos"
+ "go.stargrave.org/tofuproxy/rounds"
)
func main() {
dnsSrv := flag.String("dns", "[::1]:53", "DNS server")
fifosDir := flag.String("fifos", "fifos", "Directory with FIFOs")
notai := flag.Bool("notai", false, "Do not prepend TAI64N to logs")
+ warcOnly := flag.Bool("warc-only", false, "Server only WARC URIs")
flag.Parse()
log.SetFlags(log.Lshortfile)
tofuproxy.DNSSrv = *dnsSrv
tofuproxy.CACert = caCert
tofuproxy.CAPrv = caPrv
+ rounds.WARCOnly = *warcOnly
ln, err := net.Listen("tcp", *bind)
if err != nil {
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+
+ "github.com/dustin/go-humanize"
+ "go.stargrave.org/tofuproxy/warc"
+)
+
+func main() {
+ uri := flag.String("uri", "", "URI to extract, if specified")
+ hdr := flag.Bool("hdr", false, "Also extract WARC's header")
+ idx := flag.Bool("idx", false, "Save WARC indexes")
+ flag.Parse()
+ log.SetFlags(log.Lshortfile)
+
+ for _, p := range flag.Args() {
+ log.Println("adding", p)
+ if err := warc.Add(p); err != nil {
+ log.Fatalln(err)
+ }
+ log.Println("added", p, len(warc.WARCs[p]), "URIs")
+ if *idx {
+ if err := warc.SaveIndexes(); err != nil {
+ log.Fatalln(err)
+ }
+ }
+ }
+ if *uri == "" {
+ for warcPath, uris := range warc.WARCs {
+ for uri, rec := range uris {
+ fmt.Printf(
+ "%s\t%s\t%s\n",
+ warcPath, uri,
+ humanize.IBytes(uint64(rec.TotalSize())),
+ )
+ }
+ }
+ } else {
+ for _, uris := range warc.WARCs {
+ rec := uris[*uri]
+ if rec == nil {
+ continue
+ }
+ r, err := rec.Reader(!*hdr)
+ if err != nil {
+ log.Fatalln(err)
+ }
+ io.Copy(os.Stdout, r)
+ }
+ }
+ return
+}
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
-redo-ifchange *.go cmd/*/*.go fifos/*.go rounds/*.go
+redo-ifchange *.go cmd/*/*.go fifos/*.go rounds/*.go warc/*.go
GO_LDFLAGS="${GO_LDFLAGS:--ldflags=-s}"
${GO:-go} build -o $3 $GO_LDFLAGS ./cmd/${1%.cmd}
--- /dev/null
+@node CertTrust
+@section Certificate trust management
+
+When you encounter something requiring your attention and decision, you
+will be see Tk-dialog through the @command{wish} invocation. GnuTLS'es
+@command{certtool} is used for certificate information printing.
+
+@image{dialog,,,Example dialog,.webp}
+
+Certificate trust decision dialog (like above one) has multiple hotkeys:
+
+@itemize
+@item @code{a} -- accept and save certificate chain to disk
+@item @code{o} -- accept once per session (@command{tofuproxy} running)
+@item @code{r} -- reject certificate
+@item @code{q} -- reject certificate really once, same as closing the window
+@item @code{n} -- next page of "their" certificate chain
+@item @code{p} -- previous page of "their" certificate chain
+@item @code{N} -- next page of "our" certificate chain
+@item @code{P} -- previous page of "our" certificate chain
+@end itemize
+
+To list currently accepted, rejected, HTTP authorized, TLS client
+authenticated hosts:
+
+@example
+$ cat fifos/list-@{accepted,rejected,http-auth,tls-auth@}
+@end example
+
+To remove knowledge of the host from any of the states mentioned above:
+
+@example
+$ echo www.example.com > fifos/del-tls-auth
+@end example
@item And wonderful @url{http://jpegxl.info/, JPEG XL} image format is
not supported by most browsers. Even pretty old
-@url{https://developers.google.com/speed/webp, WebP} is not supported
-everywhere. @url{https://aomediacodec.github.io/av1-avif/, AVIF} would
-be useful too.
+@url{https://developers.google.com/speed/webp, WebP}, that has highest
+compression ratio for lossless screenshots, is not supported everywhere.
+@url{https://aomediacodec.github.io/av1-avif/, AVIF} could be useful too.
+
+@item None of web browsers support ability to view web archives
+(@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC}s). And most of
+WARC-related software is written on Python, that nowadays is close to be
+impossible to install and use with all its broken dependencies system.
@end itemize
TLS @strong{session resumption} and @strong{keep-alives} are also supported.
@item
-And Go itself tries also to act as a @url{https://http2.github.io/,
-HTTP/2} client too.
+And Go itself tries also to act as a @url{https://http2.github.io/, HTTP/2}
+client too.
+
+@item
+Ability to load @url{https://en.wikipedia.org/wiki/Web_ARChive, WARC}
+files, possibly compressed, possibly continued and replace responses.
+
+@item
+
@end itemize
--- /dev/null
+@node Spies
+@section Spies
+
+You can reject requests to the whole domains. As a rule some spying ones
+(for advertisements and better user experience, they say).
+
+@example
+$ tee fifos/add-spies < spies.txt
+ads.google.com
+advertising.yandex.ru
+[...]
+@end example
+
+All subdomains will be rejected too.
@node Usage
@unnumbered Usage
-@itemize
+Currently @command{tofuproxy} uses:
+GnuTLS'es @url{https://www.gnutls.org/manual/html_node/certtool-Invocation.html, certtool},
+@url{http://cr.yp.to/redo.html, redo} build system,
+@url{https://www.tcl.tk/, Tcl/Tk}'s @command{wish} shell for GUI dialogues,
+@command{dwebp}, @command{djxl}, @command{avifdec} for images transcoding,
+@url{https://github.com/halturin/multitail, multitail} for logs viewing.
-@item Currently @command{tofuproxy} uses:
- GnuTLS'es @url{https://www.gnutls.org/manual/html_node/certtool-Invocation.html, certtool},
- @url{http://cr.yp.to/redo.html, redo} build system,
- @url{https://www.tcl.tk/, Tcl/Tk}'s @command{wish} shell for GUI dialogues,
- @command{dwebp}, @command{djxl}, @command{avifdec} for images transcoding,
- @url{https://github.com/halturin/multitail, multitail} for logs viewing.
+@itemize
@item Download and build @command{tofuproxy}:
If you want to use TLS client certificates, then place them to
@file{-ccerts} directory.
-@item
-Load spying domains to reject to with:
-
-@example
-$ cat spies.txt > fifos/add-spies
-@end example
-
@item Watch logs:
@example
@image{logs,,,Example logs,.webp}
-@item
-When you encounter something requiring your attention and decision, you
-will be shown Tk-dialog through the @command{wish} invocation. GnuTLS'es
-@command{certtool} is used for certificate information printing.
-
-@image{dialog,,,Example dialog,.webp}
-
-@item
-Certificate trust decision dialog (like above one) has multiple hotkeys:
-
- @itemize
- @item @code{a} -- accept and save certificate chain to disk
- @item @code{o} -- accept once per session (@command{tofuproxy} running)
- @item @code{r} -- reject certificate
- @item @code{q} -- reject certificate really once, same as closing the window
- @item @code{n} -- next page of "their" certificate chain
- @item @code{p} -- previous page of "their" certificate chain
- @item @code{N} -- next page of "our" certificate chain
- @item @code{P} -- previous page of "our" certificate chain
- @end itemize
-
-@item
-To list currently accepted, rejected, HTTP authorized, TLS client
-authenticated hosts:
-
-@example
-$ cat fifos/list-@{accepted,rejected,http-auth,tls-auth@}
-@end example
-
-@item
-To remove knowledge of the host from any of the states mentioned above:
-
-@example
-$ echo www.example.com > fifos/del-tls-auth
-@end example
-
@end itemize
+
+@include spies.texi
+@include certs.texi
+@include warcs.texi
--- /dev/null
+@node WARCs
+@section WARCs management
+
+To view WARC files, you have to load them in daemon. Responses will be
+transparently replaced from those WARCs for corresponding URIs.
+
+There is no strict validation or checking of WARCs correctness at all!
+But built-in WARC support seems to be good enough for various sources.
+Uncompressed, @command{gzip} (multiple streams and single stream are
+supported) and @command{zstd} compressed ones are supported.
+
+Searching in compressed files is @strong{slow} -- every request will
+lead to decompression of the file from the very beginning, so keeping
+uncompressed WARCs on compressed ZFS dataset is much more preferable.
+@command{tofuproxy} does not take advantage of multistream gzip files.
+
+@itemize
+
+@item
+Load WARCs:
+
+@example
+$ tee fifos/add-warcs < warcs.txt
+smth.warc-00000.warc.gz
+smth.warc-00001.warc.gz
+smth.warc-00002.warc.gz
+another.warc
+@end example
+
+@item
+Visit the URI you know, that exists in those WARCs, or go to
+@url{http://warc/}, to view full list of known loaded URIs from
+those WARCs.
+
+@item
+Pay attention that order of WARCs loading is important! WARC can be
+segmented and single response can be split on multiple WARC files.
+Each following WARC files will overwrite possibly already existing URIs.
+
+@item
+To list and delete loaded known WARCs:
+
+@example
+$ cat fifos/list-warcs
+smth.warc-00000.warc.gz 154
+smth.warc-00001.warc.gz 13
+smth.warc-00002.warc.gz 0
+another.warc 123
+$ echo another.warc > fifos/del-warcs
+@end example
+
+One possibility that @file{smth.warc-00002.warc.gz} has no URIs is that
+it contains continuation segmented records.
+
+@end itemize
+
+Loading of WARC involves its whole reading and remembering where is each
+URI response is located. You can @code{echo SAVE > fifos/add-warcs} to
+save in-memory index to the disk as @file{....warc.idx.gob} file. During
+the next load, if that file exists, it is used as index immediately,
+without expensive WARC reading.
+
+@code{redo warc-extract.cmd} builds @command{warc-extract.cmd} utility,
+that uses exactly the same code for parsing WARCs. It can be used to
+check if WARCs can be successfully loaded, to list all URIs after, to
+extract some specified URI and to pre-generate @file{.idx.gob} indexes.
+
+@example
+$ warc-extract.cmd -idx \
+ smth.warc-00000.warc.gz \
+ smth.warc-00001.warc.gz \
+ smth.warc-00002.warc.gz
+$ warc-extract.cmd -uri http://some/uri \
+ smth.warc-00000.warc.gz \
+ smth.warc-00001.warc.gz \
+ smth.warc-00002.warc.gz
+@end example
+
+@url{https://www.gnu.org/software/wget/, GNU Wget} can be easily used to
+create WARCs:
+
+@example
+$ wget ... [--page-requisites] [--recursive] \
+ --no-warc-keep-log --no-warc-digests [--warc-max-size=XXX] \
+ --warc-file smth.warc ...
+@end example
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
-for f in cert dane err http-auth non-ok ok redir req tls tls-auth various ; do
+for f in cert dane err http-auth non-ok ok redir req tls tls-auth various warc ; do
[ -p log-$f ] || mkfifo log-$f
done
-for f in accepted http-auth rejected spies tls-auth ; do
+for f in accepted http-auth rejected spies tls-auth warcs ; do
[ -p list-$f ] || mkfifo list-$f
[ -p del-$f ] || mkfifo del-$f
done
[ -p add-spies ] || mkfifo add-spies
+[ -p add-warcs ] || mkfifo add-warcs
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
LogTLS = make(chan string)
LogTLSAuth = make(chan string)
LogVarious = make(chan string)
+ LogWARC = make(chan string)
)
func logger(c chan string, p string) {
-t "Redirect " -ci cyan --label "R " -L "while :; do tai64nlocal < log-redir ; done" \
-t "Request" -ci blue --label "> " -L "while :; do tai64nlocal < log-req ; done" \
-t "TLS connection" -ci yellow --label "S " -L "while :; do tai64nlocal < log-tls ; done" \
- -t "Various" -ci yellow -L "while :; do tai64nlocal < log-various ; done"
+ -t "Various" -ci yellow -L "while :; do tai64nlocal < log-various ; done" \
+ -t "WARC" -ci green --label "W " -L "while :; do tai64nlocal < log-warc ; done"
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
package fifos
import (
"path/filepath"
"go.stargrave.org/tofuproxy/caches"
+ "go.stargrave.org/tofuproxy/warc"
)
func Start(fifos string) {
go logger(LogTLS, filepath.Join(fifos, "log-tls"))
go logger(LogTLSAuth, filepath.Join(fifos, "log-tls-auth"))
go logger(LogVarious, filepath.Join(fifos, "log-various"))
+ go logger(LogWARC, filepath.Join(fifos, "log-warc"))
go listAccepted(filepath.Join(fifos, "list-accepted"))
go listHTTPAuth(filepath.Join(fifos, "list-http-auth"))
go listRejected(filepath.Join(fifos, "list-rejected"))
go listSpies(filepath.Join(fifos, "list-spies"))
go listTLSAuth(filepath.Join(fifos, "list-tls-auth"))
+ go listWARCs(filepath.Join(fifos, "list-warcs"))
go del(
&caches.AcceptedM, func(host string) { delete(caches.Accepted, host) },
},
filepath.Join(fifos, "del-spies"),
)
+
+ go addWARC(filepath.Join(fifos, "add-warcs"))
+ go del(
+ &warc.WARCsM, func(warcPath string) { delete(warc.WARCs, warcPath) },
+ filepath.Join(fifos, "del-warcs"),
+ )
}
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package fifos
+
+import (
+ "bufio"
+ "fmt"
+ "log"
+ "os"
+
+ "go.stargrave.org/tofuproxy/warc"
+)
+
+func listWARCs(p string) {
+ for {
+ fd, err := os.OpenFile(p, os.O_WRONLY|os.O_APPEND, os.FileMode(0666))
+ if err != nil {
+ log.Fatalln(err)
+ }
+ warc.WARCsM.RLock()
+ for warcPath, uris := range warc.WARCs {
+ fmt.Fprintf(fd, "%s\t%d\n", warcPath, len(uris))
+ }
+ warc.WARCsM.RUnlock()
+ fd.Close()
+ }
+}
+
+func addWARC(p string) {
+ for {
+ fd, err := os.OpenFile(p, os.O_RDONLY, os.FileMode(0666))
+ if err != nil {
+ log.Fatalln(err)
+ }
+ var warcPaths []string
+ scanner := bufio.NewScanner(fd)
+ for scanner.Scan() {
+ t := scanner.Text()
+ if len(t) > 0 {
+ warcPaths = append(warcPaths, t)
+ }
+ }
+ fd.Close()
+ for _, warcPath := range warcPaths {
+ if warcPath == "SAVE" {
+ if err = warc.SaveIndexes(); err != nil {
+ log.Printf("%s: can not save index %s: %+v\n", p, warcPath, err)
+ }
+ continue
+ }
+ if _, exists := warc.WARCs[warcPath]; exists {
+ continue
+ }
+ log.Printf("%s: adding WARC %s\n", p, warcPath)
+ err = warc.Add(warcPath)
+ if err != nil {
+ log.Printf("%s: can not open %s: %+v\n", p, warcPath, err)
+ break
+ }
+ log.Printf("%s: %s: added %d URIs\n", p, warcPath, len(warc.WARCs[warcPath]))
+ }
+ }
+}
require (
github.com/dustin/go-humanize v1.0.0
+ github.com/klauspost/compress v1.13.6
github.com/miekg/dns v1.1.29
go.cypherpunks.ru/tai64n/v2 v2.0.0
go.cypherpunks.ru/ucspi v0.0.0-20210908140534-cfdc20a8225f
github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
+github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
github.com/miekg/dns v1.1.29 h1:xHBEhR+t5RzcFJjBLJlax2daXOrTYtr9z4WdKEfWFzg=
github.com/miekg/dns v1.1.29/go.mod h1:KNUDUusw/aVsxyTYZM1oqvCicbwhgbNgztCETuNZ7xM=
go.cypherpunks.ru/tai64n/v2 v2.0.0 h1:AlohA1/zRqInhIGK7CVnn7tC5/vt1TaOAEyBgeu5Ruo=
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <title>WARC URIs</title>
+</head>
+<body>
+<table border=1>
+{{range $idx, $entry := .Entries}}
+<tr><th>{{$idx}}</th><td><tt>{{.WARC}}</tt></td><td>{{.Size}}</td></tr>
+<tr><td colspan=3><a href="{{.URI}}">{{.URI}}</a></td></tr>
+</tr>
+{{end}}
+</table>
+</body>
+</html>
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package rounds
+
+import (
+ _ "embed"
+ "fmt"
+ "html/template"
+ "io"
+ "log"
+ "net/http"
+ "path"
+ "sort"
+ "strings"
+
+ "github.com/dustin/go-humanize"
+ "go.stargrave.org/tofuproxy/fifos"
+ "go.stargrave.org/tofuproxy/warc"
+)
+
+const WARCEntrypoint = "http://warc/"
+
+var (
+ WARCOnly bool
+
+ //go:embed warc-index.tmpl
+ TmplWARCIndexRaw string
+ TmplWARCIndex = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw))
+)
+
+type WARCEntry struct {
+ WARC string
+ URI string
+ Size string
+}
+
+type ByDepth []*WARCEntry
+
+func (a ByDepth) Len() int {
+ return len(a)
+}
+
+func (a ByDepth) Swap(i, j int) {
+ a[i], a[j] = a[j], a[i]
+}
+
+func (a ByDepth) Less(i, j int) bool {
+ ci := len(a[i].WARC)
+ cj := len(a[j].WARC)
+ if ci != cj {
+ return ci < cj
+ }
+ uriI := strings.TrimSuffix(a[i].URI, "/")
+ uriJ := strings.TrimSuffix(a[j].URI, "/")
+ ci = strings.Count(uriI, "/")
+ cj = strings.Count(uriJ, "/")
+ if ci != cj {
+ return ci < cj
+ }
+ return len(uriI) < len(uriJ)
+}
+
+func RoundWARC(
+ host string,
+ resp *http.Response,
+ w http.ResponseWriter,
+ req *http.Request,
+) (bool, error) {
+ if req.URL.String() == WARCEntrypoint {
+ var entries []*WARCEntry
+ warc.WARCsM.RLock()
+ for warcPath, uris := range warc.WARCs {
+ for uri, rec := range uris {
+ entries = append(entries, &WARCEntry{
+ path.Base(warcPath),
+ uri,
+ humanize.IBytes(uint64(rec.TotalSize())),
+ })
+ }
+ }
+ warc.WARCsM.RUnlock()
+ sort.Sort(ByDepth(entries))
+ err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries})
+ if err == nil {
+ return false, nil
+ } else {
+ log.Printf("WARC: error during %s: %+v\n", req.URL, err)
+ return false, err
+ }
+ }
+
+ var rec *warc.Record
+ var warcPath string
+ var uris map[string]*warc.Record
+ hostOrig := req.URL.Host
+ if req.URL.Scheme == "https" {
+ req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443")
+ }
+ warc.WARCsM.RLock()
+ for warcPath, uris = range warc.WARCs {
+ rec = uris[req.URL.String()]
+ if rec != nil {
+ break
+ }
+ }
+ warc.WARCsM.RUnlock()
+ req.URL.Host = hostOrig
+ if rec == nil {
+ if WARCOnly {
+ http.NotFound(w, req)
+ fifos.LogNonOK <- fmt.Sprintf("%s %s\tnot in WARC", req.Method, req.URL)
+ return false, nil
+ }
+ return true, nil
+ }
+
+ wr, err := rec.Reader(true)
+ if err != nil {
+ log.Printf("WARC: error during %s: %+v\n", req.URL, err)
+ return false, err
+ }
+ hj, ok := w.(http.Hijacker)
+ if !ok {
+ http.Error(w, "can not hijack", http.StatusInternalServerError)
+ return false, err
+ }
+ conn, _, err := hj.Hijack()
+ if err != nil {
+ panic(err)
+ }
+ _, err = io.Copy(conn, wr)
+ conn.Close()
+ fifos.LogWARC <- fmt.Sprintf(
+ "%s %s\t%s\t%s\t%s",
+ req.Method, req.URL,
+ strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"),
+ warcPath,
+ humanize.IBytes(uint64(rec.TotalSize())),
+ )
+ return false, err
+}
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
+++ /dev/null
-redo-ifchange *.go cmd/tofuproxy/*.go fifos/*.go rounds/*.go
-GO_LDFLAGS="${GO_LDFLAGS:--ldflags=-s}"
-${GO:-go} build -o $3 $GO_LDFLAGS ./cmd/tofuproxy
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
host := strings.TrimSuffix(req.URL.Host, ":443")
for _, round := range []Round{
rounds.RoundNoHead,
+ rounds.RoundWARC,
rounds.RoundDenySpy,
rounds.RoundRedditOld,
rounds.RoundHabrImage,
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import (
+ "compress/gzip"
+ "fmt"
+ "io"
+ "os"
+ "path"
+
+ "github.com/klauspost/compress/zstd"
+)
+
+type Compressed struct {
+ r io.ReadCloser
+ fd *os.File
+ offset int64
+}
+
+func (c *Compressed) Read(p []byte) (int, error) {
+ n, err := c.r.Read(p)
+ c.offset += int64(n)
+ return n, err
+}
+
+func (c *Compressed) Close() error {
+ c.r.Close()
+ return c.fd.Close()
+}
+
+func (c *Compressed) Seek(offset int64, whence int) (int64, error) {
+ if whence != io.SeekStart {
+ panic("can only seek from the start")
+ }
+ if _, err := io.CopyN(io.Discard, c, offset-c.offset); err != nil {
+ return 0, err
+ }
+ c.offset = offset
+ return c.offset, nil
+}
+
+func Open(warcPath string) (io.ReadSeekCloser, error) {
+ ext := path.Ext(warcPath)
+ switch ext {
+ case ".warc":
+ return os.Open(warcPath)
+ case ".gz":
+ fd, err := os.Open(warcPath)
+ if err != nil {
+ return nil, err
+ }
+ gzr, err := gzip.NewReader(fd)
+ if err != nil {
+ return nil, err
+ }
+ gzr.Multistream(true)
+ return &Compressed{r: gzr, fd: fd}, nil
+ case ".zst":
+ fd, err := os.Open(warcPath)
+ if err != nil {
+ return nil, err
+ }
+ zstdr, err := zstd.NewReader(fd)
+ if err != nil {
+ return nil, err
+ }
+ return &Compressed{r: zstdr.IOReadCloser(), fd: fd}, nil
+ }
+ return nil, fmt.Errorf("unknown extensions: %s", ext)
+}
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import "strings"
+
+type Header map[string]string
+
+func splitKeyValue(line string) (string, string) {
+ parts := strings.SplitN(line, ":", 2)
+ if len(parts) != 2 {
+ return "", ""
+ }
+ return parts[0], strings.TrimSpace(parts[1])
+}
+
+func NewHeader() Header {
+ return make(map[string]string)
+}
+
+func (h Header) Set(key, value string) {
+ h[strings.ToLower(key)] = value
+}
+
+func (h Header) Get(key string) string {
+ return h[strings.ToLower(key)]
+}
+
+func (h Header) Del(key string) {
+ delete(h, strings.ToLower(key))
+}
+
+func (h Header) AddLine(line string) {
+ parts := strings.SplitN(line, ":", 2)
+ if len(parts) != 2 {
+ return
+ }
+ h.Set(parts[0], strings.TrimSpace(parts[1]))
+}
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+)
+
+const CRLF = "\r\n"
+
+type Reader struct {
+ Path string
+ r *bufio.Reader
+ rsc io.ReadSeekCloser
+ offset int64
+ prevRec *Record
+}
+
+func NewReader(warcPath string) (*Reader, error) {
+ rsc, err := Open(warcPath)
+ if err != nil {
+ return nil, err
+ }
+ return &Reader{
+ Path: warcPath,
+ rsc: rsc,
+ r: bufio.NewReader(rsc),
+ }, nil
+}
+
+func (r *Reader) next() error {
+ if r.prevRec == nil {
+ return nil
+ }
+ if _, err := r.r.Discard(int(r.prevRec.Size)); err != nil {
+ return err
+ }
+ r.offset += int64(r.prevRec.HdrLen) + r.prevRec.Size
+ for i := 0; i < 2; i++ {
+ line, err := r.r.ReadString('\n')
+ if err != nil {
+ return err
+ }
+ r.offset += int64(len(line))
+ if line != CRLF {
+ return fmt.Errorf("non-CRLF: %q", line)
+ }
+ }
+ return nil
+}
+
+func (r *Reader) ReadRecord() (*Record, error) {
+ r.next()
+ line, err := r.r.ReadString('\n')
+ if err != nil {
+ return nil, err
+ }
+ if !strings.HasPrefix(line, "WARC/") {
+ return nil, fmt.Errorf("non-WARC header: %q", line)
+ }
+ hdrLen := len(line)
+ hdr := NewHeader()
+ for {
+ line, err := r.r.ReadString('\n')
+ if err != nil {
+ return nil, err
+ }
+ hdrLen += len(line)
+ if line == CRLF {
+ break
+ }
+ hdr.AddLine(line)
+ }
+ size, err := strconv.ParseUint(hdr.Get("Content-Length"), 10, 64)
+ if err != nil {
+ return nil, err
+ }
+ rec := &Record{
+ WARCPath: r.Path,
+ Offset: r.offset,
+ Hdr: hdr,
+ HdrLen: hdrLen,
+ Size: int64(size),
+ }
+ r.prevRec = rec
+ return rec, nil
+}
+
+func (r *Reader) Close() error {
+ return r.rsc.Close()
+}
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import (
+ "io"
+ "strings"
+)
+
+type Record struct {
+ WARCPath string
+ Offset int64
+ Hdr Header
+ HdrLen int
+ Size int64
+
+ Continuations []*Record
+}
+
+func (rec *Record) URI() string {
+ return strings.Trim(rec.Hdr.Get("WARC-Target-URI"), "<>")
+}
+
+func (rec *Record) TotalSize() int64 {
+ s := rec.Size
+ for _, r := range rec.Continuations {
+ s += r.Size
+ }
+ return s
+}
+
+type SelfRecordReader struct {
+ r *io.LimitedReader
+ rsc io.ReadSeekCloser
+}
+
+func (srr *SelfRecordReader) Read(p []byte) (n int, err error) {
+ n, err = srr.r.Read(p)
+ if err != nil {
+ srr.Close()
+ }
+ return
+}
+
+func (srr *SelfRecordReader) Close() error {
+ return srr.rsc.Close()
+}
+
+func (rec *Record) selfReader(noHdr bool) (*SelfRecordReader, error) {
+ rsc, err := Open(rec.WARCPath)
+ if err != nil {
+ return nil, err
+ }
+ offset := rec.Offset
+ if noHdr {
+ offset += int64(rec.HdrLen)
+ }
+ if _, err = rsc.Seek(offset, io.SeekStart); err != nil {
+ rsc.Close()
+ return nil, err
+ }
+ return &SelfRecordReader{r: &io.LimitedReader{R: rsc, N: rec.Size}, rsc: rsc}, nil
+}
+
+type RecordReader struct {
+ r io.Reader
+ srrs []*SelfRecordReader
+}
+
+func (rec *Record) Reader(noHdr bool) (*RecordReader, error) {
+ srrs := make([]*SelfRecordReader, 0, 1+len(rec.Continuations))
+ rs := make([]io.Reader, 0, 1+len(rec.Continuations))
+ for i, r := range append([]*Record{rec}, rec.Continuations...) {
+ if i > 0 {
+ noHdr = true
+ }
+ srr, err := r.selfReader(noHdr)
+ if err != nil {
+ for _, srr := range srrs {
+ srr.Close()
+ }
+ return nil, err
+ }
+ srrs = append(srrs, srr)
+ rs = append(rs, srr)
+ }
+ return &RecordReader{r: io.MultiReader(rs...), srrs: srrs}, nil
+}
+
+func (rr *RecordReader) Read(p []byte) (n int, err error) {
+ n, err = rr.r.Read(p)
+ if err != nil {
+ rr.Close()
+ }
+ return
+}
+
+func (rr *RecordReader) Close() error {
+ for _, srr := range rr.srrs {
+ srr.Close()
+ }
+ return nil
+}
--- /dev/null
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import (
+ "encoding/gob"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strconv"
+ "sync"
+)
+
+const IndexExt = ".idx.gob"
+
+var (
+ WARCs = map[string]map[string]*Record{}
+ WARCsM sync.RWMutex
+
+ Incomplete = map[string]*Record{}
+)
+
+func Add(warcPath string) error {
+ fd, err := os.Open(warcPath + IndexExt)
+ if err == nil {
+ defer fd.Close()
+ var uris map[string]*Record
+ if err := gob.NewDecoder(fd).Decode(&uris); err != nil {
+ return err
+ }
+ WARCsM.Lock()
+ WARCs[warcPath] = uris
+ WARCsM.Unlock()
+ return nil
+ }
+ if err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ r, err := NewReader(warcPath)
+ if err != nil {
+ return err
+ }
+ defer r.Close()
+ uris := map[string]*Record{}
+ for {
+ rec, err := r.ReadRecord()
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ return err
+ }
+ segNum := rec.Hdr.Get("WARC-Segment-Number")
+ switch rec.Hdr.Get("WARC-Type") {
+ case "response":
+ uri := rec.URI()
+ if uri == "" {
+ continue
+ }
+ if segNum == "1" {
+ Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec
+ continue
+ }
+ uris[uri] = rec
+ case "continuation":
+ originID := rec.Hdr.Get("WARC-Segment-Origin-ID")
+ incomplete := Incomplete[originID]
+ if incomplete == nil {
+ return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID)
+ }
+ segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1)
+ if segNum != segNumExpected {
+ return fmt.Errorf(
+ "unexpected WARC-Segment-Number %s != %s",
+ segNum, segNumExpected,
+ )
+ }
+ incomplete.Continuations = append(incomplete.Continuations, rec)
+ if rec.Hdr.Get("WARC-Segment-Total-Length") != "" {
+ WARCsM.Lock()
+ WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
+ WARCsM.Unlock()
+ delete(Incomplete, originID)
+ }
+ }
+ }
+ WARCsM.Lock()
+ WARCs[warcPath] = uris
+ WARCsM.Unlock()
+ return nil
+}
+
+func SaveIndexes() error {
+ WARCsM.RLock()
+ defer WARCsM.RUnlock()
+ for warcPath, uris := range WARCs {
+ p := warcPath + IndexExt
+ if _, err := os.Stat(p); err == nil {
+ continue
+ }
+ fd, err := os.OpenFile(
+ p+".tmp",
+ os.O_CREATE|os.O_WRONLY|os.O_EXCL,
+ os.FileMode(0666),
+ )
+ if err != nil {
+ return err
+ }
+ if err = gob.NewEncoder(fd).Encode(&uris); err != nil {
+ fd.Close()
+ return err
+ }
+ fd.Close()
+ if err = os.Rename(p+".tmp", p); err != nil {
+ return err
+ }
+ log.Println("saved:", p)
+ }
+ return nil
+}
/*
-tofuproxy -- HTTP proxy with TLS certificates management
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify