From 0c0a261a6ef4fddfc34a9150005f7964cc69c420 Mon Sep 17 00:00:00 2001 From: Sergey Matveev Date: Wed, 13 Oct 2021 21:35:38 +0300 Subject: [PATCH] WARC --- .gitignore | 1 + README | 2 +- cmd/certgen/main.go | 2 +- cmd/tofuproxy/main.go | 5 +- cmd/warc-extract/main.go | 74 +++++++++++++++++++ conn.go | 2 +- dane.go | 2 +- default.cmd.do | 2 +- doc/certs.texi | 34 +++++++++ doc/index.texi | 22 ++++-- doc/spies.texi | 14 ++++ doc/usage.texi | 61 +++------------ doc/warcs.texi | 86 +++++++++++++++++++++ fifos/del.go | 2 +- fifos/ensure.do | 5 +- fifos/list.go | 2 +- fifos/log.go | 3 +- fifos/multitail.sh | 3 +- fifos/spies.go | 2 +- fifos/start.go | 26 +++++++ fifos/warcs.go | 78 ++++++++++++++++++++ go.mod | 1 + go.sum | 2 + httpauth.go | 2 +- rounds/denyFonts.go | 2 +- rounds/habrImage.go | 2 +- rounds/noHead.go | 2 +- rounds/reddit.go | 2 +- rounds/redirectHTML.go | 2 +- rounds/spy.go | 2 +- rounds/transcodeAVIF.go | 2 +- rounds/transcodeJXL.go | 2 +- rounds/transcodeWebP.go | 2 +- rounds/warc-index.tmpl | 16 ++++ rounds/warc.go | 156 +++++++++++++++++++++++++++++++++++++++ tls.go | 2 +- tlsauth.go | 2 +- tofuproxy.do | 3 - trip.go | 3 +- verify.go | 2 +- warc/compressed.go | 86 +++++++++++++++++++++ warc/header.go | 54 ++++++++++++++ warc/reader.go | 110 +++++++++++++++++++++++++++ warc/record.go | 118 +++++++++++++++++++++++++++++ warc/uris.go | 136 ++++++++++++++++++++++++++++++++++ x509.go | 2 +- 46 files changed, 1055 insertions(+), 86 deletions(-) create mode 100644 cmd/warc-extract/main.go create mode 100644 doc/certs.texi create mode 100644 doc/spies.texi create mode 100644 doc/warcs.texi create mode 100644 fifos/warcs.go create mode 100644 rounds/warc-index.tmpl create mode 100644 rounds/warc.go delete mode 100644 tofuproxy.do create mode 100644 warc/compressed.go create mode 100644 warc/header.go create mode 100644 warc/reader.go create mode 100644 warc/record.go create mode 100644 warc/uris.go diff --git a/.gitignore b/.gitignore index 1da43c8..087a229 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /certs /prv.pem /tofuproxy.cmd +/warc-extract.cmd diff --git a/README b/README index a8f0bf3..1f14d6e 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Home page: http://www.tofuproxy.stargrave.org/ diff --git a/cmd/certgen/main.go b/cmd/certgen/main.go index 1db79de..ab2b9f6 100644 --- a/cmd/certgen/main.go +++ b/cmd/certgen/main.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/cmd/tofuproxy/main.go b/cmd/tofuproxy/main.go index 96dbd32..d552339 100644 --- a/cmd/tofuproxy/main.go +++ b/cmd/tofuproxy/main.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify @@ -26,6 +26,7 @@ import ( "go.cypherpunks.ru/ucspi" "go.stargrave.org/tofuproxy" "go.stargrave.org/tofuproxy/fifos" + "go.stargrave.org/tofuproxy/rounds" ) func main() { @@ -37,6 +38,7 @@ func main() { dnsSrv := flag.String("dns", "[::1]:53", "DNS server") fifosDir := flag.String("fifos", "fifos", "Directory with FIFOs") notai := flag.Bool("notai", false, "Do not prepend TAI64N to logs") + warcOnly := flag.Bool("warc-only", false, "Server only WARC URIs") flag.Parse() log.SetFlags(log.Lshortfile) @@ -57,6 +59,7 @@ func main() { tofuproxy.DNSSrv = *dnsSrv tofuproxy.CACert = caCert tofuproxy.CAPrv = caPrv + rounds.WARCOnly = *warcOnly ln, err := net.Listen("tcp", *bind) if err != nil { diff --git a/cmd/warc-extract/main.go b/cmd/warc-extract/main.go new file mode 100644 index 0000000..8b1aa2a --- /dev/null +++ b/cmd/warc-extract/main.go @@ -0,0 +1,74 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "flag" + "fmt" + "io" + "log" + "os" + + "github.com/dustin/go-humanize" + "go.stargrave.org/tofuproxy/warc" +) + +func main() { + uri := flag.String("uri", "", "URI to extract, if specified") + hdr := flag.Bool("hdr", false, "Also extract WARC's header") + idx := flag.Bool("idx", false, "Save WARC indexes") + flag.Parse() + log.SetFlags(log.Lshortfile) + + for _, p := range flag.Args() { + log.Println("adding", p) + if err := warc.Add(p); err != nil { + log.Fatalln(err) + } + log.Println("added", p, len(warc.WARCs[p]), "URIs") + if *idx { + if err := warc.SaveIndexes(); err != nil { + log.Fatalln(err) + } + } + } + if *uri == "" { + for warcPath, uris := range warc.WARCs { + for uri, rec := range uris { + fmt.Printf( + "%s\t%s\t%s\n", + warcPath, uri, + humanize.IBytes(uint64(rec.TotalSize())), + ) + } + } + } else { + for _, uris := range warc.WARCs { + rec := uris[*uri] + if rec == nil { + continue + } + r, err := rec.Reader(!*hdr) + if err != nil { + log.Fatalln(err) + } + io.Copy(os.Stdout, r) + } + } + return +} diff --git a/conn.go b/conn.go index baa7c8d..4b88f13 100644 --- a/conn.go +++ b/conn.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/dane.go b/dane.go index d3218dd..9ab9504 100644 --- a/dane.go +++ b/dane.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/default.cmd.do b/default.cmd.do index d15196b..9473531 100644 --- a/default.cmd.do +++ b/default.cmd.do @@ -1,3 +1,3 @@ -redo-ifchange *.go cmd/*/*.go fifos/*.go rounds/*.go +redo-ifchange *.go cmd/*/*.go fifos/*.go rounds/*.go warc/*.go GO_LDFLAGS="${GO_LDFLAGS:--ldflags=-s}" ${GO:-go} build -o $3 $GO_LDFLAGS ./cmd/${1%.cmd} diff --git a/doc/certs.texi b/doc/certs.texi new file mode 100644 index 0000000..b109dc8 --- /dev/null +++ b/doc/certs.texi @@ -0,0 +1,34 @@ +@node CertTrust +@section Certificate trust management + +When you encounter something requiring your attention and decision, you +will be see Tk-dialog through the @command{wish} invocation. GnuTLS'es +@command{certtool} is used for certificate information printing. + +@image{dialog,,,Example dialog,.webp} + +Certificate trust decision dialog (like above one) has multiple hotkeys: + +@itemize +@item @code{a} -- accept and save certificate chain to disk +@item @code{o} -- accept once per session (@command{tofuproxy} running) +@item @code{r} -- reject certificate +@item @code{q} -- reject certificate really once, same as closing the window +@item @code{n} -- next page of "their" certificate chain +@item @code{p} -- previous page of "their" certificate chain +@item @code{N} -- next page of "our" certificate chain +@item @code{P} -- previous page of "our" certificate chain +@end itemize + +To list currently accepted, rejected, HTTP authorized, TLS client +authenticated hosts: + +@example +$ cat fifos/list-@{accepted,rejected,http-auth,tls-auth@} +@end example + +To remove knowledge of the host from any of the states mentioned above: + +@example +$ echo www.example.com > fifos/del-tls-auth +@end example diff --git a/doc/index.texi b/doc/index.texi index 8118303..b97500d 100644 --- a/doc/index.texi +++ b/doc/index.texi @@ -45,9 +45,14 @@ Why the hell people just do not send PostScript documents instead!? @item And wonderful @url{http://jpegxl.info/, JPEG XL} image format is not supported by most browsers. Even pretty old -@url{https://developers.google.com/speed/webp, WebP} is not supported -everywhere. @url{https://aomediacodec.github.io/av1-avif/, AVIF} would -be useful too. +@url{https://developers.google.com/speed/webp, WebP}, that has highest +compression ratio for lossless screenshots, is not supported everywhere. +@url{https://aomediacodec.github.io/av1-avif/, AVIF} could be useful too. + +@item None of web browsers support ability to view web archives +(@url{https://en.wikipedia.org/wiki/Web_ARChive, WARC}s). And most of +WARC-related software is written on Python, that nowadays is close to be +impossible to install and use with all its broken dependencies system. @end itemize @@ -120,8 +125,15 @@ Optional @strong{DANE-EE} check is also made for each domain you visit. TLS @strong{session resumption} and @strong{keep-alives} are also supported. @item -And Go itself tries also to act as a @url{https://http2.github.io/, -HTTP/2} client too. +And Go itself tries also to act as a @url{https://http2.github.io/, HTTP/2} +client too. + +@item +Ability to load @url{https://en.wikipedia.org/wiki/Web_ARChive, WARC} +files, possibly compressed, possibly continued and replace responses. + +@item + @end itemize diff --git a/doc/spies.texi b/doc/spies.texi new file mode 100644 index 0000000..afbb16a --- /dev/null +++ b/doc/spies.texi @@ -0,0 +1,14 @@ +@node Spies +@section Spies + +You can reject requests to the whole domains. As a rule some spying ones +(for advertisements and better user experience, they say). + +@example +$ tee fifos/add-spies < spies.txt +ads.google.com +advertising.yandex.ru +[...] +@end example + +All subdomains will be rejected too. diff --git a/doc/usage.texi b/doc/usage.texi index 7008456..c6f2611 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -1,14 +1,14 @@ @node Usage @unnumbered Usage -@itemize +Currently @command{tofuproxy} uses: +GnuTLS'es @url{https://www.gnutls.org/manual/html_node/certtool-Invocation.html, certtool}, +@url{http://cr.yp.to/redo.html, redo} build system, +@url{https://www.tcl.tk/, Tcl/Tk}'s @command{wish} shell for GUI dialogues, +@command{dwebp}, @command{djxl}, @command{avifdec} for images transcoding, +@url{https://github.com/halturin/multitail, multitail} for logs viewing. -@item Currently @command{tofuproxy} uses: - GnuTLS'es @url{https://www.gnutls.org/manual/html_node/certtool-Invocation.html, certtool}, - @url{http://cr.yp.to/redo.html, redo} build system, - @url{https://www.tcl.tk/, Tcl/Tk}'s @command{wish} shell for GUI dialogues, - @command{dwebp}, @command{djxl}, @command{avifdec} for images transcoding, - @url{https://github.com/halturin/multitail, multitail} for logs viewing. +@itemize @item Download and build @command{tofuproxy}: @@ -52,13 +52,6 @@ main.go:70: listening: [::1]:8080 dns: [::1]:53 certs: ./certs ccerts: ./ccerts If you want to use TLS client certificates, then place them to @file{-ccerts} directory. -@item -Load spying domains to reject to with: - -@example -$ cat spies.txt > fifos/add-spies -@end example - @item Watch logs: @example @@ -67,40 +60,8 @@ $ ( cd fifos ; ./multitail.sh ) @image{logs,,,Example logs,.webp} -@item -When you encounter something requiring your attention and decision, you -will be shown Tk-dialog through the @command{wish} invocation. GnuTLS'es -@command{certtool} is used for certificate information printing. - -@image{dialog,,,Example dialog,.webp} - -@item -Certificate trust decision dialog (like above one) has multiple hotkeys: - - @itemize - @item @code{a} -- accept and save certificate chain to disk - @item @code{o} -- accept once per session (@command{tofuproxy} running) - @item @code{r} -- reject certificate - @item @code{q} -- reject certificate really once, same as closing the window - @item @code{n} -- next page of "their" certificate chain - @item @code{p} -- previous page of "their" certificate chain - @item @code{N} -- next page of "our" certificate chain - @item @code{P} -- previous page of "our" certificate chain - @end itemize - -@item -To list currently accepted, rejected, HTTP authorized, TLS client -authenticated hosts: - -@example -$ cat fifos/list-@{accepted,rejected,http-auth,tls-auth@} -@end example - -@item -To remove knowledge of the host from any of the states mentioned above: - -@example -$ echo www.example.com > fifos/del-tls-auth -@end example - @end itemize + +@include spies.texi +@include certs.texi +@include warcs.texi diff --git a/doc/warcs.texi b/doc/warcs.texi new file mode 100644 index 0000000..9170553 --- /dev/null +++ b/doc/warcs.texi @@ -0,0 +1,86 @@ +@node WARCs +@section WARCs management + +To view WARC files, you have to load them in daemon. Responses will be +transparently replaced from those WARCs for corresponding URIs. + +There is no strict validation or checking of WARCs correctness at all! +But built-in WARC support seems to be good enough for various sources. +Uncompressed, @command{gzip} (multiple streams and single stream are +supported) and @command{zstd} compressed ones are supported. + +Searching in compressed files is @strong{slow} -- every request will +lead to decompression of the file from the very beginning, so keeping +uncompressed WARCs on compressed ZFS dataset is much more preferable. +@command{tofuproxy} does not take advantage of multistream gzip files. + +@itemize + +@item +Load WARCs: + +@example +$ tee fifos/add-warcs < warcs.txt +smth.warc-00000.warc.gz +smth.warc-00001.warc.gz +smth.warc-00002.warc.gz +another.warc +@end example + +@item +Visit the URI you know, that exists in those WARCs, or go to +@url{http://warc/}, to view full list of known loaded URIs from +those WARCs. + +@item +Pay attention that order of WARCs loading is important! WARC can be +segmented and single response can be split on multiple WARC files. +Each following WARC files will overwrite possibly already existing URIs. + +@item +To list and delete loaded known WARCs: + +@example +$ cat fifos/list-warcs +smth.warc-00000.warc.gz 154 +smth.warc-00001.warc.gz 13 +smth.warc-00002.warc.gz 0 +another.warc 123 +$ echo another.warc > fifos/del-warcs +@end example + +One possibility that @file{smth.warc-00002.warc.gz} has no URIs is that +it contains continuation segmented records. + +@end itemize + +Loading of WARC involves its whole reading and remembering where is each +URI response is located. You can @code{echo SAVE > fifos/add-warcs} to +save in-memory index to the disk as @file{....warc.idx.gob} file. During +the next load, if that file exists, it is used as index immediately, +without expensive WARC reading. + +@code{redo warc-extract.cmd} builds @command{warc-extract.cmd} utility, +that uses exactly the same code for parsing WARCs. It can be used to +check if WARCs can be successfully loaded, to list all URIs after, to +extract some specified URI and to pre-generate @file{.idx.gob} indexes. + +@example +$ warc-extract.cmd -idx \ + smth.warc-00000.warc.gz \ + smth.warc-00001.warc.gz \ + smth.warc-00002.warc.gz +$ warc-extract.cmd -uri http://some/uri \ + smth.warc-00000.warc.gz \ + smth.warc-00001.warc.gz \ + smth.warc-00002.warc.gz +@end example + +@url{https://www.gnu.org/software/wget/, GNU Wget} can be easily used to +create WARCs: + +@example +$ wget ... [--page-requisites] [--recursive] \ + --no-warc-keep-log --no-warc-digests [--warc-max-size=XXX] \ + --warc-file smth.warc ... +@end example diff --git a/fifos/del.go b/fifos/del.go index 2ab755d..8e5a184 100644 --- a/fifos/del.go +++ b/fifos/del.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/fifos/ensure.do b/fifos/ensure.do index 6af361f..85a674b 100644 --- a/fifos/ensure.do +++ b/fifos/ensure.do @@ -1,8 +1,9 @@ -for f in cert dane err http-auth non-ok ok redir req tls tls-auth various ; do +for f in cert dane err http-auth non-ok ok redir req tls tls-auth various warc ; do [ -p log-$f ] || mkfifo log-$f done -for f in accepted http-auth rejected spies tls-auth ; do +for f in accepted http-auth rejected spies tls-auth warcs ; do [ -p list-$f ] || mkfifo list-$f [ -p del-$f ] || mkfifo del-$f done [ -p add-spies ] || mkfifo add-spies +[ -p add-warcs ] || mkfifo add-warcs diff --git a/fifos/list.go b/fifos/list.go index b06a4cc..8625a81 100644 --- a/fifos/list.go +++ b/fifos/list.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/fifos/log.go b/fifos/log.go index 398aa25..0b717a1 100644 --- a/fifos/log.go +++ b/fifos/log.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify @@ -39,6 +39,7 @@ var ( LogTLS = make(chan string) LogTLSAuth = make(chan string) LogVarious = make(chan string) + LogWARC = make(chan string) ) func logger(c chan string, p string) { diff --git a/fifos/multitail.sh b/fifos/multitail.sh index 9f678a8..f249f0e 100755 --- a/fifos/multitail.sh +++ b/fifos/multitail.sh @@ -12,4 +12,5 @@ multitail \ -t "Redirect " -ci cyan --label "R " -L "while :; do tai64nlocal < log-redir ; done" \ -t "Request" -ci blue --label "> " -L "while :; do tai64nlocal < log-req ; done" \ -t "TLS connection" -ci yellow --label "S " -L "while :; do tai64nlocal < log-tls ; done" \ - -t "Various" -ci yellow -L "while :; do tai64nlocal < log-various ; done" + -t "Various" -ci yellow -L "while :; do tai64nlocal < log-various ; done" \ + -t "WARC" -ci green --label "W " -L "while :; do tai64nlocal < log-warc ; done" diff --git a/fifos/spies.go b/fifos/spies.go index 0b6aa57..4edc151 100644 --- a/fifos/spies.go +++ b/fifos/spies.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/fifos/start.go b/fifos/start.go index a005c64..528b2a9 100644 --- a/fifos/start.go +++ b/fifos/start.go @@ -1,9 +1,27 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + package fifos import ( "path/filepath" "go.stargrave.org/tofuproxy/caches" + "go.stargrave.org/tofuproxy/warc" ) func Start(fifos string) { @@ -18,12 +36,14 @@ func Start(fifos string) { go logger(LogTLS, filepath.Join(fifos, "log-tls")) go logger(LogTLSAuth, filepath.Join(fifos, "log-tls-auth")) go logger(LogVarious, filepath.Join(fifos, "log-various")) + go logger(LogWARC, filepath.Join(fifos, "log-warc")) go listAccepted(filepath.Join(fifos, "list-accepted")) go listHTTPAuth(filepath.Join(fifos, "list-http-auth")) go listRejected(filepath.Join(fifos, "list-rejected")) go listSpies(filepath.Join(fifos, "list-spies")) go listTLSAuth(filepath.Join(fifos, "list-tls-auth")) + go listWARCs(filepath.Join(fifos, "list-warcs")) go del( &caches.AcceptedM, func(host string) { delete(caches.Accepted, host) }, @@ -55,4 +75,10 @@ func Start(fifos string) { }, filepath.Join(fifos, "del-spies"), ) + + go addWARC(filepath.Join(fifos, "add-warcs")) + go del( + &warc.WARCsM, func(warcPath string) { delete(warc.WARCs, warcPath) }, + filepath.Join(fifos, "del-warcs"), + ) } diff --git a/fifos/warcs.go b/fifos/warcs.go new file mode 100644 index 0000000..6d38700 --- /dev/null +++ b/fifos/warcs.go @@ -0,0 +1,78 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package fifos + +import ( + "bufio" + "fmt" + "log" + "os" + + "go.stargrave.org/tofuproxy/warc" +) + +func listWARCs(p string) { + for { + fd, err := os.OpenFile(p, os.O_WRONLY|os.O_APPEND, os.FileMode(0666)) + if err != nil { + log.Fatalln(err) + } + warc.WARCsM.RLock() + for warcPath, uris := range warc.WARCs { + fmt.Fprintf(fd, "%s\t%d\n", warcPath, len(uris)) + } + warc.WARCsM.RUnlock() + fd.Close() + } +} + +func addWARC(p string) { + for { + fd, err := os.OpenFile(p, os.O_RDONLY, os.FileMode(0666)) + if err != nil { + log.Fatalln(err) + } + var warcPaths []string + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + t := scanner.Text() + if len(t) > 0 { + warcPaths = append(warcPaths, t) + } + } + fd.Close() + for _, warcPath := range warcPaths { + if warcPath == "SAVE" { + if err = warc.SaveIndexes(); err != nil { + log.Printf("%s: can not save index %s: %+v\n", p, warcPath, err) + } + continue + } + if _, exists := warc.WARCs[warcPath]; exists { + continue + } + log.Printf("%s: adding WARC %s\n", p, warcPath) + err = warc.Add(warcPath) + if err != nil { + log.Printf("%s: can not open %s: %+v\n", p, warcPath, err) + break + } + log.Printf("%s: %s: added %d URIs\n", p, warcPath, len(warc.WARCs[warcPath])) + } + } +} diff --git a/go.mod b/go.mod index 779bef0..0dd2f43 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.17 require ( github.com/dustin/go-humanize v1.0.0 + github.com/klauspost/compress v1.13.6 github.com/miekg/dns v1.1.29 go.cypherpunks.ru/tai64n/v2 v2.0.0 go.cypherpunks.ru/ucspi v0.0.0-20210908140534-cfdc20a8225f diff --git a/go.sum b/go.sum index a3c1ed0..5b0005c 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc= +github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/miekg/dns v1.1.29 h1:xHBEhR+t5RzcFJjBLJlax2daXOrTYtr9z4WdKEfWFzg= github.com/miekg/dns v1.1.29/go.mod h1:KNUDUusw/aVsxyTYZM1oqvCicbwhgbNgztCETuNZ7xM= go.cypherpunks.ru/tai64n/v2 v2.0.0 h1:AlohA1/zRqInhIGK7CVnn7tC5/vt1TaOAEyBgeu5Ruo= diff --git a/httpauth.go b/httpauth.go index 79b51ec..ade7801 100644 --- a/httpauth.go +++ b/httpauth.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/denyFonts.go b/rounds/denyFonts.go index ad55310..efc3d4f 100644 --- a/rounds/denyFonts.go +++ b/rounds/denyFonts.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/habrImage.go b/rounds/habrImage.go index 86f21aa..941832e 100644 --- a/rounds/habrImage.go +++ b/rounds/habrImage.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/noHead.go b/rounds/noHead.go index e382432..c61af78 100644 --- a/rounds/noHead.go +++ b/rounds/noHead.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/reddit.go b/rounds/reddit.go index 7b5668e..fb126ec 100644 --- a/rounds/reddit.go +++ b/rounds/reddit.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/redirectHTML.go b/rounds/redirectHTML.go index 2bf4ce2..d13047a 100644 --- a/rounds/redirectHTML.go +++ b/rounds/redirectHTML.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/spy.go b/rounds/spy.go index e830701..affc1cb 100644 --- a/rounds/spy.go +++ b/rounds/spy.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/transcodeAVIF.go b/rounds/transcodeAVIF.go index 06a0ffa..c315b19 100644 --- a/rounds/transcodeAVIF.go +++ b/rounds/transcodeAVIF.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/transcodeJXL.go b/rounds/transcodeJXL.go index e8a9628..94fa07a 100644 --- a/rounds/transcodeJXL.go +++ b/rounds/transcodeJXL.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/transcodeWebP.go b/rounds/transcodeWebP.go index a38b7f2..2939b08 100644 --- a/rounds/transcodeWebP.go +++ b/rounds/transcodeWebP.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/rounds/warc-index.tmpl b/rounds/warc-index.tmpl new file mode 100644 index 0000000..e43a287 --- /dev/null +++ b/rounds/warc-index.tmpl @@ -0,0 +1,16 @@ + + + + + WARC URIs + + + +{{range $idx, $entry := .Entries}} + + + +{{end}} +
{{$idx}}{{.WARC}}{{.Size}}
{{.URI}}
+ + diff --git a/rounds/warc.go b/rounds/warc.go new file mode 100644 index 0000000..d9ab78e --- /dev/null +++ b/rounds/warc.go @@ -0,0 +1,156 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package rounds + +import ( + _ "embed" + "fmt" + "html/template" + "io" + "log" + "net/http" + "path" + "sort" + "strings" + + "github.com/dustin/go-humanize" + "go.stargrave.org/tofuproxy/fifos" + "go.stargrave.org/tofuproxy/warc" +) + +const WARCEntrypoint = "http://warc/" + +var ( + WARCOnly bool + + //go:embed warc-index.tmpl + TmplWARCIndexRaw string + TmplWARCIndex = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw)) +) + +type WARCEntry struct { + WARC string + URI string + Size string +} + +type ByDepth []*WARCEntry + +func (a ByDepth) Len() int { + return len(a) +} + +func (a ByDepth) Swap(i, j int) { + a[i], a[j] = a[j], a[i] +} + +func (a ByDepth) Less(i, j int) bool { + ci := len(a[i].WARC) + cj := len(a[j].WARC) + if ci != cj { + return ci < cj + } + uriI := strings.TrimSuffix(a[i].URI, "/") + uriJ := strings.TrimSuffix(a[j].URI, "/") + ci = strings.Count(uriI, "/") + cj = strings.Count(uriJ, "/") + if ci != cj { + return ci < cj + } + return len(uriI) < len(uriJ) +} + +func RoundWARC( + host string, + resp *http.Response, + w http.ResponseWriter, + req *http.Request, +) (bool, error) { + if req.URL.String() == WARCEntrypoint { + var entries []*WARCEntry + warc.WARCsM.RLock() + for warcPath, uris := range warc.WARCs { + for uri, rec := range uris { + entries = append(entries, &WARCEntry{ + path.Base(warcPath), + uri, + humanize.IBytes(uint64(rec.TotalSize())), + }) + } + } + warc.WARCsM.RUnlock() + sort.Sort(ByDepth(entries)) + err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries}) + if err == nil { + return false, nil + } else { + log.Printf("WARC: error during %s: %+v\n", req.URL, err) + return false, err + } + } + + var rec *warc.Record + var warcPath string + var uris map[string]*warc.Record + hostOrig := req.URL.Host + if req.URL.Scheme == "https" { + req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443") + } + warc.WARCsM.RLock() + for warcPath, uris = range warc.WARCs { + rec = uris[req.URL.String()] + if rec != nil { + break + } + } + warc.WARCsM.RUnlock() + req.URL.Host = hostOrig + if rec == nil { + if WARCOnly { + http.NotFound(w, req) + fifos.LogNonOK <- fmt.Sprintf("%s %s\tnot in WARC", req.Method, req.URL) + return false, nil + } + return true, nil + } + + wr, err := rec.Reader(true) + if err != nil { + log.Printf("WARC: error during %s: %+v\n", req.URL, err) + return false, err + } + hj, ok := w.(http.Hijacker) + if !ok { + http.Error(w, "can not hijack", http.StatusInternalServerError) + return false, err + } + conn, _, err := hj.Hijack() + if err != nil { + panic(err) + } + _, err = io.Copy(conn, wr) + conn.Close() + fifos.LogWARC <- fmt.Sprintf( + "%s %s\t%s\t%s\t%s", + req.Method, req.URL, + strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"), + warcPath, + humanize.IBytes(uint64(rec.TotalSize())), + ) + return false, err +} diff --git a/tls.go b/tls.go index 8a807d0..dc59862 100644 --- a/tls.go +++ b/tls.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/tlsauth.go b/tlsauth.go index bf87b2c..fd9839a 100644 --- a/tlsauth.go +++ b/tlsauth.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/tofuproxy.do b/tofuproxy.do deleted file mode 100644 index fdb2e6c..0000000 --- a/tofuproxy.do +++ /dev/null @@ -1,3 +0,0 @@ -redo-ifchange *.go cmd/tofuproxy/*.go fifos/*.go rounds/*.go -GO_LDFLAGS="${GO_LDFLAGS:--ldflags=-s}" -${GO:-go} build -o $3 $GO_LDFLAGS ./cmd/tofuproxy diff --git a/trip.go b/trip.go index 2d04a18..b217750 100644 --- a/trip.go +++ b/trip.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify @@ -64,6 +64,7 @@ func roundTrip(w http.ResponseWriter, req *http.Request) { host := strings.TrimSuffix(req.URL.Host, ":443") for _, round := range []Round{ rounds.RoundNoHead, + rounds.RoundWARC, rounds.RoundDenySpy, rounds.RoundRedditOld, rounds.RoundHabrImage, diff --git a/verify.go b/verify.go index 29dbbc0..dc852a8 100644 --- a/verify.go +++ b/verify.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify diff --git a/warc/compressed.go b/warc/compressed.go new file mode 100644 index 0000000..baa6830 --- /dev/null +++ b/warc/compressed.go @@ -0,0 +1,86 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package warc + +import ( + "compress/gzip" + "fmt" + "io" + "os" + "path" + + "github.com/klauspost/compress/zstd" +) + +type Compressed struct { + r io.ReadCloser + fd *os.File + offset int64 +} + +func (c *Compressed) Read(p []byte) (int, error) { + n, err := c.r.Read(p) + c.offset += int64(n) + return n, err +} + +func (c *Compressed) Close() error { + c.r.Close() + return c.fd.Close() +} + +func (c *Compressed) Seek(offset int64, whence int) (int64, error) { + if whence != io.SeekStart { + panic("can only seek from the start") + } + if _, err := io.CopyN(io.Discard, c, offset-c.offset); err != nil { + return 0, err + } + c.offset = offset + return c.offset, nil +} + +func Open(warcPath string) (io.ReadSeekCloser, error) { + ext := path.Ext(warcPath) + switch ext { + case ".warc": + return os.Open(warcPath) + case ".gz": + fd, err := os.Open(warcPath) + if err != nil { + return nil, err + } + gzr, err := gzip.NewReader(fd) + if err != nil { + return nil, err + } + gzr.Multistream(true) + return &Compressed{r: gzr, fd: fd}, nil + case ".zst": + fd, err := os.Open(warcPath) + if err != nil { + return nil, err + } + zstdr, err := zstd.NewReader(fd) + if err != nil { + return nil, err + } + return &Compressed{r: zstdr.IOReadCloser(), fd: fd}, nil + } + return nil, fmt.Errorf("unknown extensions: %s", ext) +} diff --git a/warc/header.go b/warc/header.go new file mode 100644 index 0000000..cb1b7a9 --- /dev/null +++ b/warc/header.go @@ -0,0 +1,54 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package warc + +import "strings" + +type Header map[string]string + +func splitKeyValue(line string) (string, string) { + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + return "", "" + } + return parts[0], strings.TrimSpace(parts[1]) +} + +func NewHeader() Header { + return make(map[string]string) +} + +func (h Header) Set(key, value string) { + h[strings.ToLower(key)] = value +} + +func (h Header) Get(key string) string { + return h[strings.ToLower(key)] +} + +func (h Header) Del(key string) { + delete(h, strings.ToLower(key)) +} + +func (h Header) AddLine(line string) { + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + return + } + h.Set(parts[0], strings.TrimSpace(parts[1])) +} diff --git a/warc/reader.go b/warc/reader.go new file mode 100644 index 0000000..e76bd51 --- /dev/null +++ b/warc/reader.go @@ -0,0 +1,110 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package warc + +import ( + "bufio" + "fmt" + "io" + "strconv" + "strings" +) + +const CRLF = "\r\n" + +type Reader struct { + Path string + r *bufio.Reader + rsc io.ReadSeekCloser + offset int64 + prevRec *Record +} + +func NewReader(warcPath string) (*Reader, error) { + rsc, err := Open(warcPath) + if err != nil { + return nil, err + } + return &Reader{ + Path: warcPath, + rsc: rsc, + r: bufio.NewReader(rsc), + }, nil +} + +func (r *Reader) next() error { + if r.prevRec == nil { + return nil + } + if _, err := r.r.Discard(int(r.prevRec.Size)); err != nil { + return err + } + r.offset += int64(r.prevRec.HdrLen) + r.prevRec.Size + for i := 0; i < 2; i++ { + line, err := r.r.ReadString('\n') + if err != nil { + return err + } + r.offset += int64(len(line)) + if line != CRLF { + return fmt.Errorf("non-CRLF: %q", line) + } + } + return nil +} + +func (r *Reader) ReadRecord() (*Record, error) { + r.next() + line, err := r.r.ReadString('\n') + if err != nil { + return nil, err + } + if !strings.HasPrefix(line, "WARC/") { + return nil, fmt.Errorf("non-WARC header: %q", line) + } + hdrLen := len(line) + hdr := NewHeader() + for { + line, err := r.r.ReadString('\n') + if err != nil { + return nil, err + } + hdrLen += len(line) + if line == CRLF { + break + } + hdr.AddLine(line) + } + size, err := strconv.ParseUint(hdr.Get("Content-Length"), 10, 64) + if err != nil { + return nil, err + } + rec := &Record{ + WARCPath: r.Path, + Offset: r.offset, + Hdr: hdr, + HdrLen: hdrLen, + Size: int64(size), + } + r.prevRec = rec + return rec, nil +} + +func (r *Reader) Close() error { + return r.rsc.Close() +} diff --git a/warc/record.go b/warc/record.go new file mode 100644 index 0000000..2dd123d --- /dev/null +++ b/warc/record.go @@ -0,0 +1,118 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package warc + +import ( + "io" + "strings" +) + +type Record struct { + WARCPath string + Offset int64 + Hdr Header + HdrLen int + Size int64 + + Continuations []*Record +} + +func (rec *Record) URI() string { + return strings.Trim(rec.Hdr.Get("WARC-Target-URI"), "<>") +} + +func (rec *Record) TotalSize() int64 { + s := rec.Size + for _, r := range rec.Continuations { + s += r.Size + } + return s +} + +type SelfRecordReader struct { + r *io.LimitedReader + rsc io.ReadSeekCloser +} + +func (srr *SelfRecordReader) Read(p []byte) (n int, err error) { + n, err = srr.r.Read(p) + if err != nil { + srr.Close() + } + return +} + +func (srr *SelfRecordReader) Close() error { + return srr.rsc.Close() +} + +func (rec *Record) selfReader(noHdr bool) (*SelfRecordReader, error) { + rsc, err := Open(rec.WARCPath) + if err != nil { + return nil, err + } + offset := rec.Offset + if noHdr { + offset += int64(rec.HdrLen) + } + if _, err = rsc.Seek(offset, io.SeekStart); err != nil { + rsc.Close() + return nil, err + } + return &SelfRecordReader{r: &io.LimitedReader{R: rsc, N: rec.Size}, rsc: rsc}, nil +} + +type RecordReader struct { + r io.Reader + srrs []*SelfRecordReader +} + +func (rec *Record) Reader(noHdr bool) (*RecordReader, error) { + srrs := make([]*SelfRecordReader, 0, 1+len(rec.Continuations)) + rs := make([]io.Reader, 0, 1+len(rec.Continuations)) + for i, r := range append([]*Record{rec}, rec.Continuations...) { + if i > 0 { + noHdr = true + } + srr, err := r.selfReader(noHdr) + if err != nil { + for _, srr := range srrs { + srr.Close() + } + return nil, err + } + srrs = append(srrs, srr) + rs = append(rs, srr) + } + return &RecordReader{r: io.MultiReader(rs...), srrs: srrs}, nil +} + +func (rr *RecordReader) Read(p []byte) (n int, err error) { + n, err = rr.r.Read(p) + if err != nil { + rr.Close() + } + return +} + +func (rr *RecordReader) Close() error { + for _, srr := range rr.srrs { + srr.Close() + } + return nil +} diff --git a/warc/uris.go b/warc/uris.go new file mode 100644 index 0000000..a971ff0 --- /dev/null +++ b/warc/uris.go @@ -0,0 +1,136 @@ +/* +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management +Copyright (C) 2021 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package warc + +import ( + "encoding/gob" + "fmt" + "io" + "log" + "os" + "strconv" + "sync" +) + +const IndexExt = ".idx.gob" + +var ( + WARCs = map[string]map[string]*Record{} + WARCsM sync.RWMutex + + Incomplete = map[string]*Record{} +) + +func Add(warcPath string) error { + fd, err := os.Open(warcPath + IndexExt) + if err == nil { + defer fd.Close() + var uris map[string]*Record + if err := gob.NewDecoder(fd).Decode(&uris); err != nil { + return err + } + WARCsM.Lock() + WARCs[warcPath] = uris + WARCsM.Unlock() + return nil + } + if err != nil && !os.IsNotExist(err) { + return err + } + r, err := NewReader(warcPath) + if err != nil { + return err + } + defer r.Close() + uris := map[string]*Record{} + for { + rec, err := r.ReadRecord() + if err != nil { + if err == io.EOF { + break + } + return err + } + segNum := rec.Hdr.Get("WARC-Segment-Number") + switch rec.Hdr.Get("WARC-Type") { + case "response": + uri := rec.URI() + if uri == "" { + continue + } + if segNum == "1" { + Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec + continue + } + uris[uri] = rec + case "continuation": + originID := rec.Hdr.Get("WARC-Segment-Origin-ID") + incomplete := Incomplete[originID] + if incomplete == nil { + return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID) + } + segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1) + if segNum != segNumExpected { + return fmt.Errorf( + "unexpected WARC-Segment-Number %s != %s", + segNum, segNumExpected, + ) + } + incomplete.Continuations = append(incomplete.Continuations, rec) + if rec.Hdr.Get("WARC-Segment-Total-Length") != "" { + WARCsM.Lock() + WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete + WARCsM.Unlock() + delete(Incomplete, originID) + } + } + } + WARCsM.Lock() + WARCs[warcPath] = uris + WARCsM.Unlock() + return nil +} + +func SaveIndexes() error { + WARCsM.RLock() + defer WARCsM.RUnlock() + for warcPath, uris := range WARCs { + p := warcPath + IndexExt + if _, err := os.Stat(p); err == nil { + continue + } + fd, err := os.OpenFile( + p+".tmp", + os.O_CREATE|os.O_WRONLY|os.O_EXCL, + os.FileMode(0666), + ) + if err != nil { + return err + } + if err = gob.NewEncoder(fd).Encode(&uris); err != nil { + fd.Close() + return err + } + fd.Close() + if err = os.Rename(p+".tmp", p); err != nil { + return err + } + log.Println("saved:", p) + } + return nil +} diff --git a/x509.go b/x509.go index f818af2..2d27b1a 100644 --- a/x509.go +++ b/x509.go @@ -1,5 +1,5 @@ /* -tofuproxy -- HTTP proxy with TLS certificates management +tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify -- 2.44.0