]> Sergey Matveev's repositories - tofuproxy.git/blob - rounds/warc.go
2d0ea7fedb834ab1fda8be09ed521014c33da10c
[tofuproxy.git] / rounds / warc.go
1 /*
2 tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU
3              manager, WARC/geminispace browser
4 Copyright (C) 2021-2023 Sergey Matveev <stargrave@stargrave.org>
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, version 3 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 package rounds
20
21 import (
22         _ "embed"
23         "fmt"
24         "html/template"
25         "io"
26         "log"
27         "net/http"
28         "path"
29         "sort"
30         "strings"
31
32         "github.com/dustin/go-humanize"
33         "go.stargrave.org/tofuproxy/fifos"
34         "go.stargrave.org/tofuproxy/warc"
35 )
36
37 const WARCEntrypoint = "http://warc/"
38
39 var (
40         WARCOnly bool
41
42         //go:embed warc-index.tmpl
43         TmplWARCIndexRaw string
44         TmplWARCIndex    = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw))
45 )
46
47 type WARCEntry struct {
48         WARC string
49         URI  string
50         Size string
51 }
52
53 type ByDepth []*WARCEntry
54
55 func (a ByDepth) Len() int {
56         return len(a)
57 }
58
59 func (a ByDepth) Swap(i, j int) {
60         a[i], a[j] = a[j], a[i]
61 }
62
63 func (a ByDepth) Less(i, j int) bool {
64         ci := len(a[i].WARC)
65         cj := len(a[j].WARC)
66         if ci != cj {
67                 return ci < cj
68         }
69         uriI := strings.TrimSuffix(a[i].URI, "/")
70         uriJ := strings.TrimSuffix(a[j].URI, "/")
71         ci = strings.Count(uriI, "/")
72         cj = strings.Count(uriJ, "/")
73         if ci != cj {
74                 return ci < cj
75         }
76         return len(uriI) < len(uriJ)
77 }
78
79 func RoundWARC(
80         host string,
81         resp *http.Response,
82         w http.ResponseWriter,
83         req *http.Request,
84 ) (bool, error) {
85         if req.URL.String() == WARCEntrypoint {
86                 var entries []*WARCEntry
87                 warc.WARCsM.RLock()
88                 for warcPath, uris := range warc.WARCs {
89                         for uri, rec := range uris {
90                                 entries = append(entries, &WARCEntry{
91                                         path.Base(warcPath),
92                                         uri,
93                                         humanize.IBytes(uint64(rec.TotalSize())),
94                                 })
95                         }
96                 }
97                 warc.WARCsM.RUnlock()
98                 sort.Sort(ByDepth(entries))
99                 err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries})
100                 if err == nil {
101                         return false, nil
102                 } else {
103                         log.Printf("WARC: error during %s: %+v\n", req.URL, err)
104                         return false, err
105                 }
106         }
107
108         var rec *warc.Record
109         var warcPath string
110         var uris map[string]*warc.Record
111         hostOrig := req.URL.Host
112         if req.URL.Scheme == "https" {
113                 req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443")
114         }
115         warc.WARCsM.RLock()
116         for warcPath, uris = range warc.WARCs {
117                 rec = uris[req.URL.String()]
118                 if rec != nil {
119                         break
120                 }
121         }
122         warc.WARCsM.RUnlock()
123         req.URL.Host = hostOrig
124         if rec == nil {
125                 if WARCOnly {
126                         http.NotFound(w, req)
127                         fifos.LogNonOK <- fmt.Sprintf(
128                                 "%s %s\tnot in WARC", req.Method, req.URL,
129                         )
130                         return false, nil
131                 }
132                 return true, nil
133         }
134
135         wr, err := rec.Reader(true, warc.WARCsOffsets)
136         if err != nil {
137                 log.Printf("WARC: error during %s: %+v\n", req.URL, err)
138                 return false, err
139         }
140         defer wr.Close()
141         hj, ok := w.(http.Hijacker)
142         if !ok {
143                 http.Error(w, "can not hijack", http.StatusInternalServerError)
144                 return false, err
145         }
146         conn, _, err := hj.Hijack()
147         if err != nil {
148                 panic(err)
149         }
150         _, err = io.Copy(conn, wr)
151         conn.Close()
152         fifos.LogWARC <- fmt.Sprintf(
153                 "%s %s\t%s\t%s\t%s",
154                 req.Method, req.URL,
155                 strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"),
156                 warcPath,
157                 humanize.IBytes(uint64(rec.TotalSize())),
158         )
159         return false, err
160 }