]> Sergey Matveev's repositories - tofuproxy.git/blob - rounds/warc.go
Unify copyright comment format
[tofuproxy.git] / rounds / warc.go
1 // tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU
2 //              manager, WARC/geminispace browser
3 // Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
4 //
5 // This program is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, version 3 of the License.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17 package rounds
18
19 import (
20         _ "embed"
21         "fmt"
22         "html/template"
23         "io"
24         "log"
25         "net/http"
26         "path"
27         "sort"
28         "strings"
29
30         "github.com/dustin/go-humanize"
31         "go.stargrave.org/tofuproxy/fifos"
32         "go.stargrave.org/tofuproxy/warc"
33 )
34
35 const WARCEntrypoint = "http://warc/"
36
37 var (
38         WARCOnly bool
39
40         //go:embed warc-index.tmpl
41         TmplWARCIndexRaw string
42         TmplWARCIndex    = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw))
43 )
44
45 type WARCEntry struct {
46         WARC string
47         URI  string
48         Size string
49 }
50
51 type ByDepth []*WARCEntry
52
53 func (a ByDepth) Len() int {
54         return len(a)
55 }
56
57 func (a ByDepth) Swap(i, j int) {
58         a[i], a[j] = a[j], a[i]
59 }
60
61 func (a ByDepth) Less(i, j int) bool {
62         ci := len(a[i].WARC)
63         cj := len(a[j].WARC)
64         if ci != cj {
65                 return ci < cj
66         }
67         uriI := strings.TrimSuffix(a[i].URI, "/")
68         uriJ := strings.TrimSuffix(a[j].URI, "/")
69         ci = strings.Count(uriI, "/")
70         cj = strings.Count(uriJ, "/")
71         if ci != cj {
72                 return ci < cj
73         }
74         return len(uriI) < len(uriJ)
75 }
76
77 func RoundWARC(
78         host string,
79         resp *http.Response,
80         w http.ResponseWriter,
81         req *http.Request,
82 ) (bool, error) {
83         if req.URL.String() == WARCEntrypoint {
84                 var entries []*WARCEntry
85                 warc.WARCsM.RLock()
86                 for warcPath, uris := range warc.WARCs {
87                         for uri, rec := range uris {
88                                 entries = append(entries, &WARCEntry{
89                                         path.Base(warcPath),
90                                         uri,
91                                         humanize.IBytes(uint64(rec.TotalSize())),
92                                 })
93                         }
94                 }
95                 warc.WARCsM.RUnlock()
96                 sort.Sort(ByDepth(entries))
97                 err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries})
98                 if err == nil {
99                         return false, nil
100                 } else {
101                         log.Printf("WARC: error during %s: %+v\n", req.URL, err)
102                         return false, err
103                 }
104         }
105
106         var rec *warc.Record
107         var warcPath string
108         var uris map[string]*warc.Record
109         hostOrig := req.URL.Host
110         if req.URL.Scheme == "https" {
111                 req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443")
112         }
113         warc.WARCsM.RLock()
114         for warcPath, uris = range warc.WARCs {
115                 rec = uris[req.URL.String()]
116                 if rec != nil {
117                         break
118                 }
119         }
120         warc.WARCsM.RUnlock()
121         req.URL.Host = hostOrig
122         if rec == nil {
123                 if WARCOnly {
124                         http.NotFound(w, req)
125                         fifos.LogNonOK <- fmt.Sprintf(
126                                 "%s %s\tnot in WARC", req.Method, req.URL,
127                         )
128                         return false, nil
129                 }
130                 return true, nil
131         }
132
133         wr, err := rec.Reader(true, warc.WARCsOffsets)
134         if err != nil {
135                 log.Printf("WARC: error during %s: %+v\n", req.URL, err)
136                 return false, err
137         }
138         defer wr.Close()
139         hj, ok := w.(http.Hijacker)
140         if !ok {
141                 http.Error(w, "can not hijack", http.StatusInternalServerError)
142                 return false, err
143         }
144         conn, _, err := hj.Hijack()
145         if err != nil {
146                 panic(err)
147         }
148         _, err = io.Copy(conn, wr)
149         conn.Close()
150         fifos.LogWARC <- fmt.Sprintf(
151                 "%s %s\t%s\t%s\t%s",
152                 req.Method, req.URL,
153                 strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"),
154                 warcPath,
155                 humanize.IBytes(uint64(rec.TotalSize())),
156         )
157         return false, err
158 }