]> Sergey Matveev's repositories - tofuproxy.git/blob - rounds/warc.go
0ca47777a5f939055daf74fb348eca7e2a0567f7
[tofuproxy.git] / rounds / warc.go
1 /*
2 tofuproxy -- flexible HTTP proxy, TLS terminator, X.509 certificates
3              manager, WARC/Gemini browser
4 Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, version 3 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 package rounds
20
21 import (
22         _ "embed"
23         "fmt"
24         "html/template"
25         "io"
26         "log"
27         "net/http"
28         "path"
29         "sort"
30         "strings"
31
32         "github.com/dustin/go-humanize"
33         "go.stargrave.org/tofuproxy/fifos"
34         "go.stargrave.org/tofuproxy/warc"
35 )
36
37 const WARCEntrypoint = "http://warc/"
38
39 var (
40         WARCOnly bool
41
42         //go:embed warc-index.tmpl
43         TmplWARCIndexRaw string
44         TmplWARCIndex    = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw))
45 )
46
47 type WARCEntry struct {
48         WARC string
49         URI  string
50         Size string
51 }
52
53 type ByDepth []*WARCEntry
54
55 func (a ByDepth) Len() int {
56         return len(a)
57 }
58
59 func (a ByDepth) Swap(i, j int) {
60         a[i], a[j] = a[j], a[i]
61 }
62
63 func (a ByDepth) Less(i, j int) bool {
64         ci := len(a[i].WARC)
65         cj := len(a[j].WARC)
66         if ci != cj {
67                 return ci < cj
68         }
69         uriI := strings.TrimSuffix(a[i].URI, "/")
70         uriJ := strings.TrimSuffix(a[j].URI, "/")
71         ci = strings.Count(uriI, "/")
72         cj = strings.Count(uriJ, "/")
73         if ci != cj {
74                 return ci < cj
75         }
76         return len(uriI) < len(uriJ)
77 }
78
79 func RoundWARC(
80         host string,
81         resp *http.Response,
82         w http.ResponseWriter,
83         req *http.Request,
84 ) (bool, error) {
85         if req.URL.String() == WARCEntrypoint {
86                 var entries []*WARCEntry
87                 warc.WARCsM.RLock()
88                 for warcPath, uris := range warc.WARCs {
89                         for uri, rec := range uris {
90                                 entries = append(entries, &WARCEntry{
91                                         path.Base(warcPath),
92                                         uri,
93                                         humanize.IBytes(uint64(rec.TotalSize())),
94                                 })
95                         }
96                 }
97                 warc.WARCsM.RUnlock()
98                 sort.Sort(ByDepth(entries))
99                 err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries})
100                 if err == nil {
101                         return false, nil
102                 } else {
103                         log.Printf("WARC: error during %s: %+v\n", req.URL, err)
104                         return false, err
105                 }
106         }
107
108         var rec *warc.Record
109         var warcPath string
110         var uris map[string]*warc.Record
111         hostOrig := req.URL.Host
112         if req.URL.Scheme == "https" {
113                 req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443")
114         }
115         warc.WARCsM.RLock()
116         for warcPath, uris = range warc.WARCs {
117                 rec = uris[req.URL.String()]
118                 if rec != nil {
119                         break
120                 }
121         }
122         warc.WARCsM.RUnlock()
123         req.URL.Host = hostOrig
124         if rec == nil {
125                 if WARCOnly {
126                         http.NotFound(w, req)
127                         fifos.LogNonOK <- fmt.Sprintf("%s %s\tnot in WARC", req.Method, req.URL)
128                         return false, nil
129                 }
130                 return true, nil
131         }
132
133         wr, err := rec.Reader(true, warc.WARCsOffsets)
134         if err != nil {
135                 log.Printf("WARC: error during %s: %+v\n", req.URL, err)
136                 return false, err
137         }
138         defer wr.Close()
139         hj, ok := w.(http.Hijacker)
140         if !ok {
141                 http.Error(w, "can not hijack", http.StatusInternalServerError)
142                 return false, err
143         }
144         conn, _, err := hj.Hijack()
145         if err != nil {
146                 panic(err)
147         }
148         _, err = io.Copy(conn, wr)
149         conn.Close()
150         fifos.LogWARC <- fmt.Sprintf(
151                 "%s %s\t%s\t%s\t%s",
152                 req.Method, req.URL,
153                 strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"),
154                 warcPath,
155                 humanize.IBytes(uint64(rec.TotalSize())),
156         )
157         return false, err
158 }