]> Sergey Matveev's repositories - tofuproxy.git/blob - rounds/warc.go
ddb566c7d40f44648d1caf90b4b25d8fa07b502e
[tofuproxy.git] / rounds / warc.go
1 /*
2 tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
3 Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 package rounds
19
20 import (
21         _ "embed"
22         "fmt"
23         "html/template"
24         "io"
25         "log"
26         "net/http"
27         "path"
28         "sort"
29         "strings"
30
31         "github.com/dustin/go-humanize"
32         "go.stargrave.org/tofuproxy/fifos"
33         "go.stargrave.org/tofuproxy/warc"
34 )
35
36 const WARCEntrypoint = "http://warc/"
37
38 var (
39         WARCOnly bool
40
41         //go:embed warc-index.tmpl
42         TmplWARCIndexRaw string
43         TmplWARCIndex    = template.Must(template.New("warc-index").Parse(TmplWARCIndexRaw))
44 )
45
46 type WARCEntry struct {
47         WARC string
48         URI  string
49         Size string
50 }
51
52 type ByDepth []*WARCEntry
53
54 func (a ByDepth) Len() int {
55         return len(a)
56 }
57
58 func (a ByDepth) Swap(i, j int) {
59         a[i], a[j] = a[j], a[i]
60 }
61
62 func (a ByDepth) Less(i, j int) bool {
63         ci := len(a[i].WARC)
64         cj := len(a[j].WARC)
65         if ci != cj {
66                 return ci < cj
67         }
68         uriI := strings.TrimSuffix(a[i].URI, "/")
69         uriJ := strings.TrimSuffix(a[j].URI, "/")
70         ci = strings.Count(uriI, "/")
71         cj = strings.Count(uriJ, "/")
72         if ci != cj {
73                 return ci < cj
74         }
75         return len(uriI) < len(uriJ)
76 }
77
78 func RoundWARC(
79         host string,
80         resp *http.Response,
81         w http.ResponseWriter,
82         req *http.Request,
83 ) (bool, error) {
84         if req.URL.String() == WARCEntrypoint {
85                 var entries []*WARCEntry
86                 warc.WARCsM.RLock()
87                 for warcPath, uris := range warc.WARCs {
88                         for uri, rec := range uris {
89                                 entries = append(entries, &WARCEntry{
90                                         path.Base(warcPath),
91                                         uri,
92                                         humanize.IBytes(uint64(rec.TotalSize())),
93                                 })
94                         }
95                 }
96                 warc.WARCsM.RUnlock()
97                 sort.Sort(ByDepth(entries))
98                 err := TmplWARCIndex.Execute(w, struct{ Entries []*WARCEntry }{entries})
99                 if err == nil {
100                         return false, nil
101                 } else {
102                         log.Printf("WARC: error during %s: %+v\n", req.URL, err)
103                         return false, err
104                 }
105         }
106
107         var rec *warc.Record
108         var warcPath string
109         var uris map[string]*warc.Record
110         hostOrig := req.URL.Host
111         if req.URL.Scheme == "https" {
112                 req.URL.Host = strings.TrimSuffix(req.URL.Host, ":443")
113         }
114         warc.WARCsM.RLock()
115         for warcPath, uris = range warc.WARCs {
116                 rec = uris[req.URL.String()]
117                 if rec != nil {
118                         break
119                 }
120         }
121         warc.WARCsM.RUnlock()
122         req.URL.Host = hostOrig
123         if rec == nil {
124                 if WARCOnly {
125                         http.NotFound(w, req)
126                         fifos.LogNonOK <- fmt.Sprintf("%s %s\tnot in WARC", req.Method, req.URL)
127                         return false, nil
128                 }
129                 return true, nil
130         }
131
132         wr, err := rec.Reader(true, warc.WARCsOffsets)
133         if err != nil {
134                 log.Printf("WARC: error during %s: %+v\n", req.URL, err)
135                 return false, err
136         }
137         defer wr.Close()
138         hj, ok := w.(http.Hijacker)
139         if !ok {
140                 http.Error(w, "can not hijack", http.StatusInternalServerError)
141                 return false, err
142         }
143         conn, _, err := hj.Hijack()
144         if err != nil {
145                 panic(err)
146         }
147         _, err = io.Copy(conn, wr)
148         conn.Close()
149         fifos.LogWARC <- fmt.Sprintf(
150                 "%s %s\t%s\t%s\t%s",
151                 req.Method, req.URL,
152                 strings.TrimSuffix(rec.Hdr.Get("Content-Type"), ";msgtype=response"),
153                 warcPath,
154                 humanize.IBytes(uint64(rec.TotalSize())),
155         )
156         return false, err
157 }