]> Sergey Matveev's repositories - tofuproxy.git/blob - warc/uris.go
Download link for 0.6.0 release
[tofuproxy.git] / warc / uris.go
1 /*
2 tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU
3              manager, WARC/geminispace browser
4 Copyright (C) 2021-2023 Sergey Matveev <stargrave@stargrave.org>
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, version 3 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 package warc
20
21 import (
22         "encoding/gob"
23         "errors"
24         "fmt"
25         "io"
26         "io/fs"
27         "log"
28         "os"
29         "strconv"
30         "sync"
31         "time"
32 )
33
34 const IndexExt = ".idx.gob"
35
36 var (
37         WARCs        = map[string]map[string]*Record{}
38         WARCsOffsets = map[string][]Offset{}
39         WARCsM       sync.RWMutex
40
41         Incomplete = map[string]*Record{}
42 )
43
44 func Add(warcPath string) error {
45         fd, err := os.Open(warcPath + IndexExt)
46         if err == nil {
47                 defer fd.Close()
48                 var uris map[string]*Record
49                 var offsets []Offset
50                 dec := gob.NewDecoder(fd)
51                 if err := dec.Decode(&uris); err != nil {
52                         return err
53                 }
54                 if err := dec.Decode(&offsets); err != nil {
55                         return err
56                 }
57                 WARCsM.Lock()
58                 WARCs[warcPath] = uris
59                 WARCsOffsets[warcPath] = offsets
60                 WARCsM.Unlock()
61                 log.Println("loaded marshalled index:", warcPath+IndexExt)
62                 return nil
63         }
64         if err != nil && !errors.Is(err, fs.ErrNotExist) {
65                 return err
66         }
67         r, err := NewReader(warcPath)
68         if err != nil {
69                 return err
70         }
71         defer r.Close()
72         uris := map[string]*Record{}
73         for {
74                 rec, _, err := r.ReadRecord()
75                 if err != nil {
76                         if err == io.EOF {
77                                 break
78                         }
79                         return err
80                 }
81                 rec.HdrLines = nil
82                 segNum := rec.Hdr.Get("WARC-Segment-Number")
83                 switch rec.Hdr.Get("WARC-Type") {
84                 case "response":
85                         uri := rec.URI()
86                         if uri == "" {
87                                 continue
88                         }
89                         if segNum == "1" {
90                                 Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec
91                                 continue
92                         }
93                         uris[uri] = rec
94                 case "continuation":
95                         originID := rec.Hdr.Get("WARC-Segment-Origin-ID")
96                         incomplete := Incomplete[originID]
97                         if incomplete == nil {
98                                 return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID)
99                         }
100                         segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1)
101                         if segNum != segNumExpected {
102                                 return fmt.Errorf(
103                                         "unexpected WARC-Segment-Number %s != %s",
104                                         segNum, segNumExpected,
105                                 )
106                         }
107                         incomplete.Continuations = append(incomplete.Continuations, rec)
108                         if rec.Hdr.Get("WARC-Segment-Total-Length") != "" {
109                                 if incomplete.WARCPath == warcPath {
110                                         uris[incomplete.URI()] = incomplete
111                                 } else {
112                                         WARCsM.Lock()
113                                         WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
114                                         WARCsM.Unlock()
115                                 }
116                                 delete(Incomplete, originID)
117                         }
118                 }
119         }
120         r.Close()
121         WARCsM.Lock()
122         WARCs[warcPath] = uris
123         WARCsOffsets[warcPath] = r.offsets
124         WARCsM.Unlock()
125         return nil
126 }
127
128 func SaveIndices() error {
129         WARCsM.RLock()
130         defer WARCsM.RUnlock()
131         for warcPath, uris := range WARCs {
132                 p := warcPath + IndexExt
133                 if _, err := os.Stat(p); err == nil {
134                         continue
135                 }
136                 tmpSuffix := strconv.FormatInt(time.Now().UnixNano()+int64(os.Getpid()), 16)
137                 fd, err := os.OpenFile(
138                         p+tmpSuffix,
139                         os.O_WRONLY|os.O_CREATE|os.O_EXCL,
140                         os.FileMode(0666),
141                 )
142                 if err != nil {
143                         return err
144                 }
145                 enc := gob.NewEncoder(fd)
146                 if err = enc.Encode(&uris); err != nil {
147                         fd.Close()
148                         return err
149                 }
150                 offsets := WARCsOffsets[warcPath]
151                 if err = enc.Encode(&offsets); err != nil {
152                         fd.Close()
153                         return err
154                 }
155                 if err = fd.Close(); err != nil {
156                         return err
157                 }
158                 if err = os.Rename(p+tmpSuffix, p); err != nil {
159                         return err
160                 }
161                 log.Println("saved:", p)
162         }
163         return nil
164 }