]> Sergey Matveev's repositories - tofuproxy.git/blob - warc/uris.go
Unify copyright comment format
[tofuproxy.git] / warc / uris.go
1 // tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU
2 //              manager, WARC/geminispace browser
3 // Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
4 //
5 // This program is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, version 3 of the License.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17 package warc
18
19 import (
20         "encoding/gob"
21         "errors"
22         "fmt"
23         "io"
24         "io/fs"
25         "log"
26         "os"
27         "strconv"
28         "sync"
29         "time"
30 )
31
32 const IndexExt = ".idx.gob"
33
34 var (
35         WARCs        = map[string]map[string]*Record{}
36         WARCsOffsets = map[string][]Offset{}
37         WARCsM       sync.RWMutex
38
39         Incomplete = map[string]*Record{}
40 )
41
42 func Add(warcPath string) error {
43         fd, err := os.Open(warcPath + IndexExt)
44         if err == nil {
45                 defer fd.Close()
46                 var uris map[string]*Record
47                 var offsets []Offset
48                 dec := gob.NewDecoder(fd)
49                 if err := dec.Decode(&uris); err != nil {
50                         return err
51                 }
52                 if err := dec.Decode(&offsets); err != nil {
53                         return err
54                 }
55                 WARCsM.Lock()
56                 WARCs[warcPath] = uris
57                 WARCsOffsets[warcPath] = offsets
58                 WARCsM.Unlock()
59                 log.Println("loaded marshalled index:", warcPath+IndexExt)
60                 return nil
61         }
62         if err != nil && !errors.Is(err, fs.ErrNotExist) {
63                 return err
64         }
65         r, err := NewReader(warcPath)
66         if err != nil {
67                 return err
68         }
69         defer r.Close()
70         uris := map[string]*Record{}
71         for {
72                 rec, _, err := r.ReadRecord()
73                 if err != nil {
74                         if err == io.EOF {
75                                 break
76                         }
77                         return err
78                 }
79                 rec.HdrLines = nil
80                 segNum := rec.Hdr.Get("WARC-Segment-Number")
81                 switch rec.Hdr.Get("WARC-Type") {
82                 case "response":
83                         uri := rec.URI()
84                         if uri == "" {
85                                 continue
86                         }
87                         if segNum == "1" {
88                                 Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec
89                                 continue
90                         }
91                         uris[uri] = rec
92                 case "continuation":
93                         originID := rec.Hdr.Get("WARC-Segment-Origin-ID")
94                         incomplete := Incomplete[originID]
95                         if incomplete == nil {
96                                 return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID)
97                         }
98                         segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1)
99                         if segNum != segNumExpected {
100                                 return fmt.Errorf(
101                                         "unexpected WARC-Segment-Number %s != %s",
102                                         segNum, segNumExpected,
103                                 )
104                         }
105                         incomplete.Continuations = append(incomplete.Continuations, rec)
106                         if rec.Hdr.Get("WARC-Segment-Total-Length") != "" {
107                                 if incomplete.WARCPath == warcPath {
108                                         uris[incomplete.URI()] = incomplete
109                                 } else {
110                                         WARCsM.Lock()
111                                         WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
112                                         WARCsM.Unlock()
113                                 }
114                                 delete(Incomplete, originID)
115                         }
116                 }
117         }
118         r.Close()
119         WARCsM.Lock()
120         WARCs[warcPath] = uris
121         WARCsOffsets[warcPath] = r.offsets
122         WARCsM.Unlock()
123         return nil
124 }
125
126 func SaveIndices() error {
127         WARCsM.RLock()
128         defer WARCsM.RUnlock()
129         for warcPath, uris := range WARCs {
130                 p := warcPath + IndexExt
131                 if _, err := os.Stat(p); err == nil {
132                         continue
133                 }
134                 tmpSuffix := strconv.FormatInt(time.Now().UnixNano()+int64(os.Getpid()), 16)
135                 fd, err := os.OpenFile(
136                         p+tmpSuffix,
137                         os.O_WRONLY|os.O_CREATE|os.O_EXCL,
138                         os.FileMode(0666),
139                 )
140                 if err != nil {
141                         return err
142                 }
143                 enc := gob.NewEncoder(fd)
144                 if err = enc.Encode(&uris); err != nil {
145                         fd.Close()
146                         return err
147                 }
148                 offsets := WARCsOffsets[warcPath]
149                 if err = enc.Encode(&offsets); err != nil {
150                         fd.Close()
151                         return err
152                 }
153                 if err = fd.Close(); err != nil {
154                         return err
155                 }
156                 if err = os.Rename(p+tmpSuffix, p); err != nil {
157                         return err
158                 }
159                 log.Println("saved:", p)
160         }
161         return nil
162 }