]> Sergey Matveev's repositories - tofuproxy.git/blob - warc/uris.go
WARC
[tofuproxy.git] / warc / uris.go
1 /*
2 tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
3 Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 package warc
19
20 import (
21         "encoding/gob"
22         "fmt"
23         "io"
24         "log"
25         "os"
26         "strconv"
27         "sync"
28 )
29
30 const IndexExt = ".idx.gob"
31
32 var (
33         WARCs  = map[string]map[string]*Record{}
34         WARCsM sync.RWMutex
35
36         Incomplete = map[string]*Record{}
37 )
38
39 func Add(warcPath string) error {
40         fd, err := os.Open(warcPath + IndexExt)
41         if err == nil {
42                 defer fd.Close()
43                 var uris map[string]*Record
44                 if err := gob.NewDecoder(fd).Decode(&uris); err != nil {
45                         return err
46                 }
47                 WARCsM.Lock()
48                 WARCs[warcPath] = uris
49                 WARCsM.Unlock()
50                 return nil
51         }
52         if err != nil && !os.IsNotExist(err) {
53                 return err
54         }
55         r, err := NewReader(warcPath)
56         if err != nil {
57                 return err
58         }
59         defer r.Close()
60         uris := map[string]*Record{}
61         for {
62                 rec, err := r.ReadRecord()
63                 if err != nil {
64                         if err == io.EOF {
65                                 break
66                         }
67                         return err
68                 }
69                 segNum := rec.Hdr.Get("WARC-Segment-Number")
70                 switch rec.Hdr.Get("WARC-Type") {
71                 case "response":
72                         uri := rec.URI()
73                         if uri == "" {
74                                 continue
75                         }
76                         if segNum == "1" {
77                                 Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec
78                                 continue
79                         }
80                         uris[uri] = rec
81                 case "continuation":
82                         originID := rec.Hdr.Get("WARC-Segment-Origin-ID")
83                         incomplete := Incomplete[originID]
84                         if incomplete == nil {
85                                 return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID)
86                         }
87                         segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1)
88                         if segNum != segNumExpected {
89                                 return fmt.Errorf(
90                                         "unexpected WARC-Segment-Number %s != %s",
91                                         segNum, segNumExpected,
92                                 )
93                         }
94                         incomplete.Continuations = append(incomplete.Continuations, rec)
95                         if rec.Hdr.Get("WARC-Segment-Total-Length") != "" {
96                                 WARCsM.Lock()
97                                 WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
98                                 WARCsM.Unlock()
99                                 delete(Incomplete, originID)
100                         }
101                 }
102         }
103         WARCsM.Lock()
104         WARCs[warcPath] = uris
105         WARCsM.Unlock()
106         return nil
107 }
108
109 func SaveIndexes() error {
110         WARCsM.RLock()
111         defer WARCsM.RUnlock()
112         for warcPath, uris := range WARCs {
113                 p := warcPath + IndexExt
114                 if _, err := os.Stat(p); err == nil {
115                         continue
116                 }
117                 fd, err := os.OpenFile(
118                         p+".tmp",
119                         os.O_CREATE|os.O_WRONLY|os.O_EXCL,
120                         os.FileMode(0666),
121                 )
122                 if err != nil {
123                         return err
124                 }
125                 if err = gob.NewEncoder(fd).Encode(&uris); err != nil {
126                         fd.Close()
127                         return err
128                 }
129                 fd.Close()
130                 if err = os.Rename(p+".tmp", p); err != nil {
131                         return err
132                 }
133                 log.Println("saved:", p)
134         }
135         return nil
136 }