]> Sergey Matveev's repositories - tofuproxy.git/blobdiff - warc/uris.go
Download link for 0.6.0 release
[tofuproxy.git] / warc / uris.go
index a971ff0edce5f35660c9ec6c4c22dbc4a1c2fed3..56cd415d1a51868fc3b05e99bbd53403431107dd 100644 (file)
@@ -1,37 +1,40 @@
-/*
-tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
-Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, version 3 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
+// tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU
+//              manager, WARC/geminispace browser
+// Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 3 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 package warc
 
 import (
        "encoding/gob"
+       "errors"
        "fmt"
        "io"
+       "io/fs"
        "log"
        "os"
        "strconv"
        "sync"
+       "time"
 )
 
 const IndexExt = ".idx.gob"
 
 var (
-       WARCs  = map[string]map[string]*Record{}
-       WARCsM sync.RWMutex
+       WARCs        = map[string]map[string]*Record{}
+       WARCsOffsets = map[string][]Offset{}
+       WARCsM       sync.RWMutex
 
        Incomplete = map[string]*Record{}
 )
@@ -41,15 +44,22 @@ func Add(warcPath string) error {
        if err == nil {
                defer fd.Close()
                var uris map[string]*Record
-               if err := gob.NewDecoder(fd).Decode(&uris); err != nil {
+               var offsets []Offset
+               dec := gob.NewDecoder(fd)
+               if err := dec.Decode(&uris); err != nil {
+                       return err
+               }
+               if err := dec.Decode(&offsets); err != nil {
                        return err
                }
                WARCsM.Lock()
                WARCs[warcPath] = uris
+               WARCsOffsets[warcPath] = offsets
                WARCsM.Unlock()
+               log.Println("loaded marshalled index:", warcPath+IndexExt)
                return nil
        }
-       if err != nil && !os.IsNotExist(err) {
+       if err != nil && !errors.Is(err, fs.ErrNotExist) {
                return err
        }
        r, err := NewReader(warcPath)
@@ -59,13 +69,14 @@ func Add(warcPath string) error {
        defer r.Close()
        uris := map[string]*Record{}
        for {
-               rec, err := r.ReadRecord()
+               rec, _, err := r.ReadRecord()
                if err != nil {
                        if err == io.EOF {
                                break
                        }
                        return err
                }
+               rec.HdrLines = nil
                segNum := rec.Hdr.Get("WARC-Segment-Number")
                switch rec.Hdr.Get("WARC-Type") {
                case "response":
@@ -93,20 +104,26 @@ func Add(warcPath string) error {
                        }
                        incomplete.Continuations = append(incomplete.Continuations, rec)
                        if rec.Hdr.Get("WARC-Segment-Total-Length") != "" {
-                               WARCsM.Lock()
-                               WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
-                               WARCsM.Unlock()
+                               if incomplete.WARCPath == warcPath {
+                                       uris[incomplete.URI()] = incomplete
+                               } else {
+                                       WARCsM.Lock()
+                                       WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete
+                                       WARCsM.Unlock()
+                               }
                                delete(Incomplete, originID)
                        }
                }
        }
+       r.Close()
        WARCsM.Lock()
        WARCs[warcPath] = uris
+       WARCsOffsets[warcPath] = r.offsets
        WARCsM.Unlock()
        return nil
 }
 
-func SaveIndexes() error {
+func SaveIndices() error {
        WARCsM.RLock()
        defer WARCsM.RUnlock()
        for warcPath, uris := range WARCs {
@@ -114,20 +131,29 @@ func SaveIndexes() error {
                if _, err := os.Stat(p); err == nil {
                        continue
                }
+               tmpSuffix := strconv.FormatInt(time.Now().UnixNano()+int64(os.Getpid()), 16)
                fd, err := os.OpenFile(
-                       p+".tmp",
-                       os.O_CREATE|os.O_WRONLY|os.O_EXCL,
+                       p+tmpSuffix,
+                       os.O_WRONLY|os.O_CREATE|os.O_EXCL,
                        os.FileMode(0666),
                )
                if err != nil {
                        return err
                }
-               if err = gob.NewEncoder(fd).Encode(&uris); err != nil {
+               enc := gob.NewEncoder(fd)
+               if err = enc.Encode(&uris); err != nil {
                        fd.Close()
                        return err
                }
-               fd.Close()
-               if err = os.Rename(p+".tmp", p); err != nil {
+               offsets := WARCsOffsets[warcPath]
+               if err = enc.Encode(&offsets); err != nil {
+                       fd.Close()
+                       return err
+               }
+               if err = fd.Close(); err != nil {
+                       return err
+               }
+               if err = os.Rename(p+tmpSuffix, p); err != nil {
                        return err
                }
                log.Println("saved:", p)