// tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU // manager, WARC/geminispace browser // Copyright (C) 2021-2024 Sergey Matveev // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3 of the License. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . package warc import ( "encoding/gob" "errors" "fmt" "io" "io/fs" "log" "os" "strconv" "sync" "time" ) const IndexExt = ".idx.gob" var ( WARCs = map[string]map[string]*Record{} WARCsOffsets = map[string][]Offset{} WARCsM sync.RWMutex Incomplete = map[string]*Record{} ) func Add(warcPath string) error { fd, err := os.Open(warcPath + IndexExt) if err == nil { defer fd.Close() var uris map[string]*Record var offsets []Offset dec := gob.NewDecoder(fd) if err := dec.Decode(&uris); err != nil { return err } if err := dec.Decode(&offsets); err != nil { return err } WARCsM.Lock() WARCs[warcPath] = uris WARCsOffsets[warcPath] = offsets WARCsM.Unlock() log.Println("loaded marshalled index:", warcPath+IndexExt) return nil } if err != nil && !errors.Is(err, fs.ErrNotExist) { return err } r, err := NewReader(warcPath) if err != nil { return err } defer r.Close() uris := map[string]*Record{} for { rec, _, err := r.ReadRecord() if err != nil { if err == io.EOF { break } return err } rec.HdrLines = nil segNum := rec.Hdr.Get("WARC-Segment-Number") switch rec.Hdr.Get("WARC-Type") { case "response": uri := rec.URI() if uri == "" { continue } if segNum == "1" { Incomplete[rec.Hdr.Get("WARC-Record-ID")] = rec continue } uris[uri] = rec case "continuation": originID := rec.Hdr.Get("WARC-Segment-Origin-ID") incomplete := Incomplete[originID] if incomplete == nil { return fmt.Errorf("can not find WARC-Segment-Origin-ID: %q", originID) } segNumExpected := strconv.Itoa(len(incomplete.Continuations) + 1 + 1) if segNum != segNumExpected { return fmt.Errorf( "unexpected WARC-Segment-Number %s != %s", segNum, segNumExpected, ) } incomplete.Continuations = append(incomplete.Continuations, rec) if rec.Hdr.Get("WARC-Segment-Total-Length") != "" { if incomplete.WARCPath == warcPath { uris[incomplete.URI()] = incomplete } else { WARCsM.Lock() WARCs[incomplete.WARCPath][incomplete.URI()] = incomplete WARCsM.Unlock() } delete(Incomplete, originID) } } } r.Close() WARCsM.Lock() WARCs[warcPath] = uris WARCsOffsets[warcPath] = r.offsets WARCsM.Unlock() return nil } func SaveIndices() error { WARCsM.RLock() defer WARCsM.RUnlock() for warcPath, uris := range WARCs { p := warcPath + IndexExt if _, err := os.Stat(p); err == nil { continue } tmpSuffix := strconv.FormatInt(time.Now().UnixNano()+int64(os.Getpid()), 16) fd, err := os.OpenFile( p+tmpSuffix, os.O_WRONLY|os.O_CREATE|os.O_EXCL, os.FileMode(0666), ) if err != nil { return err } enc := gob.NewEncoder(fd) if err = enc.Encode(&uris); err != nil { fd.Close() return err } offsets := WARCsOffsets[warcPath] if err = enc.Encode(&offsets); err != nil { fd.Close() return err } if err = fd.Close(); err != nil { return err } if err = os.Rename(p+tmpSuffix, p); err != nil { return err } log.Println("saved:", p) } return nil }