// tofuproxy -- flexible HTTP/HTTPS proxy, TLS terminator, X.509 TOFU // manager, WARC/geminispace browser // Copyright (C) 2021-2024 Sergey Matveev // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3 of the License. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . package warc import ( "bufio" "fmt" "io" "strconv" "strings" ) const CRLF = "\r\n" type Reader struct { Path string rrr RawRecordReader br *bufio.Reader offset int64 prevRec *Record offsets []Offset } func NewReader(warcPath string) (*Reader, error) { rrr, err := Open(warcPath, nil, 0) if err != nil { return nil, err } return &Reader{ Path: warcPath, rrr: rrr, br: bufio.NewReader(rrr), }, nil } func (r *Reader) next() error { if r.prevRec == nil { return nil } if _, err := r.br.Discard(int(r.prevRec.Size)); err != nil { return err } r.offset += int64(r.prevRec.HdrLen) + r.prevRec.Size for i := 0; i < 2; i++ { line, err := r.br.ReadString('\n') if err != nil { return err } r.offset += int64(len(line)) if line != CRLF { return fmt.Errorf("non-CRLF: %q", line) } } return nil } func (r *Reader) ReadRecord() (*Record, io.Reader, error) { r.next() line, err := r.br.ReadString('\n') if err != nil { return nil, nil, err } if !strings.HasPrefix(line, "WARC/") { return nil, nil, fmt.Errorf("non-WARC header: %q", line) } hdrLines := []string{line} hdrLen := len(line) hdr := NewHeader() for { line, err := r.br.ReadString('\n') if err != nil { return nil, nil, err } hdrLen += len(line) if line == CRLF { break } hdrLines = append(hdrLines, line) hdr.AddLine(line) } size, err := strconv.ParseUint(hdr.Get("Content-Length"), 10, 64) if err != nil { return nil, nil, err } rec := &Record{ WARCPath: r.Path, Offset: r.offset, Size: int64(size), Hdr: hdr, HdrLen: hdrLen, HdrLines: hdrLines, } r.prevRec = rec return rec, &io.LimitedReader{R: r.br, N: int64(size)}, nil } func (r *Reader) RecordWasRead() { r.prevRec.HdrLen = 0 r.prevRec.Size = 0 } func (r *Reader) Close() error { err := r.rrr.Close() r.offsets = r.rrr.Offsets() return err }