/* tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management Copyright (C) 2021 Sergey Matveev This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ package warc import ( "bufio" "fmt" "io" "strconv" "strings" ) const CRLF = "\r\n" type Reader struct { Path string r *bufio.Reader rsc io.ReadSeekCloser offset int64 prevRec *Record } func NewReader(warcPath string) (*Reader, error) { rsc, err := Open(warcPath) if err != nil { return nil, err } return &Reader{ Path: warcPath, rsc: rsc, r: bufio.NewReader(rsc), }, nil } func (r *Reader) next() error { if r.prevRec == nil { return nil } if _, err := r.r.Discard(int(r.prevRec.Size)); err != nil { return err } r.offset += int64(r.prevRec.HdrLen) + r.prevRec.Size for i := 0; i < 2; i++ { line, err := r.r.ReadString('\n') if err != nil { return err } r.offset += int64(len(line)) if line != CRLF { return fmt.Errorf("non-CRLF: %q", line) } } return nil } func (r *Reader) ReadRecord() (*Record, error) { r.next() line, err := r.r.ReadString('\n') if err != nil { return nil, err } if !strings.HasPrefix(line, "WARC/") { return nil, fmt.Errorf("non-WARC header: %q", line) } hdrLen := len(line) hdr := NewHeader() for { line, err := r.r.ReadString('\n') if err != nil { return nil, err } hdrLen += len(line) if line == CRLF { break } hdr.AddLine(line) } size, err := strconv.ParseUint(hdr.Get("Content-Length"), 10, 64) if err != nil { return nil, err } rec := &Record{ WARCPath: r.Path, Offset: r.offset, Hdr: hdr, HdrLen: hdrLen, Size: int64(size), } r.prevRec = rec return rec, nil } func (r *Reader) Close() error { return r.rsc.Close() }