X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=warc%2Freader.go;h=06a123b4eece7fc352657a9d95491f6983518571;hb=bae1cfe5ce46a1b758ccc4dddda2751b6ac47f3e;hp=e76bd51e299ceaffe2ba5e49cf98a552cb999a5f;hpb=0c0a261a6ef4fddfc34a9150005f7964cc69c420;p=tofuproxy.git diff --git a/warc/reader.go b/warc/reader.go index e76bd51..06a123b 100644 --- a/warc/reader.go +++ b/warc/reader.go @@ -29,21 +29,22 @@ const CRLF = "\r\n" type Reader struct { Path string - r *bufio.Reader - rsc io.ReadSeekCloser + rrr RawRecordReader + br *bufio.Reader offset int64 prevRec *Record + offsets []Offset } func NewReader(warcPath string) (*Reader, error) { - rsc, err := Open(warcPath) + rrr, err := Open(warcPath, nil, 0) if err != nil { return nil, err } return &Reader{ Path: warcPath, - rsc: rsc, - r: bufio.NewReader(rsc), + rrr: rrr, + br: bufio.NewReader(rrr), }, nil } @@ -51,12 +52,12 @@ func (r *Reader) next() error { if r.prevRec == nil { return nil } - if _, err := r.r.Discard(int(r.prevRec.Size)); err != nil { + if _, err := r.br.Discard(int(r.prevRec.Size)); err != nil { return err } r.offset += int64(r.prevRec.HdrLen) + r.prevRec.Size for i := 0; i < 2; i++ { - line, err := r.r.ReadString('\n') + line, err := r.br.ReadString('\n') if err != nil { return err } @@ -68,43 +69,53 @@ func (r *Reader) next() error { return nil } -func (r *Reader) ReadRecord() (*Record, error) { +func (r *Reader) ReadRecord() (*Record, io.Reader, error) { r.next() - line, err := r.r.ReadString('\n') + line, err := r.br.ReadString('\n') if err != nil { - return nil, err + return nil, nil, err } if !strings.HasPrefix(line, "WARC/") { - return nil, fmt.Errorf("non-WARC header: %q", line) + return nil, nil, fmt.Errorf("non-WARC header: %q", line) } + hdrLines := []string{line} hdrLen := len(line) hdr := NewHeader() for { - line, err := r.r.ReadString('\n') + line, err := r.br.ReadString('\n') if err != nil { - return nil, err + return nil, nil, err } hdrLen += len(line) if line == CRLF { break } + hdrLines = append(hdrLines, line) hdr.AddLine(line) } size, err := strconv.ParseUint(hdr.Get("Content-Length"), 10, 64) if err != nil { - return nil, err + return nil, nil, err } rec := &Record{ WARCPath: r.Path, Offset: r.offset, + Size: int64(size), Hdr: hdr, HdrLen: hdrLen, - Size: int64(size), + HdrLines: hdrLines, } r.prevRec = rec - return rec, nil + return rec, &io.LimitedReader{R: r.br, N: int64(size)}, nil +} + +func (r *Reader) RecordWasRead() { + r.prevRec.HdrLen = 0 + r.prevRec.Size = 0 } func (r *Reader) Close() error { - return r.rsc.Close() + err := r.rrr.Close() + r.offsets = r.rrr.Offsets() + return err }