]> Sergey Matveev's repositories - tofuproxy.git/blobdiff - warc/gzip.go
Multistream WARCs and better Zstandard support
[tofuproxy.git] / warc / gzip.go
diff --git a/warc/gzip.go b/warc/gzip.go
new file mode 100644 (file)
index 0000000..335a26e
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package warc
+
+import (
+       "bufio"
+       "compress/gzip"
+       "io"
+       "os"
+)
+
+type CountableReader struct {
+       *bufio.Reader
+       n int64
+}
+
+func (r *CountableReader) ReadByte() (b byte, err error) {
+       b, err = r.Reader.ReadByte()
+       if err == nil {
+               r.n++
+       }
+       return b, err
+}
+
+func (r *CountableReader) Read(p []byte) (n int, err error) {
+       n, err = r.Reader.Read(p)
+       r.n += int64(n)
+       return
+}
+
+type GZIPReader struct {
+       fd      *os.File
+       r       io.Reader
+       offsets []Offset
+}
+
+func (r *GZIPReader) Read(p []byte) (n int, err error) {
+       return r.r.Read(p)
+}
+
+func (r *GZIPReader) Close() error {
+       return r.fd.Close()
+}
+
+func (r *GZIPReader) Offsets() []Offset {
+       return r.offsets
+}
+
+func NewGZIPReader(
+       warcPath string,
+       offsets []Offset,
+       uOffset int64,
+) (*GZIPReader, error) {
+       var offZ, offU int64
+       for _, off := range offsets {
+               if uOffset < offU+off.U {
+                       break
+               }
+               offU += off.U
+               offZ += off.Z
+       }
+       fd, err := os.Open(warcPath)
+       if err != nil {
+               return nil, err
+       }
+       if _, err = fd.Seek(offZ, io.SeekStart); err != nil {
+               fd.Close()
+               return nil, err
+       }
+       cr := &CountableReader{Reader: bufio.NewReader(fd)}
+       z, err := gzip.NewReader(cr)
+       if err != nil {
+               fd.Close()
+               return nil, err
+       }
+       r, w := io.Pipe()
+       gr := GZIPReader{r: r}
+       go func() {
+               z.Multistream(false)
+               var offset, offsetPrev int64
+               for {
+                       written, err := io.Copy(w, z)
+                       if err != nil {
+                               w.CloseWithError(err)
+                               return
+                       }
+                       offset = cr.n
+                       gr.offsets = append(gr.offsets, Offset{offset - offsetPrev, written})
+                       offsetPrev = offset
+                       err = z.Reset(cr)
+                       if err != nil {
+                               if err == io.EOF {
+                                       break
+                               }
+                               w.CloseWithError(err)
+                               return
+                       }
+                       z.Multistream(false)
+               }
+               w.CloseWithError(io.EOF)
+       }()
+       _, err = io.CopyN(io.Discard, r, uOffset-offU)
+       if err != nil {
+               fd.Close()
+               return nil, err
+       }
+       return &gr, nil
+}