]> Sergey Matveev's repositories - tofuproxy.git/blobdiff - cmd/warc-extract/main.go
Download link for 0.6.0 release
[tofuproxy.git] / cmd / warc-extract / main.go
index 8b1aa2ab99db21fcc0afab6ac52f412c9e6375bb..c9d2a3676b9c21938a6e1b4861c30da5eb66ce09 100644 (file)
@@ -1,23 +1,24 @@
-/*
-tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
-Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, version 3 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
+// warc-extract -- WARC files data extractor
+// Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 3 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 package main
 
 import (
+       "bufio"
+       "bytes"
+       "encoding/binary"
        "flag"
        "fmt"
        "io"
@@ -31,9 +32,55 @@ import (
 func main() {
        uri := flag.String("uri", "", "URI to extract, if specified")
        hdr := flag.Bool("hdr", false, "Also extract WARC's header")
-       idx := flag.Bool("idx", false, "Save WARC indexes")
+       idx := flag.Bool("idx", false, "Save WARC indices")
+       recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
+       unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility")
        flag.Parse()
        log.SetFlags(log.Lshortfile)
+       warc.UnZSTDPath = *unzstdPath
+
+       if *recompress {
+               var hdr bytes.Buffer
+               size := make([]byte, 8)
+               bw := bufio.NewWriter(os.Stdout)
+               for _, p := range flag.Args() {
+                       r, err := warc.NewReader(p)
+                       if err != nil {
+                               log.Fatalln(err)
+                       }
+                       for {
+                               rec, rr, err := r.ReadRecord()
+                               if err != nil {
+                                       if err == io.EOF {
+                                               break
+                                       }
+                                       log.Fatalln(err)
+                               }
+                               for _, line := range rec.HdrLines {
+                                       hdr.WriteString(line)
+                               }
+                               hdr.WriteString("\r\n")
+                               binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
+                               if _, err = bw.Write(size); err != nil {
+                                       log.Fatalln(err)
+                               }
+                               if _, err = io.Copy(bw, &hdr); err != nil {
+                                       log.Fatalln(err)
+                               }
+                               if _, err = io.Copy(bw, rr); err != nil {
+                                       log.Fatalln(err)
+                               }
+                               r.RecordWasRead()
+                               if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
+                                       log.Fatalln(err)
+                               }
+                       }
+               }
+               if err := bw.Flush(); err != nil {
+                       log.Fatalln(err)
+               }
+               return
+       }
 
        for _, p := range flag.Args() {
                log.Println("adding", p)
@@ -42,7 +89,7 @@ func main() {
                }
                log.Println("added", p, len(warc.WARCs[p]), "URIs")
                if *idx {
-                       if err := warc.SaveIndexes(); err != nil {
+                       if err := warc.SaveIndices(); err != nil {
                                log.Fatalln(err)
                        }
                }
@@ -63,12 +110,11 @@ func main() {
                        if rec == nil {
                                continue
                        }
-                       r, err := rec.Reader(!*hdr)
+                       r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
                        if err != nil {
                                log.Fatalln(err)
                        }
                        io.Copy(os.Stdout, r)
                }
        }
-       return
 }