]> Sergey Matveev's repositories - tofuproxy.git/blobdiff - cmd/warc-extract/main.go
WARC
[tofuproxy.git] / cmd / warc-extract / main.go
diff --git a/cmd/warc-extract/main.go b/cmd/warc-extract/main.go
new file mode 100644 (file)
index 0000000..8b1aa2a
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+       "flag"
+       "fmt"
+       "io"
+       "log"
+       "os"
+
+       "github.com/dustin/go-humanize"
+       "go.stargrave.org/tofuproxy/warc"
+)
+
+func main() {
+       uri := flag.String("uri", "", "URI to extract, if specified")
+       hdr := flag.Bool("hdr", false, "Also extract WARC's header")
+       idx := flag.Bool("idx", false, "Save WARC indexes")
+       flag.Parse()
+       log.SetFlags(log.Lshortfile)
+
+       for _, p := range flag.Args() {
+               log.Println("adding", p)
+               if err := warc.Add(p); err != nil {
+                       log.Fatalln(err)
+               }
+               log.Println("added", p, len(warc.WARCs[p]), "URIs")
+               if *idx {
+                       if err := warc.SaveIndexes(); err != nil {
+                               log.Fatalln(err)
+                       }
+               }
+       }
+       if *uri == "" {
+               for warcPath, uris := range warc.WARCs {
+                       for uri, rec := range uris {
+                               fmt.Printf(
+                                       "%s\t%s\t%s\n",
+                                       warcPath, uri,
+                                       humanize.IBytes(uint64(rec.TotalSize())),
+                               )
+                       }
+               }
+       } else {
+               for _, uris := range warc.WARCs {
+                       rec := uris[*uri]
+                       if rec == nil {
+                               continue
+                       }
+                       r, err := rec.Reader(!*hdr)
+                       if err != nil {
+                               log.Fatalln(err)
+                       }
+                       io.Copy(os.Stdout, r)
+               }
+       }
+       return
+}