]> Sergey Matveev's repositories - tofuproxy.git/blob - cmd/warc-extract/main.go
WARC
[tofuproxy.git] / cmd / warc-extract / main.go
1 /*
2 tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
3 Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 package main
19
20 import (
21         "flag"
22         "fmt"
23         "io"
24         "log"
25         "os"
26
27         "github.com/dustin/go-humanize"
28         "go.stargrave.org/tofuproxy/warc"
29 )
30
31 func main() {
32         uri := flag.String("uri", "", "URI to extract, if specified")
33         hdr := flag.Bool("hdr", false, "Also extract WARC's header")
34         idx := flag.Bool("idx", false, "Save WARC indexes")
35         flag.Parse()
36         log.SetFlags(log.Lshortfile)
37
38         for _, p := range flag.Args() {
39                 log.Println("adding", p)
40                 if err := warc.Add(p); err != nil {
41                         log.Fatalln(err)
42                 }
43                 log.Println("added", p, len(warc.WARCs[p]), "URIs")
44                 if *idx {
45                         if err := warc.SaveIndexes(); err != nil {
46                                 log.Fatalln(err)
47                         }
48                 }
49         }
50         if *uri == "" {
51                 for warcPath, uris := range warc.WARCs {
52                         for uri, rec := range uris {
53                                 fmt.Printf(
54                                         "%s\t%s\t%s\n",
55                                         warcPath, uri,
56                                         humanize.IBytes(uint64(rec.TotalSize())),
57                                 )
58                         }
59                 }
60         } else {
61                 for _, uris := range warc.WARCs {
62                         rec := uris[*uri]
63                         if rec == nil {
64                                 continue
65                         }
66                         r, err := rec.Reader(!*hdr)
67                         if err != nil {
68                                 log.Fatalln(err)
69                         }
70                         io.Copy(os.Stdout, r)
71                 }
72         }
73         return
74 }