]> Sergey Matveev's repositories - tofuproxy.git/blob - cmd/warc-extract/main.go
gemini:// support
[tofuproxy.git] / cmd / warc-extract / main.go
1 /*
2 tofuproxy -- flexible HTTP proxy, TLS terminator, X.509 certificates
3              manager, WARC/Gemini browser
4 Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, version 3 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 package main
20
21 import (
22         "bufio"
23         "bytes"
24         "encoding/binary"
25         "flag"
26         "fmt"
27         "io"
28         "log"
29         "os"
30
31         "github.com/dustin/go-humanize"
32         "go.stargrave.org/tofuproxy/warc"
33 )
34
35 func main() {
36         uri := flag.String("uri", "", "URI to extract, if specified")
37         hdr := flag.Bool("hdr", false, "Also extract WARC's header")
38         idx := flag.Bool("idx", false, "Save WARC indexes")
39         recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
40         flag.Parse()
41         log.SetFlags(log.Lshortfile)
42
43         if *recompress {
44                 var hdr bytes.Buffer
45                 size := make([]byte, 8)
46                 bw := bufio.NewWriter(os.Stdout)
47                 for _, p := range flag.Args() {
48                         r, err := warc.NewReader(p)
49                         if err != nil {
50                                 log.Fatalln(err)
51                         }
52                         for {
53                                 rec, rr, err := r.ReadRecord()
54                                 if err != nil {
55                                         if err == io.EOF {
56                                                 break
57                                         }
58                                         log.Fatalln(err)
59                                 }
60                                 for _, line := range rec.HdrLines {
61                                         hdr.WriteString(line)
62                                 }
63                                 hdr.WriteString("\r\n")
64                                 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
65                                 if _, err = bw.Write(size); err != nil {
66                                         log.Fatalln(err)
67                                 }
68                                 if _, err = io.Copy(bw, &hdr); err != nil {
69                                         log.Fatalln(err)
70                                 }
71                                 if _, err = io.Copy(bw, rr); err != nil {
72                                         log.Fatalln(err)
73                                 }
74                                 r.RecordWasRead()
75                                 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
76                                         log.Fatalln(err)
77                                 }
78                         }
79                 }
80                 if err := bw.Flush(); err != nil {
81                         log.Fatalln(err)
82                 }
83                 return
84         }
85
86         for _, p := range flag.Args() {
87                 log.Println("adding", p)
88                 if err := warc.Add(p); err != nil {
89                         log.Fatalln(err)
90                 }
91                 log.Println("added", p, len(warc.WARCs[p]), "URIs")
92                 if *idx {
93                         if err := warc.SaveIndexes(); err != nil {
94                                 log.Fatalln(err)
95                         }
96                 }
97         }
98         if *uri == "" {
99                 for warcPath, uris := range warc.WARCs {
100                         for uri, rec := range uris {
101                                 fmt.Printf(
102                                         "%s\t%s\t%s\n",
103                                         warcPath, uri,
104                                         humanize.IBytes(uint64(rec.TotalSize())),
105                                 )
106                         }
107                 }
108         } else {
109                 for _, uris := range warc.WARCs {
110                         rec := uris[*uri]
111                         if rec == nil {
112                                 continue
113                         }
114                         r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
115                         if err != nil {
116                                 log.Fatalln(err)
117                         }
118                         io.Copy(os.Stdout, r)
119                 }
120         }
121         return
122 }