]> Sergey Matveev's repositories - tofuproxy.git/blob - cmd/warc-extract/main.go
Raised copyright years
[tofuproxy.git] / cmd / warc-extract / main.go
1 /*
2 warc-extract -- WARC files data extractor
3 Copyright (C) 2021-2022 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 package main
19
20 import (
21         "bufio"
22         "bytes"
23         "encoding/binary"
24         "flag"
25         "fmt"
26         "io"
27         "log"
28         "os"
29
30         "github.com/dustin/go-humanize"
31         "go.stargrave.org/tofuproxy/warc"
32 )
33
34 func main() {
35         uri := flag.String("uri", "", "URI to extract, if specified")
36         hdr := flag.Bool("hdr", false, "Also extract WARC's header")
37         idx := flag.Bool("idx", false, "Save WARC indexes")
38         recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
39         flag.Parse()
40         log.SetFlags(log.Lshortfile)
41
42         if *recompress {
43                 var hdr bytes.Buffer
44                 size := make([]byte, 8)
45                 bw := bufio.NewWriter(os.Stdout)
46                 for _, p := range flag.Args() {
47                         r, err := warc.NewReader(p)
48                         if err != nil {
49                                 log.Fatalln(err)
50                         }
51                         for {
52                                 rec, rr, err := r.ReadRecord()
53                                 if err != nil {
54                                         if err == io.EOF {
55                                                 break
56                                         }
57                                         log.Fatalln(err)
58                                 }
59                                 for _, line := range rec.HdrLines {
60                                         hdr.WriteString(line)
61                                 }
62                                 hdr.WriteString("\r\n")
63                                 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
64                                 if _, err = bw.Write(size); err != nil {
65                                         log.Fatalln(err)
66                                 }
67                                 if _, err = io.Copy(bw, &hdr); err != nil {
68                                         log.Fatalln(err)
69                                 }
70                                 if _, err = io.Copy(bw, rr); err != nil {
71                                         log.Fatalln(err)
72                                 }
73                                 r.RecordWasRead()
74                                 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
75                                         log.Fatalln(err)
76                                 }
77                         }
78                 }
79                 if err := bw.Flush(); err != nil {
80                         log.Fatalln(err)
81                 }
82                 return
83         }
84
85         for _, p := range flag.Args() {
86                 log.Println("adding", p)
87                 if err := warc.Add(p); err != nil {
88                         log.Fatalln(err)
89                 }
90                 log.Println("added", p, len(warc.WARCs[p]), "URIs")
91                 if *idx {
92                         if err := warc.SaveIndexes(); err != nil {
93                                 log.Fatalln(err)
94                         }
95                 }
96         }
97         if *uri == "" {
98                 for warcPath, uris := range warc.WARCs {
99                         for uri, rec := range uris {
100                                 fmt.Printf(
101                                         "%s\t%s\t%s\n",
102                                         warcPath, uri,
103                                         humanize.IBytes(uint64(rec.TotalSize())),
104                                 )
105                         }
106                 }
107         } else {
108                 for _, uris := range warc.WARCs {
109                         rec := uris[*uri]
110                         if rec == nil {
111                                 continue
112                         }
113                         r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
114                         if err != nil {
115                                 log.Fatalln(err)
116                         }
117                         io.Copy(os.Stdout, r)
118                 }
119         }
120         return
121 }