]> Sergey Matveev's repositories - tofuproxy.git/blob - cmd/warc-extract/main.go
Unify copyright comment format
[tofuproxy.git] / cmd / warc-extract / main.go
1 // warc-extract -- WARC files data extractor
2 // Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, version 3 of the License.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 // GNU General Public License for more details.
12 //
13 // You should have received a copy of the GNU General Public License
14 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
16 package main
17
18 import (
19         "bufio"
20         "bytes"
21         "encoding/binary"
22         "flag"
23         "fmt"
24         "io"
25         "log"
26         "os"
27
28         "github.com/dustin/go-humanize"
29         "go.stargrave.org/tofuproxy/warc"
30 )
31
32 func main() {
33         uri := flag.String("uri", "", "URI to extract, if specified")
34         hdr := flag.Bool("hdr", false, "Also extract WARC's header")
35         idx := flag.Bool("idx", false, "Save WARC indices")
36         recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
37         unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility")
38         flag.Parse()
39         log.SetFlags(log.Lshortfile)
40         warc.UnZSTDPath = *unzstdPath
41
42         if *recompress {
43                 var hdr bytes.Buffer
44                 size := make([]byte, 8)
45                 bw := bufio.NewWriter(os.Stdout)
46                 for _, p := range flag.Args() {
47                         r, err := warc.NewReader(p)
48                         if err != nil {
49                                 log.Fatalln(err)
50                         }
51                         for {
52                                 rec, rr, err := r.ReadRecord()
53                                 if err != nil {
54                                         if err == io.EOF {
55                                                 break
56                                         }
57                                         log.Fatalln(err)
58                                 }
59                                 for _, line := range rec.HdrLines {
60                                         hdr.WriteString(line)
61                                 }
62                                 hdr.WriteString("\r\n")
63                                 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
64                                 if _, err = bw.Write(size); err != nil {
65                                         log.Fatalln(err)
66                                 }
67                                 if _, err = io.Copy(bw, &hdr); err != nil {
68                                         log.Fatalln(err)
69                                 }
70                                 if _, err = io.Copy(bw, rr); err != nil {
71                                         log.Fatalln(err)
72                                 }
73                                 r.RecordWasRead()
74                                 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
75                                         log.Fatalln(err)
76                                 }
77                         }
78                 }
79                 if err := bw.Flush(); err != nil {
80                         log.Fatalln(err)
81                 }
82                 return
83         }
84
85         for _, p := range flag.Args() {
86                 log.Println("adding", p)
87                 if err := warc.Add(p); err != nil {
88                         log.Fatalln(err)
89                 }
90                 log.Println("added", p, len(warc.WARCs[p]), "URIs")
91                 if *idx {
92                         if err := warc.SaveIndices(); err != nil {
93                                 log.Fatalln(err)
94                         }
95                 }
96         }
97         if *uri == "" {
98                 for warcPath, uris := range warc.WARCs {
99                         for uri, rec := range uris {
100                                 fmt.Printf(
101                                         "%s\t%s\t%s\n",
102                                         warcPath, uri,
103                                         humanize.IBytes(uint64(rec.TotalSize())),
104                                 )
105                         }
106                 }
107         } else {
108                 for _, uris := range warc.WARCs {
109                         rec := uris[*uri]
110                         if rec == nil {
111                                 continue
112                         }
113                         r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
114                         if err != nil {
115                                 log.Fatalln(err)
116                         }
117                         io.Copy(os.Stdout, r)
118                 }
119         }
120 }