]> Sergey Matveev's repositories - tofuproxy.git/blob - cmd/warc-extract/main.go
abb1618ea40a1faae7489d672eb03f5d70576a0b
[tofuproxy.git] / cmd / warc-extract / main.go
1 /*
2 warc-extract -- WARC files data extractor
3 Copyright (C) 2021-2023 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 package main
19
20 import (
21         "bufio"
22         "bytes"
23         "encoding/binary"
24         "flag"
25         "fmt"
26         "io"
27         "log"
28         "os"
29
30         "github.com/dustin/go-humanize"
31         "go.stargrave.org/tofuproxy/warc"
32 )
33
34 func main() {
35         uri := flag.String("uri", "", "URI to extract, if specified")
36         hdr := flag.Bool("hdr", false, "Also extract WARC's header")
37         idx := flag.Bool("idx", false, "Save WARC indices")
38         recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
39         unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility")
40         flag.Parse()
41         log.SetFlags(log.Lshortfile)
42         warc.UnZSTDPath = *unzstdPath
43
44         if *recompress {
45                 var hdr bytes.Buffer
46                 size := make([]byte, 8)
47                 bw := bufio.NewWriter(os.Stdout)
48                 for _, p := range flag.Args() {
49                         r, err := warc.NewReader(p)
50                         if err != nil {
51                                 log.Fatalln(err)
52                         }
53                         for {
54                                 rec, rr, err := r.ReadRecord()
55                                 if err != nil {
56                                         if err == io.EOF {
57                                                 break
58                                         }
59                                         log.Fatalln(err)
60                                 }
61                                 for _, line := range rec.HdrLines {
62                                         hdr.WriteString(line)
63                                 }
64                                 hdr.WriteString("\r\n")
65                                 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
66                                 if _, err = bw.Write(size); err != nil {
67                                         log.Fatalln(err)
68                                 }
69                                 if _, err = io.Copy(bw, &hdr); err != nil {
70                                         log.Fatalln(err)
71                                 }
72                                 if _, err = io.Copy(bw, rr); err != nil {
73                                         log.Fatalln(err)
74                                 }
75                                 r.RecordWasRead()
76                                 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
77                                         log.Fatalln(err)
78                                 }
79                         }
80                 }
81                 if err := bw.Flush(); err != nil {
82                         log.Fatalln(err)
83                 }
84                 return
85         }
86
87         for _, p := range flag.Args() {
88                 log.Println("adding", p)
89                 if err := warc.Add(p); err != nil {
90                         log.Fatalln(err)
91                 }
92                 log.Println("added", p, len(warc.WARCs[p]), "URIs")
93                 if *idx {
94                         if err := warc.SaveIndices(); err != nil {
95                                 log.Fatalln(err)
96                         }
97                 }
98         }
99         if *uri == "" {
100                 for warcPath, uris := range warc.WARCs {
101                         for uri, rec := range uris {
102                                 fmt.Printf(
103                                         "%s\t%s\t%s\n",
104                                         warcPath, uri,
105                                         humanize.IBytes(uint64(rec.TotalSize())),
106                                 )
107                         }
108                 }
109         } else {
110                 for _, uris := range warc.WARCs {
111                         rec := uris[*uri]
112                         if rec == nil {
113                                 continue
114                         }
115                         r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
116                         if err != nil {
117                                 log.Fatalln(err)
118                         }
119                         io.Copy(os.Stdout, r)
120                 }
121         }
122 }