1 // warc-extract -- WARC files data extractor
2 // Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, version 3 of the License.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program. If not, see <http://www.gnu.org/licenses/>.
28 "github.com/dustin/go-humanize"
29 "go.stargrave.org/tofuproxy/warc"
33 uri := flag.String("uri", "", "URI to extract, if specified")
34 hdr := flag.Bool("hdr", false, "Also extract WARC's header")
35 idx := flag.Bool("idx", false, "Save WARC indices")
36 recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
37 unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility")
39 log.SetFlags(log.Lshortfile)
40 warc.UnZSTDPath = *unzstdPath
44 size := make([]byte, 8)
45 bw := bufio.NewWriter(os.Stdout)
46 for _, p := range flag.Args() {
47 r, err := warc.NewReader(p)
52 rec, rr, err := r.ReadRecord()
59 for _, line := range rec.HdrLines {
62 hdr.WriteString("\r\n")
63 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
64 if _, err = bw.Write(size); err != nil {
67 if _, err = io.Copy(bw, &hdr); err != nil {
70 if _, err = io.Copy(bw, rr); err != nil {
74 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
79 if err := bw.Flush(); err != nil {
85 for _, p := range flag.Args() {
86 log.Println("adding", p)
87 if err := warc.Add(p); err != nil {
90 log.Println("added", p, len(warc.WARCs[p]), "URIs")
92 if err := warc.SaveIndices(); err != nil {
98 for warcPath, uris := range warc.WARCs {
99 for uri, rec := range uris {
103 humanize.IBytes(uint64(rec.TotalSize())),
108 for _, uris := range warc.WARCs {
113 r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
117 io.Copy(os.Stdout, r)