2 warc-extract -- WARC files data extractor
3 Copyright (C) 2021-2023 Sergey Matveev <stargrave@stargrave.org>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
30 "github.com/dustin/go-humanize"
31 "go.stargrave.org/tofuproxy/warc"
35 uri := flag.String("uri", "", "URI to extract, if specified")
36 hdr := flag.Bool("hdr", false, "Also extract WARC's header")
37 idx := flag.Bool("idx", false, "Save WARC indices")
38 recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
39 unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility")
41 log.SetFlags(log.Lshortfile)
42 warc.UnZSTDPath = *unzstdPath
46 size := make([]byte, 8)
47 bw := bufio.NewWriter(os.Stdout)
48 for _, p := range flag.Args() {
49 r, err := warc.NewReader(p)
54 rec, rr, err := r.ReadRecord()
61 for _, line := range rec.HdrLines {
64 hdr.WriteString("\r\n")
65 binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
66 if _, err = bw.Write(size); err != nil {
69 if _, err = io.Copy(bw, &hdr); err != nil {
72 if _, err = io.Copy(bw, rr); err != nil {
76 if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
81 if err := bw.Flush(); err != nil {
87 for _, p := range flag.Args() {
88 log.Println("adding", p)
89 if err := warc.Add(p); err != nil {
92 log.Println("added", p, len(warc.WARCs[p]), "URIs")
94 if err := warc.SaveIndices(); err != nil {
100 for warcPath, uris := range warc.WARCs {
101 for uri, rec := range uris {
105 humanize.IBytes(uint64(rec.TotalSize())),
110 for _, uris := range warc.WARCs {
115 r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
119 io.Copy(os.Stdout, r)