// warc-extract -- WARC files data extractor // Copyright (C) 2021-2024 Sergey Matveev // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3 of the License. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . package main import ( "bufio" "bytes" "encoding/binary" "flag" "fmt" "io" "log" "os" "github.com/dustin/go-humanize" "go.stargrave.org/tofuproxy/warc" ) func main() { uri := flag.String("uri", "", "URI to extract, if specified") hdr := flag.Bool("hdr", false, "Also extract WARC's header") idx := flag.Bool("idx", false, "Save WARC indices") recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility") unzstdPath := flag.String("unzstd", "cmd/zstd/unzstd", "Path to unzstd utility") flag.Parse() log.SetFlags(log.Lshortfile) warc.UnZSTDPath = *unzstdPath if *recompress { var hdr bytes.Buffer size := make([]byte, 8) bw := bufio.NewWriter(os.Stdout) for _, p := range flag.Args() { r, err := warc.NewReader(p) if err != nil { log.Fatalln(err) } for { rec, rr, err := r.ReadRecord() if err != nil { if err == io.EOF { break } log.Fatalln(err) } for _, line := range rec.HdrLines { hdr.WriteString(line) } hdr.WriteString("\r\n") binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4) if _, err = bw.Write(size); err != nil { log.Fatalln(err) } if _, err = io.Copy(bw, &hdr); err != nil { log.Fatalln(err) } if _, err = io.Copy(bw, rr); err != nil { log.Fatalln(err) } r.RecordWasRead() if _, err = bw.Write([]byte("\r\n\r\n")); err != nil { log.Fatalln(err) } } } if err := bw.Flush(); err != nil { log.Fatalln(err) } return } for _, p := range flag.Args() { log.Println("adding", p) if err := warc.Add(p); err != nil { log.Fatalln(err) } log.Println("added", p, len(warc.WARCs[p]), "URIs") if *idx { if err := warc.SaveIndices(); err != nil { log.Fatalln(err) } } } if *uri == "" { for warcPath, uris := range warc.WARCs { for uri, rec := range uris { fmt.Printf( "%s\t%s\t%s\n", warcPath, uri, humanize.IBytes(uint64(rec.TotalSize())), ) } } } else { for _, uris := range warc.WARCs { rec := uris[*uri] if rec == nil { continue } r, err := rec.Reader(!*hdr, warc.WARCsOffsets) if err != nil { log.Fatalln(err) } io.Copy(os.Stdout, r) } } }