/*
-tofuproxy -- flexible HTTP/WARC proxy with TLS certificates management
-Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+warc-extract -- WARC files data extractor
+Copyright (C) 2021-2023 Sergey Matveev <stargrave@stargrave.org>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
package main
import (
+ "bufio"
+ "bytes"
+ "encoding/binary"
"flag"
"fmt"
"io"
func main() {
uri := flag.String("uri", "", "URI to extract, if specified")
hdr := flag.Bool("hdr", false, "Also extract WARC's header")
- idx := flag.Bool("idx", false, "Save WARC indexes")
+ idx := flag.Bool("idx", false, "Save WARC indices")
+ recompress := flag.Bool("for-enzstd", false, "Output for enzstd utility")
flag.Parse()
log.SetFlags(log.Lshortfile)
+ if *recompress {
+ var hdr bytes.Buffer
+ size := make([]byte, 8)
+ bw := bufio.NewWriter(os.Stdout)
+ for _, p := range flag.Args() {
+ r, err := warc.NewReader(p)
+ if err != nil {
+ log.Fatalln(err)
+ }
+ for {
+ rec, rr, err := r.ReadRecord()
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ log.Fatalln(err)
+ }
+ for _, line := range rec.HdrLines {
+ hdr.WriteString(line)
+ }
+ hdr.WriteString("\r\n")
+ binary.BigEndian.PutUint64(size, uint64(hdr.Len())+uint64(rec.Size)+4)
+ if _, err = bw.Write(size); err != nil {
+ log.Fatalln(err)
+ }
+ if _, err = io.Copy(bw, &hdr); err != nil {
+ log.Fatalln(err)
+ }
+ if _, err = io.Copy(bw, rr); err != nil {
+ log.Fatalln(err)
+ }
+ r.RecordWasRead()
+ if _, err = bw.Write([]byte("\r\n\r\n")); err != nil {
+ log.Fatalln(err)
+ }
+ }
+ }
+ if err := bw.Flush(); err != nil {
+ log.Fatalln(err)
+ }
+ return
+ }
+
for _, p := range flag.Args() {
log.Println("adding", p)
if err := warc.Add(p); err != nil {
}
log.Println("added", p, len(warc.WARCs[p]), "URIs")
if *idx {
- if err := warc.SaveIndexes(); err != nil {
+ if err := warc.SaveIndices(); err != nil {
log.Fatalln(err)
}
}
if rec == nil {
continue
}
- r, err := rec.Reader(!*hdr)
+ r, err := rec.Reader(!*hdr, warc.WARCsOffsets)
if err != nil {
log.Fatalln(err)
}