From db65ffeff7274def395c8ee747873d0e9d8250b75f543b6ac0d7bbd079cce66d Mon Sep 17 00:00:00 2001 From: Sergey Matveev Date: Mon, 8 Aug 2022 22:31:58 +0300 Subject: [PATCH] Return zstd back --- README | 2 +- diff.go | 10 +++------- dirsizer.go | 25 ++++++++++++++++++++----- go.mod | 2 ++ go.sum | 2 ++ index.go | 7 +------ main.go | 3 +-- reader.go | 23 ++++++++++++++++------- writer.go | 26 ++++++++++++++++++++------ 9 files changed, 66 insertions(+), 34 deletions(-) diff --git a/README b/README index 358d530..8c52b35 100644 --- a/README +++ b/README @@ -18,7 +18,7 @@ memory, but that takes 2-3 GiBs of memory, that is huge amount. Moreover it fully loads it to perform any basic searches. So current implementation uses temporary files and heavy use of data streaming. -Its storage format is trivial: +Its storage format is simple: Zstandard-compressed list of records: * 16-bit BE size of the following name * entity (file, directory, symbolic link, etc) name itself. diff --git a/diff.go b/diff.go index 61ba1ac..dda4fd9 100644 --- a/diff.go +++ b/diff.go @@ -105,7 +105,7 @@ func updateWithDiff(dbPath, strip string) *os.File { mods := make([]*Ent, 0, len(modsNames)+len(rens)) if len(rens) > 0 { sort.Sort(BySrc(rens)) - go reader(bufio.NewReaderSize(db, 1<<17), entsReader) + go reader(db, entsReader) var ent Ent var ok, met bool for { @@ -183,7 +183,7 @@ func updateWithDiff(dbPath, strip string) *os.File { entsReader = make(chan Ent, 1<<10) entsDirSizer := make(chan Ent, 1<<10) entsWriter := make(chan Ent, 1<<10) - go reader(bufio.NewReaderSize(db, 1<<17), entsReader) + go reader(db, entsReader) dirSizerJob := make(chan struct{}) var dirSizes []int64 @@ -193,10 +193,9 @@ func updateWithDiff(dbPath, strip string) *os.File { close(dirSizerJob) }() - bw := bufio.NewWriterSize(tmp0, 1<<17) writerJob := make(chan struct{}) go func() { - writer(bw, entsWriter) + writer(tmp0, entsWriter) close(writerJob) }() @@ -243,9 +242,6 @@ func updateWithDiff(dbPath, strip string) *os.File { <-dirSizerJob close(entsWriter) <-writerJob - if err = bw.Flush(); err != nil { - log.Fatalln(err) - } tmp1 := applyDirSizes(tmp0, dirSizes) tmp0.Close() diff --git a/dirsizer.go b/dirsizer.go index b4f5b4e..fda37f4 100644 --- a/dirsizer.go +++ b/dirsizer.go @@ -6,6 +6,8 @@ import ( "io" "log" "os" + + "github.com/klauspost/compress/zstd" ) func dirSizer(dirSizes *[]int64, depth int, sinkBack, sinkIn, sinkOut chan Ent) (curSize int64) { @@ -51,11 +53,22 @@ func applyDirSizes(src *os.File, dirSizes []int64) *os.File { log.Fatalln(err) } - br := bufio.NewReaderSize(src, 1<<17) + compR, err := zstd.NewReader(src) + if err != nil { + log.Fatalln(err) + } + br := bufio.NewReaderSize(compR, 1<<17) + + compW, err := zstd.NewWriter(tmp, + zstd.WithEncoderLevel(zstd.SpeedBestCompression)) + if err != nil { + log.Fatalln(err) + } + bw := bufio.NewWriterSize(compW, 1<<17) + num := make([]byte, 8) var nameLen int name := make([]byte, 0, 1<<16) - bw := bufio.NewWriterSize(tmp, 1<<17) var dirIdx int for { if _, err = io.ReadFull(br, num[:2]); err != nil { @@ -67,9 +80,7 @@ func applyDirSizes(src *os.File, dirSizes []int64) *os.File { mustWrite(bw, num[:2]) nameLen = int(binary.BigEndian.Uint16(num[:2])) name = name[:nameLen] - if _, err = io.ReadFull(br, name); err != nil { - log.Fatalln(err) - } + mustReadFull(br, name) mustWrite(bw, name) if _, err = io.CopyN(bw, br, 1+8); err != nil { log.Fatalln(err) @@ -90,5 +101,9 @@ func applyDirSizes(src *os.File, dirSizes []int64) *os.File { if err = bw.Flush(); err != nil { log.Fatalln(err) } + if err = compW.Close(); err != nil { + log.Fatalln(err) + } + compR.Close() return tmp } diff --git a/go.mod b/go.mod index f8654fc..7d95a30 100644 --- a/go.mod +++ b/go.mod @@ -3,3 +3,5 @@ module go.stargrave.org/glocate go 1.18 require github.com/dustin/go-humanize v1.0.0 + +require github.com/klauspost/compress v1.15.9 diff --git a/go.sum b/go.sum index 4f89ea4..568acaa 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY= +github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= diff --git a/index.go b/index.go index c6fa539..c05e4c0 100644 --- a/index.go +++ b/index.go @@ -1,7 +1,6 @@ package main import ( - "bufio" "fmt" "log" "os" @@ -25,10 +24,9 @@ func index() *os.File { close(dirSizerJob) }() - bw := bufio.NewWriterSize(tmp0, 1<<17) writerJob := make(chan struct{}) go func() { - writer(bw, entsWriter) + writer(tmp0, entsWriter) close(writerJob) }() @@ -45,9 +43,6 @@ func index() *os.File { <-dirSizerJob close(entsWriter) <-writerJob - if err = bw.Flush(); err != nil { - log.Fatalln(err) - } tmp1 := applyDirSizes(tmp0, dirSizes) tmp0.Close() diff --git a/main.go b/main.go index a5fcb41..c8615fd 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,6 @@ package main import ( - "bufio" "flag" "log" "os" @@ -64,7 +63,7 @@ func main() { log.Fatalln(err) } entsReader := make(chan Ent, 1<<10) - go reader(bufio.NewReaderSize(db, 1<<17), entsReader) + go reader(db, entsReader) entsPrinter := make(chan Ent, 1<<10) printerJob := make(chan struct{}) diff --git a/reader.go b/reader.go index 269fc32..8900c26 100644 --- a/reader.go +++ b/reader.go @@ -1,9 +1,12 @@ package main import ( + "bufio" "encoding/binary" "io" "log" + + "github.com/klauspost/compress/zstd" ) func mustReadFull(r io.Reader, buf []byte) { @@ -12,15 +15,20 @@ func mustReadFull(r io.Reader, buf []byte) { } } -func reader(r io.Reader, sink chan Ent) { - var err error +func reader(src io.Reader, sink chan Ent) { + comp, err := zstd.NewReader(src) + if err != nil { + log.Fatalln(err) + } + br := bufio.NewReaderSize(comp, 1<<17) + num := make([]byte, 8) var cols []string var namePrev string var nameLen uint16 var depth, depthPrev uint8 for { - _, err = io.ReadFull(r, num[:2]) + _, err = io.ReadFull(br, num[:2]) if err != nil { if err == io.EOF { break @@ -29,13 +37,13 @@ func reader(r io.Reader, sink chan Ent) { } nameLen = binary.BigEndian.Uint16(num[:2]) nameRaw := make([]byte, nameLen) - mustReadFull(r, nameRaw) + mustReadFull(br, nameRaw) name := string(nameRaw) - mustReadFull(r, num[:1]) + mustReadFull(br, num[:1]) depth = uint8(num[0]) - mustReadFull(r, num) + mustReadFull(br, num) ent := Ent{mtime: int64(binary.BigEndian.Uint64(num))} - mustReadFull(r, num) + mustReadFull(br, num) ent.size = int64(binary.BigEndian.Uint64(num)) if depth > depthPrev { cols = append(cols, namePrev[:len(namePrev)-1]) @@ -48,4 +56,5 @@ func reader(r io.Reader, sink chan Ent) { depthPrev = depth } close(sink) + comp.Close() } diff --git a/writer.go b/writer.go index e12a19c..75b82e5 100644 --- a/writer.go +++ b/writer.go @@ -1,9 +1,12 @@ package main import ( + "bufio" "encoding/binary" "io" "log" + + "github.com/klauspost/compress/zstd" ) func mustWrite(w io.Writer, buf []byte) { @@ -12,7 +15,12 @@ func mustWrite(w io.Writer, buf []byte) { } } -func writer(w io.Writer, sink chan Ent) { +func writer(dst io.Writer, sink chan Ent) { + comp, err := zstd.NewWriter(dst) + if err != nil { + log.Fatalln(err) + } + bw := bufio.NewWriterSize(comp, 1<<17) num := make([]byte, 8) var name string for ent := range sink { @@ -21,15 +29,21 @@ func writer(w io.Writer, sink chan Ent) { panic("too long") } binary.BigEndian.PutUint16(num[:2], uint16(len(name))) - mustWrite(w, num[:2]) - mustWrite(w, []byte(name)) + mustWrite(bw, num[:2]) + mustWrite(bw, []byte(name)) if len(ent.name) >= 1<<8 { panic("too deep") } - mustWrite(w, []byte{byte(len(ent.name) - 1)}) + mustWrite(bw, []byte{byte(len(ent.name) - 1)}) binary.BigEndian.PutUint64(num, uint64(ent.mtime)) - mustWrite(w, num) + mustWrite(bw, num) binary.BigEndian.PutUint64(num, uint64(ent.size)) - mustWrite(w, num) + mustWrite(bw, num) + } + if err = bw.Flush(); err != nil { + log.Fatalln(err) + } + if err = comp.Close(); err != nil { + log.Fatalln(err) } } -- 2.44.0