From 4cb0b1b52ba01d3b200b56c349f4177f5ec355d3 Mon Sep 17 00:00:00 2001 From: Sergey Matveev Date: Fri, 20 Mar 2020 00:24:19 +0300 Subject: [PATCH] Initial version --- README | 88 +++++++++++++++ go.mod | 8 ++ go.sum | 10 ++ main.go | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++ progress.go | 102 +++++++++++++++++ walk.go | 73 ++++++++++++ 6 files changed, 596 insertions(+) create mode 100644 README create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go create mode 100644 progress.go create mode 100644 walk.go diff --git a/README b/README new file mode 100644 index 0000000..b34dcd9 --- /dev/null +++ b/README @@ -0,0 +1,88 @@ + sgodup -- file deduplication utility + ==================================== + +DESCRIPTION AND USAGE + +sgodup is utility for duplicate files detection. You supply two +directories: the base and one with possible duplicates, utility +determines duplicate files and replaces them with the links. It +is aimed to have very high performance. + +There are just few arguments: + +-basedir -- directory with files that are possible link targets + -dupdir -- directory with possible duplicates, which are replaced + with the links to basedir's files + -action -- * print: just print to stdout duplicate file path with + relative path to basedir's corresponding file + * symlink: create symbolic link with relative path to + basedir's corresponding file + * hardlink: create hard link instead + -chmod -- if specified, then chmod files in basedir and dupdir + during scan phase. Octal representation is expected + -fsync -- fsync directories where linking occurs + +There are three stages: + +* basedir directory scan: collect all *regular* file paths, sizes and + inodes. If -chmod is specified, then apply it at once. Empty files are + ignored +* dupdir directory scan: same as above. If there is no basedir's file + with the same size, then skip dupdir's file (obviously it can not be + duplicate). Check that no basedir's files have the same inode, skip + dupdir's file otherwise, because it is already hardlinked +* deduplication stage. For each dupdir file, find basedir file with the + same size and compare their contents, to determine if dupdir's one is + the duplicate. Perform specified action if so. There are two separate + queues and processing cycles: + + * small files, up to 4 KiB (one disk sector): files are fully read and + compared in memory + * large files (everything else): read and compare first 4 KiB of files + in memory. If they are not equal, then this is not a duplicate. + Fully read each file's contents sequentially with 128 KiB chunks and + calculate BLAKE2b-512 digest otherwise + +Progress is showed at each stage: how many files are counted/processed, +total size of the files, how much space is deduplicated. + + 2020/03/19 22:57:07 processing basedir... + 2020/03/19 22:57:07 464,329 / 0 (0%) files scanned + 2020/03/19 22:57:07 534 GiB / 0 B (0%) + 2020/03/19 22:57:12 processing dupdir... + 2020/03/19 22:57:12 362,245 / 0 (0%) files scanned + 2020/03/19 22:57:12 362 GiB / 0 B (0%) + 2020/03/19 22:57:17 deduplicating... + 2020/03/19 22:58:18 8,193 / 362,245 (2%) files processed + 2020/03/19 22:58:18 7.7 GiB / 362 GiB (2%) deduplicated + [...] + 2020/03/20 11:17:20 321,123 files deduplicated + +It is safe to specify same directory as a basedir and dupdir. + +SAFETY AND CONSISTENCY + +POSIX has no ability to atomically replace regular file with with +symbolic/hard link. So file is removed first, then link created. sgodup +cautiously prevents possible interruption by signal (TERM, INT) of those +two calls. But any other failure could possibly break the program after +file removal without link creation, leading to its loss! + +It is recommended to use filesystems with snapshot capability to be able +to rollback and restore removed file. Or you can use "-action print" +beforehand to collect the duplicates and use it as a log for possible +recovery. + +There are no warranties and any defined behaviour if directories (and files +within) where utility is working with are modified. + +LICENCE + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d87c9cc --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module go.stargrave.org/sgodup + +go 1.14 + +require ( + github.com/dustin/go-humanize v1.0.0 + golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..dc88a58 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/main.go b/main.go new file mode 100644 index 0000000..f4fbf6c --- /dev/null +++ b/main.go @@ -0,0 +1,315 @@ +/* +sgodup -- File deduplication utility +Copyright (C) 2020 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +// File deduplication utility +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "io" + "log" + "os" + "os/signal" + "path/filepath" + "strconv" + "sync" + "syscall" + + "golang.org/x/crypto/blake2b" +) + +const ( + SizeBoundary = 1 << 12 // 4 KiB sector size + BufSize = 1 << 17 // ZFS default 128 KiB recordsize +) + +var ( + canExit sync.Mutex + + curDirPath string + curDirFd *os.File +) + +func link(dup, orig, action string, fsync bool) { + tgt, err := filepath.Rel(dup, orig) + if err != nil { + log.Fatal(err) + } + tgt = tgt[3:] + if action == "print" { + fmt.Println(dup, tgt) + return + } + canExit.Lock() + if err = os.Remove(dup); err != nil { + log.Fatal(err) + } + if action == "symlink" { + err = os.Symlink(tgt, dup) + } else { + err = os.Link(orig, dup) + } + if err != nil { + log.Fatal(err) + } + if fsync { + dirPath := filepath.Dir(dup) + if dirPath != curDirPath { + curDirFd, err = os.Open(dirPath) + if err != nil { + log.Fatal(err) + } + curDirPath = dirPath + } + if err = curDirFd.Sync(); err != nil { + log.Fatal(err) + } + } + canExit.Unlock() +} + +func main() { + var ( + baseDir = flag.String("basedir", "", "Directory with original files") + dupDir = flag.String("dupdir", "", "Directory with possible duplicates") + action = flag.String("action", "", "print, symlink, hardlink") + doChmod = flag.String("chmod", "", "chmod files") + doFsync = flag.Bool("fsync", false, "fsync directories?") + ) + flag.Parse() + if *baseDir == "" { + log.Fatalln("-basedir is required") + } + if *dupDir == "" { + log.Fatalln("-dupdir is required") + } + var chmod os.FileMode + if *doChmod != "" { + ch, err := strconv.ParseUint(*doChmod, 8, 16) + if err != nil { + log.Fatal(err) + } + chmod = os.FileMode(ch) + } + if !(*action == "print" || *action == "symlink" || *action == "hardlink") { + log.Fatalln("choose action") + } + + log.Println("processing basedir...") + size2fi := make(map[int64][]FileInode, 1<<10) + files := 0 + filesSmall := 0 + filesLarge := 0 + var fullSize int64 + progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total") + for fi := range walk(*baseDir) { + if chmod > 0 { + if err := os.Chmod(fi.Path, chmod); err != nil { + log.Fatal(err) + } + } + if fi.Size == 0 { + continue + } + if fi.Size <= SizeBoundary { + filesSmall++ + } else { + filesLarge++ + } + files++ + fullSize += fi.Size + size2fi[fi.Size] = append(size2fi[fi.Size], fi) + } + progress.Stop() + + log.Println("processing dupdir...") + queueSmall := make(map[string][]string, filesSmall) + queueLarge := make(map[string][]string, filesLarge) + files = 0 + fullSize = 0 + progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total") + for fi := range walk(*dupDir) { + if chmod > 0 { + if err := os.Chmod(fi.Path, chmod); err != nil { + log.Fatal(err) + } + } + if fi.Size == 0 { + continue + } + origs, ok := size2fi[fi.Size] + if !ok { + continue + } + paths := make([]string, 0, len(origs)) + for _, orig := range origs { + if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) { + continue + } + paths = append(paths, orig.Path) + } + files++ + fullSize += fi.Size + if fi.Size <= SizeBoundary { + queueSmall[fi.Path] = paths + } else { + queueLarge[fi.Path] = paths + } + } + size2fi = nil + progress.Stop() + + log.Println("deduplicating...") + progress = NewProgress( + files, + fullSize, + &files, + &fullSize, + " processed", + " deduplicated", + ) + files = 0 + fullSize = 0 + deduped := 0 + termRequired := make(chan os.Signal, 1) + signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT) + go func() { + <-termRequired + canExit.Lock() + progress.Stop() + log.Println(deduped, "files deduplicated") + os.Exit(0) + }() + bufDup := make([]byte, SizeBoundary) + bufOrig := make([]byte, SizeBoundary) + seen := make(map[string]struct{}, len(queueSmall)) + for dup, origs := range queueSmall { + files++ + if _, ok := seen[dup]; ok { + continue + } + fdDup, err := os.Open(dup) + if err != nil { + log.Fatal(err) + } + sizeDup, err := io.ReadFull(fdDup, bufDup) + if !(err == nil || err == io.ErrUnexpectedEOF) { + log.Fatal(err) + } + if err = fdDup.Close(); err != nil { + log.Fatal(err) + } + for _, orig := range origs { + fdOrig, err := os.Open(orig) + if err != nil { + log.Fatal(err) + } + sizeOrig, err := io.ReadFull(fdOrig, bufOrig) + if !(err == nil || err == io.ErrUnexpectedEOF) { + log.Fatal(err) + } + if sizeOrig != sizeDup { + log.Fatalln(dup, orig, "unexpectedly different sizes") + } + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 { + continue + } + link(dup, orig, *action, *doFsync) + seen[orig] = struct{}{} + deduped++ + fullSize += int64(sizeDup) + break + } + } + queueSmall = nil + + hasher, err := blake2b.New512(nil) + if err != nil { + panic(err) + } + seen = make(map[string]struct{}, len(queueLarge)) + var sizeDup int64 + for dup, origs := range queueLarge { + files++ + if _, ok := seen[dup]; ok { + continue + } + fdDup, err := os.Open(dup) + if err != nil { + log.Fatal(err) + } + if _, err := io.ReadFull(fdDup, bufDup); err != nil { + log.Fatal(err) + } + var hashDup []byte + for _, orig := range origs { + fdOrig, err := os.Open(orig) + if err != nil { + log.Fatal(err) + } + if _, err = io.ReadFull(fdOrig, bufOrig); err != nil { + log.Fatal(err) + } + if bytes.Compare(bufDup, bufOrig) != 0 { + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + continue + } + if hashDup == nil { + hasher.Reset() + if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) { + log.Fatalln("can not write to hash", err) + } + sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize)) + if err != nil { + log.Fatal(err) + } + hashDup = hasher.Sum(nil) + } + hasher.Reset() + if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) { + log.Fatalln("can not write to hash", err) + } + if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil { + log.Fatal(err) + } + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 { + continue + } + link(dup, orig, *action, *doFsync) + seen[orig] = struct{}{} + deduped++ + fullSize += sizeDup + break + } + if err = fdDup.Close(); err != nil { + log.Fatal(err) + } + } + termRequired <- syscall.SIGTERM + <-termRequired +} diff --git a/progress.go b/progress.go new file mode 100644 index 0000000..863b37a --- /dev/null +++ b/progress.go @@ -0,0 +1,102 @@ +/* +sgodup -- File deduplication utility +Copyright (C) 2020 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "fmt" + "os" + "time" + + "github.com/dustin/go-humanize" +) + +const ( + ESC = 27 +) + +var LineClear = fmt.Sprintf("%c[%dA%c[2K", ESC, 2, ESC) + +type Progress struct { + fullFiles int64 + fullSize uint64 + fullFilesS string + fullSizeS string + suffixFiles string + suffixSize string + stop chan struct{} +} + +func NewProgress( + fullFiles int, + fullSize int64, + files *int, + size *int64, + suffixFiles, suffixSize string, +) Progress { + p := Progress{ + int64(fullFiles), + uint64(fullSize), + humanize.Comma(int64(fullFiles)), + humanize.IBytes(uint64(fullSize)), + suffixFiles, + suffixSize, + make(chan struct{}, 0), + } + go p.Run(files, size) + return p +} + +func (p Progress) Log(prefix string, files int64, size uint64) { + percentageFiles := int64(0) + if p.fullFiles > 0 { + percentageFiles = 100 * files / p.fullFiles + } + percentageSize := uint64(0) + if p.fullSize > 0 { + percentageSize = 100 * size / p.fullSize + } + now := time.Now().Format("2006/01/02 15:04:05") + fmt.Fprintf( + os.Stderr, + "%s%s %s / %s (%d%%) files%s\n%s %s / %s (%d%%)%s\n", + prefix, + now, humanize.Comma(files), p.fullFilesS, percentageFiles, p.suffixFiles, + now, humanize.IBytes(size), p.fullSizeS, percentageSize, p.suffixSize, + ) +} + +func (p Progress) Run(files *int, size *int64) { + p.Log("", 0, 0) + ticker := time.NewTicker(250 * time.Millisecond) + for { + select { + case <-ticker.C: + p.Log(LineClear, int64(*files), uint64(*size)) + case <-p.stop: + ticker.Stop() + p.Log(LineClear, int64(*files), uint64(*size)) + close(p.stop) + return + } + } +} + +func (p Progress) Stop() { + p.stop <- struct{}{} + <-p.stop +} diff --git a/walk.go b/walk.go new file mode 100644 index 0000000..76abb9c --- /dev/null +++ b/walk.go @@ -0,0 +1,73 @@ +/* +sgodup -- File deduplication utility +Copyright (C) 2020 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "io" + "log" + "os" + "path" + "syscall" +) + +type FileInode struct { + Path string + Size int64 + Dev uint64 + Ino uint64 +} + +func walker(c chan FileInode, dirPath string) { + dirFd, err := os.Open(dirPath) + if err != nil { + log.Fatal(err) + } + for { + fis, err := dirFd.Readdir(1 << 10) + if err != nil { + if err == io.EOF { + break + } + log.Fatal(err) + } + for _, fi := range fis { + stat := fi.Sys().(*syscall.Stat_t) + if fi.Mode().IsRegular() { + c <- FileInode{ + path.Join(dirPath, fi.Name()), + fi.Size(), + stat.Dev, stat.Ino, + } + } else if fi.IsDir() { + walker(c, path.Join(dirPath, fi.Name())) + } + } + } + if err = dirFd.Close(); err != nil { + log.Fatal(err) + } +} + +func walk(dirPath string) chan FileInode { + c := make(chan FileInode, 1<<10) + go func() { + walker(c, dirPath) + close(c) + }() + return c +} -- 2.44.0