]> Sergey Matveev's repositories - sgodup.git/commitdiff
Initial version
authorSergey Matveev <stargrave@stargrave.org>
Thu, 19 Mar 2020 21:24:19 +0000 (00:24 +0300)
committerSergey Matveev <stargrave@stargrave.org>
Fri, 20 Mar 2020 10:51:50 +0000 (13:51 +0300)
README [new file with mode: 0644]
go.mod [new file with mode: 0644]
go.sum [new file with mode: 0644]
main.go [new file with mode: 0644]
progress.go [new file with mode: 0644]
walk.go [new file with mode: 0644]

diff --git a/README b/README
new file mode 100644 (file)
index 0000000..b34dcd9
--- /dev/null
+++ b/README
@@ -0,0 +1,88 @@
+                  sgodup -- file deduplication utility
+                  ====================================
+
+DESCRIPTION AND USAGE
+
+sgodup is utility for duplicate files detection. You supply two
+directories: the base and one with possible duplicates, utility
+determines duplicate files and replaces them with the links. It
+is aimed to have very high performance.
+
+There are just few arguments:
+
+-basedir -- directory with files that are possible link targets
+ -dupdir -- directory with possible duplicates, which are replaced
+            with the links to basedir's files
+ -action -- * print: just print to stdout duplicate file path with
+              relative path to basedir's corresponding file
+            * symlink: create symbolic link with relative path to
+              basedir's corresponding file
+            * hardlink: create hard link instead
+  -chmod -- if specified, then chmod files in basedir and dupdir
+            during scan phase. Octal representation is expected
+  -fsync -- fsync directories where linking occurs
+
+There are three stages:
+
+* basedir directory scan: collect all *regular* file paths, sizes and
+  inodes. If -chmod is specified, then apply it at once. Empty files are
+  ignored
+* dupdir directory scan: same as above. If there is no basedir's file
+  with the same size, then skip dupdir's file (obviously it can not be
+  duplicate). Check that no basedir's files have the same inode, skip
+  dupdir's file otherwise, because it is already hardlinked
+* deduplication stage. For each dupdir file, find basedir file with the
+  same size and compare their contents, to determine if dupdir's one is
+  the duplicate. Perform specified action if so. There are two separate
+  queues and processing cycles:
+
+  * small files, up to 4 KiB (one disk sector): files are fully read and
+    compared in memory
+  * large files (everything else): read and compare first 4 KiB of files
+    in memory. If they are not equal, then this is not a duplicate.
+    Fully read each file's contents sequentially with 128 KiB chunks and
+    calculate BLAKE2b-512 digest otherwise
+
+Progress is showed at each stage: how many files are counted/processed,
+total size of the files, how much space is deduplicated.
+
+    2020/03/19 22:57:07 processing basedir...
+    2020/03/19 22:57:07 464,329 / 0 (0%) files scanned
+    2020/03/19 22:57:07 534 GiB / 0 B (0%)
+    2020/03/19 22:57:12 processing dupdir...
+    2020/03/19 22:57:12 362,245 / 0 (0%) files scanned
+    2020/03/19 22:57:12 362 GiB / 0 B (0%)
+    2020/03/19 22:57:17 deduplicating...
+    2020/03/19 22:58:18 8,193 / 362,245 (2%) files processed
+    2020/03/19 22:58:18 7.7 GiB / 362 GiB (2%) deduplicated
+    [...]
+    2020/03/20 11:17:20 321,123 files deduplicated
+
+It is safe to specify same directory as a basedir and dupdir.
+
+SAFETY AND CONSISTENCY
+
+POSIX has no ability to atomically replace regular file with with
+symbolic/hard link. So file is removed first, then link created. sgodup
+cautiously prevents possible interruption by signal (TERM, INT) of those
+two calls. But any other failure could possibly break the program after
+file removal without link creation, leading to its loss!
+
+It is recommended to use filesystems with snapshot capability to be able
+to rollback and restore removed file. Or you can use "-action print"
+beforehand to collect the duplicates and use it as a log for possible
+recovery.
+
+There are no warranties and any defined behaviour if directories (and files
+within) where utility is working with are modified.
+
+LICENCE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
diff --git a/go.mod b/go.mod
new file mode 100644 (file)
index 0000000..d87c9cc
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,8 @@
+module go.stargrave.org/sgodup
+
+go 1.14
+
+require (
+       github.com/dustin/go-humanize v1.0.0
+       golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
+)
diff --git a/go.sum b/go.sum
new file mode 100644 (file)
index 0000000..dc88a58
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,10 @@
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/main.go b/main.go
new file mode 100644 (file)
index 0000000..f4fbf6c
--- /dev/null
+++ b/main.go
@@ -0,0 +1,315 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// File deduplication utility
+package main
+
+import (
+       "bufio"
+       "bytes"
+       "flag"
+       "fmt"
+       "io"
+       "log"
+       "os"
+       "os/signal"
+       "path/filepath"
+       "strconv"
+       "sync"
+       "syscall"
+
+       "golang.org/x/crypto/blake2b"
+)
+
+const (
+       SizeBoundary = 1 << 12 // 4 KiB sector size
+       BufSize      = 1 << 17 // ZFS default 128 KiB recordsize
+)
+
+var (
+       canExit sync.Mutex
+
+       curDirPath string
+       curDirFd   *os.File
+)
+
+func link(dup, orig, action string, fsync bool) {
+       tgt, err := filepath.Rel(dup, orig)
+       if err != nil {
+               log.Fatal(err)
+       }
+       tgt = tgt[3:]
+       if action == "print" {
+               fmt.Println(dup, tgt)
+               return
+       }
+       canExit.Lock()
+       if err = os.Remove(dup); err != nil {
+               log.Fatal(err)
+       }
+       if action == "symlink" {
+               err = os.Symlink(tgt, dup)
+       } else {
+               err = os.Link(orig, dup)
+       }
+       if err != nil {
+               log.Fatal(err)
+       }
+       if fsync {
+               dirPath := filepath.Dir(dup)
+               if dirPath != curDirPath {
+                       curDirFd, err = os.Open(dirPath)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       curDirPath = dirPath
+               }
+               if err = curDirFd.Sync(); err != nil {
+                       log.Fatal(err)
+               }
+       }
+       canExit.Unlock()
+}
+
+func main() {
+       var (
+               baseDir = flag.String("basedir", "", "Directory with original files")
+               dupDir  = flag.String("dupdir", "", "Directory with possible duplicates")
+               action  = flag.String("action", "", "print, symlink, hardlink")
+               doChmod = flag.String("chmod", "", "chmod files")
+               doFsync = flag.Bool("fsync", false, "fsync directories?")
+       )
+       flag.Parse()
+       if *baseDir == "" {
+               log.Fatalln("-basedir is required")
+       }
+       if *dupDir == "" {
+               log.Fatalln("-dupdir is required")
+       }
+       var chmod os.FileMode
+       if *doChmod != "" {
+               ch, err := strconv.ParseUint(*doChmod, 8, 16)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               chmod = os.FileMode(ch)
+       }
+       if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
+               log.Fatalln("choose action")
+       }
+
+       log.Println("processing basedir...")
+       size2fi := make(map[int64][]FileInode, 1<<10)
+       files := 0
+       filesSmall := 0
+       filesLarge := 0
+       var fullSize int64
+       progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+       for fi := range walk(*baseDir) {
+               if chmod > 0 {
+                       if err := os.Chmod(fi.Path, chmod); err != nil {
+                               log.Fatal(err)
+                       }
+               }
+               if fi.Size == 0 {
+                       continue
+               }
+               if fi.Size <= SizeBoundary {
+                       filesSmall++
+               } else {
+                       filesLarge++
+               }
+               files++
+               fullSize += fi.Size
+               size2fi[fi.Size] = append(size2fi[fi.Size], fi)
+       }
+       progress.Stop()
+
+       log.Println("processing dupdir...")
+       queueSmall := make(map[string][]string, filesSmall)
+       queueLarge := make(map[string][]string, filesLarge)
+       files = 0
+       fullSize = 0
+       progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+       for fi := range walk(*dupDir) {
+               if chmod > 0 {
+                       if err := os.Chmod(fi.Path, chmod); err != nil {
+                               log.Fatal(err)
+                       }
+               }
+               if fi.Size == 0 {
+                       continue
+               }
+               origs, ok := size2fi[fi.Size]
+               if !ok {
+                       continue
+               }
+               paths := make([]string, 0, len(origs))
+               for _, orig := range origs {
+                       if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+                               continue
+                       }
+                       paths = append(paths, orig.Path)
+               }
+               files++
+               fullSize += fi.Size
+               if fi.Size <= SizeBoundary {
+                       queueSmall[fi.Path] = paths
+               } else {
+                       queueLarge[fi.Path] = paths
+               }
+       }
+       size2fi = nil
+       progress.Stop()
+
+       log.Println("deduplicating...")
+       progress = NewProgress(
+               files,
+               fullSize,
+               &files,
+               &fullSize,
+               " processed",
+               " deduplicated",
+       )
+       files = 0
+       fullSize = 0
+       deduped := 0
+       termRequired := make(chan os.Signal, 1)
+       signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+       go func() {
+               <-termRequired
+               canExit.Lock()
+               progress.Stop()
+               log.Println(deduped, "files deduplicated")
+               os.Exit(0)
+       }()
+       bufDup := make([]byte, SizeBoundary)
+       bufOrig := make([]byte, SizeBoundary)
+       seen := make(map[string]struct{}, len(queueSmall))
+       for dup, origs := range queueSmall {
+               files++
+               if _, ok := seen[dup]; ok {
+                       continue
+               }
+               fdDup, err := os.Open(dup)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               sizeDup, err := io.ReadFull(fdDup, bufDup)
+               if !(err == nil || err == io.ErrUnexpectedEOF) {
+                       log.Fatal(err)
+               }
+               if err = fdDup.Close(); err != nil {
+                       log.Fatal(err)
+               }
+               for _, orig := range origs {
+                       fdOrig, err := os.Open(orig)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
+                       if !(err == nil || err == io.ErrUnexpectedEOF) {
+                               log.Fatal(err)
+                       }
+                       if sizeOrig != sizeDup {
+                               log.Fatalln(dup, orig, "unexpectedly different sizes")
+                       }
+                       if err = fdOrig.Close(); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
+                               continue
+                       }
+                       link(dup, orig, *action, *doFsync)
+                       seen[orig] = struct{}{}
+                       deduped++
+                       fullSize += int64(sizeDup)
+                       break
+               }
+       }
+       queueSmall = nil
+
+       hasher, err := blake2b.New512(nil)
+       if err != nil {
+               panic(err)
+       }
+       seen = make(map[string]struct{}, len(queueLarge))
+       var sizeDup int64
+       for dup, origs := range queueLarge {
+               files++
+               if _, ok := seen[dup]; ok {
+                       continue
+               }
+               fdDup, err := os.Open(dup)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               if _, err := io.ReadFull(fdDup, bufDup); err != nil {
+                       log.Fatal(err)
+               }
+               var hashDup []byte
+               for _, orig := range origs {
+                       fdOrig, err := os.Open(orig)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(bufDup, bufOrig) != 0 {
+                               if err = fdOrig.Close(); err != nil {
+                                       log.Fatal(err)
+                               }
+                               continue
+                       }
+                       if hashDup == nil {
+                               hasher.Reset()
+                               if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+                                       log.Fatalln("can not write to hash", err)
+                               }
+                               sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+                               if err != nil {
+                                       log.Fatal(err)
+                               }
+                               hashDup = hasher.Sum(nil)
+                       }
+                       hasher.Reset()
+                       if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+                               log.Fatalln("can not write to hash", err)
+                       }
+                       if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+                               log.Fatal(err)
+                       }
+                       if err = fdOrig.Close(); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
+                               continue
+                       }
+                       link(dup, orig, *action, *doFsync)
+                       seen[orig] = struct{}{}
+                       deduped++
+                       fullSize += sizeDup
+                       break
+               }
+               if err = fdDup.Close(); err != nil {
+                       log.Fatal(err)
+               }
+       }
+       termRequired <- syscall.SIGTERM
+       <-termRequired
+}
diff --git a/progress.go b/progress.go
new file mode 100644 (file)
index 0000000..863b37a
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+       "fmt"
+       "os"
+       "time"
+
+       "github.com/dustin/go-humanize"
+)
+
+const (
+       ESC = 27
+)
+
+var LineClear = fmt.Sprintf("%c[%dA%c[2K", ESC, 2, ESC)
+
+type Progress struct {
+       fullFiles   int64
+       fullSize    uint64
+       fullFilesS  string
+       fullSizeS   string
+       suffixFiles string
+       suffixSize  string
+       stop        chan struct{}
+}
+
+func NewProgress(
+       fullFiles int,
+       fullSize int64,
+       files *int,
+       size *int64,
+       suffixFiles, suffixSize string,
+) Progress {
+       p := Progress{
+               int64(fullFiles),
+               uint64(fullSize),
+               humanize.Comma(int64(fullFiles)),
+               humanize.IBytes(uint64(fullSize)),
+               suffixFiles,
+               suffixSize,
+               make(chan struct{}, 0),
+       }
+       go p.Run(files, size)
+       return p
+}
+
+func (p Progress) Log(prefix string, files int64, size uint64) {
+       percentageFiles := int64(0)
+       if p.fullFiles > 0 {
+               percentageFiles = 100 * files / p.fullFiles
+       }
+       percentageSize := uint64(0)
+       if p.fullSize > 0 {
+               percentageSize = 100 * size / p.fullSize
+       }
+       now := time.Now().Format("2006/01/02 15:04:05")
+       fmt.Fprintf(
+               os.Stderr,
+               "%s%s %s / %s (%d%%) files%s\n%s %s / %s (%d%%)%s\n",
+               prefix,
+               now, humanize.Comma(files), p.fullFilesS, percentageFiles, p.suffixFiles,
+               now, humanize.IBytes(size), p.fullSizeS, percentageSize, p.suffixSize,
+       )
+}
+
+func (p Progress) Run(files *int, size *int64) {
+       p.Log("", 0, 0)
+       ticker := time.NewTicker(250 * time.Millisecond)
+       for {
+               select {
+               case <-ticker.C:
+                       p.Log(LineClear, int64(*files), uint64(*size))
+               case <-p.stop:
+                       ticker.Stop()
+                       p.Log(LineClear, int64(*files), uint64(*size))
+                       close(p.stop)
+                       return
+               }
+       }
+}
+
+func (p Progress) Stop() {
+       p.stop <- struct{}{}
+       <-p.stop
+}
diff --git a/walk.go b/walk.go
new file mode 100644 (file)
index 0000000..76abb9c
--- /dev/null
+++ b/walk.go
@@ -0,0 +1,73 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+       "io"
+       "log"
+       "os"
+       "path"
+       "syscall"
+)
+
+type FileInode struct {
+       Path string
+       Size int64
+       Dev  uint64
+       Ino  uint64
+}
+
+func walker(c chan FileInode, dirPath string) {
+       dirFd, err := os.Open(dirPath)
+       if err != nil {
+               log.Fatal(err)
+       }
+       for {
+               fis, err := dirFd.Readdir(1 << 10)
+               if err != nil {
+                       if err == io.EOF {
+                               break
+                       }
+                       log.Fatal(err)
+               }
+               for _, fi := range fis {
+                       stat := fi.Sys().(*syscall.Stat_t)
+                       if fi.Mode().IsRegular() {
+                               c <- FileInode{
+                                       path.Join(dirPath, fi.Name()),
+                                       fi.Size(),
+                                       stat.Dev, stat.Ino,
+                               }
+                       } else if fi.IsDir() {
+                               walker(c, path.Join(dirPath, fi.Name()))
+                       }
+               }
+       }
+       if err = dirFd.Close(); err != nil {
+               log.Fatal(err)
+       }
+}
+
+func walk(dirPath string) chan FileInode {
+       c := make(chan FileInode, 1<<10)
+       go func() {
+               walker(c, dirPath)
+               close(c)
+       }()
+       return c
+}