]> Sergey Matveev's repositories - sgodup.git/blobdiff - main.go
Initial version
[sgodup.git] / main.go
diff --git a/main.go b/main.go
new file mode 100644 (file)
index 0000000..f4fbf6c
--- /dev/null
+++ b/main.go
@@ -0,0 +1,315 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// File deduplication utility
+package main
+
+import (
+       "bufio"
+       "bytes"
+       "flag"
+       "fmt"
+       "io"
+       "log"
+       "os"
+       "os/signal"
+       "path/filepath"
+       "strconv"
+       "sync"
+       "syscall"
+
+       "golang.org/x/crypto/blake2b"
+)
+
+const (
+       SizeBoundary = 1 << 12 // 4 KiB sector size
+       BufSize      = 1 << 17 // ZFS default 128 KiB recordsize
+)
+
+var (
+       canExit sync.Mutex
+
+       curDirPath string
+       curDirFd   *os.File
+)
+
+func link(dup, orig, action string, fsync bool) {
+       tgt, err := filepath.Rel(dup, orig)
+       if err != nil {
+               log.Fatal(err)
+       }
+       tgt = tgt[3:]
+       if action == "print" {
+               fmt.Println(dup, tgt)
+               return
+       }
+       canExit.Lock()
+       if err = os.Remove(dup); err != nil {
+               log.Fatal(err)
+       }
+       if action == "symlink" {
+               err = os.Symlink(tgt, dup)
+       } else {
+               err = os.Link(orig, dup)
+       }
+       if err != nil {
+               log.Fatal(err)
+       }
+       if fsync {
+               dirPath := filepath.Dir(dup)
+               if dirPath != curDirPath {
+                       curDirFd, err = os.Open(dirPath)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       curDirPath = dirPath
+               }
+               if err = curDirFd.Sync(); err != nil {
+                       log.Fatal(err)
+               }
+       }
+       canExit.Unlock()
+}
+
+func main() {
+       var (
+               baseDir = flag.String("basedir", "", "Directory with original files")
+               dupDir  = flag.String("dupdir", "", "Directory with possible duplicates")
+               action  = flag.String("action", "", "print, symlink, hardlink")
+               doChmod = flag.String("chmod", "", "chmod files")
+               doFsync = flag.Bool("fsync", false, "fsync directories?")
+       )
+       flag.Parse()
+       if *baseDir == "" {
+               log.Fatalln("-basedir is required")
+       }
+       if *dupDir == "" {
+               log.Fatalln("-dupdir is required")
+       }
+       var chmod os.FileMode
+       if *doChmod != "" {
+               ch, err := strconv.ParseUint(*doChmod, 8, 16)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               chmod = os.FileMode(ch)
+       }
+       if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
+               log.Fatalln("choose action")
+       }
+
+       log.Println("processing basedir...")
+       size2fi := make(map[int64][]FileInode, 1<<10)
+       files := 0
+       filesSmall := 0
+       filesLarge := 0
+       var fullSize int64
+       progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+       for fi := range walk(*baseDir) {
+               if chmod > 0 {
+                       if err := os.Chmod(fi.Path, chmod); err != nil {
+                               log.Fatal(err)
+                       }
+               }
+               if fi.Size == 0 {
+                       continue
+               }
+               if fi.Size <= SizeBoundary {
+                       filesSmall++
+               } else {
+                       filesLarge++
+               }
+               files++
+               fullSize += fi.Size
+               size2fi[fi.Size] = append(size2fi[fi.Size], fi)
+       }
+       progress.Stop()
+
+       log.Println("processing dupdir...")
+       queueSmall := make(map[string][]string, filesSmall)
+       queueLarge := make(map[string][]string, filesLarge)
+       files = 0
+       fullSize = 0
+       progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+       for fi := range walk(*dupDir) {
+               if chmod > 0 {
+                       if err := os.Chmod(fi.Path, chmod); err != nil {
+                               log.Fatal(err)
+                       }
+               }
+               if fi.Size == 0 {
+                       continue
+               }
+               origs, ok := size2fi[fi.Size]
+               if !ok {
+                       continue
+               }
+               paths := make([]string, 0, len(origs))
+               for _, orig := range origs {
+                       if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+                               continue
+                       }
+                       paths = append(paths, orig.Path)
+               }
+               files++
+               fullSize += fi.Size
+               if fi.Size <= SizeBoundary {
+                       queueSmall[fi.Path] = paths
+               } else {
+                       queueLarge[fi.Path] = paths
+               }
+       }
+       size2fi = nil
+       progress.Stop()
+
+       log.Println("deduplicating...")
+       progress = NewProgress(
+               files,
+               fullSize,
+               &files,
+               &fullSize,
+               " processed",
+               " deduplicated",
+       )
+       files = 0
+       fullSize = 0
+       deduped := 0
+       termRequired := make(chan os.Signal, 1)
+       signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+       go func() {
+               <-termRequired
+               canExit.Lock()
+               progress.Stop()
+               log.Println(deduped, "files deduplicated")
+               os.Exit(0)
+       }()
+       bufDup := make([]byte, SizeBoundary)
+       bufOrig := make([]byte, SizeBoundary)
+       seen := make(map[string]struct{}, len(queueSmall))
+       for dup, origs := range queueSmall {
+               files++
+               if _, ok := seen[dup]; ok {
+                       continue
+               }
+               fdDup, err := os.Open(dup)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               sizeDup, err := io.ReadFull(fdDup, bufDup)
+               if !(err == nil || err == io.ErrUnexpectedEOF) {
+                       log.Fatal(err)
+               }
+               if err = fdDup.Close(); err != nil {
+                       log.Fatal(err)
+               }
+               for _, orig := range origs {
+                       fdOrig, err := os.Open(orig)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
+                       if !(err == nil || err == io.ErrUnexpectedEOF) {
+                               log.Fatal(err)
+                       }
+                       if sizeOrig != sizeDup {
+                               log.Fatalln(dup, orig, "unexpectedly different sizes")
+                       }
+                       if err = fdOrig.Close(); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
+                               continue
+                       }
+                       link(dup, orig, *action, *doFsync)
+                       seen[orig] = struct{}{}
+                       deduped++
+                       fullSize += int64(sizeDup)
+                       break
+               }
+       }
+       queueSmall = nil
+
+       hasher, err := blake2b.New512(nil)
+       if err != nil {
+               panic(err)
+       }
+       seen = make(map[string]struct{}, len(queueLarge))
+       var sizeDup int64
+       for dup, origs := range queueLarge {
+               files++
+               if _, ok := seen[dup]; ok {
+                       continue
+               }
+               fdDup, err := os.Open(dup)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               if _, err := io.ReadFull(fdDup, bufDup); err != nil {
+                       log.Fatal(err)
+               }
+               var hashDup []byte
+               for _, orig := range origs {
+                       fdOrig, err := os.Open(orig)
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(bufDup, bufOrig) != 0 {
+                               if err = fdOrig.Close(); err != nil {
+                                       log.Fatal(err)
+                               }
+                               continue
+                       }
+                       if hashDup == nil {
+                               hasher.Reset()
+                               if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+                                       log.Fatalln("can not write to hash", err)
+                               }
+                               sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+                               if err != nil {
+                                       log.Fatal(err)
+                               }
+                               hashDup = hasher.Sum(nil)
+                       }
+                       hasher.Reset()
+                       if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+                               log.Fatalln("can not write to hash", err)
+                       }
+                       if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+                               log.Fatal(err)
+                       }
+                       if err = fdOrig.Close(); err != nil {
+                               log.Fatal(err)
+                       }
+                       if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
+                               continue
+                       }
+                       link(dup, orig, *action, *doFsync)
+                       seen[orig] = struct{}{}
+                       deduped++
+                       fullSize += sizeDup
+                       break
+               }
+               if err = fdDup.Close(); err != nil {
+                       log.Fatal(err)
+               }
+       }
+       termRequired <- syscall.SIGTERM
+       <-termRequired
+}