--- /dev/null
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// File deduplication utility
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "strconv"
+ "sync"
+ "syscall"
+
+ "golang.org/x/crypto/blake2b"
+)
+
+const (
+ SizeBoundary = 1 << 12 // 4 KiB sector size
+ BufSize = 1 << 17 // ZFS default 128 KiB recordsize
+)
+
+var (
+ canExit sync.Mutex
+
+ curDirPath string
+ curDirFd *os.File
+)
+
+func link(dup, orig, action string, fsync bool) {
+ tgt, err := filepath.Rel(dup, orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ tgt = tgt[3:]
+ if action == "print" {
+ fmt.Println(dup, tgt)
+ return
+ }
+ canExit.Lock()
+ if err = os.Remove(dup); err != nil {
+ log.Fatal(err)
+ }
+ if action == "symlink" {
+ err = os.Symlink(tgt, dup)
+ } else {
+ err = os.Link(orig, dup)
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+ if fsync {
+ dirPath := filepath.Dir(dup)
+ if dirPath != curDirPath {
+ curDirFd, err = os.Open(dirPath)
+ if err != nil {
+ log.Fatal(err)
+ }
+ curDirPath = dirPath
+ }
+ if err = curDirFd.Sync(); err != nil {
+ log.Fatal(err)
+ }
+ }
+ canExit.Unlock()
+}
+
+func main() {
+ var (
+ baseDir = flag.String("basedir", "", "Directory with original files")
+ dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
+ action = flag.String("action", "", "print, symlink, hardlink")
+ doChmod = flag.String("chmod", "", "chmod files")
+ doFsync = flag.Bool("fsync", false, "fsync directories?")
+ )
+ flag.Parse()
+ if *baseDir == "" {
+ log.Fatalln("-basedir is required")
+ }
+ if *dupDir == "" {
+ log.Fatalln("-dupdir is required")
+ }
+ var chmod os.FileMode
+ if *doChmod != "" {
+ ch, err := strconv.ParseUint(*doChmod, 8, 16)
+ if err != nil {
+ log.Fatal(err)
+ }
+ chmod = os.FileMode(ch)
+ }
+ if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
+ log.Fatalln("choose action")
+ }
+
+ log.Println("processing basedir...")
+ size2fi := make(map[int64][]FileInode, 1<<10)
+ files := 0
+ filesSmall := 0
+ filesLarge := 0
+ var fullSize int64
+ progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+ for fi := range walk(*baseDir) {
+ if chmod > 0 {
+ if err := os.Chmod(fi.Path, chmod); err != nil {
+ log.Fatal(err)
+ }
+ }
+ if fi.Size == 0 {
+ continue
+ }
+ if fi.Size <= SizeBoundary {
+ filesSmall++
+ } else {
+ filesLarge++
+ }
+ files++
+ fullSize += fi.Size
+ size2fi[fi.Size] = append(size2fi[fi.Size], fi)
+ }
+ progress.Stop()
+
+ log.Println("processing dupdir...")
+ queueSmall := make(map[string][]string, filesSmall)
+ queueLarge := make(map[string][]string, filesLarge)
+ files = 0
+ fullSize = 0
+ progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+ for fi := range walk(*dupDir) {
+ if chmod > 0 {
+ if err := os.Chmod(fi.Path, chmod); err != nil {
+ log.Fatal(err)
+ }
+ }
+ if fi.Size == 0 {
+ continue
+ }
+ origs, ok := size2fi[fi.Size]
+ if !ok {
+ continue
+ }
+ paths := make([]string, 0, len(origs))
+ for _, orig := range origs {
+ if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+ continue
+ }
+ paths = append(paths, orig.Path)
+ }
+ files++
+ fullSize += fi.Size
+ if fi.Size <= SizeBoundary {
+ queueSmall[fi.Path] = paths
+ } else {
+ queueLarge[fi.Path] = paths
+ }
+ }
+ size2fi = nil
+ progress.Stop()
+
+ log.Println("deduplicating...")
+ progress = NewProgress(
+ files,
+ fullSize,
+ &files,
+ &fullSize,
+ " processed",
+ " deduplicated",
+ )
+ files = 0
+ fullSize = 0
+ deduped := 0
+ termRequired := make(chan os.Signal, 1)
+ signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+ go func() {
+ <-termRequired
+ canExit.Lock()
+ progress.Stop()
+ log.Println(deduped, "files deduplicated")
+ os.Exit(0)
+ }()
+ bufDup := make([]byte, SizeBoundary)
+ bufOrig := make([]byte, SizeBoundary)
+ seen := make(map[string]struct{}, len(queueSmall))
+ for dup, origs := range queueSmall {
+ files++
+ if _, ok := seen[dup]; ok {
+ continue
+ }
+ fdDup, err := os.Open(dup)
+ if err != nil {
+ log.Fatal(err)
+ }
+ sizeDup, err := io.ReadFull(fdDup, bufDup)
+ if !(err == nil || err == io.ErrUnexpectedEOF) {
+ log.Fatal(err)
+ }
+ if err = fdDup.Close(); err != nil {
+ log.Fatal(err)
+ }
+ for _, orig := range origs {
+ fdOrig, err := os.Open(orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
+ if !(err == nil || err == io.ErrUnexpectedEOF) {
+ log.Fatal(err)
+ }
+ if sizeOrig != sizeDup {
+ log.Fatalln(dup, orig, "unexpectedly different sizes")
+ }
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
+ continue
+ }
+ link(dup, orig, *action, *doFsync)
+ seen[orig] = struct{}{}
+ deduped++
+ fullSize += int64(sizeDup)
+ break
+ }
+ }
+ queueSmall = nil
+
+ hasher, err := blake2b.New512(nil)
+ if err != nil {
+ panic(err)
+ }
+ seen = make(map[string]struct{}, len(queueLarge))
+ var sizeDup int64
+ for dup, origs := range queueLarge {
+ files++
+ if _, ok := seen[dup]; ok {
+ continue
+ }
+ fdDup, err := os.Open(dup)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if _, err := io.ReadFull(fdDup, bufDup); err != nil {
+ log.Fatal(err)
+ }
+ var hashDup []byte
+ for _, orig := range origs {
+ fdOrig, err := os.Open(orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(bufDup, bufOrig) != 0 {
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ continue
+ }
+ if hashDup == nil {
+ hasher.Reset()
+ if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+ log.Fatalln("can not write to hash", err)
+ }
+ sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+ if err != nil {
+ log.Fatal(err)
+ }
+ hashDup = hasher.Sum(nil)
+ }
+ hasher.Reset()
+ if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+ log.Fatalln("can not write to hash", err)
+ }
+ if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+ log.Fatal(err)
+ }
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
+ continue
+ }
+ link(dup, orig, *action, *doFsync)
+ seen[orig] = struct{}{}
+ deduped++
+ fullSize += sizeDup
+ break
+ }
+ if err = fdDup.Close(); err != nil {
+ log.Fatal(err)
+ }
+ }
+ termRequired <- syscall.SIGTERM
+ <-termRequired
+}