--- /dev/null
+ sgodup -- file deduplication utility
+ ====================================
+
+DESCRIPTION AND USAGE
+
+sgodup is utility for duplicate files detection. You supply two
+directories: the base and one with possible duplicates, utility
+determines duplicate files and replaces them with the links. It
+is aimed to have very high performance.
+
+There are just few arguments:
+
+-basedir -- directory with files that are possible link targets
+ -dupdir -- directory with possible duplicates, which are replaced
+ with the links to basedir's files
+ -action -- * print: just print to stdout duplicate file path with
+ relative path to basedir's corresponding file
+ * symlink: create symbolic link with relative path to
+ basedir's corresponding file
+ * hardlink: create hard link instead
+ -chmod -- if specified, then chmod files in basedir and dupdir
+ during scan phase. Octal representation is expected
+ -fsync -- fsync directories where linking occurs
+
+There are three stages:
+
+* basedir directory scan: collect all *regular* file paths, sizes and
+ inodes. If -chmod is specified, then apply it at once. Empty files are
+ ignored
+* dupdir directory scan: same as above. If there is no basedir's file
+ with the same size, then skip dupdir's file (obviously it can not be
+ duplicate). Check that no basedir's files have the same inode, skip
+ dupdir's file otherwise, because it is already hardlinked
+* deduplication stage. For each dupdir file, find basedir file with the
+ same size and compare their contents, to determine if dupdir's one is
+ the duplicate. Perform specified action if so. There are two separate
+ queues and processing cycles:
+
+ * small files, up to 4 KiB (one disk sector): files are fully read and
+ compared in memory
+ * large files (everything else): read and compare first 4 KiB of files
+ in memory. If they are not equal, then this is not a duplicate.
+ Fully read each file's contents sequentially with 128 KiB chunks and
+ calculate BLAKE2b-512 digest otherwise
+
+Progress is showed at each stage: how many files are counted/processed,
+total size of the files, how much space is deduplicated.
+
+ 2020/03/19 22:57:07 processing basedir...
+ 2020/03/19 22:57:07 464,329 / 0 (0%) files scanned
+ 2020/03/19 22:57:07 534 GiB / 0 B (0%)
+ 2020/03/19 22:57:12 processing dupdir...
+ 2020/03/19 22:57:12 362,245 / 0 (0%) files scanned
+ 2020/03/19 22:57:12 362 GiB / 0 B (0%)
+ 2020/03/19 22:57:17 deduplicating...
+ 2020/03/19 22:58:18 8,193 / 362,245 (2%) files processed
+ 2020/03/19 22:58:18 7.7 GiB / 362 GiB (2%) deduplicated
+ [...]
+ 2020/03/20 11:17:20 321,123 files deduplicated
+
+It is safe to specify same directory as a basedir and dupdir.
+
+SAFETY AND CONSISTENCY
+
+POSIX has no ability to atomically replace regular file with with
+symbolic/hard link. So file is removed first, then link created. sgodup
+cautiously prevents possible interruption by signal (TERM, INT) of those
+two calls. But any other failure could possibly break the program after
+file removal without link creation, leading to its loss!
+
+It is recommended to use filesystems with snapshot capability to be able
+to rollback and restore removed file. Or you can use "-action print"
+beforehand to collect the duplicates and use it as a log for possible
+recovery.
+
+There are no warranties and any defined behaviour if directories (and files
+within) where utility is working with are modified.
+
+LICENCE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
--- /dev/null
+module go.stargrave.org/sgodup
+
+go 1.14
+
+require (
+ github.com/dustin/go-humanize v1.0.0
+ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
+)
--- /dev/null
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
--- /dev/null
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// File deduplication utility
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "strconv"
+ "sync"
+ "syscall"
+
+ "golang.org/x/crypto/blake2b"
+)
+
+const (
+ SizeBoundary = 1 << 12 // 4 KiB sector size
+ BufSize = 1 << 17 // ZFS default 128 KiB recordsize
+)
+
+var (
+ canExit sync.Mutex
+
+ curDirPath string
+ curDirFd *os.File
+)
+
+func link(dup, orig, action string, fsync bool) {
+ tgt, err := filepath.Rel(dup, orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ tgt = tgt[3:]
+ if action == "print" {
+ fmt.Println(dup, tgt)
+ return
+ }
+ canExit.Lock()
+ if err = os.Remove(dup); err != nil {
+ log.Fatal(err)
+ }
+ if action == "symlink" {
+ err = os.Symlink(tgt, dup)
+ } else {
+ err = os.Link(orig, dup)
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+ if fsync {
+ dirPath := filepath.Dir(dup)
+ if dirPath != curDirPath {
+ curDirFd, err = os.Open(dirPath)
+ if err != nil {
+ log.Fatal(err)
+ }
+ curDirPath = dirPath
+ }
+ if err = curDirFd.Sync(); err != nil {
+ log.Fatal(err)
+ }
+ }
+ canExit.Unlock()
+}
+
+func main() {
+ var (
+ baseDir = flag.String("basedir", "", "Directory with original files")
+ dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
+ action = flag.String("action", "", "print, symlink, hardlink")
+ doChmod = flag.String("chmod", "", "chmod files")
+ doFsync = flag.Bool("fsync", false, "fsync directories?")
+ )
+ flag.Parse()
+ if *baseDir == "" {
+ log.Fatalln("-basedir is required")
+ }
+ if *dupDir == "" {
+ log.Fatalln("-dupdir is required")
+ }
+ var chmod os.FileMode
+ if *doChmod != "" {
+ ch, err := strconv.ParseUint(*doChmod, 8, 16)
+ if err != nil {
+ log.Fatal(err)
+ }
+ chmod = os.FileMode(ch)
+ }
+ if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
+ log.Fatalln("choose action")
+ }
+
+ log.Println("processing basedir...")
+ size2fi := make(map[int64][]FileInode, 1<<10)
+ files := 0
+ filesSmall := 0
+ filesLarge := 0
+ var fullSize int64
+ progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+ for fi := range walk(*baseDir) {
+ if chmod > 0 {
+ if err := os.Chmod(fi.Path, chmod); err != nil {
+ log.Fatal(err)
+ }
+ }
+ if fi.Size == 0 {
+ continue
+ }
+ if fi.Size <= SizeBoundary {
+ filesSmall++
+ } else {
+ filesLarge++
+ }
+ files++
+ fullSize += fi.Size
+ size2fi[fi.Size] = append(size2fi[fi.Size], fi)
+ }
+ progress.Stop()
+
+ log.Println("processing dupdir...")
+ queueSmall := make(map[string][]string, filesSmall)
+ queueLarge := make(map[string][]string, filesLarge)
+ files = 0
+ fullSize = 0
+ progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+ for fi := range walk(*dupDir) {
+ if chmod > 0 {
+ if err := os.Chmod(fi.Path, chmod); err != nil {
+ log.Fatal(err)
+ }
+ }
+ if fi.Size == 0 {
+ continue
+ }
+ origs, ok := size2fi[fi.Size]
+ if !ok {
+ continue
+ }
+ paths := make([]string, 0, len(origs))
+ for _, orig := range origs {
+ if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+ continue
+ }
+ paths = append(paths, orig.Path)
+ }
+ files++
+ fullSize += fi.Size
+ if fi.Size <= SizeBoundary {
+ queueSmall[fi.Path] = paths
+ } else {
+ queueLarge[fi.Path] = paths
+ }
+ }
+ size2fi = nil
+ progress.Stop()
+
+ log.Println("deduplicating...")
+ progress = NewProgress(
+ files,
+ fullSize,
+ &files,
+ &fullSize,
+ " processed",
+ " deduplicated",
+ )
+ files = 0
+ fullSize = 0
+ deduped := 0
+ termRequired := make(chan os.Signal, 1)
+ signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+ go func() {
+ <-termRequired
+ canExit.Lock()
+ progress.Stop()
+ log.Println(deduped, "files deduplicated")
+ os.Exit(0)
+ }()
+ bufDup := make([]byte, SizeBoundary)
+ bufOrig := make([]byte, SizeBoundary)
+ seen := make(map[string]struct{}, len(queueSmall))
+ for dup, origs := range queueSmall {
+ files++
+ if _, ok := seen[dup]; ok {
+ continue
+ }
+ fdDup, err := os.Open(dup)
+ if err != nil {
+ log.Fatal(err)
+ }
+ sizeDup, err := io.ReadFull(fdDup, bufDup)
+ if !(err == nil || err == io.ErrUnexpectedEOF) {
+ log.Fatal(err)
+ }
+ if err = fdDup.Close(); err != nil {
+ log.Fatal(err)
+ }
+ for _, orig := range origs {
+ fdOrig, err := os.Open(orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
+ if !(err == nil || err == io.ErrUnexpectedEOF) {
+ log.Fatal(err)
+ }
+ if sizeOrig != sizeDup {
+ log.Fatalln(dup, orig, "unexpectedly different sizes")
+ }
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
+ continue
+ }
+ link(dup, orig, *action, *doFsync)
+ seen[orig] = struct{}{}
+ deduped++
+ fullSize += int64(sizeDup)
+ break
+ }
+ }
+ queueSmall = nil
+
+ hasher, err := blake2b.New512(nil)
+ if err != nil {
+ panic(err)
+ }
+ seen = make(map[string]struct{}, len(queueLarge))
+ var sizeDup int64
+ for dup, origs := range queueLarge {
+ files++
+ if _, ok := seen[dup]; ok {
+ continue
+ }
+ fdDup, err := os.Open(dup)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if _, err := io.ReadFull(fdDup, bufDup); err != nil {
+ log.Fatal(err)
+ }
+ var hashDup []byte
+ for _, orig := range origs {
+ fdOrig, err := os.Open(orig)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(bufDup, bufOrig) != 0 {
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ continue
+ }
+ if hashDup == nil {
+ hasher.Reset()
+ if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+ log.Fatalln("can not write to hash", err)
+ }
+ sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+ if err != nil {
+ log.Fatal(err)
+ }
+ hashDup = hasher.Sum(nil)
+ }
+ hasher.Reset()
+ if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+ log.Fatalln("can not write to hash", err)
+ }
+ if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+ log.Fatal(err)
+ }
+ if err = fdOrig.Close(); err != nil {
+ log.Fatal(err)
+ }
+ if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
+ continue
+ }
+ link(dup, orig, *action, *doFsync)
+ seen[orig] = struct{}{}
+ deduped++
+ fullSize += sizeDup
+ break
+ }
+ if err = fdDup.Close(); err != nil {
+ log.Fatal(err)
+ }
+ }
+ termRequired <- syscall.SIGTERM
+ <-termRequired
+}
--- /dev/null
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+ "fmt"
+ "os"
+ "time"
+
+ "github.com/dustin/go-humanize"
+)
+
+const (
+ ESC = 27
+)
+
+var LineClear = fmt.Sprintf("%c[%dA%c[2K", ESC, 2, ESC)
+
+type Progress struct {
+ fullFiles int64
+ fullSize uint64
+ fullFilesS string
+ fullSizeS string
+ suffixFiles string
+ suffixSize string
+ stop chan struct{}
+}
+
+func NewProgress(
+ fullFiles int,
+ fullSize int64,
+ files *int,
+ size *int64,
+ suffixFiles, suffixSize string,
+) Progress {
+ p := Progress{
+ int64(fullFiles),
+ uint64(fullSize),
+ humanize.Comma(int64(fullFiles)),
+ humanize.IBytes(uint64(fullSize)),
+ suffixFiles,
+ suffixSize,
+ make(chan struct{}, 0),
+ }
+ go p.Run(files, size)
+ return p
+}
+
+func (p Progress) Log(prefix string, files int64, size uint64) {
+ percentageFiles := int64(0)
+ if p.fullFiles > 0 {
+ percentageFiles = 100 * files / p.fullFiles
+ }
+ percentageSize := uint64(0)
+ if p.fullSize > 0 {
+ percentageSize = 100 * size / p.fullSize
+ }
+ now := time.Now().Format("2006/01/02 15:04:05")
+ fmt.Fprintf(
+ os.Stderr,
+ "%s%s %s / %s (%d%%) files%s\n%s %s / %s (%d%%)%s\n",
+ prefix,
+ now, humanize.Comma(files), p.fullFilesS, percentageFiles, p.suffixFiles,
+ now, humanize.IBytes(size), p.fullSizeS, percentageSize, p.suffixSize,
+ )
+}
+
+func (p Progress) Run(files *int, size *int64) {
+ p.Log("", 0, 0)
+ ticker := time.NewTicker(250 * time.Millisecond)
+ for {
+ select {
+ case <-ticker.C:
+ p.Log(LineClear, int64(*files), uint64(*size))
+ case <-p.stop:
+ ticker.Stop()
+ p.Log(LineClear, int64(*files), uint64(*size))
+ close(p.stop)
+ return
+ }
+ }
+}
+
+func (p Progress) Stop() {
+ p.stop <- struct{}{}
+ <-p.stop
+}
--- /dev/null
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+ "io"
+ "log"
+ "os"
+ "path"
+ "syscall"
+)
+
+type FileInode struct {
+ Path string
+ Size int64
+ Dev uint64
+ Ino uint64
+}
+
+func walker(c chan FileInode, dirPath string) {
+ dirFd, err := os.Open(dirPath)
+ if err != nil {
+ log.Fatal(err)
+ }
+ for {
+ fis, err := dirFd.Readdir(1 << 10)
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ log.Fatal(err)
+ }
+ for _, fi := range fis {
+ stat := fi.Sys().(*syscall.Stat_t)
+ if fi.Mode().IsRegular() {
+ c <- FileInode{
+ path.Join(dirPath, fi.Name()),
+ fi.Size(),
+ stat.Dev, stat.Ino,
+ }
+ } else if fi.IsDir() {
+ walker(c, path.Join(dirPath, fi.Name()))
+ }
+ }
+ }
+ if err = dirFd.Close(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+func walk(dirPath string) chan FileInode {
+ c := make(chan FileInode, 1<<10)
+ go func() {
+ walker(c, dirPath)
+ close(c)
+ }()
+ return c
+}