From 4cb0b1b52ba01d3b200b56c349f4177f5ec355d3 Mon Sep 17 00:00:00 2001
From: Sergey Matveev <stargrave@stargrave.org>
Date: Fri, 20 Mar 2020 00:24:19 +0300
Subject: [PATCH] Initial version

---
 README      |  88 +++++++++++++++
 go.mod      |   8 ++
 go.sum      |  10 ++
 main.go     | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 progress.go | 102 +++++++++++++++++
 walk.go     |  73 ++++++++++++
 6 files changed, 596 insertions(+)
 create mode 100644 README
 create mode 100644 go.mod
 create mode 100644 go.sum
 create mode 100644 main.go
 create mode 100644 progress.go
 create mode 100644 walk.go

diff --git a/README b/README
new file mode 100644
index 0000000..b34dcd9
--- /dev/null
+++ b/README
@@ -0,0 +1,88 @@
+                  sgodup -- file deduplication utility
+                  ====================================
+
+DESCRIPTION AND USAGE
+
+sgodup is utility for duplicate files detection. You supply two
+directories: the base and one with possible duplicates, utility
+determines duplicate files and replaces them with the links. It
+is aimed to have very high performance.
+
+There are just few arguments:
+
+-basedir -- directory with files that are possible link targets
+ -dupdir -- directory with possible duplicates, which are replaced
+            with the links to basedir's files
+ -action -- * print: just print to stdout duplicate file path with
+              relative path to basedir's corresponding file
+            * symlink: create symbolic link with relative path to
+              basedir's corresponding file
+            * hardlink: create hard link instead
+  -chmod -- if specified, then chmod files in basedir and dupdir
+            during scan phase. Octal representation is expected
+  -fsync -- fsync directories where linking occurs
+
+There are three stages:
+
+* basedir directory scan: collect all *regular* file paths, sizes and
+  inodes. If -chmod is specified, then apply it at once. Empty files are
+  ignored
+* dupdir directory scan: same as above. If there is no basedir's file
+  with the same size, then skip dupdir's file (obviously it can not be
+  duplicate). Check that no basedir's files have the same inode, skip
+  dupdir's file otherwise, because it is already hardlinked
+* deduplication stage. For each dupdir file, find basedir file with the
+  same size and compare their contents, to determine if dupdir's one is
+  the duplicate. Perform specified action if so. There are two separate
+  queues and processing cycles:
+
+  * small files, up to 4 KiB (one disk sector): files are fully read and
+    compared in memory
+  * large files (everything else): read and compare first 4 KiB of files
+    in memory. If they are not equal, then this is not a duplicate.
+    Fully read each file's contents sequentially with 128 KiB chunks and
+    calculate BLAKE2b-512 digest otherwise
+
+Progress is showed at each stage: how many files are counted/processed,
+total size of the files, how much space is deduplicated.
+
+    2020/03/19 22:57:07 processing basedir...
+    2020/03/19 22:57:07 464,329 / 0 (0%) files scanned
+    2020/03/19 22:57:07 534 GiB / 0 B (0%)
+    2020/03/19 22:57:12 processing dupdir...
+    2020/03/19 22:57:12 362,245 / 0 (0%) files scanned
+    2020/03/19 22:57:12 362 GiB / 0 B (0%)
+    2020/03/19 22:57:17 deduplicating...
+    2020/03/19 22:58:18 8,193 / 362,245 (2%) files processed
+    2020/03/19 22:58:18 7.7 GiB / 362 GiB (2%) deduplicated
+    [...]
+    2020/03/20 11:17:20 321,123 files deduplicated
+
+It is safe to specify same directory as a basedir and dupdir.
+
+SAFETY AND CONSISTENCY
+
+POSIX has no ability to atomically replace regular file with with
+symbolic/hard link. So file is removed first, then link created. sgodup
+cautiously prevents possible interruption by signal (TERM, INT) of those
+two calls. But any other failure could possibly break the program after
+file removal without link creation, leading to its loss!
+
+It is recommended to use filesystems with snapshot capability to be able
+to rollback and restore removed file. Or you can use "-action print"
+beforehand to collect the duplicates and use it as a log for possible
+recovery.
+
+There are no warranties and any defined behaviour if directories (and files
+within) where utility is working with are modified.
+
+LICENCE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..d87c9cc
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,8 @@
+module go.stargrave.org/sgodup
+
+go 1.14
+
+require (
+	github.com/dustin/go-humanize v1.0.0
+	golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..dc88a58
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,10 @@
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..f4fbf6c
--- /dev/null
+++ b/main.go
@@ -0,0 +1,315 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// File deduplication utility
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"strconv"
+	"sync"
+	"syscall"
+
+	"golang.org/x/crypto/blake2b"
+)
+
+const (
+	SizeBoundary = 1 << 12 // 4 KiB sector size
+	BufSize      = 1 << 17 // ZFS default 128 KiB recordsize
+)
+
+var (
+	canExit sync.Mutex
+
+	curDirPath string
+	curDirFd   *os.File
+)
+
+func link(dup, orig, action string, fsync bool) {
+	tgt, err := filepath.Rel(dup, orig)
+	if err != nil {
+		log.Fatal(err)
+	}
+	tgt = tgt[3:]
+	if action == "print" {
+		fmt.Println(dup, tgt)
+		return
+	}
+	canExit.Lock()
+	if err = os.Remove(dup); err != nil {
+		log.Fatal(err)
+	}
+	if action == "symlink" {
+		err = os.Symlink(tgt, dup)
+	} else {
+		err = os.Link(orig, dup)
+	}
+	if err != nil {
+		log.Fatal(err)
+	}
+	if fsync {
+		dirPath := filepath.Dir(dup)
+		if dirPath != curDirPath {
+			curDirFd, err = os.Open(dirPath)
+			if err != nil {
+				log.Fatal(err)
+			}
+			curDirPath = dirPath
+		}
+		if err = curDirFd.Sync(); err != nil {
+			log.Fatal(err)
+		}
+	}
+	canExit.Unlock()
+}
+
+func main() {
+	var (
+		baseDir = flag.String("basedir", "", "Directory with original files")
+		dupDir  = flag.String("dupdir", "", "Directory with possible duplicates")
+		action  = flag.String("action", "", "print, symlink, hardlink")
+		doChmod = flag.String("chmod", "", "chmod files")
+		doFsync = flag.Bool("fsync", false, "fsync directories?")
+	)
+	flag.Parse()
+	if *baseDir == "" {
+		log.Fatalln("-basedir is required")
+	}
+	if *dupDir == "" {
+		log.Fatalln("-dupdir is required")
+	}
+	var chmod os.FileMode
+	if *doChmod != "" {
+		ch, err := strconv.ParseUint(*doChmod, 8, 16)
+		if err != nil {
+			log.Fatal(err)
+		}
+		chmod = os.FileMode(ch)
+	}
+	if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
+		log.Fatalln("choose action")
+	}
+
+	log.Println("processing basedir...")
+	size2fi := make(map[int64][]FileInode, 1<<10)
+	files := 0
+	filesSmall := 0
+	filesLarge := 0
+	var fullSize int64
+	progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+	for fi := range walk(*baseDir) {
+		if chmod > 0 {
+			if err := os.Chmod(fi.Path, chmod); err != nil {
+				log.Fatal(err)
+			}
+		}
+		if fi.Size == 0 {
+			continue
+		}
+		if fi.Size <= SizeBoundary {
+			filesSmall++
+		} else {
+			filesLarge++
+		}
+		files++
+		fullSize += fi.Size
+		size2fi[fi.Size] = append(size2fi[fi.Size], fi)
+	}
+	progress.Stop()
+
+	log.Println("processing dupdir...")
+	queueSmall := make(map[string][]string, filesSmall)
+	queueLarge := make(map[string][]string, filesLarge)
+	files = 0
+	fullSize = 0
+	progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
+	for fi := range walk(*dupDir) {
+		if chmod > 0 {
+			if err := os.Chmod(fi.Path, chmod); err != nil {
+				log.Fatal(err)
+			}
+		}
+		if fi.Size == 0 {
+			continue
+		}
+		origs, ok := size2fi[fi.Size]
+		if !ok {
+			continue
+		}
+		paths := make([]string, 0, len(origs))
+		for _, orig := range origs {
+			if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+				continue
+			}
+			paths = append(paths, orig.Path)
+		}
+		files++
+		fullSize += fi.Size
+		if fi.Size <= SizeBoundary {
+			queueSmall[fi.Path] = paths
+		} else {
+			queueLarge[fi.Path] = paths
+		}
+	}
+	size2fi = nil
+	progress.Stop()
+
+	log.Println("deduplicating...")
+	progress = NewProgress(
+		files,
+		fullSize,
+		&files,
+		&fullSize,
+		" processed",
+		" deduplicated",
+	)
+	files = 0
+	fullSize = 0
+	deduped := 0
+	termRequired := make(chan os.Signal, 1)
+	signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+	go func() {
+		<-termRequired
+		canExit.Lock()
+		progress.Stop()
+		log.Println(deduped, "files deduplicated")
+		os.Exit(0)
+	}()
+	bufDup := make([]byte, SizeBoundary)
+	bufOrig := make([]byte, SizeBoundary)
+	seen := make(map[string]struct{}, len(queueSmall))
+	for dup, origs := range queueSmall {
+		files++
+		if _, ok := seen[dup]; ok {
+			continue
+		}
+		fdDup, err := os.Open(dup)
+		if err != nil {
+			log.Fatal(err)
+		}
+		sizeDup, err := io.ReadFull(fdDup, bufDup)
+		if !(err == nil || err == io.ErrUnexpectedEOF) {
+			log.Fatal(err)
+		}
+		if err = fdDup.Close(); err != nil {
+			log.Fatal(err)
+		}
+		for _, orig := range origs {
+			fdOrig, err := os.Open(orig)
+			if err != nil {
+				log.Fatal(err)
+			}
+			sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
+			if !(err == nil || err == io.ErrUnexpectedEOF) {
+				log.Fatal(err)
+			}
+			if sizeOrig != sizeDup {
+				log.Fatalln(dup, orig, "unexpectedly different sizes")
+			}
+			if err = fdOrig.Close(); err != nil {
+				log.Fatal(err)
+			}
+			if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
+				continue
+			}
+			link(dup, orig, *action, *doFsync)
+			seen[orig] = struct{}{}
+			deduped++
+			fullSize += int64(sizeDup)
+			break
+		}
+	}
+	queueSmall = nil
+
+	hasher, err := blake2b.New512(nil)
+	if err != nil {
+		panic(err)
+	}
+	seen = make(map[string]struct{}, len(queueLarge))
+	var sizeDup int64
+	for dup, origs := range queueLarge {
+		files++
+		if _, ok := seen[dup]; ok {
+			continue
+		}
+		fdDup, err := os.Open(dup)
+		if err != nil {
+			log.Fatal(err)
+		}
+		if _, err := io.ReadFull(fdDup, bufDup); err != nil {
+			log.Fatal(err)
+		}
+		var hashDup []byte
+		for _, orig := range origs {
+			fdOrig, err := os.Open(orig)
+			if err != nil {
+				log.Fatal(err)
+			}
+			if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+				log.Fatal(err)
+			}
+			if bytes.Compare(bufDup, bufOrig) != 0 {
+				if err = fdOrig.Close(); err != nil {
+					log.Fatal(err)
+				}
+				continue
+			}
+			if hashDup == nil {
+				hasher.Reset()
+				if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+					log.Fatalln("can not write to hash", err)
+				}
+				sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+				if err != nil {
+					log.Fatal(err)
+				}
+				hashDup = hasher.Sum(nil)
+			}
+			hasher.Reset()
+			if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+				log.Fatalln("can not write to hash", err)
+			}
+			if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+				log.Fatal(err)
+			}
+			if err = fdOrig.Close(); err != nil {
+				log.Fatal(err)
+			}
+			if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
+				continue
+			}
+			link(dup, orig, *action, *doFsync)
+			seen[orig] = struct{}{}
+			deduped++
+			fullSize += sizeDup
+			break
+		}
+		if err = fdDup.Close(); err != nil {
+			log.Fatal(err)
+		}
+	}
+	termRequired <- syscall.SIGTERM
+	<-termRequired
+}
diff --git a/progress.go b/progress.go
new file mode 100644
index 0000000..863b37a
--- /dev/null
+++ b/progress.go
@@ -0,0 +1,102 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/dustin/go-humanize"
+)
+
+const (
+	ESC = 27
+)
+
+var LineClear = fmt.Sprintf("%c[%dA%c[2K", ESC, 2, ESC)
+
+type Progress struct {
+	fullFiles   int64
+	fullSize    uint64
+	fullFilesS  string
+	fullSizeS   string
+	suffixFiles string
+	suffixSize  string
+	stop        chan struct{}
+}
+
+func NewProgress(
+	fullFiles int,
+	fullSize int64,
+	files *int,
+	size *int64,
+	suffixFiles, suffixSize string,
+) Progress {
+	p := Progress{
+		int64(fullFiles),
+		uint64(fullSize),
+		humanize.Comma(int64(fullFiles)),
+		humanize.IBytes(uint64(fullSize)),
+		suffixFiles,
+		suffixSize,
+		make(chan struct{}, 0),
+	}
+	go p.Run(files, size)
+	return p
+}
+
+func (p Progress) Log(prefix string, files int64, size uint64) {
+	percentageFiles := int64(0)
+	if p.fullFiles > 0 {
+		percentageFiles = 100 * files / p.fullFiles
+	}
+	percentageSize := uint64(0)
+	if p.fullSize > 0 {
+		percentageSize = 100 * size / p.fullSize
+	}
+	now := time.Now().Format("2006/01/02 15:04:05")
+	fmt.Fprintf(
+		os.Stderr,
+		"%s%s %s / %s (%d%%) files%s\n%s %s / %s (%d%%)%s\n",
+		prefix,
+		now, humanize.Comma(files), p.fullFilesS, percentageFiles, p.suffixFiles,
+		now, humanize.IBytes(size), p.fullSizeS, percentageSize, p.suffixSize,
+	)
+}
+
+func (p Progress) Run(files *int, size *int64) {
+	p.Log("", 0, 0)
+	ticker := time.NewTicker(250 * time.Millisecond)
+	for {
+		select {
+		case <-ticker.C:
+			p.Log(LineClear, int64(*files), uint64(*size))
+		case <-p.stop:
+			ticker.Stop()
+			p.Log(LineClear, int64(*files), uint64(*size))
+			close(p.stop)
+			return
+		}
+	}
+}
+
+func (p Progress) Stop() {
+	p.stop <- struct{}{}
+	<-p.stop
+}
diff --git a/walk.go b/walk.go
new file mode 100644
index 0000000..76abb9c
--- /dev/null
+++ b/walk.go
@@ -0,0 +1,73 @@
+/*
+sgodup -- File deduplication utility
+Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+	"io"
+	"log"
+	"os"
+	"path"
+	"syscall"
+)
+
+type FileInode struct {
+	Path string
+	Size int64
+	Dev  uint64
+	Ino  uint64
+}
+
+func walker(c chan FileInode, dirPath string) {
+	dirFd, err := os.Open(dirPath)
+	if err != nil {
+		log.Fatal(err)
+	}
+	for {
+		fis, err := dirFd.Readdir(1 << 10)
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			log.Fatal(err)
+		}
+		for _, fi := range fis {
+			stat := fi.Sys().(*syscall.Stat_t)
+			if fi.Mode().IsRegular() {
+				c <- FileInode{
+					path.Join(dirPath, fi.Name()),
+					fi.Size(),
+					stat.Dev, stat.Ino,
+				}
+			} else if fi.IsDir() {
+				walker(c, path.Join(dirPath, fi.Name()))
+			}
+		}
+	}
+	if err = dirFd.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func walk(dirPath string) chan FileInode {
+	c := make(chan FileInode, 1<<10)
+	go func() {
+		walker(c, dirPath)
+		close(c)
+	}()
+	return c
+}
-- 
2.51.0