2 sgodup -- File deduplication utility
3 Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 // File deduplication utility
35 "golang.org/x/crypto/blake2b"
39 SizeBoundary = 1 << 12 // 4 KiB sector size
40 BufSize = 1 << 17 // ZFS default 128 KiB recordsize
50 func link(dup, orig, action string, fsync bool) {
51 tgt, err := filepath.Rel(dup, orig)
56 if action == "print" {
61 if err = os.Remove(dup); err != nil {
64 if action == "symlink" {
65 err = os.Symlink(tgt, dup)
67 err = os.Link(orig, dup)
73 dirPath := filepath.Dir(dup)
74 if dirPath != curDirPath {
75 curDirFd, err = os.Open(dirPath)
81 if err = curDirFd.Sync(); err != nil {
90 baseDir = flag.String("basedir", "", "Directory with original files")
91 dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
92 action = flag.String("action", "", "print, symlink, hardlink")
93 doChmod = flag.String("chmod", "", "chmod files")
94 doFsync = flag.Bool("fsync", false, "fsync directories?")
98 log.Fatalln("-basedir is required")
101 log.Fatalln("-dupdir is required")
103 var chmod os.FileMode
105 ch, err := strconv.ParseUint(*doChmod, 8, 16)
109 chmod = os.FileMode(ch)
111 if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
112 log.Fatalln("choose action")
115 log.Println("processing basedir...")
116 size2fi := make(map[int64][]FileInode, 1<<10)
121 progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
122 for fi := range walk(*baseDir) {
124 if err := os.Chmod(fi.Path, chmod); err != nil {
131 if fi.Size <= SizeBoundary {
138 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
142 log.Println("processing dupdir...")
143 queueSmall := make(map[string][]string, filesSmall)
144 queueLarge := make(map[string][]string, filesLarge)
147 progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
148 for fi := range walk(*dupDir) {
150 if err := os.Chmod(fi.Path, chmod); err != nil {
157 origs, ok := size2fi[fi.Size]
161 paths := make([]string, 0, len(origs))
162 for _, orig := range origs {
163 if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
166 paths = append(paths, orig.Path)
170 if fi.Size <= SizeBoundary {
171 queueSmall[fi.Path] = paths
173 queueLarge[fi.Path] = paths
179 log.Println("deduplicating...")
180 progress = NewProgress(
191 termRequired := make(chan os.Signal, 1)
192 signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
197 log.Println(deduped, "files deduplicated")
200 bufDup := make([]byte, SizeBoundary)
201 bufOrig := make([]byte, SizeBoundary)
202 seen := make(map[string]struct{}, len(queueSmall))
203 for dup, origs := range queueSmall {
205 if _, ok := seen[dup]; ok {
208 fdDup, err := os.Open(dup)
212 sizeDup, err := io.ReadFull(fdDup, bufDup)
213 if !(err == nil || err == io.ErrUnexpectedEOF) {
216 if err = fdDup.Close(); err != nil {
219 for _, orig := range origs {
220 fdOrig, err := os.Open(orig)
224 sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
225 if !(err == nil || err == io.ErrUnexpectedEOF) {
228 if sizeOrig != sizeDup {
229 log.Fatalln(dup, orig, "unexpectedly different sizes")
231 if err = fdOrig.Close(); err != nil {
234 if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
237 link(dup, orig, *action, *doFsync)
238 seen[orig] = struct{}{}
240 fullSize += int64(sizeDup)
246 hasher, err := blake2b.New512(nil)
250 seen = make(map[string]struct{}, len(queueLarge))
252 for dup, origs := range queueLarge {
254 if _, ok := seen[dup]; ok {
257 fdDup, err := os.Open(dup)
261 if _, err := io.ReadFull(fdDup, bufDup); err != nil {
265 for _, orig := range origs {
266 fdOrig, err := os.Open(orig)
270 if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
273 if bytes.Compare(bufDup, bufOrig) != 0 {
274 if err = fdOrig.Close(); err != nil {
281 if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
282 log.Fatalln("can not write to hash", err)
284 sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
288 hashDup = hasher.Sum(nil)
291 if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
292 log.Fatalln("can not write to hash", err)
294 if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
297 if err = fdOrig.Close(); err != nil {
300 if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
303 link(dup, orig, *action, *doFsync)
304 seen[orig] = struct{}{}
309 if err = fdDup.Close(); err != nil {
313 termRequired <- syscall.SIGTERM