2 sgodup -- File deduplication utility
3 Copyright (C) 2020-2021 Sergey Matveev <stargrave@stargrave.org>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 // File deduplication utility
36 "github.com/dustin/go-humanize"
37 "go.cypherpunks.ru/netstring/v2"
38 "golang.org/x/crypto/blake2b"
43 SectorSize = 1 << 12 // 4 KiB sector size
44 BufSize = 1 << 17 // ZFS default 128 KiB recordsize
61 baseDir = flag.String("basedir", "", "Directory with original files")
62 dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
63 actionS = flag.String("action", "", "print, ns, symlink, hardlink")
64 minSize = flag.Int64("minsize", 1, "minimal file size")
65 chmod = flag.String("chmod", "", "chmod files")
66 doNS = flag.Bool("ns", false, "link targets from netstring read from stdin")
67 fsync = flag.Bool("fsync", false, "fsync directories?")
68 version = flag.Bool("version", false, "Print version information")
69 warranty = flag.Bool("warranty", false, "Print warranty information")
72 func link(dup, orig string) {
74 if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
77 if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
82 tgt, err := filepath.Rel(dup, orig)
87 if action == ActPrint {
88 fmt.Println(dup, "->", tgt)
92 if err = os.Remove(dup); err != nil {
95 if action == ActSymlink {
96 err = os.Symlink(tgt, dup)
98 err = os.Link(orig, dup)
104 dirPath := filepath.Dir(dup)
105 if dirPath != curDirPath {
106 curDirFd, err = os.Open(dirPath)
112 if err = curDirFd.Sync(); err != nil {
119 func signalHandler(progressStop func(), deduped *int) chan os.Signal {
120 termRequired := make(chan os.Signal, 1)
121 signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
126 log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
133 flag.Usage = func() {
134 fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
135 Copyright (C) 2020-2021 Sergey Matveev
136 License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
137 This is free software: you are free to change and redistribute it.
138 There is NO WARRANTY, to the extent permitted by law.
141 %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
142 [-chmod XXX] [-minsize XXX] [-fsync]
144 %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
145 %s -action {print,symlink,hardlink} [-fsync] -ns < state
148 `, os.Args[0], os.Args[0], os.Args[0])
153 fmt.Println("sgodup version", Version, "built with", runtime.Version())
157 fmt.Println(`This program is free software: you can redistribute it and/or modify
158 it under the terms of the GNU General Public License as published by
159 the Free Software Foundation, version 3 of the License.
161 This program is distributed in the hope that it will be useful,
162 but WITHOUT ANY WARRANTY; without even the implied warranty of
163 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
164 GNU General Public License for more details.
166 You should have received a copy of the GNU General Public License
167 along with this program. If not, see <http://www.gnu.org/licenses/>.`)
171 var stdoutW *bufio.Writer
177 stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
178 nsW = netstring.NewWriter(stdoutW)
184 log.Fatalln("invalid action")
189 log.Fatalln("\"-action ns\" has no meaning with -ns")
191 nsR := netstring.NewReader(bufio.NewReaderSize(os.Stdin, BufSize))
192 pathDup := make([]byte, 1<<10)
193 pathOrig := make([]byte, 1<<10)
195 var pathDupLen, pathOrigLen uint64
198 progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
199 termRequired := signalHandler(progress.Stop, &files)
201 pathDupLen, err = nsR.Next()
208 if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
211 pathOrigLen, err = nsR.Next()
215 if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
218 link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
221 termRequired <- syscall.SIGTERM
226 log.Fatalln("-basedir is required")
229 log.Fatalln("-dupdir is required")
231 var doChmod os.FileMode
233 ch, err := strconv.ParseUint(*chmod, 8, 16)
237 doChmod = os.FileMode(ch)
240 log.Println("processing basedir...")
241 size2fi := make(map[int64][]FileInode, 1<<10)
244 progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
245 for fi := range walk(*baseDir) {
247 if err := os.Chmod(fi.Path, doChmod); err != nil {
251 if fi.Size < *minSize {
256 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
260 log.Println("processing dupdir...")
261 queue := make([]FileInode, 0, files)
262 files, fullSize = 0, 0
263 progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
264 for fi := range walk(*dupDir) {
266 if err := os.Chmod(fi.Path, doChmod); err != nil {
270 if fi.Size < *minSize {
273 origs, ok := size2fi[fi.Size]
278 for _, orig := range origs {
279 if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
289 queue = append(queue, fi)
293 log.Println("deduplicating...")
294 progress = NewProgress(
300 files, fullSize = 0, 0
301 bufDup := make([]byte, SectorSize)
302 bufOrig := make([]byte, SectorSize)
303 seenDup := make(map[string]struct{}, len(queue)/2)
304 seenOrig := make(map[string]struct{}, len(queue)/2)
305 hasher, err := blake2b.New512(nil)
309 rdDup := bufio.NewReaderSize(nil, BufSize)
310 rdOrig := bufio.NewReaderSize(nil, BufSize)
312 termRequired := signalHandler(progress.Stop, &deduped)
313 for _, fi := range queue {
315 if _, ok := seenOrig[fi.Path]; ok {
318 fdDup, err := os.Open(fi.Path)
322 readDup, err := io.ReadFull(fdDup, bufDup)
324 if err != io.ErrUnexpectedEOF {
327 if int64(readDup) != fi.Size {
328 log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
332 for _, orig := range size2fi[fi.Size] {
333 if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
336 if _, ok := seenDup[orig.Path]; ok {
339 fdOrig, err := os.Open(orig.Path)
343 readOrig, err := io.ReadFull(fdOrig, bufOrig)
344 if !(err == nil || err == io.ErrUnexpectedEOF) {
347 if readOrig != readDup {
350 "unexpectedly different sizes",
354 if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
355 if err = fdOrig.Close(); err != nil {
362 if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
363 log.Fatalln("can not write to hash", err)
366 n, err := io.Copy(hasher, rdDup)
370 if int64(readDup)+n != fi.Size {
371 log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
373 hashDup = hasher.Sum(nil)
376 if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
377 log.Fatalln("can not write to hash", err)
380 n, err := io.Copy(hasher, rdOrig)
384 if int64(readOrig)+n != fi.Size {
387 "unexpectedly different sizes",
388 int64(readOrig)+n, fi.Size,
391 if err = fdOrig.Close(); err != nil {
394 if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
397 link(fi.Path, orig.Path)
398 seenDup[fi.Path] = struct{}{}
399 seenOrig[orig.Path] = struct{}{}
404 if err = fdDup.Close(); err != nil {
409 if err = stdoutW.Flush(); err != nil {
413 termRequired <- syscall.SIGTERM