2 sgodup -- File deduplication utility
3 Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 // File deduplication utility
36 "github.com/dustin/go-humanize"
37 "go.cypherpunks.ru/netstring/v2"
38 "golang.org/x/crypto/blake2b"
43 SectorSize = 1 << 12 // 4 KiB sector size
44 BufSize = 1 << 17 // ZFS default 128 KiB recordsize
61 baseDir = flag.String("basedir", "", "Directory with original files")
62 dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
63 actionS = flag.String("action", "", "print, ns, symlink, hardlink")
64 minSize = flag.Int64("minsize", 1, "minimal file size")
65 chmod = flag.String("chmod", "", "chmod files")
66 nsPath = flag.String("ns", "", "link targets from netstring file")
67 fsync = flag.Bool("fsync", false, "fsync directories?")
68 version = flag.Bool("version", false, "Print version information")
69 warranty = flag.Bool("warranty", false, "Print warranty information")
72 func link(dup, orig string) {
74 if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
77 if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
82 tgt, err := filepath.Rel(dup, orig)
87 if action == ActPrint {
88 fmt.Println(dup, "->", tgt)
92 if err = os.Remove(dup); err != nil {
95 if action == ActSymlink {
96 err = os.Symlink(tgt, dup)
98 err = os.Link(orig, dup)
104 dirPath := filepath.Dir(dup)
105 if dirPath != curDirPath {
106 curDirFd, err = os.Open(dirPath)
112 if err = curDirFd.Sync(); err != nil {
119 func signalHandler(progressStop func(), deduped *int) chan os.Signal {
120 termRequired := make(chan os.Signal, 1)
121 signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
126 log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
133 flag.Usage = func() {
134 fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
135 Copyright (C) 2020 Sergey Matveev
136 License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
137 This is free software: you are free to change and redistribute it.
138 There is NO WARRANTY, to the extent permitted by law.
141 %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
142 [-chmod XXX] [-minsize XXX] [-fsync]
144 %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
145 %s -action {print,symlink,hardlink} [-fsync] -ns state
148 `, os.Args[0], os.Args[0], os.Args[0])
153 fmt.Println("sgodup version", Version, "built with", runtime.Version())
157 fmt.Println(`This program is free software: you can redistribute it and/or modify
158 it under the terms of the GNU General Public License as published by
159 the Free Software Foundation, version 3 of the License.
161 This program is distributed in the hope that it will be useful,
162 but WITHOUT ANY WARRANTY; without even the implied warranty of
163 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
164 GNU General Public License for more details.
166 You should have received a copy of the GNU General Public License
167 along with this program. If not, see <http://www.gnu.org/licenses/>.`)
171 var stdoutW *bufio.Writer
177 stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
178 nsW = netstring.NewWriter(stdoutW)
184 log.Fatalln("invalid action")
189 log.Fatalln("\"-action ns\" has no meaning with -ns")
191 fd, err := os.Open(*nsPath)
195 nsR := netstring.NewReader(bufio.NewReaderSize(fd, BufSize))
196 pathDup := make([]byte, 1<<10)
197 pathOrig := make([]byte, 1<<10)
198 var pathDupLen, pathOrigLen uint64
201 progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
202 termRequired := signalHandler(progress.Stop, &files)
204 pathDupLen, err = nsR.Next()
211 if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
214 pathOrigLen, err = nsR.Next()
218 if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
221 link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
224 termRequired <- syscall.SIGTERM
229 log.Fatalln("-basedir is required")
232 log.Fatalln("-dupdir is required")
234 var doChmod os.FileMode
236 ch, err := strconv.ParseUint(*chmod, 8, 16)
240 doChmod = os.FileMode(ch)
243 log.Println("processing basedir...")
244 size2fi := make(map[int64][]FileInode, 1<<10)
247 progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
248 for fi := range walk(*baseDir) {
250 if err := os.Chmod(fi.Path, doChmod); err != nil {
254 if fi.Size < *minSize {
259 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
263 log.Println("processing dupdir...")
264 queue := make([]FileInode, 0, files)
265 files, fullSize = 0, 0
266 progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
267 for fi := range walk(*dupDir) {
269 if err := os.Chmod(fi.Path, doChmod); err != nil {
273 if fi.Size < *minSize {
276 origs, ok := size2fi[fi.Size]
281 for _, orig := range origs {
282 if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
292 queue = append(queue, fi)
296 log.Println("deduplicating...")
297 progress = NewProgress(
303 files, fullSize = 0, 0
304 bufDup := make([]byte, SectorSize)
305 bufOrig := make([]byte, SectorSize)
306 seenDup := make(map[string]struct{}, len(queue)/2)
307 seenOrig := make(map[string]struct{}, len(queue)/2)
308 hasher, err := blake2b.New512(nil)
312 rdDup := bufio.NewReaderSize(nil, BufSize)
313 rdOrig := bufio.NewReaderSize(nil, BufSize)
315 termRequired := signalHandler(progress.Stop, &deduped)
316 for _, fi := range queue {
318 if _, ok := seenOrig[fi.Path]; ok {
321 fdDup, err := os.Open(fi.Path)
325 readDup, err := io.ReadFull(fdDup, bufDup)
327 if err != io.ErrUnexpectedEOF {
330 if int64(readDup) != fi.Size {
331 log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
335 for _, orig := range size2fi[fi.Size] {
336 if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
339 if _, ok := seenDup[orig.Path]; ok {
342 fdOrig, err := os.Open(orig.Path)
346 readOrig, err := io.ReadFull(fdOrig, bufOrig)
347 if !(err == nil || err == io.ErrUnexpectedEOF) {
350 if readOrig != readDup {
353 "unexpectedly different sizes",
357 if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
358 if err = fdOrig.Close(); err != nil {
365 if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
366 log.Fatalln("can not write to hash", err)
369 n, err := io.Copy(hasher, rdDup)
373 if int64(readDup)+n != fi.Size {
374 log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
376 hashDup = hasher.Sum(nil)
379 if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
380 log.Fatalln("can not write to hash", err)
383 n, err := io.Copy(hasher, rdOrig)
387 if int64(readOrig)+n != fi.Size {
390 "unexpectedly different sizes",
391 int64(readOrig)+n, fi.Size,
394 if err = fdOrig.Close(); err != nil {
397 if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
400 link(fi.Path, orig.Path)
401 seenDup[fi.Path] = struct{}{}
402 seenOrig[orig.Path] = struct{}{}
407 if err = fdDup.Close(); err != nil {
412 if err = stdoutW.Flush(); err != nil {
416 termRequired <- syscall.SIGTERM