/* sgodup -- File deduplication utility Copyright (C) 2020-2023 Sergey Matveev This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ // File deduplication utility package main import ( "bufio" "bytes" "flag" "fmt" "io" "log" "os" "os/signal" "path/filepath" "runtime" "strconv" "sync" "syscall" "github.com/dustin/go-humanize" "go.cypherpunks.ru/netstring/v2" "golang.org/x/crypto/blake2b" ) const ( Version = "0.2.0" SectorSize = 1 << 12 // 4 KiB sector size BufSize = 1 << 17 // ZFS default 128 KiB recordsize ActPrint = iota ActNS = iota ActSymlink = iota ActHardlink = iota ) type Action int var ( canExit sync.Mutex nsW *netstring.Writer curDirPath string curDirFd *os.File action Action baseDir = flag.String("basedir", "", "Directory with original files") dupDir = flag.String("dupdir", "", "Directory with possible duplicates") actionS = flag.String("action", "", "print, ns, symlink, hardlink") minSize = flag.Int64("minsize", 1, "minimal file size") chmod = flag.String("chmod", "", "chmod files") doNS = flag.Bool("ns", false, "link targets from netstring read from stdin") fsync = flag.Bool("fsync", false, "fsync directories?") version = flag.Bool("version", false, "Print version information") warranty = flag.Bool("warranty", false, "Print warranty information") ) func link(dup, orig string) { if action == ActNS { if _, err := nsW.WriteChunk([]byte(dup)); err != nil { log.Fatal(err) } if _, err := nsW.WriteChunk([]byte(orig)); err != nil { log.Fatal(err) } return } tgt, err := filepath.Rel(dup, orig) if err != nil { log.Fatal(err) } tgt = tgt[3:] if action == ActPrint { fmt.Println(dup, "->", tgt) return } canExit.Lock() if err = os.Remove(dup); err != nil { log.Fatal(err) } if action == ActSymlink { err = os.Symlink(tgt, dup) } else { err = os.Link(orig, dup) } if err != nil { log.Fatal(err) } if *fsync { dirPath := filepath.Dir(dup) if dirPath != curDirPath { curDirFd, err = os.Open(dirPath) if err != nil { log.Fatal(err) } curDirPath = dirPath } if err = curDirFd.Sync(); err != nil { log.Fatal(err) } } canExit.Unlock() } func signalHandler(progressStop func(), deduped *int) chan os.Signal { termRequired := make(chan os.Signal, 1) signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT) go func() { <-termRequired canExit.Lock() progressStop() log.Println(humanize.Comma(int64(*deduped)), "files deduplicated") os.Exit(0) }() return termRequired } func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility Copyright (C) 2020-2023 Sergey Matveev License GPLv3: GNU GPL version 3 This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Single pass mode: %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink} [-chmod XXX] [-minsize XXX] [-fsync] Two pass mode: %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state %s -action {print,symlink,hardlink} [-fsync] -ns < state Options: `, os.Args[0], os.Args[0], os.Args[0]) flag.PrintDefaults() } flag.Parse() if *version { fmt.Println("sgodup version", Version, "built with", runtime.Version()) return } if *warranty { fmt.Println(`This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see .`) return } var stdoutW *bufio.Writer switch *actionS { case "print": action = ActPrint case "ns": action = ActNS stdoutW = bufio.NewWriterSize(os.Stdout, BufSize) nsW = netstring.NewWriter(stdoutW) case "symlink": action = ActSymlink case "hardlink": action = ActHardlink default: log.Fatalln("invalid action") } if *doNS { if action == ActNS { log.Fatalln("\"-action ns\" has no meaning with -ns") } nsR := netstring.NewReader(bufio.NewReaderSize(os.Stdin, BufSize)) pathDup := make([]byte, 1<<10) pathOrig := make([]byte, 1<<10) var err error var pathDupLen, pathOrigLen uint64 files := 0 fullSize := int64(0) progress := NewProgress(0, 0, &files, &fullSize, " linked", "") termRequired := signalHandler(progress.Stop, &files) for { pathDupLen, err = nsR.Next() if err != nil { if err == io.EOF { break } log.Fatal(err) } if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil { log.Fatal(err) } pathOrigLen, err = nsR.Next() if err != nil { log.Fatal(err) } if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil { log.Fatal(err) } link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen])) files++ } termRequired <- syscall.SIGTERM <-termRequired } if *baseDir == "" { log.Fatalln("-basedir is required") } if *dupDir == "" { log.Fatalln("-dupdir is required") } var doChmod os.FileMode if *chmod != "" { ch, err := strconv.ParseUint(*chmod, 8, 16) if err != nil { log.Fatal(err) } doChmod = os.FileMode(ch) } log.Println("processing basedir...") size2fi := make(map[int64][]FileInode, 1<<10) var files int var fullSize int64 progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total") for fi := range walk(*baseDir) { if doChmod > 0 { if err := os.Chmod(fi.Path, doChmod); err != nil { log.Fatal(err) } } if fi.Size < *minSize { continue } files++ fullSize += fi.Size size2fi[fi.Size] = append(size2fi[fi.Size], fi) } progress.Stop() log.Println("processing dupdir...") queue := make([]FileInode, 0, files) files, fullSize = 0, 0 progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total") for fi := range walk(*dupDir) { if doChmod > 0 { if err := os.Chmod(fi.Path, doChmod); err != nil { log.Fatal(err) } } if fi.Size < *minSize { continue } origs, ok := size2fi[fi.Size] if !ok { continue } candidates := 0 for _, orig := range origs { if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) { continue } candidates++ } if candidates == 0 { continue } files++ fullSize += fi.Size queue = append(queue, fi) } progress.Stop() log.Println("deduplicating...") progress = NewProgress( files, fullSize, &files, &fullSize, " processed", " deduplicated", ) files, fullSize = 0, 0 bufDup := make([]byte, SectorSize) bufOrig := make([]byte, SectorSize) seenDup := make(map[string]struct{}, len(queue)/2) seenOrig := make(map[string]struct{}, len(queue)/2) hasher, err := blake2b.New512(nil) if err != nil { panic(err) } rdDup := bufio.NewReaderSize(nil, BufSize) rdOrig := bufio.NewReaderSize(nil, BufSize) var deduped int termRequired := signalHandler(progress.Stop, &deduped) for _, fi := range queue { files++ if _, ok := seenOrig[fi.Path]; ok { continue } fdDup, err := os.Open(fi.Path) if err != nil { log.Fatal(err) } readDup, err := io.ReadFull(fdDup, bufDup) if err != nil { if err != io.ErrUnexpectedEOF { log.Fatal(err) } if int64(readDup) != fi.Size { log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size) } } var hashDup []byte for _, orig := range size2fi[fi.Size] { if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) { continue } if _, ok := seenDup[orig.Path]; ok { continue } fdOrig, err := os.Open(orig.Path) if err != nil { log.Fatal(err) } readOrig, err := io.ReadFull(fdOrig, bufOrig) if !(err == nil || err == io.ErrUnexpectedEOF) { log.Fatal(err) } if readOrig != readDup { log.Fatalln( fi.Path, orig.Path, "unexpectedly different sizes", readOrig, readDup, ) } if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 { if err = fdOrig.Close(); err != nil { log.Fatal(err) } continue } if hashDup == nil { hasher.Reset() if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup { log.Fatalln("can not write to hash", err) } rdDup.Reset(fdDup) n, err := io.Copy(hasher, rdDup) if err != nil { log.Fatal(err) } if int64(readDup)+n != fi.Size { log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size) } hashDup = hasher.Sum(nil) } hasher.Reset() if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig { log.Fatalln("can not write to hash", err) } rdOrig.Reset(fdOrig) n, err := io.Copy(hasher, rdOrig) if err != nil { log.Fatal(err) } if int64(readOrig)+n != fi.Size { log.Fatalln( fi.Path, orig.Path, "unexpectedly different sizes", int64(readOrig)+n, fi.Size, ) } if err = fdOrig.Close(); err != nil { log.Fatal(err) } if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 { continue } link(fi.Path, orig.Path) seenDup[fi.Path] = struct{}{} seenOrig[orig.Path] = struct{}{} fullSize += fi.Size deduped++ break } if err = fdDup.Close(); err != nil { log.Fatal(err) } } if action == ActNS { if err = stdoutW.Flush(); err != nil { log.Fatal(err) } } termRequired <- syscall.SIGTERM <-termRequired }