-/*
-sgodup -- File deduplication utility
-Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, version 3 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
+// sgodup -- File deduplication utility
+// Copyright (C) 2020-2024 Sergey Matveev <stargrave@stargrave.org>
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 3 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
// File deduplication utility
package main
"os"
"os/signal"
"path/filepath"
+ "runtime"
"strconv"
"sync"
"syscall"
+ "github.com/dustin/go-humanize"
+ "go.cypherpunks.su/netstring/v3"
"golang.org/x/crypto/blake2b"
)
const (
- SizeBoundary = 1 << 12 // 4 KiB sector size
- BufSize = 1 << 17 // ZFS default 128 KiB recordsize
+ Version = "0.2.0"
+ SectorSize = 1 << 12 // 4 KiB sector size
+ BufSize = 1 << 17 // ZFS default 128 KiB recordsize
+
+ ActPrint = iota
+ ActNS = iota
+ ActSymlink = iota
+ ActHardlink = iota
)
-var (
- canExit sync.Mutex
+type Action int
+var (
+ canExit sync.Mutex
+ nsW *netstring.Writer
curDirPath string
curDirFd *os.File
+ action Action
+
+ baseDir = flag.String("basedir", "", "Directory with original files")
+ dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
+ actionS = flag.String("action", "", "print, ns, symlink, hardlink")
+ minSize = flag.Int64("minsize", 1, "minimal file size")
+ chmod = flag.String("chmod", "", "chmod files")
+ doNS = flag.Bool("ns", false, "link targets from netstring read from stdin")
+ fsync = flag.Bool("fsync", false, "fsync directories?")
+ version = flag.Bool("version", false, "Print version information")
+ warranty = flag.Bool("warranty", false, "Print warranty information")
)
-func link(dup, orig, action string, fsync bool) {
+func link(dup, orig string) {
+ if action == ActNS {
+ if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
+ log.Fatal(err)
+ }
+ if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
+ log.Fatal(err)
+ }
+ return
+ }
tgt, err := filepath.Rel(dup, orig)
if err != nil {
log.Fatal(err)
}
tgt = tgt[3:]
- if action == "print" {
- fmt.Println(dup, tgt)
+ if action == ActPrint {
+ fmt.Println(dup, "->", tgt)
return
}
canExit.Lock()
if err = os.Remove(dup); err != nil {
log.Fatal(err)
}
- if action == "symlink" {
+ if action == ActSymlink {
err = os.Symlink(tgt, dup)
} else {
err = os.Link(orig, dup)
if err != nil {
log.Fatal(err)
}
- if fsync {
+ if *fsync {
dirPath := filepath.Dir(dup)
if dirPath != curDirPath {
curDirFd, err = os.Open(dirPath)
canExit.Unlock()
}
+func signalHandler(progressStop func(), deduped *int) chan os.Signal {
+ termRequired := make(chan os.Signal, 1)
+ signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+ go func() {
+ <-termRequired
+ canExit.Lock()
+ progressStop()
+ log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
+ os.Exit(0)
+ }()
+ return termRequired
+}
+
func main() {
- var (
- baseDir = flag.String("basedir", "", "Directory with original files")
- dupDir = flag.String("dupdir", "", "Directory with possible duplicates")
- action = flag.String("action", "", "print, symlink, hardlink")
- doChmod = flag.String("chmod", "", "chmod files")
- doFsync = flag.Bool("fsync", false, "fsync directories?")
- )
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
+Copyright (C) 2020-2024 Sergey Matveev
+License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+
+Single pass mode:
+ %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
+ [-chmod XXX] [-minsize XXX] [-fsync]
+Two pass mode:
+ %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
+ %s -action {print,symlink,hardlink} [-fsync] -ns < state
+
+Options:
+`, os.Args[0], os.Args[0], os.Args[0])
+ flag.PrintDefaults()
+ }
flag.Parse()
+ if *version {
+ fmt.Println("sgodup version", Version, "built with", runtime.Version())
+ return
+ }
+ if *warranty {
+ fmt.Println(`This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.`)
+ return
+ }
+
+ var stdoutW *bufio.Writer
+ switch *actionS {
+ case "print":
+ action = ActPrint
+ case "ns":
+ action = ActNS
+ stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
+ nsW = netstring.NewWriter(stdoutW)
+ case "symlink":
+ action = ActSymlink
+ case "hardlink":
+ action = ActHardlink
+ default:
+ log.Fatalln("invalid action")
+ }
+
+ if *doNS {
+ if action == ActNS {
+ log.Fatalln("\"-action ns\" has no meaning with -ns")
+ }
+ nsR := netstring.NewReader(bufio.NewReaderSize(os.Stdin, BufSize))
+ pathDup := make([]byte, 1<<10)
+ pathOrig := make([]byte, 1<<10)
+ var err error
+ var pathDupLen, pathOrigLen uint64
+ files := 0
+ fullSize := int64(0)
+ progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
+ termRequired := signalHandler(progress.Stop, &files)
+ for {
+ pathDupLen, err = nsR.Next()
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ log.Fatal(err)
+ }
+ if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
+ log.Fatal(err)
+ }
+ pathOrigLen, err = nsR.Next()
+ if err != nil {
+ log.Fatal(err)
+ }
+ if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
+ log.Fatal(err)
+ }
+ link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
+ files++
+ }
+ termRequired <- syscall.SIGTERM
+ <-termRequired
+ }
+
if *baseDir == "" {
log.Fatalln("-basedir is required")
}
if *dupDir == "" {
log.Fatalln("-dupdir is required")
}
- var chmod os.FileMode
- if *doChmod != "" {
- ch, err := strconv.ParseUint(*doChmod, 8, 16)
+ var doChmod os.FileMode
+ if *chmod != "" {
+ ch, err := strconv.ParseUint(*chmod, 8, 16)
if err != nil {
log.Fatal(err)
}
- chmod = os.FileMode(ch)
- }
- if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
- log.Fatalln("choose action")
+ doChmod = os.FileMode(ch)
}
log.Println("processing basedir...")
size2fi := make(map[int64][]FileInode, 1<<10)
- files := 0
- filesSmall := 0
- filesLarge := 0
+ var files int
var fullSize int64
progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
for fi := range walk(*baseDir) {
- if chmod > 0 {
- if err := os.Chmod(fi.Path, chmod); err != nil {
+ if doChmod > 0 {
+ if err := os.Chmod(fi.Path, doChmod); err != nil {
log.Fatal(err)
}
}
- if fi.Size == 0 {
+ if fi.Size < *minSize {
continue
}
- if fi.Size <= SizeBoundary {
- filesSmall++
- } else {
- filesLarge++
- }
files++
fullSize += fi.Size
size2fi[fi.Size] = append(size2fi[fi.Size], fi)
progress.Stop()
log.Println("processing dupdir...")
- queueSmall := make(map[string][]string, filesSmall)
- queueLarge := make(map[string][]string, filesLarge)
- files = 0
- fullSize = 0
+ queue := make([]FileInode, 0, files)
+ files, fullSize = 0, 0
progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
for fi := range walk(*dupDir) {
- if chmod > 0 {
- if err := os.Chmod(fi.Path, chmod); err != nil {
+ if doChmod > 0 {
+ if err := os.Chmod(fi.Path, doChmod); err != nil {
log.Fatal(err)
}
}
- if fi.Size == 0 {
+ if fi.Size < *minSize {
continue
}
origs, ok := size2fi[fi.Size]
if !ok {
continue
}
- paths := make([]string, 0, len(origs))
+ candidates := 0
for _, orig := range origs {
if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
continue
}
- paths = append(paths, orig.Path)
+ candidates++
+ }
+ if candidates == 0 {
+ continue
}
files++
fullSize += fi.Size
- if fi.Size <= SizeBoundary {
- queueSmall[fi.Path] = paths
- } else {
- queueLarge[fi.Path] = paths
- }
+ queue = append(queue, fi)
}
- size2fi = nil
progress.Stop()
log.Println("deduplicating...")
progress = NewProgress(
- files,
- fullSize,
- &files,
- &fullSize,
+ files, fullSize,
+ &files, &fullSize,
" processed",
" deduplicated",
)
- files = 0
- fullSize = 0
- deduped := 0
- termRequired := make(chan os.Signal, 1)
- signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
- go func() {
- <-termRequired
- canExit.Lock()
- progress.Stop()
- log.Println(deduped, "files deduplicated")
- os.Exit(0)
- }()
- bufDup := make([]byte, SizeBoundary)
- bufOrig := make([]byte, SizeBoundary)
- seen := make(map[string]struct{}, len(queueSmall))
- for dup, origs := range queueSmall {
- files++
- if _, ok := seen[dup]; ok {
- continue
- }
- fdDup, err := os.Open(dup)
- if err != nil {
- log.Fatal(err)
- }
- sizeDup, err := io.ReadFull(fdDup, bufDup)
- if !(err == nil || err == io.ErrUnexpectedEOF) {
- log.Fatal(err)
- }
- if err = fdDup.Close(); err != nil {
- log.Fatal(err)
- }
- for _, orig := range origs {
- fdOrig, err := os.Open(orig)
- if err != nil {
- log.Fatal(err)
- }
- sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
- if !(err == nil || err == io.ErrUnexpectedEOF) {
- log.Fatal(err)
- }
- if sizeOrig != sizeDup {
- log.Fatalln(dup, orig, "unexpectedly different sizes")
- }
- if err = fdOrig.Close(); err != nil {
- log.Fatal(err)
- }
- if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
- continue
- }
- link(dup, orig, *action, *doFsync)
- seen[orig] = struct{}{}
- deduped++
- fullSize += int64(sizeDup)
- break
- }
- }
- queueSmall = nil
-
+ files, fullSize = 0, 0
+ bufDup := make([]byte, SectorSize)
+ bufOrig := make([]byte, SectorSize)
+ seenDup := make(map[string]struct{}, len(queue)/2)
+ seenOrig := make(map[string]struct{}, len(queue)/2)
hasher, err := blake2b.New512(nil)
if err != nil {
panic(err)
}
- seen = make(map[string]struct{}, len(queueLarge))
- var sizeDup int64
- for dup, origs := range queueLarge {
+ rdDup := bufio.NewReaderSize(nil, BufSize)
+ rdOrig := bufio.NewReaderSize(nil, BufSize)
+ var deduped int
+ termRequired := signalHandler(progress.Stop, &deduped)
+ for _, fi := range queue {
files++
- if _, ok := seen[dup]; ok {
+ if _, ok := seenOrig[fi.Path]; ok {
continue
}
- fdDup, err := os.Open(dup)
+ fdDup, err := os.Open(fi.Path)
if err != nil {
log.Fatal(err)
}
- if _, err := io.ReadFull(fdDup, bufDup); err != nil {
- log.Fatal(err)
+ readDup, err := io.ReadFull(fdDup, bufDup)
+ if err != nil {
+ if err != io.ErrUnexpectedEOF {
+ log.Fatal(err)
+ }
+ if int64(readDup) != fi.Size {
+ log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
+ }
}
var hashDup []byte
- for _, orig := range origs {
- fdOrig, err := os.Open(orig)
+ for _, orig := range size2fi[fi.Size] {
+ if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+ continue
+ }
+ if _, ok := seenDup[orig.Path]; ok {
+ continue
+ }
+ fdOrig, err := os.Open(orig.Path)
if err != nil {
log.Fatal(err)
}
- if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+ readOrig, err := io.ReadFull(fdOrig, bufOrig)
+ if !(err == nil || err == io.ErrUnexpectedEOF) {
log.Fatal(err)
}
- if bytes.Compare(bufDup, bufOrig) != 0 {
+ if readOrig != readDup {
+ log.Fatalln(
+ fi.Path, orig.Path,
+ "unexpectedly different sizes",
+ readOrig, readDup,
+ )
+ }
+ if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
if err = fdOrig.Close(); err != nil {
log.Fatal(err)
}
}
if hashDup == nil {
hasher.Reset()
- if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+ if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
log.Fatalln("can not write to hash", err)
}
- sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+ rdDup.Reset(fdDup)
+ n, err := io.Copy(hasher, rdDup)
if err != nil {
log.Fatal(err)
}
+ if int64(readDup)+n != fi.Size {
+ log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
+ }
hashDup = hasher.Sum(nil)
}
hasher.Reset()
- if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+ if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
log.Fatalln("can not write to hash", err)
}
- if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+ rdOrig.Reset(fdOrig)
+ n, err := io.Copy(hasher, rdOrig)
+ if err != nil {
log.Fatal(err)
}
+ if int64(readOrig)+n != fi.Size {
+ log.Fatalln(
+ fi.Path, orig.Path,
+ "unexpectedly different sizes",
+ int64(readOrig)+n, fi.Size,
+ )
+ }
if err = fdOrig.Close(); err != nil {
log.Fatal(err)
}
if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
continue
}
- link(dup, orig, *action, *doFsync)
- seen[orig] = struct{}{}
+ link(fi.Path, orig.Path)
+ seenDup[fi.Path] = struct{}{}
+ seenOrig[orig.Path] = struct{}{}
+ fullSize += fi.Size
deduped++
- fullSize += sizeDup
break
}
if err = fdDup.Close(); err != nil {
log.Fatal(err)
}
}
+ if action == ActNS {
+ if err = stdoutW.Flush(); err != nil {
+ log.Fatal(err)
+ }
+ }
termRequired <- syscall.SIGTERM
<-termRequired
}