]> Sergey Matveev's repositories - sgodup.git/commitdiff
Higher performance, two pass mode v0.1.0
authorSergey Matveev <stargrave@stargrave.org>
Sat, 21 Mar 2020 07:29:14 +0000 (10:29 +0300)
committerSergey Matveev <stargrave@stargrave.org>
Sat, 21 Mar 2020 17:19:40 +0000 (20:19 +0300)
README
go.mod
go.sum
main.go

diff --git a/README b/README
index b34dcd95fdd5581f84ac08ecb114aa71a9a0e8db..f8b2b8a7b0eafc769c8e661a228126600c99e42f 100644 (file)
--- a/README
+++ b/README
@@ -1,47 +1,53 @@
                   sgodup -- file deduplication utility
                   ====================================
 
                   sgodup -- file deduplication utility
                   ====================================
 
-DESCRIPTION AND USAGE
+sgodup is utility for files deduplication. You supply two directories:
+the base and one with possible duplicates, utility determines duplicate
+files and replaces them with the links. It is aimed to have very high
+performance.
 
 
-sgodup is utility for duplicate files detection. You supply two
-directories: the base and one with possible duplicates, utility
-determines duplicate files and replaces them with the links. It
-is aimed to have very high performance.
+SINGLE PASS MODE
+================
 
 
-There are just few arguments:
+$ sgodup -basedir DIR -dupdir DIR -action ACTION \
+  [-minsize NNN] [-chmod NNN] [-fsync]
 
 
--basedir -- directory with files that are possible link targets
- -dupdir -- directory with possible duplicates, which are replaced
-            with the links to basedir's files
- -action -- * print: just print to stdout duplicate file path with
-              relative path to basedir's corresponding file
-            * symlink: create symbolic link with relative path to
-              basedir's corresponding file
-            * hardlink: create hard link instead
-  -chmod -- if specified, then chmod files in basedir and dupdir
-            during scan phase. Octal representation is expected
-  -fsync -- fsync directories where linking occurs
+basedir is a directory with "original" files, that are possible link
+targets. dupdir is a directory with possible duplicates, which are to be
+replaced with the links to basedir's file. It is safe to specify same
+directory as a basedir and dupdir.
 
 
-There are three stages:
+There are 3 stages this command will do:
 
 * basedir directory scan: collect all *regular* file paths, sizes and
 
 * basedir directory scan: collect all *regular* file paths, sizes and
-  inodes. If -chmod is specified, then apply it at once. Empty files are
-  ignored
+  inodes. If -chmod is specified, then apply it to them. Files smaller
+  than -minsize (by default it is equal to 1 bytes) are not taken for
+  duplication comparison
 * dupdir directory scan: same as above. If there is no basedir's file
 * dupdir directory scan: same as above. If there is no basedir's file
-  with the same size, then skip dupdir's file (obviously it can not be
+  with the same size, then skip dupdir's one (obviously it can not be
   duplicate). Check that no basedir's files have the same inode, skip
   duplicate). Check that no basedir's files have the same inode, skip
-  dupdir's file otherwise, because it is already hardlinked
-* deduplication stage. For each dupdir file, find basedir file with the
-  same size and compare their contents, to determine if dupdir's one is
-  the duplicate. Perform specified action if so. There are two separate
-  queues and processing cycles:
-
-  * small files, up to 4 KiB (one disk sector): files are fully read and
-    compared in memory
-  * large files (everything else): read and compare first 4 KiB of files
-    in memory. If they are not equal, then this is not a duplicate.
-    Fully read each file's contents sequentially with 128 KiB chunks and
-    calculate BLAKE2b-512 digest otherwise
+  dupdir's file otherwise (it is hardlink)
+* deduplication stage. For each dupdir file, find basedir one with the
+  same size and compare their contents, to determine if dupdir one is
+  the duplicate. Perform specified action if so. Comparing is done the
+  following way:
+  * read first 4 KiB (one disk sector) of each file
+  * if that sector differs, then files are not duplicates
+  * read each file's contents sequentially with 128 KiB chunks and
+    calculate BLAKE2b-512 digest
+
+Action can be the following:
+
+* print: print to stdout duplicate file path with corresponding relative
+  path to basedir's file
+* symlink: create symbolic link with relative path to corresponding
+  basedir's file
+* hardlink: create hard link instead
+* ns: write to stdout series of netstring encoded pairs of duplicate
+  file path and its corresponding basedir's one. It is used in two pass
+  mode. Hint: it is highly compressible
+
+If -fsync is specified, then fsync directories where linking occurs.
 
 Progress is showed at each stage: how many files are counted/processed,
 total size of the files, how much space is deduplicated.
 
 Progress is showed at each stage: how many files are counted/processed,
 total size of the files, how much space is deduplicated.
@@ -58,9 +64,27 @@ total size of the files, how much space is deduplicated.
     [...]
     2020/03/20 11:17:20 321,123 files deduplicated
 
     [...]
     2020/03/20 11:17:20 321,123 files deduplicated
 
-It is safe to specify same directory as a basedir and dupdir.
+TWO PASS MODE
+=============
+
+$ sgodup -basedir DIR -dupdir DIR -action ns [-minsize NNN] [-chmod NNN] > state
+$ sgodup -action ACTION [-fsync] -ns state
+
+If you are dealing with huge amount of small files, then simultaneous
+reading (duplicate detection) and writing (duplicate files linking) on
+the same disk can dramatically decrease performance. It is advisable to
+separate the whole process on two stages: read-only duplicates
+detection, write-only duplicates linking.
+
+Start sgodup with "-action ns" and redirect stdout output to some
+temporary state file, for storing detected duplicate files information
+in it. Then start again with "-ns state" option to relink files.
 
 SAFETY AND CONSISTENCY
 
 SAFETY AND CONSISTENCY
+======================
+
+It was not tested on 32-bit platforms and probably won't work on them
+correctly.
 
 POSIX has no ability to atomically replace regular file with with
 symbolic/hard link. So file is removed first, then link created. sgodup
 
 POSIX has no ability to atomically replace regular file with with
 symbolic/hard link. So file is removed first, then link created. sgodup
@@ -77,6 +101,7 @@ There are no warranties and any defined behaviour if directories (and files
 within) where utility is working with are modified.
 
 LICENCE
 within) where utility is working with are modified.
 
 LICENCE
+=======
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/go.mod b/go.mod
index d87c9ccff3978ba8ed8382c30fb6369c7d2850a4..42b833c82d5b4f0d60a8bf4c9ca39b849e011c3e 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -4,5 +4,6 @@ go 1.14
 
 require (
        github.com/dustin/go-humanize v1.0.0
 
 require (
        github.com/dustin/go-humanize v1.0.0
+       go.cypherpunks.ru/netstring/v2 v2.0.0
        golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
 )
        golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
 )
diff --git a/go.sum b/go.sum
index dc88a584d05d902c0f240e4cd26b04ee860ca5ef..820ce5b330c7f70d460a96a12ffa2e988c58c3ab 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,7 @@
 github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+go.cypherpunks.ru/netstring/v2 v2.0.0 h1:or1LDZO3fSd6iITGR3jJUfUrjvRgeNlUpEYI13qaRBk=
+go.cypherpunks.ru/netstring/v2 v2.0.0/go.mod h1:6YDx4gW414SmHdvSBMKbHaB2/7w9WZ04NQb7XIUV/pA=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
diff --git a/main.go b/main.go
index f4fbf6ca776668269f98c1baac66b4d6a8457c6e..9c5382ce03f79d536422e03e383a7c403cfe6faa 100644 (file)
--- a/main.go
+++ b/main.go
@@ -28,40 +28,71 @@ import (
        "os"
        "os/signal"
        "path/filepath"
        "os"
        "os/signal"
        "path/filepath"
+       "runtime"
        "strconv"
        "sync"
        "syscall"
 
        "strconv"
        "sync"
        "syscall"
 
+       "github.com/dustin/go-humanize"
+       "go.cypherpunks.ru/netstring/v2"
        "golang.org/x/crypto/blake2b"
 )
 
 const (
        "golang.org/x/crypto/blake2b"
 )
 
 const (
-       SizeBoundary = 1 << 12 // 4 KiB sector size
-       BufSize      = 1 << 17 // ZFS default 128 KiB recordsize
+       Version    = "0.1.0"
+       SectorSize = 1 << 12 // 4 KiB sector size
+       BufSize    = 1 << 17 // ZFS default 128 KiB recordsize
+
+       ActPrint    = iota
+       ActNS       = iota
+       ActSymlink  = iota
+       ActHardlink = iota
 )
 
 )
 
-var (
-       canExit sync.Mutex
+type Action int
 
 
+var (
+       canExit    sync.Mutex
+       nsW        *netstring.Writer
        curDirPath string
        curDirFd   *os.File
        curDirPath string
        curDirFd   *os.File
+       action     Action
+
+       baseDir  = flag.String("basedir", "", "Directory with original files")
+       dupDir   = flag.String("dupdir", "", "Directory with possible duplicates")
+       actionS  = flag.String("action", "", "print, ns, symlink, hardlink")
+       minSize  = flag.Int64("minsize", 1, "minimal file size")
+       chmod    = flag.String("chmod", "", "chmod files")
+       nsPath   = flag.String("ns", "", "link targets from netstring file")
+       fsync    = flag.Bool("fsync", false, "fsync directories?")
+       version  = flag.Bool("version", false, "Print version information")
+       warranty = flag.Bool("warranty", false, "Print warranty information")
 )
 
 )
 
-func link(dup, orig, action string, fsync bool) {
+func link(dup, orig string) {
+       if action == ActNS {
+               if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
+                       log.Fatal(err)
+               }
+               if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
+                       log.Fatal(err)
+               }
+               return
+       }
        tgt, err := filepath.Rel(dup, orig)
        if err != nil {
                log.Fatal(err)
        }
        tgt = tgt[3:]
        tgt, err := filepath.Rel(dup, orig)
        if err != nil {
                log.Fatal(err)
        }
        tgt = tgt[3:]
-       if action == "print" {
-               fmt.Println(dup, tgt)
+       if action == ActPrint {
+               fmt.Println(dup, "->", tgt)
                return
        }
        canExit.Lock()
        if err = os.Remove(dup); err != nil {
                log.Fatal(err)
        }
                return
        }
        canExit.Lock()
        if err = os.Remove(dup); err != nil {
                log.Fatal(err)
        }
-       if action == "symlink" {
+       if action == ActSymlink {
                err = os.Symlink(tgt, dup)
        } else {
                err = os.Link(orig, dup)
                err = os.Symlink(tgt, dup)
        } else {
                err = os.Link(orig, dup)
@@ -69,7 +100,7 @@ func link(dup, orig, action string, fsync bool) {
        if err != nil {
                log.Fatal(err)
        }
        if err != nil {
                log.Fatal(err)
        }
-       if fsync {
+       if *fsync {
                dirPath := filepath.Dir(dup)
                if dirPath != curDirPath {
                        curDirFd, err = os.Open(dirPath)
                dirPath := filepath.Dir(dup)
                if dirPath != curDirPath {
                        curDirFd, err = os.Open(dirPath)
@@ -85,54 +116,144 @@ func link(dup, orig, action string, fsync bool) {
        canExit.Unlock()
 }
 
        canExit.Unlock()
 }
 
+func signalHandler(progressStop func(), deduped *int) chan os.Signal {
+       termRequired := make(chan os.Signal, 1)
+       signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
+       go func() {
+               <-termRequired
+               canExit.Lock()
+               progressStop()
+               log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
+               os.Exit(0)
+       }()
+       return termRequired
+}
+
 func main() {
 func main() {
-       var (
-               baseDir = flag.String("basedir", "", "Directory with original files")
-               dupDir  = flag.String("dupdir", "", "Directory with possible duplicates")
-               action  = flag.String("action", "", "print, symlink, hardlink")
-               doChmod = flag.String("chmod", "", "chmod files")
-               doFsync = flag.Bool("fsync", false, "fsync directories?")
-       )
+       flag.Usage = func() {
+               fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
+Copyright (C) 2020 Sergey Matveev
+License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+
+Single pass mode:
+  %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
+    [-chmod XXX] [-minsize XXX] [-fsync]
+Two pass mode:
+  %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
+  %s -action {print,symlink,hardlink} [-fsync] -ns state
+
+Options:
+`, os.Args[0], os.Args[0], os.Args[0])
+               flag.PrintDefaults()
+       }
        flag.Parse()
        flag.Parse()
+       if *version {
+               fmt.Println("sgodup version", Version, "built with", runtime.Version())
+               return
+       }
+       if *warranty {
+               fmt.Println(`This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.`)
+               return
+       }
+
+       var stdoutW *bufio.Writer
+       switch *actionS {
+       case "print":
+               action = ActPrint
+       case "ns":
+               action = ActNS
+               stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
+               nsW = netstring.NewWriter(stdoutW)
+       case "symlink":
+               action = ActSymlink
+       case "hardlink":
+               action = ActHardlink
+       default:
+               log.Fatalln("invalid action")
+       }
+
+       if *nsPath != "" {
+               if action == ActNS {
+                       log.Fatalln("\"-action ns\" has no meaning with -ns")
+               }
+               fd, err := os.Open(*nsPath)
+               if err != nil {
+                       log.Fatal(err)
+               }
+               nsR := netstring.NewReader(bufio.NewReaderSize(fd, BufSize))
+               pathDup := make([]byte, 1<<10)
+               pathOrig := make([]byte, 1<<10)
+               var pathDupLen, pathOrigLen uint64
+               files := 0
+               fullSize := int64(0)
+               progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
+               termRequired := signalHandler(progress.Stop, &files)
+               for {
+                       pathDupLen, err = nsR.Next()
+                       if err != nil {
+                               if err == io.EOF {
+                                       break
+                               }
+                               log.Fatal(err)
+                       }
+                       if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
+                               log.Fatal(err)
+                       }
+                       pathOrigLen, err = nsR.Next()
+                       if err != nil {
+                               log.Fatal(err)
+                       }
+                       if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
+                               log.Fatal(err)
+                       }
+                       link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
+                       files++
+               }
+               termRequired <- syscall.SIGTERM
+               <-termRequired
+       }
+
        if *baseDir == "" {
                log.Fatalln("-basedir is required")
        }
        if *dupDir == "" {
                log.Fatalln("-dupdir is required")
        }
        if *baseDir == "" {
                log.Fatalln("-basedir is required")
        }
        if *dupDir == "" {
                log.Fatalln("-dupdir is required")
        }
-       var chmod os.FileMode
-       if *doChmod != "" {
-               ch, err := strconv.ParseUint(*doChmod, 8, 16)
+       var doChmod os.FileMode
+       if *chmod != "" {
+               ch, err := strconv.ParseUint(*chmod, 8, 16)
                if err != nil {
                        log.Fatal(err)
                }
                if err != nil {
                        log.Fatal(err)
                }
-               chmod = os.FileMode(ch)
-       }
-       if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
-               log.Fatalln("choose action")
+               doChmod = os.FileMode(ch)
        }
 
        log.Println("processing basedir...")
        size2fi := make(map[int64][]FileInode, 1<<10)
        }
 
        log.Println("processing basedir...")
        size2fi := make(map[int64][]FileInode, 1<<10)
-       files := 0
-       filesSmall := 0
-       filesLarge := 0
+       var files int
        var fullSize int64
        progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
        for fi := range walk(*baseDir) {
        var fullSize int64
        progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
        for fi := range walk(*baseDir) {
-               if chmod > 0 {
-                       if err := os.Chmod(fi.Path, chmod); err != nil {
+               if doChmod > 0 {
+                       if err := os.Chmod(fi.Path, doChmod); err != nil {
                                log.Fatal(err)
                        }
                }
                                log.Fatal(err)
                        }
                }
-               if fi.Size == 0 {
+               if fi.Size < *minSize {
                        continue
                }
                        continue
                }
-               if fi.Size <= SizeBoundary {
-                       filesSmall++
-               } else {
-                       filesLarge++
-               }
                files++
                fullSize += fi.Size
                size2fi[fi.Size] = append(size2fi[fi.Size], fi)
                files++
                fullSize += fi.Size
                size2fi[fi.Size] = append(size2fi[fi.Size], fi)
@@ -140,137 +261,100 @@ func main() {
        progress.Stop()
 
        log.Println("processing dupdir...")
        progress.Stop()
 
        log.Println("processing dupdir...")
-       queueSmall := make(map[string][]string, filesSmall)
-       queueLarge := make(map[string][]string, filesLarge)
-       files = 0
-       fullSize = 0
+       queue := make([]FileInode, 0, files)
+       files, fullSize = 0, 0
        progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
        for fi := range walk(*dupDir) {
        progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
        for fi := range walk(*dupDir) {
-               if chmod > 0 {
-                       if err := os.Chmod(fi.Path, chmod); err != nil {
+               if doChmod > 0 {
+                       if err := os.Chmod(fi.Path, doChmod); err != nil {
                                log.Fatal(err)
                        }
                }
                                log.Fatal(err)
                        }
                }
-               if fi.Size == 0 {
+               if fi.Size < *minSize {
                        continue
                }
                origs, ok := size2fi[fi.Size]
                if !ok {
                        continue
                }
                        continue
                }
                origs, ok := size2fi[fi.Size]
                if !ok {
                        continue
                }
-               paths := make([]string, 0, len(origs))
+               candidates := 0
                for _, orig := range origs {
                        if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
                                continue
                        }
                for _, orig := range origs {
                        if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
                                continue
                        }
-                       paths = append(paths, orig.Path)
+                       candidates++
+               }
+               if candidates == 0 {
+                       continue
                }
                files++
                fullSize += fi.Size
                }
                files++
                fullSize += fi.Size
-               if fi.Size <= SizeBoundary {
-                       queueSmall[fi.Path] = paths
-               } else {
-                       queueLarge[fi.Path] = paths
-               }
+               queue = append(queue, fi)
        }
        }
-       size2fi = nil
        progress.Stop()
 
        log.Println("deduplicating...")
        progress = NewProgress(
        progress.Stop()
 
        log.Println("deduplicating...")
        progress = NewProgress(
-               files,
-               fullSize,
-               &files,
-               &fullSize,
+               files, fullSize,
+               &files, &fullSize,
                " processed",
                " deduplicated",
        )
                " processed",
                " deduplicated",
        )
-       files = 0
-       fullSize = 0
-       deduped := 0
-       termRequired := make(chan os.Signal, 1)
-       signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
-       go func() {
-               <-termRequired
-               canExit.Lock()
-               progress.Stop()
-               log.Println(deduped, "files deduplicated")
-               os.Exit(0)
-       }()
-       bufDup := make([]byte, SizeBoundary)
-       bufOrig := make([]byte, SizeBoundary)
-       seen := make(map[string]struct{}, len(queueSmall))
-       for dup, origs := range queueSmall {
-               files++
-               if _, ok := seen[dup]; ok {
-                       continue
-               }
-               fdDup, err := os.Open(dup)
-               if err != nil {
-                       log.Fatal(err)
-               }
-               sizeDup, err := io.ReadFull(fdDup, bufDup)
-               if !(err == nil || err == io.ErrUnexpectedEOF) {
-                       log.Fatal(err)
-               }
-               if err = fdDup.Close(); err != nil {
-                       log.Fatal(err)
-               }
-               for _, orig := range origs {
-                       fdOrig, err := os.Open(orig)
-                       if err != nil {
-                               log.Fatal(err)
-                       }
-                       sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
-                       if !(err == nil || err == io.ErrUnexpectedEOF) {
-                               log.Fatal(err)
-                       }
-                       if sizeOrig != sizeDup {
-                               log.Fatalln(dup, orig, "unexpectedly different sizes")
-                       }
-                       if err = fdOrig.Close(); err != nil {
-                               log.Fatal(err)
-                       }
-                       if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
-                               continue
-                       }
-                       link(dup, orig, *action, *doFsync)
-                       seen[orig] = struct{}{}
-                       deduped++
-                       fullSize += int64(sizeDup)
-                       break
-               }
-       }
-       queueSmall = nil
-
+       files, fullSize = 0, 0
+       bufDup := make([]byte, SectorSize)
+       bufOrig := make([]byte, SectorSize)
+       seenDup := make(map[string]struct{}, len(queue)/2)
+       seenOrig := make(map[string]struct{}, len(queue)/2)
        hasher, err := blake2b.New512(nil)
        if err != nil {
                panic(err)
        }
        hasher, err := blake2b.New512(nil)
        if err != nil {
                panic(err)
        }
-       seen = make(map[string]struct{}, len(queueLarge))
-       var sizeDup int64
-       for dup, origs := range queueLarge {
+       rdDup := bufio.NewReaderSize(nil, BufSize)
+       rdOrig := bufio.NewReaderSize(nil, BufSize)
+       var deduped int
+       termRequired := signalHandler(progress.Stop, &deduped)
+       for _, fi := range queue {
                files++
                files++
-               if _, ok := seen[dup]; ok {
+               if _, ok := seenOrig[fi.Path]; ok {
                        continue
                }
                        continue
                }
-               fdDup, err := os.Open(dup)
+               fdDup, err := os.Open(fi.Path)
                if err != nil {
                        log.Fatal(err)
                }
                if err != nil {
                        log.Fatal(err)
                }
-               if _, err := io.ReadFull(fdDup, bufDup); err != nil {
-                       log.Fatal(err)
+               readDup, err := io.ReadFull(fdDup, bufDup)
+               if err != nil {
+                       if err != io.ErrUnexpectedEOF {
+                               log.Fatal(err)
+                       }
+                       if int64(readDup) != fi.Size {
+                               log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
+                       }
                }
                var hashDup []byte
                }
                var hashDup []byte
-               for _, orig := range origs {
-                       fdOrig, err := os.Open(orig)
+               for _, orig := range size2fi[fi.Size] {
+                       if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
+                               continue
+                       }
+                       if _, ok := seenDup[orig.Path]; ok {
+                               continue
+                       }
+                       fdOrig, err := os.Open(orig.Path)
                        if err != nil {
                                log.Fatal(err)
                        }
                        if err != nil {
                                log.Fatal(err)
                        }
-                       if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
+                       readOrig, err := io.ReadFull(fdOrig, bufOrig)
+                       if !(err == nil || err == io.ErrUnexpectedEOF) {
                                log.Fatal(err)
                        }
                                log.Fatal(err)
                        }
-                       if bytes.Compare(bufDup, bufOrig) != 0 {
+                       if readOrig != readDup {
+                               log.Fatalln(
+                                       fi.Path, orig.Path,
+                                       "unexpectedly different sizes",
+                                       readOrig, readDup,
+                               )
+                       }
+                       if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
                                if err = fdOrig.Close(); err != nil {
                                        log.Fatal(err)
                                }
                                if err = fdOrig.Close(); err != nil {
                                        log.Fatal(err)
                                }
@@ -278,38 +362,57 @@ func main() {
                        }
                        if hashDup == nil {
                                hasher.Reset()
                        }
                        if hashDup == nil {
                                hasher.Reset()
-                               if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
+                               if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
                                        log.Fatalln("can not write to hash", err)
                                }
                                        log.Fatalln("can not write to hash", err)
                                }
-                               sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
+                               rdDup.Reset(fdDup)
+                               n, err := io.Copy(hasher, rdDup)
                                if err != nil {
                                        log.Fatal(err)
                                }
                                if err != nil {
                                        log.Fatal(err)
                                }
+                               if int64(readDup)+n != fi.Size {
+                                       log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
+                               }
                                hashDup = hasher.Sum(nil)
                        }
                        hasher.Reset()
                                hashDup = hasher.Sum(nil)
                        }
                        hasher.Reset()
-                       if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
+                       if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
                                log.Fatalln("can not write to hash", err)
                        }
                                log.Fatalln("can not write to hash", err)
                        }
-                       if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
+                       rdOrig.Reset(fdOrig)
+                       n, err := io.Copy(hasher, rdOrig)
+                       if err != nil {
                                log.Fatal(err)
                        }
                                log.Fatal(err)
                        }
+                       if int64(readOrig)+n != fi.Size {
+                               log.Fatalln(
+                                       fi.Path, orig.Path,
+                                       "unexpectedly different sizes",
+                                       int64(readOrig)+n, fi.Size,
+                               )
+                       }
                        if err = fdOrig.Close(); err != nil {
                                log.Fatal(err)
                        }
                        if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
                                continue
                        }
                        if err = fdOrig.Close(); err != nil {
                                log.Fatal(err)
                        }
                        if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
                                continue
                        }
-                       link(dup, orig, *action, *doFsync)
-                       seen[orig] = struct{}{}
+                       link(fi.Path, orig.Path)
+                       seenDup[fi.Path] = struct{}{}
+                       seenOrig[orig.Path] = struct{}{}
+                       fullSize += fi.Size
                        deduped++
                        deduped++
-                       fullSize += sizeDup
                        break
                }
                if err = fdDup.Close(); err != nil {
                        log.Fatal(err)
                }
        }
                        break
                }
                if err = fdDup.Close(); err != nil {
                        log.Fatal(err)
                }
        }
+       if action == ActNS {
+               if err = stdoutW.Flush(); err != nil {
+                       log.Fatal(err)
+               }
+       }
        termRequired <- syscall.SIGTERM
        <-termRequired
 }
        termRequired <- syscall.SIGTERM
        <-termRequired
 }