]> Sergey Matveev's repositories - sgodup.git/blob - main.go
9c5382ce03f79d536422e03e383a7c403cfe6faa
[sgodup.git] / main.go
1 /*
2 sgodup -- File deduplication utility
3 Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 // File deduplication utility
19 package main
20
21 import (
22         "bufio"
23         "bytes"
24         "flag"
25         "fmt"
26         "io"
27         "log"
28         "os"
29         "os/signal"
30         "path/filepath"
31         "runtime"
32         "strconv"
33         "sync"
34         "syscall"
35
36         "github.com/dustin/go-humanize"
37         "go.cypherpunks.ru/netstring/v2"
38         "golang.org/x/crypto/blake2b"
39 )
40
41 const (
42         Version    = "0.1.0"
43         SectorSize = 1 << 12 // 4 KiB sector size
44         BufSize    = 1 << 17 // ZFS default 128 KiB recordsize
45
46         ActPrint    = iota
47         ActNS       = iota
48         ActSymlink  = iota
49         ActHardlink = iota
50 )
51
52 type Action int
53
54 var (
55         canExit    sync.Mutex
56         nsW        *netstring.Writer
57         curDirPath string
58         curDirFd   *os.File
59         action     Action
60
61         baseDir  = flag.String("basedir", "", "Directory with original files")
62         dupDir   = flag.String("dupdir", "", "Directory with possible duplicates")
63         actionS  = flag.String("action", "", "print, ns, symlink, hardlink")
64         minSize  = flag.Int64("minsize", 1, "minimal file size")
65         chmod    = flag.String("chmod", "", "chmod files")
66         nsPath   = flag.String("ns", "", "link targets from netstring file")
67         fsync    = flag.Bool("fsync", false, "fsync directories?")
68         version  = flag.Bool("version", false, "Print version information")
69         warranty = flag.Bool("warranty", false, "Print warranty information")
70 )
71
72 func link(dup, orig string) {
73         if action == ActNS {
74                 if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
75                         log.Fatal(err)
76                 }
77                 if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
78                         log.Fatal(err)
79                 }
80                 return
81         }
82         tgt, err := filepath.Rel(dup, orig)
83         if err != nil {
84                 log.Fatal(err)
85         }
86         tgt = tgt[3:]
87         if action == ActPrint {
88                 fmt.Println(dup, "->", tgt)
89                 return
90         }
91         canExit.Lock()
92         if err = os.Remove(dup); err != nil {
93                 log.Fatal(err)
94         }
95         if action == ActSymlink {
96                 err = os.Symlink(tgt, dup)
97         } else {
98                 err = os.Link(orig, dup)
99         }
100         if err != nil {
101                 log.Fatal(err)
102         }
103         if *fsync {
104                 dirPath := filepath.Dir(dup)
105                 if dirPath != curDirPath {
106                         curDirFd, err = os.Open(dirPath)
107                         if err != nil {
108                                 log.Fatal(err)
109                         }
110                         curDirPath = dirPath
111                 }
112                 if err = curDirFd.Sync(); err != nil {
113                         log.Fatal(err)
114                 }
115         }
116         canExit.Unlock()
117 }
118
119 func signalHandler(progressStop func(), deduped *int) chan os.Signal {
120         termRequired := make(chan os.Signal, 1)
121         signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
122         go func() {
123                 <-termRequired
124                 canExit.Lock()
125                 progressStop()
126                 log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
127                 os.Exit(0)
128         }()
129         return termRequired
130 }
131
132 func main() {
133         flag.Usage = func() {
134                 fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
135 Copyright (C) 2020 Sergey Matveev
136 License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
137 This is free software: you are free to change and redistribute it.
138 There is NO WARRANTY, to the extent permitted by law.
139
140 Single pass mode:
141   %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
142     [-chmod XXX] [-minsize XXX] [-fsync]
143 Two pass mode:
144   %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
145   %s -action {print,symlink,hardlink} [-fsync] -ns state
146
147 Options:
148 `, os.Args[0], os.Args[0], os.Args[0])
149                 flag.PrintDefaults()
150         }
151         flag.Parse()
152         if *version {
153                 fmt.Println("sgodup version", Version, "built with", runtime.Version())
154                 return
155         }
156         if *warranty {
157                 fmt.Println(`This program is free software: you can redistribute it and/or modify
158 it under the terms of the GNU General Public License as published by
159 the Free Software Foundation, version 3 of the License.
160
161 This program is distributed in the hope that it will be useful,
162 but WITHOUT ANY WARRANTY; without even the implied warranty of
163 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
164 GNU General Public License for more details.
165
166 You should have received a copy of the GNU General Public License
167 along with this program.  If not, see <http://www.gnu.org/licenses/>.`)
168                 return
169         }
170
171         var stdoutW *bufio.Writer
172         switch *actionS {
173         case "print":
174                 action = ActPrint
175         case "ns":
176                 action = ActNS
177                 stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
178                 nsW = netstring.NewWriter(stdoutW)
179         case "symlink":
180                 action = ActSymlink
181         case "hardlink":
182                 action = ActHardlink
183         default:
184                 log.Fatalln("invalid action")
185         }
186
187         if *nsPath != "" {
188                 if action == ActNS {
189                         log.Fatalln("\"-action ns\" has no meaning with -ns")
190                 }
191                 fd, err := os.Open(*nsPath)
192                 if err != nil {
193                         log.Fatal(err)
194                 }
195                 nsR := netstring.NewReader(bufio.NewReaderSize(fd, BufSize))
196                 pathDup := make([]byte, 1<<10)
197                 pathOrig := make([]byte, 1<<10)
198                 var pathDupLen, pathOrigLen uint64
199                 files := 0
200                 fullSize := int64(0)
201                 progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
202                 termRequired := signalHandler(progress.Stop, &files)
203                 for {
204                         pathDupLen, err = nsR.Next()
205                         if err != nil {
206                                 if err == io.EOF {
207                                         break
208                                 }
209                                 log.Fatal(err)
210                         }
211                         if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
212                                 log.Fatal(err)
213                         }
214                         pathOrigLen, err = nsR.Next()
215                         if err != nil {
216                                 log.Fatal(err)
217                         }
218                         if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
219                                 log.Fatal(err)
220                         }
221                         link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
222                         files++
223                 }
224                 termRequired <- syscall.SIGTERM
225                 <-termRequired
226         }
227
228         if *baseDir == "" {
229                 log.Fatalln("-basedir is required")
230         }
231         if *dupDir == "" {
232                 log.Fatalln("-dupdir is required")
233         }
234         var doChmod os.FileMode
235         if *chmod != "" {
236                 ch, err := strconv.ParseUint(*chmod, 8, 16)
237                 if err != nil {
238                         log.Fatal(err)
239                 }
240                 doChmod = os.FileMode(ch)
241         }
242
243         log.Println("processing basedir...")
244         size2fi := make(map[int64][]FileInode, 1<<10)
245         var files int
246         var fullSize int64
247         progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
248         for fi := range walk(*baseDir) {
249                 if doChmod > 0 {
250                         if err := os.Chmod(fi.Path, doChmod); err != nil {
251                                 log.Fatal(err)
252                         }
253                 }
254                 if fi.Size < *minSize {
255                         continue
256                 }
257                 files++
258                 fullSize += fi.Size
259                 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
260         }
261         progress.Stop()
262
263         log.Println("processing dupdir...")
264         queue := make([]FileInode, 0, files)
265         files, fullSize = 0, 0
266         progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
267         for fi := range walk(*dupDir) {
268                 if doChmod > 0 {
269                         if err := os.Chmod(fi.Path, doChmod); err != nil {
270                                 log.Fatal(err)
271                         }
272                 }
273                 if fi.Size < *minSize {
274                         continue
275                 }
276                 origs, ok := size2fi[fi.Size]
277                 if !ok {
278                         continue
279                 }
280                 candidates := 0
281                 for _, orig := range origs {
282                         if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
283                                 continue
284                         }
285                         candidates++
286                 }
287                 if candidates == 0 {
288                         continue
289                 }
290                 files++
291                 fullSize += fi.Size
292                 queue = append(queue, fi)
293         }
294         progress.Stop()
295
296         log.Println("deduplicating...")
297         progress = NewProgress(
298                 files, fullSize,
299                 &files, &fullSize,
300                 " processed",
301                 " deduplicated",
302         )
303         files, fullSize = 0, 0
304         bufDup := make([]byte, SectorSize)
305         bufOrig := make([]byte, SectorSize)
306         seenDup := make(map[string]struct{}, len(queue)/2)
307         seenOrig := make(map[string]struct{}, len(queue)/2)
308         hasher, err := blake2b.New512(nil)
309         if err != nil {
310                 panic(err)
311         }
312         rdDup := bufio.NewReaderSize(nil, BufSize)
313         rdOrig := bufio.NewReaderSize(nil, BufSize)
314         var deduped int
315         termRequired := signalHandler(progress.Stop, &deduped)
316         for _, fi := range queue {
317                 files++
318                 if _, ok := seenOrig[fi.Path]; ok {
319                         continue
320                 }
321                 fdDup, err := os.Open(fi.Path)
322                 if err != nil {
323                         log.Fatal(err)
324                 }
325                 readDup, err := io.ReadFull(fdDup, bufDup)
326                 if err != nil {
327                         if err != io.ErrUnexpectedEOF {
328                                 log.Fatal(err)
329                         }
330                         if int64(readDup) != fi.Size {
331                                 log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
332                         }
333                 }
334                 var hashDup []byte
335                 for _, orig := range size2fi[fi.Size] {
336                         if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
337                                 continue
338                         }
339                         if _, ok := seenDup[orig.Path]; ok {
340                                 continue
341                         }
342                         fdOrig, err := os.Open(orig.Path)
343                         if err != nil {
344                                 log.Fatal(err)
345                         }
346                         readOrig, err := io.ReadFull(fdOrig, bufOrig)
347                         if !(err == nil || err == io.ErrUnexpectedEOF) {
348                                 log.Fatal(err)
349                         }
350                         if readOrig != readDup {
351                                 log.Fatalln(
352                                         fi.Path, orig.Path,
353                                         "unexpectedly different sizes",
354                                         readOrig, readDup,
355                                 )
356                         }
357                         if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
358                                 if err = fdOrig.Close(); err != nil {
359                                         log.Fatal(err)
360                                 }
361                                 continue
362                         }
363                         if hashDup == nil {
364                                 hasher.Reset()
365                                 if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
366                                         log.Fatalln("can not write to hash", err)
367                                 }
368                                 rdDup.Reset(fdDup)
369                                 n, err := io.Copy(hasher, rdDup)
370                                 if err != nil {
371                                         log.Fatal(err)
372                                 }
373                                 if int64(readDup)+n != fi.Size {
374                                         log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
375                                 }
376                                 hashDup = hasher.Sum(nil)
377                         }
378                         hasher.Reset()
379                         if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
380                                 log.Fatalln("can not write to hash", err)
381                         }
382                         rdOrig.Reset(fdOrig)
383                         n, err := io.Copy(hasher, rdOrig)
384                         if err != nil {
385                                 log.Fatal(err)
386                         }
387                         if int64(readOrig)+n != fi.Size {
388                                 log.Fatalln(
389                                         fi.Path, orig.Path,
390                                         "unexpectedly different sizes",
391                                         int64(readOrig)+n, fi.Size,
392                                 )
393                         }
394                         if err = fdOrig.Close(); err != nil {
395                                 log.Fatal(err)
396                         }
397                         if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
398                                 continue
399                         }
400                         link(fi.Path, orig.Path)
401                         seenDup[fi.Path] = struct{}{}
402                         seenOrig[orig.Path] = struct{}{}
403                         fullSize += fi.Size
404                         deduped++
405                         break
406                 }
407                 if err = fdDup.Close(); err != nil {
408                         log.Fatal(err)
409                 }
410         }
411         if action == ActNS {
412                 if err = stdoutW.Flush(); err != nil {
413                         log.Fatal(err)
414                 }
415         }
416         termRequired <- syscall.SIGTERM
417         <-termRequired
418 }