]> Sergey Matveev's repositories - sgodup.git/blob - main.go
Raise copyright years
[sgodup.git] / main.go
1 /*
2 sgodup -- File deduplication utility
3 Copyright (C) 2020-2023 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 // File deduplication utility
19 package main
20
21 import (
22         "bufio"
23         "bytes"
24         "flag"
25         "fmt"
26         "io"
27         "log"
28         "os"
29         "os/signal"
30         "path/filepath"
31         "runtime"
32         "strconv"
33         "sync"
34         "syscall"
35
36         "github.com/dustin/go-humanize"
37         "go.cypherpunks.ru/netstring/v2"
38         "golang.org/x/crypto/blake2b"
39 )
40
41 const (
42         Version    = "0.2.0"
43         SectorSize = 1 << 12 // 4 KiB sector size
44         BufSize    = 1 << 17 // ZFS default 128 KiB recordsize
45
46         ActPrint    = iota
47         ActNS       = iota
48         ActSymlink  = iota
49         ActHardlink = iota
50 )
51
52 type Action int
53
54 var (
55         canExit    sync.Mutex
56         nsW        *netstring.Writer
57         curDirPath string
58         curDirFd   *os.File
59         action     Action
60
61         baseDir  = flag.String("basedir", "", "Directory with original files")
62         dupDir   = flag.String("dupdir", "", "Directory with possible duplicates")
63         actionS  = flag.String("action", "", "print, ns, symlink, hardlink")
64         minSize  = flag.Int64("minsize", 1, "minimal file size")
65         chmod    = flag.String("chmod", "", "chmod files")
66         doNS     = flag.Bool("ns", false, "link targets from netstring read from stdin")
67         fsync    = flag.Bool("fsync", false, "fsync directories?")
68         version  = flag.Bool("version", false, "Print version information")
69         warranty = flag.Bool("warranty", false, "Print warranty information")
70 )
71
72 func link(dup, orig string) {
73         if action == ActNS {
74                 if _, err := nsW.WriteChunk([]byte(dup)); err != nil {
75                         log.Fatal(err)
76                 }
77                 if _, err := nsW.WriteChunk([]byte(orig)); err != nil {
78                         log.Fatal(err)
79                 }
80                 return
81         }
82         tgt, err := filepath.Rel(dup, orig)
83         if err != nil {
84                 log.Fatal(err)
85         }
86         tgt = tgt[3:]
87         if action == ActPrint {
88                 fmt.Println(dup, "->", tgt)
89                 return
90         }
91         canExit.Lock()
92         if err = os.Remove(dup); err != nil {
93                 log.Fatal(err)
94         }
95         if action == ActSymlink {
96                 err = os.Symlink(tgt, dup)
97         } else {
98                 err = os.Link(orig, dup)
99         }
100         if err != nil {
101                 log.Fatal(err)
102         }
103         if *fsync {
104                 dirPath := filepath.Dir(dup)
105                 if dirPath != curDirPath {
106                         curDirFd, err = os.Open(dirPath)
107                         if err != nil {
108                                 log.Fatal(err)
109                         }
110                         curDirPath = dirPath
111                 }
112                 if err = curDirFd.Sync(); err != nil {
113                         log.Fatal(err)
114                 }
115         }
116         canExit.Unlock()
117 }
118
119 func signalHandler(progressStop func(), deduped *int) chan os.Signal {
120         termRequired := make(chan os.Signal, 1)
121         signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
122         go func() {
123                 <-termRequired
124                 canExit.Lock()
125                 progressStop()
126                 log.Println(humanize.Comma(int64(*deduped)), "files deduplicated")
127                 os.Exit(0)
128         }()
129         return termRequired
130 }
131
132 func main() {
133         flag.Usage = func() {
134                 fmt.Fprintf(os.Stderr, `sgodup -- file deduplication utility
135 Copyright (C) 2020-2023 Sergey Matveev
136 License GPLv3: GNU GPL version 3 <http://gnu.org/licenses/gpl.html>
137 This is free software: you are free to change and redistribute it.
138 There is NO WARRANTY, to the extent permitted by law.
139
140 Single pass mode:
141   %s -basedir DIR -dupdir DIR -action {print,ns,symlink,hardlink}
142     [-chmod XXX] [-minsize XXX] [-fsync]
143 Two pass mode:
144   %s -basedir DIR -dupdir DIR -action ns [-chmod XXX] [-minsize XXX] > state
145   %s -action {print,symlink,hardlink} [-fsync] -ns < state
146
147 Options:
148 `, os.Args[0], os.Args[0], os.Args[0])
149                 flag.PrintDefaults()
150         }
151         flag.Parse()
152         if *version {
153                 fmt.Println("sgodup version", Version, "built with", runtime.Version())
154                 return
155         }
156         if *warranty {
157                 fmt.Println(`This program is free software: you can redistribute it and/or modify
158 it under the terms of the GNU General Public License as published by
159 the Free Software Foundation, version 3 of the License.
160
161 This program is distributed in the hope that it will be useful,
162 but WITHOUT ANY WARRANTY; without even the implied warranty of
163 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
164 GNU General Public License for more details.
165
166 You should have received a copy of the GNU General Public License
167 along with this program.  If not, see <http://www.gnu.org/licenses/>.`)
168                 return
169         }
170
171         var stdoutW *bufio.Writer
172         switch *actionS {
173         case "print":
174                 action = ActPrint
175         case "ns":
176                 action = ActNS
177                 stdoutW = bufio.NewWriterSize(os.Stdout, BufSize)
178                 nsW = netstring.NewWriter(stdoutW)
179         case "symlink":
180                 action = ActSymlink
181         case "hardlink":
182                 action = ActHardlink
183         default:
184                 log.Fatalln("invalid action")
185         }
186
187         if *doNS {
188                 if action == ActNS {
189                         log.Fatalln("\"-action ns\" has no meaning with -ns")
190                 }
191                 nsR := netstring.NewReader(bufio.NewReaderSize(os.Stdin, BufSize))
192                 pathDup := make([]byte, 1<<10)
193                 pathOrig := make([]byte, 1<<10)
194                 var err error
195                 var pathDupLen, pathOrigLen uint64
196                 files := 0
197                 fullSize := int64(0)
198                 progress := NewProgress(0, 0, &files, &fullSize, " linked", "")
199                 termRequired := signalHandler(progress.Stop, &files)
200                 for {
201                         pathDupLen, err = nsR.Next()
202                         if err != nil {
203                                 if err == io.EOF {
204                                         break
205                                 }
206                                 log.Fatal(err)
207                         }
208                         if _, err = io.ReadFull(nsR, pathDup[:pathDupLen]); err != nil {
209                                 log.Fatal(err)
210                         }
211                         pathOrigLen, err = nsR.Next()
212                         if err != nil {
213                                 log.Fatal(err)
214                         }
215                         if _, err = io.ReadFull(nsR, pathOrig[:pathOrigLen]); err != nil {
216                                 log.Fatal(err)
217                         }
218                         link(string(pathDup[:pathDupLen]), string(pathOrig[:pathOrigLen]))
219                         files++
220                 }
221                 termRequired <- syscall.SIGTERM
222                 <-termRequired
223         }
224
225         if *baseDir == "" {
226                 log.Fatalln("-basedir is required")
227         }
228         if *dupDir == "" {
229                 log.Fatalln("-dupdir is required")
230         }
231         var doChmod os.FileMode
232         if *chmod != "" {
233                 ch, err := strconv.ParseUint(*chmod, 8, 16)
234                 if err != nil {
235                         log.Fatal(err)
236                 }
237                 doChmod = os.FileMode(ch)
238         }
239
240         log.Println("processing basedir...")
241         size2fi := make(map[int64][]FileInode, 1<<10)
242         var files int
243         var fullSize int64
244         progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
245         for fi := range walk(*baseDir) {
246                 if doChmod > 0 {
247                         if err := os.Chmod(fi.Path, doChmod); err != nil {
248                                 log.Fatal(err)
249                         }
250                 }
251                 if fi.Size < *minSize {
252                         continue
253                 }
254                 files++
255                 fullSize += fi.Size
256                 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
257         }
258         progress.Stop()
259
260         log.Println("processing dupdir...")
261         queue := make([]FileInode, 0, files)
262         files, fullSize = 0, 0
263         progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
264         for fi := range walk(*dupDir) {
265                 if doChmod > 0 {
266                         if err := os.Chmod(fi.Path, doChmod); err != nil {
267                                 log.Fatal(err)
268                         }
269                 }
270                 if fi.Size < *minSize {
271                         continue
272                 }
273                 origs, ok := size2fi[fi.Size]
274                 if !ok {
275                         continue
276                 }
277                 candidates := 0
278                 for _, orig := range origs {
279                         if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
280                                 continue
281                         }
282                         candidates++
283                 }
284                 if candidates == 0 {
285                         continue
286                 }
287                 files++
288                 fullSize += fi.Size
289                 queue = append(queue, fi)
290         }
291         progress.Stop()
292
293         log.Println("deduplicating...")
294         progress = NewProgress(
295                 files, fullSize,
296                 &files, &fullSize,
297                 " processed",
298                 " deduplicated",
299         )
300         files, fullSize = 0, 0
301         bufDup := make([]byte, SectorSize)
302         bufOrig := make([]byte, SectorSize)
303         seenDup := make(map[string]struct{}, len(queue)/2)
304         seenOrig := make(map[string]struct{}, len(queue)/2)
305         hasher, err := blake2b.New512(nil)
306         if err != nil {
307                 panic(err)
308         }
309         rdDup := bufio.NewReaderSize(nil, BufSize)
310         rdOrig := bufio.NewReaderSize(nil, BufSize)
311         var deduped int
312         termRequired := signalHandler(progress.Stop, &deduped)
313         for _, fi := range queue {
314                 files++
315                 if _, ok := seenOrig[fi.Path]; ok {
316                         continue
317                 }
318                 fdDup, err := os.Open(fi.Path)
319                 if err != nil {
320                         log.Fatal(err)
321                 }
322                 readDup, err := io.ReadFull(fdDup, bufDup)
323                 if err != nil {
324                         if err != io.ErrUnexpectedEOF {
325                                 log.Fatal(err)
326                         }
327                         if int64(readDup) != fi.Size {
328                                 log.Fatalln(fi.Path, "unexpected size", readDup, fi.Size)
329                         }
330                 }
331                 var hashDup []byte
332                 for _, orig := range size2fi[fi.Size] {
333                         if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
334                                 continue
335                         }
336                         if _, ok := seenDup[orig.Path]; ok {
337                                 continue
338                         }
339                         fdOrig, err := os.Open(orig.Path)
340                         if err != nil {
341                                 log.Fatal(err)
342                         }
343                         readOrig, err := io.ReadFull(fdOrig, bufOrig)
344                         if !(err == nil || err == io.ErrUnexpectedEOF) {
345                                 log.Fatal(err)
346                         }
347                         if readOrig != readDup {
348                                 log.Fatalln(
349                                         fi.Path, orig.Path,
350                                         "unexpectedly different sizes",
351                                         readOrig, readDup,
352                                 )
353                         }
354                         if bytes.Compare(bufDup[:readDup], bufOrig[:readOrig]) != 0 {
355                                 if err = fdOrig.Close(); err != nil {
356                                         log.Fatal(err)
357                                 }
358                                 continue
359                         }
360                         if hashDup == nil {
361                                 hasher.Reset()
362                                 if n, err := hasher.Write(bufDup[:readDup]); err != nil || n != readDup {
363                                         log.Fatalln("can not write to hash", err)
364                                 }
365                                 rdDup.Reset(fdDup)
366                                 n, err := io.Copy(hasher, rdDup)
367                                 if err != nil {
368                                         log.Fatal(err)
369                                 }
370                                 if int64(readDup)+n != fi.Size {
371                                         log.Fatalln(fi.Path, "unexpected size", int64(readDup)+n, fi.Size)
372                                 }
373                                 hashDup = hasher.Sum(nil)
374                         }
375                         hasher.Reset()
376                         if n, err := hasher.Write(bufOrig[:readOrig]); err != nil || n != readOrig {
377                                 log.Fatalln("can not write to hash", err)
378                         }
379                         rdOrig.Reset(fdOrig)
380                         n, err := io.Copy(hasher, rdOrig)
381                         if err != nil {
382                                 log.Fatal(err)
383                         }
384                         if int64(readOrig)+n != fi.Size {
385                                 log.Fatalln(
386                                         fi.Path, orig.Path,
387                                         "unexpectedly different sizes",
388                                         int64(readOrig)+n, fi.Size,
389                                 )
390                         }
391                         if err = fdOrig.Close(); err != nil {
392                                 log.Fatal(err)
393                         }
394                         if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
395                                 continue
396                         }
397                         link(fi.Path, orig.Path)
398                         seenDup[fi.Path] = struct{}{}
399                         seenOrig[orig.Path] = struct{}{}
400                         fullSize += fi.Size
401                         deduped++
402                         break
403                 }
404                 if err = fdDup.Close(); err != nil {
405                         log.Fatal(err)
406                 }
407         }
408         if action == ActNS {
409                 if err = stdoutW.Flush(); err != nil {
410                         log.Fatal(err)
411                 }
412         }
413         termRequired <- syscall.SIGTERM
414         <-termRequired
415 }