]> Sergey Matveev's repositories - sgodup.git/blob - main.go
Initial version
[sgodup.git] / main.go
1 /*
2 sgodup -- File deduplication utility
3 Copyright (C) 2020 Sergey Matveev <stargrave@stargrave.org>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, version 3 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 // File deduplication utility
19 package main
20
21 import (
22         "bufio"
23         "bytes"
24         "flag"
25         "fmt"
26         "io"
27         "log"
28         "os"
29         "os/signal"
30         "path/filepath"
31         "strconv"
32         "sync"
33         "syscall"
34
35         "golang.org/x/crypto/blake2b"
36 )
37
38 const (
39         SizeBoundary = 1 << 12 // 4 KiB sector size
40         BufSize      = 1 << 17 // ZFS default 128 KiB recordsize
41 )
42
43 var (
44         canExit sync.Mutex
45
46         curDirPath string
47         curDirFd   *os.File
48 )
49
50 func link(dup, orig, action string, fsync bool) {
51         tgt, err := filepath.Rel(dup, orig)
52         if err != nil {
53                 log.Fatal(err)
54         }
55         tgt = tgt[3:]
56         if action == "print" {
57                 fmt.Println(dup, tgt)
58                 return
59         }
60         canExit.Lock()
61         if err = os.Remove(dup); err != nil {
62                 log.Fatal(err)
63         }
64         if action == "symlink" {
65                 err = os.Symlink(tgt, dup)
66         } else {
67                 err = os.Link(orig, dup)
68         }
69         if err != nil {
70                 log.Fatal(err)
71         }
72         if fsync {
73                 dirPath := filepath.Dir(dup)
74                 if dirPath != curDirPath {
75                         curDirFd, err = os.Open(dirPath)
76                         if err != nil {
77                                 log.Fatal(err)
78                         }
79                         curDirPath = dirPath
80                 }
81                 if err = curDirFd.Sync(); err != nil {
82                         log.Fatal(err)
83                 }
84         }
85         canExit.Unlock()
86 }
87
88 func main() {
89         var (
90                 baseDir = flag.String("basedir", "", "Directory with original files")
91                 dupDir  = flag.String("dupdir", "", "Directory with possible duplicates")
92                 action  = flag.String("action", "", "print, symlink, hardlink")
93                 doChmod = flag.String("chmod", "", "chmod files")
94                 doFsync = flag.Bool("fsync", false, "fsync directories?")
95         )
96         flag.Parse()
97         if *baseDir == "" {
98                 log.Fatalln("-basedir is required")
99         }
100         if *dupDir == "" {
101                 log.Fatalln("-dupdir is required")
102         }
103         var chmod os.FileMode
104         if *doChmod != "" {
105                 ch, err := strconv.ParseUint(*doChmod, 8, 16)
106                 if err != nil {
107                         log.Fatal(err)
108                 }
109                 chmod = os.FileMode(ch)
110         }
111         if !(*action == "print" || *action == "symlink" || *action == "hardlink") {
112                 log.Fatalln("choose action")
113         }
114
115         log.Println("processing basedir...")
116         size2fi := make(map[int64][]FileInode, 1<<10)
117         files := 0
118         filesSmall := 0
119         filesLarge := 0
120         var fullSize int64
121         progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total")
122         for fi := range walk(*baseDir) {
123                 if chmod > 0 {
124                         if err := os.Chmod(fi.Path, chmod); err != nil {
125                                 log.Fatal(err)
126                         }
127                 }
128                 if fi.Size == 0 {
129                         continue
130                 }
131                 if fi.Size <= SizeBoundary {
132                         filesSmall++
133                 } else {
134                         filesLarge++
135                 }
136                 files++
137                 fullSize += fi.Size
138                 size2fi[fi.Size] = append(size2fi[fi.Size], fi)
139         }
140         progress.Stop()
141
142         log.Println("processing dupdir...")
143         queueSmall := make(map[string][]string, filesSmall)
144         queueLarge := make(map[string][]string, filesLarge)
145         files = 0
146         fullSize = 0
147         progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total")
148         for fi := range walk(*dupDir) {
149                 if chmod > 0 {
150                         if err := os.Chmod(fi.Path, chmod); err != nil {
151                                 log.Fatal(err)
152                         }
153                 }
154                 if fi.Size == 0 {
155                         continue
156                 }
157                 origs, ok := size2fi[fi.Size]
158                 if !ok {
159                         continue
160                 }
161                 paths := make([]string, 0, len(origs))
162                 for _, orig := range origs {
163                         if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) {
164                                 continue
165                         }
166                         paths = append(paths, orig.Path)
167                 }
168                 files++
169                 fullSize += fi.Size
170                 if fi.Size <= SizeBoundary {
171                         queueSmall[fi.Path] = paths
172                 } else {
173                         queueLarge[fi.Path] = paths
174                 }
175         }
176         size2fi = nil
177         progress.Stop()
178
179         log.Println("deduplicating...")
180         progress = NewProgress(
181                 files,
182                 fullSize,
183                 &files,
184                 &fullSize,
185                 " processed",
186                 " deduplicated",
187         )
188         files = 0
189         fullSize = 0
190         deduped := 0
191         termRequired := make(chan os.Signal, 1)
192         signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT)
193         go func() {
194                 <-termRequired
195                 canExit.Lock()
196                 progress.Stop()
197                 log.Println(deduped, "files deduplicated")
198                 os.Exit(0)
199         }()
200         bufDup := make([]byte, SizeBoundary)
201         bufOrig := make([]byte, SizeBoundary)
202         seen := make(map[string]struct{}, len(queueSmall))
203         for dup, origs := range queueSmall {
204                 files++
205                 if _, ok := seen[dup]; ok {
206                         continue
207                 }
208                 fdDup, err := os.Open(dup)
209                 if err != nil {
210                         log.Fatal(err)
211                 }
212                 sizeDup, err := io.ReadFull(fdDup, bufDup)
213                 if !(err == nil || err == io.ErrUnexpectedEOF) {
214                         log.Fatal(err)
215                 }
216                 if err = fdDup.Close(); err != nil {
217                         log.Fatal(err)
218                 }
219                 for _, orig := range origs {
220                         fdOrig, err := os.Open(orig)
221                         if err != nil {
222                                 log.Fatal(err)
223                         }
224                         sizeOrig, err := io.ReadFull(fdOrig, bufOrig)
225                         if !(err == nil || err == io.ErrUnexpectedEOF) {
226                                 log.Fatal(err)
227                         }
228                         if sizeOrig != sizeDup {
229                                 log.Fatalln(dup, orig, "unexpectedly different sizes")
230                         }
231                         if err = fdOrig.Close(); err != nil {
232                                 log.Fatal(err)
233                         }
234                         if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 {
235                                 continue
236                         }
237                         link(dup, orig, *action, *doFsync)
238                         seen[orig] = struct{}{}
239                         deduped++
240                         fullSize += int64(sizeDup)
241                         break
242                 }
243         }
244         queueSmall = nil
245
246         hasher, err := blake2b.New512(nil)
247         if err != nil {
248                 panic(err)
249         }
250         seen = make(map[string]struct{}, len(queueLarge))
251         var sizeDup int64
252         for dup, origs := range queueLarge {
253                 files++
254                 if _, ok := seen[dup]; ok {
255                         continue
256                 }
257                 fdDup, err := os.Open(dup)
258                 if err != nil {
259                         log.Fatal(err)
260                 }
261                 if _, err := io.ReadFull(fdDup, bufDup); err != nil {
262                         log.Fatal(err)
263                 }
264                 var hashDup []byte
265                 for _, orig := range origs {
266                         fdOrig, err := os.Open(orig)
267                         if err != nil {
268                                 log.Fatal(err)
269                         }
270                         if _, err = io.ReadFull(fdOrig, bufOrig); err != nil {
271                                 log.Fatal(err)
272                         }
273                         if bytes.Compare(bufDup, bufOrig) != 0 {
274                                 if err = fdOrig.Close(); err != nil {
275                                         log.Fatal(err)
276                                 }
277                                 continue
278                         }
279                         if hashDup == nil {
280                                 hasher.Reset()
281                                 if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) {
282                                         log.Fatalln("can not write to hash", err)
283                                 }
284                                 sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize))
285                                 if err != nil {
286                                         log.Fatal(err)
287                                 }
288                                 hashDup = hasher.Sum(nil)
289                         }
290                         hasher.Reset()
291                         if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) {
292                                 log.Fatalln("can not write to hash", err)
293                         }
294                         if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil {
295                                 log.Fatal(err)
296                         }
297                         if err = fdOrig.Close(); err != nil {
298                                 log.Fatal(err)
299                         }
300                         if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 {
301                                 continue
302                         }
303                         link(dup, orig, *action, *doFsync)
304                         seen[orig] = struct{}{}
305                         deduped++
306                         fullSize += sizeDup
307                         break
308                 }
309                 if err = fdDup.Close(); err != nil {
310                         log.Fatal(err)
311                 }
312         }
313         termRequired <- syscall.SIGTERM
314         <-termRequired
315 }