1 /* FDUPES Copyright (c) 1999 Adrian Lopez
3 Permission is hereby granted, free of charge, to any person
4 obtaining a copy of this software and associated documentation files
5 (the "Software"), to deal in the Software without restriction,
6 including without limitation the rights to use, copy, modify, merge,
7 publish, distribute, sublicense, and/or sell copies of the Software,
8 and to permit persons to whom the Software is furnished to do so,
9 subject to the following conditions:
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
37 #define ISFLAG(a,b) ((a & b) == b)
38 #define SETFLAG(a,b) (a |= b)
40 #define F_RECURSE 0x001
41 #define F_HIDEPROGRESS 0x002
42 #define F_DSAMELINE 0x004
43 #define F_FOLLOWLINKS 0x008
44 #define F_DELETEFILES 0x010
45 #define F_EXCLUDEEMPTY 0x020
46 #define F_CONSIDERHARDLINKS 0x040
47 #define F_SHOWSIZE 0x080
48 #define F_OMITFIRST 0x100
52 unsigned long flags = 0;
54 #define CHUNK_SIZE 8192
56 #define INPUT_SIZE 256
58 typedef struct _file {
63 int hasdupes; /* true only if file is first on duplicate chain */
64 struct _file *duplicates;
68 typedef struct _filetree {
70 #ifdef EXPERIMENTAL_RBTREE
72 struct _filetree *parent;
74 struct _filetree *left;
75 struct _filetree *right;
78 #ifdef EXPERIMENTAL_RBTREE
83 void errormsg(char *message, ...)
87 va_start(ap, message);
89 fprintf(stderr, "\r%40s\r%s: ", "", program_name);
90 vfprintf(stderr, message, ap);
93 void escapefilename(char *escape_list, char **filename_ptr)
100 filename = *filename_ptr;
102 tmp = (char*) malloc(strlen(filename) * 2 + 1);
104 errormsg("out of memory!\n");
108 for (x = 0, tx = 0; x < strlen(filename); x++) {
109 if (strchr(escape_list, filename[x]) != NULL) tmp[tx++] = '\\';
110 tmp[tx++] = filename[x];
116 *filename_ptr = realloc(*filename_ptr, strlen(tmp) + 1);
117 if (*filename_ptr == NULL) {
118 errormsg("out of memory!\n");
121 strcpy(*filename_ptr, tmp);
125 off_t filesize(char *filename) {
128 if (stat(filename, &s) != 0) return -1;
133 ino_t getinode(char *filename) {
136 if (stat(filename, &s) != 0) return 0;
141 int grokdir(char *dir, file_t **filelistp)
145 struct dirent *dirinfo;
150 static int progress = 0;
151 static char indicator[] = "-\\|/";
156 errormsg("could not chdir to %s\n", dir);
160 while ((dirinfo = readdir(cd)) != NULL) {
161 if (strcmp(dirinfo->d_name, ".") && strcmp(dirinfo->d_name, "..")) {
162 if (!ISFLAG(flags, F_HIDEPROGRESS)) {
163 fprintf(stderr, "\rBuilding file list %c ", indicator[progress]);
164 progress = (progress + 1) % 4;
167 newfile = (file_t*) malloc(sizeof(file_t));
170 errormsg("out of memory!\n");
173 } else newfile->next = *filelistp;
176 newfile->crcsignature = NULL;
177 newfile->duplicates = NULL;
178 newfile->hasdupes = 0;
180 newfile->d_name = (char*)malloc(strlen(dir)+strlen(dirinfo->d_name)+2);
182 if (!newfile->d_name) {
183 errormsg("out of memory!\n");
189 strcpy(newfile->d_name, dir);
190 lastchar = strlen(dir) - 1;
191 if (lastchar >= 0 && dir[lastchar] != '/')
192 strcat(newfile->d_name, "/");
193 strcat(newfile->d_name, dirinfo->d_name);
195 if (filesize(newfile->d_name) == 0 && ISFLAG(flags, F_EXCLUDEEMPTY)) {
196 free(newfile->d_name);
201 if (stat(newfile->d_name, &info) == -1) {
202 free(newfile->d_name);
207 if (lstat(newfile->d_name, &linfo) == -1) {
208 free(newfile->d_name);
213 if (S_ISDIR(info.st_mode)) {
214 if (ISFLAG(flags, F_RECURSE) && (ISFLAG(flags, F_FOLLOWLINKS) || !S_ISLNK(linfo.st_mode)))
215 filecount += grokdir(newfile->d_name, filelistp);
216 free(newfile->d_name);
219 if (S_ISREG(linfo.st_mode) || (S_ISLNK(linfo.st_mode) && ISFLAG(flags, F_FOLLOWLINKS))) {
220 *filelistp = newfile;
223 free(newfile->d_name);
237 /* If EXTERNAL_MD5 is not defined, use L. Peter Deutsch's MD5 library.
239 char *getcrcsignature(char *filename)
245 md5_byte_t digest[16];
246 static md5_byte_t chunk[CHUNK_SIZE];
247 static char signature[16*2 + 1];
253 fsize = filesize(filename);
255 file = fopen(filename, "rb");
257 errormsg("error opening file %s\n", filename);
262 toread = (fsize % CHUNK_SIZE) ? (fsize % CHUNK_SIZE) : CHUNK_SIZE;
263 if (fread(chunk, toread, 1, file) != 1) {
264 errormsg("error reading from file %s\n", filename);
267 md5_append(&state, chunk, toread);
271 md5_finish(&state, digest);
275 for (x = 0; x < 16; x++) {
276 sprintf(sigp, "%02x", digest[x]);
277 sigp = strchr(sigp, '\0');
285 #endif /* [#ifndef EXTERNAL_MD5] */
289 /* If EXTERNAL_MD5 is defined, use md5sum program to calculate signatures.
291 char *getcrcsignature(char *filename)
293 static char signature[256];
298 command = (char*) malloc(strlen(filename)+strlen(EXTERNAL_MD5)+2);
299 if (command == NULL) {
300 errormsg("out of memory\n");
304 sprintf(command, "%s %s", EXTERNAL_MD5, filename);
306 result = popen(command, "r");
307 if (result == NULL) {
308 errormsg("error invoking %s\n", EXTERNAL_MD5);
314 if (fgets(signature, 256, result) == NULL) {
315 errormsg("error generating signature for %s\n", filename);
318 separator = strchr(signature, ' ');
319 if (separator) *separator = '\0';
326 #endif /* [#ifdef EXTERNAL_MD5] */
328 void purgetree(filetree_t *checktree)
330 if (checktree->left != NULL) purgetree(checktree->left);
332 if (checktree->right != NULL) purgetree(checktree->right);
337 #ifdef EXPERIMENTAL_RBTREE
338 /* Use a red-black tree structure to store file information.
341 void rotate_left(filetree_t **root, filetree_t *node)
345 subject = node->right;
346 node->right = subject->left;
348 if (subject->left != NULL) subject->left->parent = node;
349 subject->parent = node->parent;
351 if (node->parent == NULL) {
354 if (node == node->parent->left)
355 node->parent->left = subject;
357 node->parent->right = subject;
360 subject->left = node;
361 node->parent = subject;
364 void rotate_right(filetree_t **root, filetree_t *node)
368 subject = node->left;
369 node->left = subject->right;
371 if (subject->right != NULL) subject->right->parent = node;
372 subject->parent = node->parent;
374 if (node->parent == NULL) {
377 if (node == node->parent->left)
378 node->parent->left = subject;
380 node->parent->right = subject;
383 subject->right = node;
384 node->parent = subject;
391 void registerfile(filetree_t **root, filetree_t *parent, int loc, file_t *file)
396 file->size = filesize(file->d_name);
397 file->inode = getinode(file->d_name);
399 node = (filetree_t*) malloc(sizeof(filetree_t));
401 errormsg("out of memory!\n");
408 node->parent = parent;
409 node->color = COLOR_RED;
411 if (loc == TREE_ROOT)
413 else if (loc == TREE_LEFT)
416 parent->right = node;
418 while (node != *root && node->parent->color == COLOR_RED) {
419 if (node->parent->parent == NULL) return;
421 if (node->parent == node->parent->parent->left) {
422 uncle = node->parent->parent->right;
423 if (uncle == NULL) return;
425 if (uncle->color == COLOR_RED) {
426 node->parent->color = COLOR_BLACK;
427 uncle->color = COLOR_BLACK;
428 node->parent->parent->color = COLOR_RED;
429 node = node->parent->parent;
431 if (node == node->parent->right) {
433 rotate_left(root, node);
435 node->parent->color = COLOR_BLACK;
436 node->parent->parent->color = COLOR_RED;
437 rotate_right(root, node->parent->parent);
440 uncle = node->parent->parent->left;
441 if (uncle == NULL) return;
443 if (uncle->color == COLOR_RED) {
444 node->parent->color = COLOR_BLACK;
445 uncle->color = COLOR_BLACK;
446 node->parent->parent->color = COLOR_RED;
447 node = node->parent->parent;
449 if (node == node->parent->right) {
451 rotate_left(root, node);
453 node->parent->color = COLOR_BLACK;
454 node->parent->parent->color = COLOR_RED;
455 rotate_right(root, node->parent->parent);
460 (*root)->color = COLOR_BLACK;
463 #endif /* [#ifdef EXPERIMENTAL_RBTREE] */
465 #ifndef EXPERIMENTAL_RBTREE
467 int registerfile(filetree_t **branch, file_t *file)
469 file->size = filesize(file->d_name);
470 file->inode = getinode(file->d_name);
472 *branch = (filetree_t*) malloc(sizeof(filetree_t));
473 if (*branch == NULL) {
474 errormsg("out of memory!\n");
478 (*branch)->file = file;
479 (*branch)->left = NULL;
480 (*branch)->right = NULL;
485 #endif /* [#ifndef EXPERIMENTAL_RBTREE] */
487 file_t *checkmatch(filetree_t **root, filetree_t *checktree, file_t *file)
493 /* If inodes are equal one of the files is a hard link, which
494 is usually not accidental. We don't want to flag them as
495 duplicates, unless the user specifies otherwise. */
497 if (!ISFLAG(flags, F_CONSIDERHARDLINKS) && getinode(file->d_name) ==
498 checktree->file->inode) return NULL;
500 fsize = filesize(file->d_name);
502 if (fsize < checktree->file->size)
505 if (fsize > checktree->file->size) cmpresult = 1;
507 if (checktree->file->crcsignature == NULL) {
508 crcsignature = getcrcsignature(checktree->file->d_name);
509 if (crcsignature == NULL) return NULL;
511 checktree->file->crcsignature = (char*) malloc(strlen(crcsignature)+1);
512 if (checktree->file->crcsignature == NULL) {
513 errormsg("out of memory\n");
516 strcpy(checktree->file->crcsignature, crcsignature);
519 if (file->crcsignature == NULL) {
520 crcsignature = getcrcsignature(file->d_name);
521 if (crcsignature == NULL) return NULL;
523 file->crcsignature = (char*) malloc(strlen(crcsignature)+1);
524 if (file->crcsignature == NULL) {
525 errormsg("out of memory\n");
528 strcpy(file->crcsignature, crcsignature);
531 cmpresult = strcmp(file->crcsignature, checktree->file->crcsignature);
535 if (checktree->left != NULL) {
536 return checkmatch(root, checktree->left, file);
538 #ifndef EXPERIMENTAL_RBTREE
539 registerfile(&(checktree->left), file);
541 registerfile(root, checktree, TREE_LEFT, file);
545 } else if (cmpresult > 0) {
546 if (checktree->right != NULL) {
547 return checkmatch(root, checktree->right, file);
549 #ifndef EXPERIMENTAL_RBTREE
550 registerfile(&(checktree->right), file);
552 registerfile(root, checktree, TREE_RIGHT, file);
556 } else return checktree->file;
559 /* Do a bit-for-bit comparison in case two different files produce the
560 same signature. Unlikely, but better safe than sorry. */
562 int confirmmatch(FILE *file1, FILE *file2)
569 fseek(file1, 0, SEEK_SET);
570 fseek(file2, 0, SEEK_SET);
573 r1 = fread(&c1, sizeof(c1), 1, file1);
574 r2 = fread(&c2, sizeof(c2), 1, file2);
576 if (c1 != c2) return 0; /* file contents are different */
579 if (r1 != r2) return 0; /* file lengths are different */
584 void printmatches(file_t *files)
588 while (files != NULL) {
589 if (files->hasdupes) {
590 if (!ISFLAG(flags, F_OMITFIRST)) {
591 if (ISFLAG(flags, F_SHOWSIZE)) printf("%ld byte%seach:\n", files->size,
592 (files->size != 1) ? "s " : " ");
593 if (ISFLAG(flags, F_DSAMELINE)) escapefilename("\\ ", &files->d_name);
594 printf("%s%c", files->d_name, ISFLAG(flags, F_DSAMELINE)?' ':'\n');
596 tmpfile = files->duplicates;
597 while (tmpfile != NULL) {
598 if (ISFLAG(flags, F_DSAMELINE)) escapefilename("\\ ", &tmpfile->d_name);
599 printf("%s%c", tmpfile->d_name, ISFLAG(flags, F_DSAMELINE)?' ':'\n');
600 tmpfile = tmpfile->duplicates;
610 void autodelete(file_t *files)
631 if (curfile->hasdupes) {
635 tmpfile = curfile->duplicates;
638 tmpfile = tmpfile->duplicates;
641 if (counter > max) max = counter;
644 curfile = curfile->next;
649 dupelist = (file_t**) malloc(sizeof(file_t*) * max);
650 preserve = (int*) malloc(sizeof(int) * max);
651 preservestr = (char*) malloc(INPUT_SIZE);
653 if (!dupelist || !preserve || !preservestr) {
654 errormsg("out of memory\n");
659 if (files->hasdupes) {
662 dupelist[counter] = files;
664 printf("[%d] %s\n", counter, files->d_name);
666 tmpfile = files->duplicates;
669 dupelist[++counter] = tmpfile;
670 printf("[%d] %s\n", counter, tmpfile->d_name);
671 tmpfile = tmpfile->duplicates;
677 printf("Set %d of %d, preserve files [1 - %d, all]",
678 curgroup, groups, counter);
679 if (ISFLAG(flags, F_SHOWSIZE)) printf(" (%ld byte%seach)", files->size,
680 (files->size != 1) ? "s " : " ");
684 fgets(preservestr, INPUT_SIZE, stdin);
686 i = strlen(preservestr) - 1;
688 while (preservestr[i]!='\n'){ /* tail of buffer must be a newline */
690 realloc(preservestr, strlen(preservestr) + 1 + INPUT_SIZE);
691 if (!tstr) { /* couldn't allocate memory, treat as fatal */
692 errormsg("out of memory!\n");
697 if (!fgets(preservestr + i + 1, INPUT_SIZE, stdin))
698 break; /* stop if fgets fails -- possible EOF? */
699 i = strlen(preservestr)-1;
702 for (x = 1; x <= counter; x++) preserve[x] = 0;
704 token = strtok(preservestr, " ,\n");
706 while (token != NULL) {
707 if (strcasecmp(token, "all") == 0)
708 for (x = 0; x <= counter; x++) preserve[x] = 1;
711 sscanf(token, "%d", &number);
712 if (number > 0 && number <= counter) preserve[number] = 1;
714 token = strtok(NULL, " ,\n");
717 for (sum = 0, x = 1; x <= counter; x++) sum += preserve[x];
718 } while (sum < 1); /* make sure we've preserved at least one file */
722 for (x = 1; x <= counter; x++) {
724 printf(" [+] %s\n", dupelist[x]->d_name);
726 printf(" [-] %s\n", dupelist[x]->d_name);
727 remove(dupelist[x]->d_name);
743 printf("Usage: fdupes [options] DIRECTORY...\n\n");
745 printf(" -r --recurse \tinclude files residing in subdirectories\n");
746 printf(" -s --symlinks \tfollow symlinks\n");
747 printf(" -H --hardlinks \tnormally, when two or more files point to the same\n");
748 printf(" \tdisk area they are treated as non-duplicates; this\n");
749 printf(" \toption will change this behavior\n");
750 printf(" -n --noempty \texclude zero-length files from consideration\n");
751 printf(" -f --omitfirst \tomit the first file in each set of matches\n");
752 printf(" -1 --sameline \tlist each set of matches on a single line\n");
753 printf(" -S --size \tshow size of duplicate files\n");
754 printf(" -q --quiet \thide progress indicator\n");
755 printf(" -d --delete \tprompt user for files to preserve and delete all\n");
756 printf(" \tothers; important: under particular circumstances,\n");
757 printf(" \tdata may be lost when using this option together\n");
758 printf(" \twith -s or --symlinks, or when specifying a\n");
759 printf(" \tparticular directory more than once; refer to the\n");
760 printf(" \tfdupes documentation for additional information\n");
761 printf(" -v --version \tdisplay fdupes version\n");
762 printf(" -h --help \tdisplay this help message\n\n");
765 int main(int argc, char **argv) {
770 file_t *files = NULL;
772 file_t *match = NULL;
773 filetree_t *checktree = NULL;
777 static struct option long_options[] =
779 { "omitfirst", 0, 0, 'f' },
780 { "recurse", 0, 0, 'r' },
781 { "quiet", 0, 0, 'q' },
782 { "sameline", 0, 0, '1' },
783 { "size", 0, 0, 'S' },
784 { "symlinks", 0, 0, 's' },
785 { "hardlinks", 0, 0, 'H' },
786 { "noempty", 0, 0, 'n' },
787 { "delete", 0, 0, 'd' },
788 { "version", 0, 0, 'v' },
789 { "help", 0, 0, 'h' },
793 program_name = argv[0];
795 while ((opt = getopt_long(argc, argv, "frq1SsHndvh", long_options, NULL)) != EOF) {
798 SETFLAG(flags, F_OMITFIRST);
801 SETFLAG(flags, F_RECURSE);
804 SETFLAG(flags, F_HIDEPROGRESS);
807 SETFLAG(flags, F_DSAMELINE);
810 SETFLAG(flags, F_SHOWSIZE);
813 SETFLAG(flags, F_FOLLOWLINKS);
816 SETFLAG(flags, F_CONSIDERHARDLINKS);
819 SETFLAG(flags, F_EXCLUDEEMPTY);
822 SETFLAG(flags, F_DELETEFILES);
825 printf("fdupes %s\n", VERSION);
831 fprintf(stderr, "Try `fdupes --help' for more information\n");
836 if (optind >= argc) {
837 errormsg("no directories specified\n");
841 for (x = optind; x < argc; x++) filecount += grokdir(argv[x], &files);
849 #ifndef EXPERIMENTAL_RBTREE
850 registerfile(&checktree, curfile);
852 registerfile(&checktree, NULL, TREE_ROOT, curfile);
855 match = checkmatch(&checktree, checktree, curfile);
858 file1 = fopen(curfile->d_name, "rb");
860 curfile = curfile->next;
864 file2 = fopen(match->d_name, "rb");
867 curfile = curfile->next;
871 if (confirmmatch(file1, file2)) {
873 curfile->duplicates = match->duplicates;
874 match->duplicates = curfile;
881 curfile = curfile->next;
883 if (!ISFLAG(flags, F_HIDEPROGRESS)) {
884 fprintf(stderr, "\rProgress [%d/%d] %d%% ", progress, filecount,
885 (int)((float) progress / (float) filecount * 100.0));
890 if (!ISFLAG(flags, F_HIDEPROGRESS)) fprintf(stderr, "\r%40s\r", " ");
892 if (ISFLAG(flags, F_DELETEFILES)) autodelete(files);
893 else printmatches(files);
896 curfile = files->next;
902 purgetree(checktree);