-
Notifications
You must be signed in to change notification settings - Fork 33
/
find_similar_filenames_unidec.pl
executable file
·115 lines (89 loc) · 2.52 KB
/
find_similar_filenames_unidec.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl
# Daniel "Trizen" Șuteu
# License: GPLv3
# Date: 22 June 2012
# https://github.com/trizen
# Find files which have exactly or *ALMOST*
# exactly the same name in a given path.
# Improved version here:
# https://github.com/trizen/perl-scripts/blob/master/Finders/fsfn.pl
use 5.014;
use strict;
use warnings;
use File::Find qw(find);
use Getopt::Std qw(getopts);
use Text::Unidecode qw(unidecode);
my @dirs = grep { -d } @ARGV;
die <<"HELP" if !@dirs;
usage: $0 [options] /my/path [...]
Options:
-f : keep only the first file
-l : keep only the last file
HELP
my %opts;
if (@ARGV) {
getopts("fl", \%opts);
}
sub compare_strings ($$) {
my ($name1, $name2) = @_;
return 0 if $name1 eq $name2;
if (length($name1) > length($name2)) {
($name2, $name1) = ($name1, $name2);
}
my $len1 = length($name1);
my $len2 = length($name2);
my $min = int(0.5 + $len2 / 2);
return -1 if $min > $len1;
my $diff = $len1 - $min;
foreach my $i (0 .. $diff) {
foreach my $j ($i .. $diff) {
if (index($name2, substr($name1, $i, $min + $j - $i)) != -1) {
return 0;
}
}
}
return 1;
}
sub find_duplicated_files (&@) {
my $code = shift;
my %files;
find {
wanted => sub {
lstat;
-f _ && (not -l _) && push @{$files{"key"}}, # to group files by size, change the "key" to '-s _' (unquoted)
{
name => do { utf8::decode($_); lc(unidecode($_) =~ s{\.\w+\z}{}r) },
real_name => $File::Find::name,
};
}
} => @_;
foreach my $files (values %files) {
next if $#{$files} < 1;
my %dups;
foreach my $i (0 .. $#{$files} - 1) {
for (my $j = $i + 1 ; $j <= $#{$files} ; $j++) {
if (compare_strings($files->[$i]{name}, $files->[$j]{name}) == 0) {
push @{$dups{$files->[$i]{real_name}}}, ${splice @{$files}, $j--, 1}{real_name};
}
}
}
while (my ($fparent, $fdups) = each %dups) {
$code->(sort $fparent, @{$fdups});
}
}
return 1;
}
{
local $, = "\n";
find_duplicated_files {
say @_, "-" x 80 if @_;
foreach my $i (
$opts{f} ? (1 .. $#_)
: $opts{l} ? (0 .. $#_ - 1)
: ()
) {
unlink $_[$i] or warn "[error]: Can't delete: $!\n";
}
}
@dirs;
}