2010-08-09

Check Out Which Files Are Double

I wrote a little script to figure out which files are double inside a directory tree. You can see how many bytes are wasted by storing data twice.


#!/usr/bin/perl -w

# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software
# and associated documentation files (the
# "Software"), to deal in the Software without
# restriction, including without limitation the
# rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom
# the Software is furnished to do so, subject
# to the following conditions:
#
# This permission notice shall be included in
# all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT
# WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR
# IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.

use strict;
use File::Find;
use Digest::MD5;
use Digest::SHA;


# Debug
my $debug = 0;

# ARGV contains the basis directories
@ARGV = qw(.) unless @ARGV;
if(@ARGV > 1) {
print "Base directories: @ARGV\n\n";
} else {
print "Base directory: @ARGV\n\n";
}

# Save data in this structure
my %md5_sha_data;

# Process files
find(\&process_file, @ARGV);

# Create report
report();


sub process_file {
my $file = $_;
my $fullfile = $File::Find::name;
print "Processing: $file\n" if $debug;

my $skip = 0;

if(-d $file) {
$skip = 1;
} elsif(-l $file) {
$skip = 1;
}

if($skip) {
print "skipped\n\n" if $debug;
} else {
open(FILE,'<',$file) or die "Error: $!\n\n";
binmode(FILE);
my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
my $sha = Digest::SHA->new->addfile(*FILE)->hexdigest;
close(FILE);
my $size = -s $file;
print "$md5 $sha $size\n\n" if $debug;

save_data($fullfile, $md5, $sha, $size);
}
}


sub save_data {
my $file = shift;
my $md5 = shift;
my $sha = shift;
my $size = shift;
my $hash = "$md5$sha";

if(!exists($md5_sha_data{$hash})) {
$md5_sha_data{$hash}{"count"} = 1;
$md5_sha_data{$hash}{"size"} = $size;
push @{$md5_sha_data{$hash}{"filenames"}}, $file;
} else {
$md5_sha_data{$hash}{"count"} += 1;
push @{$md5_sha_data{$hash}{"filenames"}}, $file;
}
}


sub report {
my $total = 0;
for my $hash ( sort keys %md5_sha_data ){
if($md5_sha_data{$hash}{"count"} > 1) {
print "Multiple copies of equal file\n";
foreach (@{$md5_sha_data{$hash}{"filenames"}}){
print " $_\n";
}
my $sum = $md5_sha_data{$hash}{"count"} *
$md5_sha_data{$hash}{"size"} -
$md5_sha_data{$hash}{"size"};
$total += $sum;
print "$sum Bytes\n\n";
}
}

my $KBytes = $total / 1024;
$KBytes = sprintf("%.0f", $KBytes);
my $MBytes = $KBytes / 1024;
$MBytes = sprintf("%.0f", $MBytes);

print "$total Bytes ($KBytes KiB, $MBytes MiB) wasted space\n\n";
}