IT Fuzzy Blog: Check Out Which Files Are Double

I wrote a little script to figure out which files are double inside a directory tree. You can see how many bytes are wasted by storing data twice.

#!/usr/bin/perl -w

# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software
# and associated documentation files (the
# "Software"), to deal in the Software without
# restriction, including without limitation the
# rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom
# the Software is furnished to do so, subject
# to the following conditions:
#
# This permission notice shall be included in
# all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT
# WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR
# IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.

use strict;
use File::Find;
use Digest::MD5;
use Digest::SHA;


# Debug
my $debug = 0;

# ARGV contains the basis directories
@ARGV = qw(.) unless @ARGV;
if(@ARGV > 1) {
  print "Base directories: @ARGV\n\n";
} else {
  print "Base directory: @ARGV\n\n";
}

# Save data in this structure
my %md5_sha_data;

# Process files
find(\&process_file, @ARGV);

# Create report
report();


sub process_file {
  my $file = $_;
  my $fullfile = $File::Find::name;
  print "Processing: $file\n" if $debug;

  my $skip = 0;

  if(-d $file) {
    $skip = 1;
  } elsif(-l $file) {
    $skip = 1;
  }

  if($skip) {
    print "skipped\n\n" if $debug;
  } else {
    open(FILE,'<',$file) or die "Error: $!\n\n";
    binmode(FILE);
    my  $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
    my  $sha = Digest::SHA->new->addfile(*FILE)->hexdigest;
    close(FILE);
    my $size = -s $file;
    print "$md5 $sha $size\n\n" if $debug;

    save_data($fullfile, $md5, $sha, $size);
  }
}


sub save_data {
  my $file = shift;
  my $md5 = shift;
  my $sha = shift;
  my $size = shift;
  my $hash = "$md5$sha";

  if(!exists($md5_sha_data{$hash})) {
    $md5_sha_data{$hash}{"count"} = 1;
    $md5_sha_data{$hash}{"size"} = $size;
    push @{$md5_sha_data{$hash}{"filenames"}}, $file;
  } else {
    $md5_sha_data{$hash}{"count"} += 1;
    push @{$md5_sha_data{$hash}{"filenames"}}, $file;
  }
}


sub report {
  my $total = 0;
  for my $hash ( sort keys %md5_sha_data ){
    if($md5_sha_data{$hash}{"count"} > 1) {
      print "Multiple copies of equal file\n";
      foreach (@{$md5_sha_data{$hash}{"filenames"}}){
        print "  $_\n";
      }
      my $sum = $md5_sha_data{$hash}{"count"} *
                $md5_sha_data{$hash}{"size"} -
                $md5_sha_data{$hash}{"size"};
      $total += $sum;
      print "$sum Bytes\n\n";
    }
  }

  my $KBytes = $total / 1024;
  $KBytes = sprintf("%.0f", $KBytes);
  my $MBytes = $KBytes / 1024;
  $MBytes = sprintf("%.0f", $MBytes);

  print "$total Bytes ($KBytes KiB, $MBytes MiB) wasted space\n\n";
}
IT Fuzzy Blog

2010-08-09

Check Out Which Files Are Double

About Me