finddup

#! /usr/bin/env python

#
# Script to find duplicate files
# Uses multiprocessing to scan files in parallel.
#
# Sudhi Herle <sudhi@herle.net>
# GPLv2
#

import os, sys,  os.path
import re, zlib, mmap, signal
import multiprocessing as m
from os.path  import basename, dirname, join, isfile, isdir
from optparse import Option, OptionParser, OptionValueError

try:
    import hashlib
    md5sum = hashlib.md5
except:
    import md5
    md5sum = md5.md5

KB = 1024
MB = 1024 * KB
GB = 1024 * MB
TB = 1024 * GB
PB = 1024 * TB

Divisors = [
    ('PB', PB),
    ('TB', TB),
    ('GB', GB),
    ('MB', MB),
    ('kB', KB),
    ]

usage = """%s dir [dir...]

%s - Find duplicate files in one or more directories
""" % (sys.argv[0], sys.argv[0])

parser = OptionParser(usage)
parser.add_option("-s", "--shell", dest="shell_cmds", action="store_true",
                  default=False,
                  help="Generate shell commands to delete duplicate files [%default]")

opt, args = parser.parse_args()

if len(args) < 1:
    args.append('.')

Ignore_re = ('\.svn',
             '.*~$',
             '\.*sw.$', 
             '\.CVS',
             '\.hg',
             )
Ignore_re_list = map(lambda x: re.compile(x), Ignore_re)


def block_sigs():
    """Block signals in the worker process"""
    signal.signal(signal.SIGINT,  signal.SIG_IGN)
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
    signal.signal(signal.SIGHUP,  signal.SIG_IGN)


def ignore(name):
    """Return True if this pat must be ignored, false otherwise"""
    global Ignore_re_list

    for x in Ignore_re_list:
        if x.search(name) is not None:
            return True

    return False

class bundle:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

class cksum_db:
    def __init__(self, cksum):
        self.w   = {}
        self.m   = m.Pool(processes=None, initializer=block_sigs)
        self.cksum = cksum

    def addfile(self, fn):
        x = self.m.apply_async(self.cksum, args=(fn,))
        self.w[fn] = x


    def dups(self):
        """Return a dict of duplicate files. Each entry in the dict
        is an array of files - whose contents are identical according to
        the checksum criteria of the class."""

        db = {}
        for k, v in self.w.items():
            z = v.get()
            a = db.setdefault(z.cksum, [])
            a.append(z)
            #print a

        ck = dict([(k,v) for k, v in db.items() if len(v) > 1])
        return ck


def mmap_gen(fn):
    """Generator that yields mmap'd chunks of memory and length"""
    MM_CHUNKSIZE = 2 * GB

    fd  = open(fn, 'rb')
    fdn = fd.fileno()
    st  = os.fstat(fdn)
    n   = 0 + st.st_size
    off = 0
    while n > 0:
        z  = MM_CHUNKSIZE if n > MM_CHUNKSIZE else n
        try:
            mm = mmap.mmap(fdn, z, access=mmap.ACCESS_READ, offset=off)
            yield mm, z
        except Exception as ex:
            ss = "can't mmap %d bytes of %s: %s" % (z, fn, str(ex))
            raise Exception(ss)
        mm.close()

        n   -= z
        off += z

    fd.close()


def cksum_slow(filename):
    m  = md5sum()
    sz = 0

    for mm, n in mmap_gen(filename):
        buf = mm.read(n)
        m.update(buf)
        sz += n

    b = bundle(size=sz, fname=filename, cksum=m.hexdigest())
    return b

def cksum_quick(filename):
    v  = 0
    sz = 0
    for mm, n in mmap_gen(filename):
        buf = mm.read(n)
        v   = zlib.adler32(buf, v)
        sz += n

    v = -v if v < 0 else v
    x = "%#x" % v
    b = bundle(size=sz, fname=filename, cksum=x)
    return b


def descend(db, dn):
    """Descend into directory 'dn' and gather files into dups"""

    for root, dirs, files in os.walk(dn, 1):
        for f in files:
            here = join(root, f)
            if not isfile(here): continue
            if not ignore(here):
                db.addfile(here)

def human(n):
    """Return human readable size for n bytes"""
    global Divisors
    for d in Divisors:
        sz = d[1]
        if n > sz:
            s = "%4.2f %s" % (float(n) / sz, d[0])
            return s

    return "%lu" % n


class shell:
    """Abstraction to print shell commands to remove dups"""

    def __init__(self):
        self.tot = 0
        print("#! /bin/sh\n")

    def dups(self, k, keep, rm):
        waste     = keep.size * len(rm)
        self.tot += waste
        s = '\n'.join([ "rm -f '%s'" % x.fname for x in rm ])
        print("# %s: %s, saving %s\n#rm -f '%s'\n%s\n" % \
                (k, human(keep.size), human(waste), keep.fname, s))

    def finish(self):
        print("# %s saved" % human(self.tot))


class plain:
    """Abstraction to simply print the duplicated files"""
    def __init__(self):
        self.tot = 0
        print("Report of duplicate files\n")

    def dups(self, k, keep, rm):
        waste     = keep.size * len(rm)
        self.tot += waste
        s = '\n\t'.join( [x.fname for x in rm ])
        print("%s: %s, wasted %s\n\tKEEP %s\n%s" % \
                (k, human(keep.size), human(waste), keep.fname, s))

    def finish(self):
        print("\n%s total wasted space" % human(self.tot))


def sighandler(a, b):
    #warn("** Keyboard interrupt. Exiting ..")
    sys.exit(1)


# startoff by installing signal handlers
signal.signal(signal.SIGINT, sighandler)

# We detect dups in two stages.
# In the first stage, we use a quick checksum to coarsely distinguish
# files. If two files have a checksum collision, we are not sure if they
# are identical or just a checksum collision.
# So, for those files, we use a strong checksum to disambiguate.
db = cksum_db(cksum_quick)
zargs = []
for d in args:
    if not isdir(d):
        print("Skipping non-directory %s" % d, file=sys.stderr)
        continue

    descend(db, d)
    zargs.append(d)


# Now, we use a slow checksum for the files that maybe dups
db2 = cksum_db(cksum_slow)
dups = db.dups()
for k, v  in dups.items():
    for f in v:
        #print "# maybe dup %s => %s" % (k, f)
        db2.addfile(f.fname)

dups  = db2.dups()

# We always keep the files in first arg
pr    = shell() if opt.shell_cmds else plain()

for k, v in dups.items():
    keep = v[0]
    rm   = v[1:]

    pr.dups(k, keep, rm)

pr.finish()

# vim: expandtab:sw=4:ts=4:tw=72:notextmode: