import os
from zlib import crc32
import argparse
def searchForFiles(path, extensions=None):
for root, subFolders, files in os.walk(path):
root = root.rstrip("/") +"/"
for fName in files:
if not extensions or fName.endswith(extensions):
yield root + fName
def genCRC32(fName, bufferSize=1024*1024*10):
prev = 0
fd = open(fName, 'rb')
d = fd.read(bufferSize)
while d:
prev = crc32(d, prev)
d = fd.read(bufferSize)
fd.close()
return prev
def findDupsByCRC(path, handler, minFileSize=1, extensions=None):
known = dict()
for fName in searchForFiles(path, extensions):
try:
msize = os.path.getsize(fName)
if msize < minFileSize or os.path.islink(fName): continue
if msize in known:
if isinstance(known[msize], basestring):
name = known[msize]
ocrc = genCRC32(name)
known[msize] = dict()
known[msize][ocrc] = name
ncrc = genCRC32(fName)
if ncrc in known[msize]:
if handler(known[msize][ncrc], fName, msize):
known[msize][ncrc] = fName
continue
known[msize][ncrc] = fName
else:
known[msize] = fName
except Exception as e:
print e
continue
if __name__ == '__main__':
def log(oldName, newName, size):
print "Duplicate (%0.2f mb):\t%s\t%s" % (size/1024.0/1024, oldName, newName)
return False
def remove_log(oldName, newName, size):
print "Duplicate (%0.2f mb):\t%s\t%s" % (size/1024.0/1024, oldName, newName)
print "Delete:\t%s" % newName
os.remove(newName)
return False
def ask(oldName, newName, size):
while True:
print "Duplicate:%s-%s%s-%s%smb:%0.2f" % (os.linesep, oldName, os.linesep, newName, os.linesep, size/1024.0/1024)
print "1\tDelete %s" % oldName
print "2\tDelete %s" % newName
print "3\tTo skip"
v = raw_input("Selection: ")
if v == "1":
print "Delete %s" % oldName
os.remove(oldName)
return True
if v == "2":
print "Delete %s" % newName
os.remove(newName)
return False
if v == "3":
print "SKIP"
return False
parser = argparse.ArgumentParser()
parser.add_argument("directory", help="The directory you want to search for duplicates in.")
parser.add_argument("-a", "--action", choices=['del', 'log', 'ask'], default="log", help="DELete, LOG or ASK at found duplicates.")
parser.add_argument("-e", "--extensions", default=None, help="A list of file extensions/ending in the form of \".mp3,.mpg,somePostfix.mp3\". The extensions are case sensitive. (TODO fix this). ")
parser.add_argument("-m", "--minsize", default=1, type=int, help="The minimum file size in bytes to check.")
args = parser.parse_args()
path = args.directory
print path
action = {'DEL':remove_log, 'LOG':log, 'ASK':ask}[args.action.upper()]
if args.extensions is None: extensions = None
else: extensions = tuple(args.extensions.split(","))
minSize = args.minsize
import time
stime = time.time()
findDupsByCRC(path, action, minSize, extensions)
print "Needed", time.time()-stime, "seconds"
The minimum file size in bytes to check.