Remove duplicate files in python
import os
import hashlib
path = u'/path/to/folder'
def file_as_bytes(file):
with file:
return file.read()
hashes = []
fname = []
files = []
for root,d_names,f_names in os.walk(path):
for f in f_names:
for name in f_names:
if name in files:
print(name)
files.append(name)
fname.append(os.path.join(root, f))
hash_string = hashlib.md5(file_as_bytes(open(os.path.join(root, f), 'rb'))).hexdigest()
hashes.append(hash_string)
multiple_item = list(set([x for x in hashes if hashes.count(x) > 2]))
for file_path in fname:
hash_string = hashlib.md5(file_as_bytes(open(file_path, 'rb'))).hexdigest()
if hash_string in multiple_item:
print('hash_string: {} exists for file {}'.format(hash_string, file_path))
# os.remove(file_path)