remove duplicate files in python

Python -- Posted on Jan. 2, 2020

Remove duplicate files in python

              
                import os
import hashlib
path = u'/path/to/folder'
def file_as_bytes(file):
    with file:
        return file.read()

hashes = []
fname = []
files = []
for root,d_names,f_names in os.walk(path):
    for f in f_names:
        for name in f_names:
            if name in files:
                print(name)
            files.append(name)
        fname.append(os.path.join(root, f))
        hash_string = hashlib.md5(file_as_bytes(open(os.path.join(root, f), 'rb'))).hexdigest()
        hashes.append(hash_string)

multiple_item = list(set([x for x in hashes if hashes.count(x) > 2]))

for file_path in fname:
    hash_string = hashlib.md5(file_as_bytes(open(file_path, 'rb'))).hexdigest()
    if hash_string in multiple_item:
        print('hash_string: {}  exists for file {}'.format(hash_string, file_path))
        # os.remove(file_path)
                  
   
            

Related Posts