
A helper searching for duplicate directories


The idea is to define customizable* hash function for a directory, that returns the same result for directories with equal (*or similar) contents.

from collections import Counter as C
from hashlib import sha256 as H
from pathlib import Path as P

# Path to the stuff to analyze, with content like:
#   dir1/         \
#   dir2/          > payload
#   ...           /
#   dir1.sha256  \
#   dir2.sha256   > checksum files hashed with the same func. as imported,
#   ...           /  with path relative to dir1, dir2, ...
ROOTDIR = 'c:\\Users\\u\\Documents'

##  Prepare checksums
# (
#   pushd <root_dir>
#   find dir1 dir2 -type f -exec shasum -a256 {} \; > dir_1_2.sha256
#   popd
# )

##  i-1. Load checksums     TODO Calculate here with hashlib, as an alternative

def parse_hashline(hashline):
    (h, f) = hashline.split(' *', maxsplit=1)
    return (P(f), h)

fhash = dict()

for sf in P(ROOTDIR).glob('*.sha256'):
    with open(sf) as fo:

##  i-2. Calculate hashes for directories

def calc_dirhash(p):
    p = P(p)
    z = list(p.iterdir())
    h = H()
    # h.update(b'init hash for directory, to distinguish empty dirs and empty files')
    for _dh in sorted(calc_dirhash(_)
                      for _ in z if _.is_dir() and not _.is_symlink()):
    for _fh in sorted(fhash[_.relative_to(ROOTDIR)]
                      for _ in z if _.is_file() and not _.is_symlink()):
    dhash.update({p: h.hexdigest()})
    return h.hexdigest()

if True: # Copy-paste-to-IDLE-friendliness
    dhash = dict()
    calc_dirhash(P(ROOTDIR, 'dir1'))
    calc_dirhash(P(ROOTDIR, 'dir2'))
    # calc_dirhash(P(ROOTDIR, '...'))

##  ii. Calculate occupied space for directories     TODO motivation for blocks

def calc_dirsize(p):
    p = P(p)
    s = (
                lambda _: calc_dirsize(_) if _.is_dir() and not _.is_symlink() else _.lstat().st_size,
    dsize.update({p: s})
    return s

if True: # Copy-paste-to-IDLE-friendliness
    dsize = dict()
    calc_dirsize(P(ROOTDIR, 'dir1'))
    calc_dirsize(P(ROOTDIR, 'dir2'))
    # calc_dirsize(P(ROOTDIR, '...'))

##  iii. Go on with analysis

# Count hashes
c = C(dhash.values())

# Long list of (Path, Hash, Overhead estimation)    # TODO Explain "estimation"
z = sorted(
        round(dsize[d] * (c[dh] - 1) / 1024, 1)
    ) for (d, dh) in dhash.items() if c[dh] > 1),
    key=lambda x: (x[2], x[1], len(x[0])),

# Sample report
for (p, h, oe) in z[:10]:
    print('{} {:8} {}'.format(h, oe, p))