dirhashing

.py
appliance

A helper searching for duplicate directories

The idea is to define customizable* hash function for a directory, that returns the same result for directories with equal (*or similar) contents.

from collections import Counter as C
from hashlib import sha256 as H
from pathlib import Path as P

#
# Path to the stuff to analyze, with content like:
#   dir1/         \
#   dir2/          > payload
#   ...           /
#   dir1.sha256  \
#   dir2.sha256   > checksum files hashed with the same func. as imported,
#   ...           /  with path relative to dir1, dir2, ...
#
ROOTDIR = 'c:\\Users\\u\\Documents'


###############################################################################
##  Prepare checksums
# (
#   pushd <root_dir>
#   find dir1 dir2 -type f -exec shasum -a256 {} \; > dir_1_2.sha256
#   popd
# )


###############################################################################
##  i-1. Load checksums     TODO Calculate here with hashlib, as an alternative

def parse_hashline(hashline):
    (h, f) = hashline.split(' *', maxsplit=1)
    return (P(f), h)

fhash = dict()

for sf in P(ROOTDIR).glob('*.sha256'):
    with open(sf) as fo:
        fhash.update(map(parse_hashline,
                         fo.read().splitlines(keepends=False)))


###############################################################################
##  i-2. Calculate hashes for directories

def calc_dirhash(p):
    p = P(p)
    z = list(p.iterdir())
    h = H()
    # h.update(b'init hash for directory, to distinguish empty dirs and empty files')
    for _dh in sorted(calc_dirhash(_)
                      for _ in z if _.is_dir() and not _.is_symlink()):
        h.update(_dh.encode())
    for _fh in sorted(fhash[_.relative_to(ROOTDIR)]
                      for _ in z if _.is_file() and not _.is_symlink()):
        h.update(_fh.encode())
    dhash.update({p: h.hexdigest()})
    return h.hexdigest()

if True: # Copy-paste-to-IDLE-friendliness
    dhash = dict()
    calc_dirhash(P(ROOTDIR, 'dir1'))
    calc_dirhash(P(ROOTDIR, 'dir2'))
    # calc_dirhash(P(ROOTDIR, '...'))


###############################################################################
##  ii. Calculate occupied space for directories     TODO motivation for blocks

def calc_dirsize(p):
    p = P(p)
    s = (
        sum(
            map(
                lambda _: calc_dirsize(_) if _.is_dir() and not _.is_symlink() else _.lstat().st_size,
                list(p.iterdir())
            )
        )
    )
    dsize.update({p: s})
    return s

if True: # Copy-paste-to-IDLE-friendliness
    dsize = dict()
    calc_dirsize(P(ROOTDIR, 'dir1'))
    calc_dirsize(P(ROOTDIR, 'dir2'))
    # calc_dirsize(P(ROOTDIR, '...'))


###############################################################################
##  iii. Go on with analysis

# Count hashes
c = C(dhash.values())

# Long list of (Path, Hash, Overhead estimation)    # TODO Explain "estimation"
z = sorted(
    ((
        str(d.relative_to(ROOTDIR)),
        dh,
        round(dsize[d] * (c[dh] - 1) / 1024, 1)
    ) for (d, dh) in dhash.items() if c[dh] > 1),
    key=lambda x: (x[2], x[1], len(x[0])),
    reverse=True
)

# Sample report
for (p, h, oe) in z[:10]:
    print('{} {:8} {}'.format(h, oe, p))