dirhashing
.py
appliance
A helper searching for duplicate directories
The idea is to define customizable* hash function for a directory, that returns the same result for directories with equal (*or similar) contents.
from collections import Counter as C
from hashlib import sha256 as H
from pathlib import Path as P
#
# Path to the stuff to analyze, with content like:
# dir1/ \
# dir2/ > payload
# ... /
# dir1.sha256 \
# dir2.sha256 > checksum files hashed with the same func. as imported,
# ... / with path relative to dir1, dir2, ...
#
ROOTDIR = 'c:\\Users\\u\\Documents'
###############################################################################
## Prepare checksums
# (
# pushd <root_dir>
# find dir1 dir2 -type f -exec shasum -a256 {} \; > dir_1_2.sha256
# popd
# )
###############################################################################
## i-1. Load checksums TODO Calculate here with hashlib, as an alternative
def parse_hashline(hashline):
(h, f) = hashline.split(' *', maxsplit=1)
return (P(f), h)
fhash = dict()
for sf in P(ROOTDIR).glob('*.sha256'):
with open(sf) as fo:
fhash.update(map(parse_hashline,
fo.read().splitlines(keepends=False)))
###############################################################################
## i-2. Calculate hashes for directories
def calc_dirhash(p):
p = P(p)
z = list(p.iterdir())
h = H()
# h.update(b'init hash for directory, to distinguish empty dirs and empty files')
for _dh in sorted(calc_dirhash(_)
for _ in z if _.is_dir() and not _.is_symlink()):
h.update(_dh.encode())
for _fh in sorted(fhash[_.relative_to(ROOTDIR)]
for _ in z if _.is_file() and not _.is_symlink()):
h.update(_fh.encode())
dhash.update({p: h.hexdigest()})
return h.hexdigest()
if True: # Copy-paste-to-IDLE-friendliness
dhash = dict()
calc_dirhash(P(ROOTDIR, 'dir1'))
calc_dirhash(P(ROOTDIR, 'dir2'))
# calc_dirhash(P(ROOTDIR, '...'))
###############################################################################
## ii. Calculate occupied space for directories TODO motivation for blocks
def calc_dirsize(p):
p = P(p)
s = (
sum(
map(
lambda _: calc_dirsize(_) if _.is_dir() and not _.is_symlink() else _.lstat().st_size,
list(p.iterdir())
)
)
)
dsize.update({p: s})
return s
if True: # Copy-paste-to-IDLE-friendliness
dsize = dict()
calc_dirsize(P(ROOTDIR, 'dir1'))
calc_dirsize(P(ROOTDIR, 'dir2'))
# calc_dirsize(P(ROOTDIR, '...'))
###############################################################################
## iii. Go on with analysis
# Count hashes
c = C(dhash.values())
# Long list of (Path, Hash, Overhead estimation) # TODO Explain "estimation"
z = sorted(
((
str(d.relative_to(ROOTDIR)),
dh,
round(dsize[d] * (c[dh] - 1) / 1024, 1)
) for (d, dh) in dhash.items() if c[dh] > 1),
key=lambda x: (x[2], x[1], len(x[0])),
reverse=True
)
# Sample report
for (p, h, oe) in z[:10]:
print('{} {:8} {}'.format(h, oe, p))