dirhashing
.py
appliance
A helper searching for duplicate directories
The idea is to define customizable* hash function for a directory, that returns the same result for directories with equal (*or similar) contents.
from collections import Counter as C
from hashlib import sha256 as H
from pathlib import Path as P
#
# Path to the stuff to analyze, with content like:
# dir1/ \
# dir2/ > payload
# ... /
# dir1.sha256 \
# dir2.sha256 > checksum files hashed with the same func. as imported,
# ... / with path relative to dir1, dir2, ...
#
= 'c:\\Users\\u\\Documents'
ROOTDIR
###############################################################################
## Prepare checksums
# (
# pushd <root_dir>
# find dir1 dir2 -type f -exec shasum -a256 {} \; > dir_1_2.sha256
# popd
# )
###############################################################################
## i-1. Load checksums TODO Calculate here with hashlib, as an alternative
def parse_hashline(hashline):
= hashline.split(' *', maxsplit=1)
(h, f) return (P(f), h)
= dict()
fhash
for sf in P(ROOTDIR).glob('*.sha256'):
with open(sf) as fo:
map(parse_hashline,
fhash.update(=False)))
fo.read().splitlines(keepends
###############################################################################
## i-2. Calculate hashes for directories
def calc_dirhash(p):
= P(p)
p = list(p.iterdir())
z = H()
h # h.update(b'init hash for directory, to distinguish empty dirs and empty files')
for _dh in sorted(calc_dirhash(_)
for _ in z if _.is_dir() and not _.is_symlink()):
h.update(_dh.encode())for _fh in sorted(fhash[_.relative_to(ROOTDIR)]
for _ in z if _.is_file() and not _.is_symlink()):
h.update(_fh.encode())
dhash.update({p: h.hexdigest()})return h.hexdigest()
if True: # Copy-paste-to-IDLE-friendliness
= dict()
dhash 'dir1'))
calc_dirhash(P(ROOTDIR, 'dir2'))
calc_dirhash(P(ROOTDIR, # calc_dirhash(P(ROOTDIR, '...'))
###############################################################################
## ii. Calculate occupied space for directories TODO motivation for blocks
def calc_dirsize(p):
= P(p)
p = (
s sum(
map(
lambda _: calc_dirsize(_) if _.is_dir() and not _.is_symlink() else _.lstat().st_size,
list(p.iterdir())
)
)
)
dsize.update({p: s})return s
if True: # Copy-paste-to-IDLE-friendliness
= dict()
dsize 'dir1'))
calc_dirsize(P(ROOTDIR, 'dir2'))
calc_dirsize(P(ROOTDIR, # calc_dirsize(P(ROOTDIR, '...'))
###############################################################################
## iii. Go on with analysis
# Count hashes
= C(dhash.values())
c
# Long list of (Path, Hash, Overhead estimation) # TODO Explain "estimation"
= sorted(
z
((str(d.relative_to(ROOTDIR)),
dh,round(dsize[d] * (c[dh] - 1) / 1024, 1)
for (d, dh) in dhash.items() if c[dh] > 1),
) =lambda x: (x[2], x[1], len(x[0])),
key=True
reverse
)
# Sample report
for (p, h, oe) in z[:10]:
print('{} {:8} {}'.format(h, oe, p))