So, I found this question: How to assign a Git SHA1's to a file without Git?
but I am unsure how to do this method for a directory. How can I hash a directory in a program without using git so that it matches the sha1 given by git?
So, I found this question: How to assign a Git SHA1's to a file without Git?
but I am unsure how to do this method for a directory. How can I hash a directory in a program without using git so that it matches the sha1 given by git?
This turned out to be harder than I expected, but I do have it working now.
As I commented and hobbs answered, computing a tree hash is nontrivial. You must hash each file in each sub-tree, computing the hash for those sub-trees, and use those hashes to compute the hash for a top level tree.
The attached python code seems to work for at least some test cases (e.g., computing tree hashes for the git source itself). I put in, as comments, explanations of some unexpected things I discovered along the way.
This is also now in my github "scripts" repository.
[Edit: the github version has some Python3 fixes now, and will likely generally be newer/better.]
#! /usr/bin/env python
"""
Compute git hash values.
This is meant to work with both Python2 and Python3, but
has only been tested with Python2.7.
"""
from __future__ import print_function
import argparse
import os
import stat
import sys
from hashlib import sha1
def strmode(mode):
"""
Turn internal mode (octal with leading 0s suppressed) into
print form (i.e., left pad => right justify with 0s as needed).
"""
return mode.rjust(6, '0')
def classify(path):
"""
Return git classification of a path (as both mode,
100644/100755 etc, and git object type, i.e., blob vs tree).
Also throw in st_size field since we want it for file blobs.
"""
# We need the X bit of regular files for the mode, so
# might as well just use lstat rather than os.isdir().
st = os.lstat(path)
if stat.S_ISLNK(st.st_mode):
gitclass = 'blob'
mode = '120000'
elif stat.S_ISDIR(st.st_mode):
gitclass = 'tree'
mode = '40000' # note: no leading 0!
elif stat.S_ISREG(st.st_mode):
# 100755 if any execute permission bit set, else 100644
gitclass = 'blob'
mode = '100755' if (st.st_mode & 0111) != 0 else '100644'
else:
raise ValueError('un-git-able file system entity %s' % fullpath)
return mode, gitclass, st.st_size
def blob_hash(stream, size):
"""
Return (as hash instance) the hash of a blob,
as read from the given stream.
"""
hasher = sha1()
hasher.update(b'blob %u\0' % size)
nread = 0
while True:
# We read just 64K at a time to be kind to
# runtime storage requirements.
data = stream.read(65536)
if data == '':
break
nread += len(data)
hasher.update(data)
if nread != size:
raise ValueError('%s: expected %u bytes, found %u bytes' %
(stream.name, size, nread))
return hasher
def symlink_hash(path):
"""
Return (as hash instance) the hash of a symlink.
Caller must use hexdigest() or digest() as needed on
the result.
"""
hasher = sha1()
# XXX os.readlink produces a string, even though the
# underlying data read from the inode (which git will hash)
# are raw bytes. It's not clear what happens if the raw
# data bytes are not decode-able into Unicode; it might
# be nice to have a raw_readlink.
data = os.readlink(path).encode('utf8')
hasher.update(b'blob %u\0' % len(data))
hasher.update(data)
return hasher
def tree_hash(path, args):
"""
Return the hash of a tree. We need to know all
files and sub-trees. Since order matters, we must
walk the sub-trees and files in their natural (byte) order,
so we cannot use os.walk.
This is also slightly defective in that it does not know
about .gitignore files (we can't just read them since git
retains files that are in the index, even if they would be
ignored by a .gitignore directive).
We also do not (cannot) deal with submodules here.
"""
# Annoyingly, the tree object encodes its size, which requires
# two passes, one to find the size and one to compute the hash.
contents = os.listdir(path)
tsize = 0
to_skip = ('.', '..') if args.keep_dot_git else ('.', '..', '.git')
pass1 = []
for entry in contents:
if entry not in to_skip:
fullpath = os.path.join(path, entry)
mode, gitclass, esize = classify(fullpath)
# git stores as mode<sp><entry-name>\0<digest-bytes>
encoded_form = entry.encode('utf8')
tsize += len(mode) + 1 + len(encoded_form) + 1 + 20
pass1.append((fullpath, mode, gitclass, esize, encoded_form))
# Git's cache sorts foo/bar before fooXbar but after foo-bar,
# because it actually stores foo/bar as the literal string
# "foo/bar" in the index, rather than using recursion. That is,
# a directory name should sort as if it ends with '/' rather than
# with '\0'. Sort pass1 contents with funky sorting.
#
# (i[4] is the utf-8 encoded form of the name, i[1] is the
# mode which is '40000' for directories.)
pass1.sort(key = lambda i: i[4] + '/' if i[1] == '40000' else i[4])
args.depth += 1
hasher = sha1()
hasher.update(b'tree %u\0' % tsize)
for (fullpath, mode, gitclass, esize, encoded_form) in pass1:
sub_hash = generic_hash(fullpath, mode, esize, args)
if args.debug: # and args.depth == 0:
print('%s%s %s %s\t%s' % (' ' * args.depth,
strmode(mode), gitclass, sub_hash.hexdigest(),
encoded_form.decode('utf8')))
# Annoyingly, git stores the tree hash as 20 bytes, rather
# than 40 ASCII characters. This is why we return the
# hash instance (so we can use .digest() directly).
# The format here is <mode><sp><path>\0<raw-hash>.
hasher.update(b'%s %s\0' % (mode, encoded_form))
hasher.update(sub_hash.digest())
args.depth -= 1
return hasher
def generic_hash(path, mode, size, args):
"""
Hash an object based on its mode.
"""
if mode == '120000':
hasher = symlink_hash(path)
elif mode == '40000':
hasher = tree_hash(path, args)
else:
# 100755 if any execute permission bit set, else 100644
with open(path, 'rb') as stream:
hasher = blob_hash(stream, size)
return hasher
def main():
"""
Parse arguments and invoke hashers.
"""
parser = argparse.ArgumentParser('compute git hashes')
parser.add_argument('-d', '--debug', action='store_true')
parser.add_argument('-k', '--keep-dot-git', action='store_true')
parser.add_argument('path', nargs='+')
args = parser.parse_args()
args.depth = -1 # for debug print
status = 0
for path in args.path:
try:
try:
mode, gitclass, size = classify(path)
except ValueError:
print('%s: unhashable!' % path)
status = 1
continue
hasher = generic_hash(path, mode, size, args)
result = hasher.hexdigest()
if args.debug:
print('%s %s %s\t%s' % (strmode(mode), gitclass, result,
path))
else:
print('%s: %s hash = %s' % (path, gitclass, result))
except OSError as err:
print(str(err))
status = 1
sys.exit(status)
if __name__ == '__main__':
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit('\nInterrupted')
How can I hash a directory in a program without using git so that it matches the sha1 given by git?
Not at all - git doesn't "hash" directories. It hashes the files contained and has a representation of the tree; see the git object storage docs.
The state of all of the files in a directory in git is represented by a "tree" object, described in this SO answer and this section of the Git book. In order to calculate the hash of the tree object, you will have to produce the tree itself.
For each item in the directory you need four things:
st_mode
field of struct stat
), but restricted to a few values: of primary use are 040000 for directories, 100644 for non-executable files, 100755 for executable files, and 120000 for symlinks.If you collect this information for all of the entries in the directory, and write out the data in the correct format, in the correct order, you will have a valid tree object. If you compute the SHA of this object, you should get the same result as git and GitHub.