-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor blob creation #7
Conversation
Signed-off-by: Jaideep Rao <[email protected]>
Hi @jaideepr97 thanks for this PR but I'm not following why:
The idea was to have 2 functions which would write a Layer, either just Tar, or TarGz, and while writing this Layer, return the simple Hash (while writing, without re-reading) and the Hash Pre/Post to use it as DiffID and Digest. It seems to me, this refactor into Can you kindly clarify? Thanks in advance for your clarifications |
Thanks for reviewing @tarilabs I havent removed any functionality, I moved these functions into the
these functions are only created for unit testing purposes, that's why they are placed in The actual logic has not been changed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thank you so much @jaideepr97
Thanks for reviewing @tarilabs
I havent removed any functionality, I moved these functions into the
utils
package in a previous PR, I am only removing those functions frombasics.py
in this PR Same is true forHashingWriter
as wellIt seems to me, this refactor into file_checksums_without_compression and file_checksums_with_compression makes compute of the hash with the requirement of re-reading the files, and further it deletes the actual layer blob.
these functions are only created for unit testing purposes, that's why they are placed in
tests/common.py
The actual logic has not been changed
I see now, noted below, thanks for your patience I didn't recall about that previous refactor.
/approve
I think in a follow-up PR we can deal with small comment below about test utility function refactor, indeed separately
To make things hopefully easier, exceptionally will merge with merge-commit this one
def tar_into_ocilayout(ocilayout: Path, model: Path): | ||
sha256_path = ocilayout / "blobs" / "sha256" | ||
temp_tar_filename = sha256_path / "temp_layer" | ||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with tarfile.open(fileobj=writer, mode="w") as tar: # type:ignore | ||
tar.add(model, arcname="/models/"+model.name, filter=tar_filter_fn) | ||
checksum = writer.hash_func.hexdigest() | ||
click.echo(f"digest of the tar file: {checksum}") | ||
final_tar_filename = checksum | ||
os.rename(temp_tar_filename, sha256_path / final_tar_filename) | ||
click.echo(f"tar file renamed to: {final_tar_filename}") | ||
return checksum | ||
|
||
|
||
def targz_into_ocilayout(ocilayout: Path, model: Path): | ||
sha256_path = ocilayout / "blobs" / "sha256" | ||
temp_tar_filename = sha256_path / "temp_layer" | ||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type:ignore | ||
inner_writer = HashingWriter(gz) | ||
with tarfile.open(fileobj=inner_writer, mode="w") as tar: # type:ignore | ||
tar.add(model, arcname="/models/"+model.name, filter=tar_filter_fn) | ||
checksum = writer.hash_func.hexdigest() | ||
tar_checksum = inner_writer.hash_func.hexdigest() | ||
final_tar_filename = checksum | ||
os.rename(temp_tar_filename, sha256_path / final_tar_filename) | ||
return (checksum, tar_checksum) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see now, these were moved into olot/utils/files.py
def tar_into_ocilayout(ocilayout: Path, model: Path): | ||
sha256_path = ocilayout / "blobs" / "sha256" | ||
temp_tar_filename = sha256_path / "temp_layer" | ||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with tarfile.open(fileobj=writer, mode="w") as tar: # type:ignore | ||
tar.add(model, arcname="/models/"+model.name, filter=tar_filter_fn) | ||
checksum = writer.hash_func.hexdigest() | ||
click.echo(f"digest of the tar file: {checksum}") | ||
final_tar_filename = checksum | ||
os.rename(temp_tar_filename, sha256_path / final_tar_filename) | ||
click.echo(f"tar file renamed to: {final_tar_filename}") | ||
return checksum |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indeed moved to
Lines 68 to 85 in c7ceea6
if not file_path.exists(): | |
raise ValueError(f"Input file '{file_path}' does not exist.") | |
os.makedirs(dest, exist_ok=True) | |
temp_dest = dest / "temp" | |
try: | |
with open(temp_dest, "wb") as temp_file: | |
writer = HashingWriter(temp_file) | |
with tarfile.open(fileobj=writer, mode="w") as tar: # type:ignore | |
tar.add(file_path, arcname="/models/"+file_path.name, filter=tar_filter_fn) | |
checksum = writer.hash_func.hexdigest() | |
os.rename(temp_dest, dest / checksum) | |
return checksum | |
except tarfile.TarError as e: | |
raise tarfile.TarError(f"Error creating tarball: {e}") from e | |
except OSError as e: | |
raise OSError(f"File operation failed: {e}") from e |
def targz_into_ocilayout(ocilayout: Path, model: Path): | ||
sha256_path = ocilayout / "blobs" / "sha256" | ||
temp_tar_filename = sha256_path / "temp_layer" | ||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type:ignore | ||
inner_writer = HashingWriter(gz) | ||
with tarfile.open(fileobj=inner_writer, mode="w") as tar: # type:ignore | ||
tar.add(model, arcname="/models/"+model.name, filter=tar_filter_fn) | ||
checksum = writer.hash_func.hexdigest() | ||
tar_checksum = inner_writer.hash_func.hexdigest() | ||
final_tar_filename = checksum | ||
os.rename(temp_tar_filename, sha256_path / final_tar_filename) | ||
return (checksum, tar_checksum) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indeed was moved to
Lines 108 to 128 in c7ceea6
if not file_path.exists(): | |
raise ValueError(f"Input file '{file_path}' does not exist.") | |
os.makedirs(dest, exist_ok=True) | |
temp_dest = dest / "temp" | |
try: | |
with open(temp_dest, "wb") as temp_file: | |
writer = HashingWriter(temp_file) | |
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type:ignore | |
inner_writer = HashingWriter(gz) | |
with tarfile.open(fileobj=inner_writer, mode="w") as tar: # type:ignore | |
tar.add(file_path, arcname="/models/"+file_path.name, filter=tar_filter_fn) | |
precompress_checksum = inner_writer.hash_func.hexdigest() | |
postcompress_checksum = writer.hash_func.hexdigest() | |
os.rename(temp_dest, dest / postcompress_checksum) | |
return (postcompress_checksum, precompress_checksum) | |
except tarfile.TarError as e: | |
raise tarfile.TarError(f"Error creating tarball: {e}") from e | |
except OSError as e: | |
raise OSError(f"File operation failed: {e}") from e |
@@ -114,9 +114,9 @@ def targz_from_file(file_path: Path, dest: Path) -> tuple[str, str]: | |||
try: | |||
with open(temp_dest, "wb") as temp_file: | |||
writer = HashingWriter(temp_file) | |||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type:ignore | |||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type: ignore[call-overload] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thank you for specifying better for lint ignore
def file_checksums_without_compression(source_file: Path, tmp_dest: Path) -> str: | ||
|
||
tmp_dest.mkdir(parents=True, exist_ok=True) | ||
temp_tar_filename = tmp_dest / "temp_layer" | ||
|
||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with tarfile.open(fileobj=writer, mode="w") as tar: # type: ignore[call-overload] | ||
tar.add(source_file, arcname="/models/"+source_file.name, filter=tar_filter_fn) | ||
sha = get_file_hash(temp_tar_filename) | ||
shutil.rmtree(tmp_dest) | ||
return sha | ||
|
||
|
||
def file_checksums_with_compression(source_file: Path, tmp_dest: Path) -> tuple[str, str]: | ||
temp_tar_filename = tmp_dest / "temp_layer.tar.gz" | ||
|
||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: # type: ignore[call-overload] | ||
inner_writer = HashingWriter(gz) | ||
with tarfile.open(fileobj=inner_writer, mode="w") as tar: # type: ignore[call-overload] | ||
tar.add(source_file, arcname="/models/"+source_file.name, filter=tar_filter_fn) | ||
|
||
postcompress_chksum_from_disk = get_file_hash(temp_tar_filename) | ||
|
||
uncompressed_tar = tmp_dest / "uncompressed.tar" | ||
with tarfile.open(uncompressed_tar, mode="w") as tar: # type: ignore[call-overload] | ||
tar.add(source_file, arcname=source_file.name) | ||
uncompressed_checksum_from_disk = get_file_hash(uncompressed_tar) | ||
|
||
shutil.rmtree(tmp_dest) | ||
|
||
return postcompress_chksum_from_disk, uncompressed_checksum_from_disk |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we could refactor these to re-use existing functions (above) and then remove the added layer, ie to avoid duplicating part of the hashing logic, but I also think this could be done in a subsequent PR, wdyt?
def test_Hashwriter_tar(tmp_path): | ||
"""Example bespoke use of HashingWriter for .tar | ||
""" | ||
hello_path = sample_model_path() / "hello.md" | ||
write_dest = sha256_path(tmp_path) | ||
write_dest.mkdir(parents=True, exist_ok=True) | ||
temp_tar_filename = write_dest / "temp_layer" | ||
|
||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with tarfile.open(fileobj=writer, mode="w") as tar: | ||
tar.add(hello_path, arcname=hello_path.name) | ||
|
||
checksum_from_disk = get_file_hash(temp_tar_filename) | ||
checksum = writer.hash_func.hexdigest() | ||
assert checksum == checksum_from_disk | ||
|
||
final_tar_filename = f"{checksum}.tar" | ||
os.rename(temp_tar_filename, write_dest / final_tar_filename) | ||
print(f"tar file renamed to: {final_tar_filename}") | ||
|
||
for file in tmp_path.rglob('*'): | ||
if file.is_file(): | ||
print(file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def test_Hashwriter_tar_gz(tmp_path): | ||
"""Example bespoke use of HashingWriter for .tar.gz | ||
""" | ||
hello_path = sample_model_path() / "hello.md" | ||
write_dest = sha256_path(tmp_path) | ||
write_dest.mkdir(parents=True, exist_ok=True) | ||
temp_tar_filename = write_dest / "temp_layer.tar.gz" | ||
|
||
with open(temp_tar_filename, "wb") as temp_file: | ||
writer = HashingWriter(temp_file) | ||
with gzip.GzipFile(fileobj=writer, mode="wb", mtime=0, compresslevel=6) as gz: | ||
inner_writer = HashingWriter(gz) | ||
with tarfile.open(fileobj=inner_writer, mode="w") as tar: | ||
tar.add(hello_path, arcname=hello_path.name) | ||
|
||
checksum_from_disk = get_file_hash(temp_tar_filename) | ||
checksum = writer.hash_func.hexdigest() | ||
assert checksum == checksum_from_disk | ||
|
||
uncompressed_tar = tmp_path / "uncompressed.tar" | ||
with tarfile.open(uncompressed_tar, mode="w") as tar: | ||
tar.add(hello_path, arcname=hello_path.name) | ||
|
||
uncompressed_checksum_from_disk = get_file_hash(uncompressed_tar) | ||
tar_checksum = inner_writer.hash_func.hexdigest() | ||
assert tar_checksum == uncompressed_checksum_from_disk | ||
|
||
final_tar_filename = f"{checksum}.tar.gz" | ||
os.rename(temp_tar_filename, write_dest / final_tar_filename) | ||
|
||
for file in tmp_path.rglob('*'): | ||
if file.is_file(): | ||
print(file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def test_tarball_from_file(tmp_path): | ||
"""Test tarball_from_file() function is able to produce the expected tar (uncompressed) layer blob in the oci-layout | ||
""" | ||
model_path = sample_model_path() / "model.joblib" | ||
write_dest = sha256_path(tmp_path) | ||
write_dest.mkdir(parents=True, exist_ok=True) | ||
digest = tarball_from_file(model_path, write_dest) # forcing it into a partial temp directory with blobs subdir for tests | ||
for file in tmp_path.rglob('*'): | ||
if file.is_file(): | ||
print(file) | ||
|
||
checksum_from_disk = get_file_hash(write_dest / digest) # read the file | ||
assert digest == checksum_from_disk # filename should match its digest | ||
|
||
found = None | ||
with tarfile.open(write_dest / digest, "r") as tar: | ||
for tarinfo in tar: | ||
if tarinfo.name == "models/model.joblib" and tarinfo.mode == 0o664: | ||
found = tarinfo # model.joblib is added in models/ inside modelcar with expected permissions | ||
assert found |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def test_targz_from_file(tmp_path): | ||
"""Test targz_from_file() function is able to produce the expected tar.gz layer blob in the oci-layout | ||
""" | ||
model_path = sample_model_path() / "model.joblib" | ||
write_dest = sha256_path(tmp_path) | ||
write_dest.mkdir(parents=True, exist_ok=True) | ||
postcomp_chksum, precomp_chskum = targz_from_file(model_path, write_dest) # forcing it into a partial temp directory with blobs subdir for tests | ||
|
||
for file in tmp_path.rglob('*'): | ||
if file.is_file(): | ||
print(file) | ||
|
||
checksum_from_disk = get_file_hash(write_dest / postcomp_chksum) # read the file | ||
assert postcomp_chksum == checksum_from_disk # filename should match its digest | ||
|
||
found = None | ||
with tarfile.open(write_dest / postcomp_chksum, "r:gz") as tar: | ||
for tarinfo in tar: | ||
if tarinfo.name == "models/model.joblib" and tarinfo.mode == 0o664: | ||
found = tarinfo # model.joblib is added in models/ inside modelcar with expected permissions | ||
assert found | ||
|
||
uncompressed_tar = write_dest / "uncompressed.tar" | ||
with gzip.open(write_dest / postcomp_chksum, "rb") as g_in: | ||
with open(uncompressed_tar, "wb") as f_out: | ||
shutil.copyfileobj(g_in, f_out) | ||
for file in tmp_path.rglob('*'): | ||
if file.is_file(): | ||
print(file) | ||
tar_checksum_from_disk = get_file_hash(uncompressed_tar) # compute the digest for the .tar from the .tar.gz | ||
assert precomp_chskum == tar_checksum_from_disk # digests should match |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def test_compute_hash_of_str(): | ||
"""Basis compute_hash_of_str() fn testing | ||
""" | ||
my_string = "Hello, World! Thanks Andrew, Roland, Syed and Quay team for the idea :)" | ||
actual = compute_hash_of_str(my_string) | ||
assert actual == "8695589abd30ec83cde42fabc3b4f6fd7da629eca94cf146c7920c6b067f4087" # can use echo -n "Hello, World! Thanks Andrew, Roland, Syed and Quay team for the idea :)" | shasum -a 256 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This PR does the following:
create_blobs
function