diff --git a/olot/basics.py b/olot/basics.py index 196a061..6d8f226 100644 --- a/olot/basics.py +++ b/olot/basics.py @@ -2,6 +2,7 @@ import os from pathlib import Path from pprint import pprint +import tarfile from typing import Dict, List import typing import click @@ -138,5 +139,48 @@ def crawl_ocilayout_indexes(ocilayout: Path, ocilayout_root_index: OCIImageIndex click.echo(f"Found Image Manifest {m.digest} in root index, TODO assuming these are referred through the other indexes") return ocilayout_indexes + +def crawl_ocilayout_blobs_to_extract(ocilayout: Path, + output_path: Path, + tar_filter_dir: str = "/models") -> List[str]: + """ + Extract from OCI Image/ModelCar only the contents from a specific directory. + + Args: + ocilayout: The directory containing the oci-layout of the OCI Image/ModelCar. + output_path: The directory where to extract the ML model assets from the ModelCar to. + tar_filter_dir: The subdirectory in the ModelCar to extract, defaults to `"/models"`. + + Returns: + The list of extracted ML contents from the OCI Image/ModelCar. + """ + extracted: List[str] = [] + tar_filter_dir= tar_filter_dir.lstrip("/") + blobs_path = ocilayout / "blobs" / "sha256" + if not os.path.exists(output_path): + os.makedirs(output_path) + verify_ocilayout(ocilayout) + ocilayout_root_index = read_ocilayout_root_index(ocilayout) + if len(ocilayout_root_index.manifests) != 1: + raise ValueError("TODO the root index has more than one manifest, expected single ModelCar") + manifest0 = ocilayout_root_index.manifests[0] + if manifest0.mediaType != MediaTypes.manifest: + raise ValueError("Can only extract from ModelCar Image manifests") + target_hash = manifest0.digest.removeprefix("sha256:") + manifest_path = blobs_path / target_hash + with open(manifest_path, "r") as ip: + image_manifest = OCIImageManifest.model_validate_json(ip.read()) + for layer in image_manifest.layers: + if layer.mediaType == MediaTypes.layer or layer.mediaType == MediaTypes.layer_gzip: + target_hash = layer.digest.removeprefix("sha256:") + manifest_path = blobs_path / target_hash + with tarfile.open(manifest_path, "r:*") as tar: + for member in tar.getmembers(): + if member.isfile() and member.name.startswith(tar_filter_dir): + tar.extract(member, path=output_path) + extracted.append(member.name) + return extracted + + if __name__ == "__main__": print("?") diff --git a/tests/basic_test.py b/tests/basic_test.py index 469ab35..619e790 100644 --- a/tests/basic_test.py +++ b/tests/basic_test.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Dict -from olot.basics import crawl_ocilayout_indexes, crawl_ocilayout_manifests +from olot.basics import crawl_ocilayout_blobs_to_extract, crawl_ocilayout_indexes, crawl_ocilayout_manifests from olot.oci.oci_image_index import OCIImageIndex, read_ocilayout_root_index from olot.oci.oci_image_manifest import OCIImageManifest @@ -46,3 +46,20 @@ def test_crawl_ocilayout_manifests(): assert layer0.size == 1949749 assert layer0.mediaType == "application/vnd.oci.image.layer.v1.tar+gzip" + +def test_crawl_ocilayout_blobs_to_extract(tmp_path: Path): + """Crawl ocilayout4 which is a ModelCar containing one ML file "model.joblib" and one text file "README.md" as ModelCarD. + Verify extraction from ModelCar produces those 2 assets. + """ + ocilayout4_path = Path(__file__).parent / "data" / "ocilayout4" + mut = crawl_ocilayout_blobs_to_extract(ocilayout4_path, tmp_path) + + assert len(mut) == 2 + assert "models/README.md" in mut + assert "models/model.joblib" in mut + + assert len([x for x in tmp_path.rglob("*") if x.is_file()]) == 2 + modelcard = tmp_path / "models" / "README.md" + assert modelcard.exists() + modelfile = tmp_path / "models" / "model.joblib" + assert modelfile.exists() diff --git a/tests/data/ocilayout4/README.md b/tests/data/ocilayout4/README.md new file mode 100644 index 0000000..c9a98bd --- /dev/null +++ b/tests/data/ocilayout4/README.md @@ -0,0 +1,3 @@ +A ModelCar for linux/arm64 with a layer of model ML file, and a ModelCarD + +oras copy --platform linux/arm64 --to-oci-layout quay.io/mmortari/demo20241208:latest ./download:latest diff --git a/tests/data/ocilayout4/blobs/sha256/1933e30a3373776d5c7155591a6dacbc205cf6a2665b6dced682c6d2ea7b000f b/tests/data/ocilayout4/blobs/sha256/1933e30a3373776d5c7155591a6dacbc205cf6a2665b6dced682c6d2ea7b000f new file mode 100644 index 0000000..06566bb Binary files /dev/null and b/tests/data/ocilayout4/blobs/sha256/1933e30a3373776d5c7155591a6dacbc205cf6a2665b6dced682c6d2ea7b000f differ diff --git a/tests/data/ocilayout4/blobs/sha256/2df53fcc3c2170bcbe65e6d9c4809caee2b1792976d5867f12c04a481823ee13 b/tests/data/ocilayout4/blobs/sha256/2df53fcc3c2170bcbe65e6d9c4809caee2b1792976d5867f12c04a481823ee13 new file mode 100644 index 0000000..4c19c1f --- /dev/null +++ b/tests/data/ocilayout4/blobs/sha256/2df53fcc3c2170bcbe65e6d9c4809caee2b1792976d5867f12c04a481823ee13 @@ -0,0 +1 @@ +{"created":"2024-11-29T10:50:43.850369Z","architecture":"arm64","variant":"v8","os":"linux","config":{"Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"],"Cmd":["/bin/sh","-c","echo 'Hello, World! and will wait forever' && sleep infinity"],"Labels":{"io.buildah.version":"1.37.1"}},"rootfs":{"diff_ids":["sha256:f21ad18174949794e810922c8ada6ff8416aabab8ef3fd3bd144e47058359f52","sha256:cb48e126671f2c2d857bc7f2ada5f7b9ae6628add26353af15f309eb99b84b93","sha256:7f426b6872596a77dca738ad102c7694f4f78f0729ae218202b74c32c4832e3f"],"type":"layers"},"history":[{"created":"2024-09-26T21:31:42Z","created_by":"BusyBox 1.37.0 (glibc), Debian 12"},{"created":"2024-11-29T10:50:43.850412Z","created_by":"/bin/sh -c #(nop) CMD [\"/bin/sh\", \"-c\", \"echo 'Hello, World! and will wait forever' && sleep infinity\"]","comment":"FROM docker.io/library/busybox:latest","empty_layer":true}]} \ No newline at end of file diff --git a/tests/data/ocilayout4/blobs/sha256/6b679ab88ee14309a20d40544941f67b890e4ca49e1c40cfa133357f59b134d0 b/tests/data/ocilayout4/blobs/sha256/6b679ab88ee14309a20d40544941f67b890e4ca49e1c40cfa133357f59b134d0 new file mode 100644 index 0000000..700af92 --- /dev/null +++ b/tests/data/ocilayout4/blobs/sha256/6b679ab88ee14309a20d40544941f67b890e4ca49e1c40cfa133357f59b134d0 @@ -0,0 +1 @@ +{"schemaVersion":2,"mediaType":"application/vnd.oci.image.manifest.v1+json","config":{"mediaType":"application/vnd.oci.image.config.v1+json","digest":"sha256:2df53fcc3c2170bcbe65e6d9c4809caee2b1792976d5867f12c04a481823ee13","size":900},"layers":[{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:1933e30a3373776d5c7155591a6dacbc205cf6a2665b6dced682c6d2ea7b000f","size":1949749},{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:c294efacaa47e3bb60a3f718f28500d4f4ea95c32ac43c63fefe9807977eb7b2","size":1519},{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:e14aeeb6d3b72007eaf5646fc5dc000a4d4aed7c22d2300529687c8013929034","size":2261}],"annotations":{"io.opendatahub.temp.author":"olot","org.opencontainers.image.base.digest":"sha256:6ca1ac3927a17445a61188b4f91af0bfb1e0b16757b07ec9f556e9e1e0851b15","org.opencontainers.image.base.name":"docker.io/library/busybox:latest","org.opencontainers.image.url":"https://github.com/docker-library/busybox","org.opencontainers.image.version":"1.37.0-glibc"}} \ No newline at end of file diff --git a/tests/data/ocilayout4/blobs/sha256/c294efacaa47e3bb60a3f718f28500d4f4ea95c32ac43c63fefe9807977eb7b2 b/tests/data/ocilayout4/blobs/sha256/c294efacaa47e3bb60a3f718f28500d4f4ea95c32ac43c63fefe9807977eb7b2 new file mode 100644 index 0000000..0e31f05 Binary files /dev/null and b/tests/data/ocilayout4/blobs/sha256/c294efacaa47e3bb60a3f718f28500d4f4ea95c32ac43c63fefe9807977eb7b2 differ diff --git a/tests/data/ocilayout4/blobs/sha256/e14aeeb6d3b72007eaf5646fc5dc000a4d4aed7c22d2300529687c8013929034 b/tests/data/ocilayout4/blobs/sha256/e14aeeb6d3b72007eaf5646fc5dc000a4d4aed7c22d2300529687c8013929034 new file mode 100644 index 0000000..9abad62 Binary files /dev/null and b/tests/data/ocilayout4/blobs/sha256/e14aeeb6d3b72007eaf5646fc5dc000a4d4aed7c22d2300529687c8013929034 differ diff --git a/tests/data/ocilayout4/index.json b/tests/data/ocilayout4/index.json new file mode 100644 index 0000000..ac1ed34 --- /dev/null +++ b/tests/data/ocilayout4/index.json @@ -0,0 +1 @@ +{"schemaVersion":2,"manifests":[{"mediaType":"application/vnd.oci.image.manifest.v1+json","digest":"sha256:6b679ab88ee14309a20d40544941f67b890e4ca49e1c40cfa133357f59b134d0","size":1077,"annotations":{"org.opencontainers.image.ref.name":"latest"},"platform":{"architecture":"arm64","os":"linux"}}]} \ No newline at end of file diff --git a/tests/data/ocilayout4/oci-layout b/tests/data/ocilayout4/oci-layout new file mode 100644 index 0000000..1343d37 --- /dev/null +++ b/tests/data/ocilayout4/oci-layout @@ -0,0 +1 @@ +{"imageLayoutVersion":"1.0.0"} \ No newline at end of file