Skip to content

Commit

Permalink
Merge pull request #13 from containers/tarilabs-20250212-extract-from…
Browse files Browse the repository at this point in the history
…-modelcar

utils: extract `/models` content from ModelCar
  • Loading branch information
jaideepr97 authored Feb 13, 2025
2 parents b865091 + 278de46 commit 5042c27
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 1 deletion.
44 changes: 44 additions & 0 deletions olot/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from pathlib import Path
from pprint import pprint
import tarfile
from typing import Dict, List
import typing
import click
Expand Down Expand Up @@ -138,5 +139,48 @@ def crawl_ocilayout_indexes(ocilayout: Path, ocilayout_root_index: OCIImageIndex
click.echo(f"Found Image Manifest {m.digest} in root index, TODO assuming these are referred through the other indexes")
return ocilayout_indexes


def crawl_ocilayout_blobs_to_extract(ocilayout: Path,
output_path: Path,
tar_filter_dir: str = "/models") -> List[str]:
"""
Extract from OCI Image/ModelCar only the contents from a specific directory.
Args:
ocilayout: The directory containing the oci-layout of the OCI Image/ModelCar.
output_path: The directory where to extract the ML model assets from the ModelCar to.
tar_filter_dir: The subdirectory in the ModelCar to extract, defaults to `"/models"`.
Returns:
The list of extracted ML contents from the OCI Image/ModelCar.
"""
extracted: List[str] = []
tar_filter_dir= tar_filter_dir.lstrip("/")
blobs_path = ocilayout / "blobs" / "sha256"
if not os.path.exists(output_path):
os.makedirs(output_path)
verify_ocilayout(ocilayout)
ocilayout_root_index = read_ocilayout_root_index(ocilayout)
if len(ocilayout_root_index.manifests) != 1:
raise ValueError("TODO the root index has more than one manifest, expected single ModelCar")
manifest0 = ocilayout_root_index.manifests[0]
if manifest0.mediaType != MediaTypes.manifest:
raise ValueError("Can only extract from ModelCar Image manifests")
target_hash = manifest0.digest.removeprefix("sha256:")
manifest_path = blobs_path / target_hash
with open(manifest_path, "r") as ip:
image_manifest = OCIImageManifest.model_validate_json(ip.read())
for layer in image_manifest.layers:
if layer.mediaType == MediaTypes.layer or layer.mediaType == MediaTypes.layer_gzip:
target_hash = layer.digest.removeprefix("sha256:")
manifest_path = blobs_path / target_hash
with tarfile.open(manifest_path, "r:*") as tar:
for member in tar.getmembers():
if member.isfile() and member.name.startswith(tar_filter_dir):
tar.extract(member, path=output_path)
extracted.append(member.name)
return extracted


if __name__ == "__main__":
print("?")
19 changes: 18 additions & 1 deletion tests/basic_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
from typing import Dict

from olot.basics import crawl_ocilayout_indexes, crawl_ocilayout_manifests
from olot.basics import crawl_ocilayout_blobs_to_extract, crawl_ocilayout_indexes, crawl_ocilayout_manifests

from olot.oci.oci_image_index import OCIImageIndex, read_ocilayout_root_index
from olot.oci.oci_image_manifest import OCIImageManifest
Expand Down Expand Up @@ -46,3 +46,20 @@ def test_crawl_ocilayout_manifests():
assert layer0.size == 1949749
assert layer0.mediaType == "application/vnd.oci.image.layer.v1.tar+gzip"


def test_crawl_ocilayout_blobs_to_extract(tmp_path: Path):
"""Crawl ocilayout4 which is a ModelCar containing one ML file "model.joblib" and one text file "README.md" as ModelCarD.
Verify extraction from ModelCar produces those 2 assets.
"""
ocilayout4_path = Path(__file__).parent / "data" / "ocilayout4"
mut = crawl_ocilayout_blobs_to_extract(ocilayout4_path, tmp_path)

assert len(mut) == 2
assert "models/README.md" in mut
assert "models/model.joblib" in mut

assert len([x for x in tmp_path.rglob("*") if x.is_file()]) == 2
modelcard = tmp_path / "models" / "README.md"
assert modelcard.exists()
modelfile = tmp_path / "models" / "model.joblib"
assert modelfile.exists()
3 changes: 3 additions & 0 deletions tests/data/ocilayout4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
A ModelCar for linux/arm64 with a layer of model ML file, and a ModelCarD

oras copy --platform linux/arm64 --to-oci-layout quay.io/mmortari/demo20241208:latest ./download:latest
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"created":"2024-11-29T10:50:43.850369Z","architecture":"arm64","variant":"v8","os":"linux","config":{"Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"],"Cmd":["/bin/sh","-c","echo 'Hello, World! and will wait forever' && sleep infinity"],"Labels":{"io.buildah.version":"1.37.1"}},"rootfs":{"diff_ids":["sha256:f21ad18174949794e810922c8ada6ff8416aabab8ef3fd3bd144e47058359f52","sha256:cb48e126671f2c2d857bc7f2ada5f7b9ae6628add26353af15f309eb99b84b93","sha256:7f426b6872596a77dca738ad102c7694f4f78f0729ae218202b74c32c4832e3f"],"type":"layers"},"history":[{"created":"2024-09-26T21:31:42Z","created_by":"BusyBox 1.37.0 (glibc), Debian 12"},{"created":"2024-11-29T10:50:43.850412Z","created_by":"/bin/sh -c #(nop) CMD [\"/bin/sh\", \"-c\", \"echo 'Hello, World! and will wait forever' && sleep infinity\"]","comment":"FROM docker.io/library/busybox:latest","empty_layer":true}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"schemaVersion":2,"mediaType":"application/vnd.oci.image.manifest.v1+json","config":{"mediaType":"application/vnd.oci.image.config.v1+json","digest":"sha256:2df53fcc3c2170bcbe65e6d9c4809caee2b1792976d5867f12c04a481823ee13","size":900},"layers":[{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:1933e30a3373776d5c7155591a6dacbc205cf6a2665b6dced682c6d2ea7b000f","size":1949749},{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:c294efacaa47e3bb60a3f718f28500d4f4ea95c32ac43c63fefe9807977eb7b2","size":1519},{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"sha256:e14aeeb6d3b72007eaf5646fc5dc000a4d4aed7c22d2300529687c8013929034","size":2261}],"annotations":{"io.opendatahub.temp.author":"olot","org.opencontainers.image.base.digest":"sha256:6ca1ac3927a17445a61188b4f91af0bfb1e0b16757b07ec9f556e9e1e0851b15","org.opencontainers.image.base.name":"docker.io/library/busybox:latest","org.opencontainers.image.url":"https://github.com/docker-library/busybox","org.opencontainers.image.version":"1.37.0-glibc"}}
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/ocilayout4/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"schemaVersion":2,"manifests":[{"mediaType":"application/vnd.oci.image.manifest.v1+json","digest":"sha256:6b679ab88ee14309a20d40544941f67b890e4ca49e1c40cfa133357f59b134d0","size":1077,"annotations":{"org.opencontainers.image.ref.name":"latest"},"platform":{"architecture":"arm64","os":"linux"}}]}
1 change: 1 addition & 0 deletions tests/data/ocilayout4/oci-layout
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"imageLayoutVersion":"1.0.0"}

0 comments on commit 5042c27

Please sign in to comment.