From 61642f5f864b8f6bb5f0d4fb51a5ba1aa134321a Mon Sep 17 00:00:00 2001 From: Chris Goller Date: Tue, 6 Aug 2024 07:50:39 -0500 Subject: [PATCH 1/4] feat(fly): support launching A10 GPUs in Fly's ORD region Signed-off-by: Chris Goller --- proto/depot/cloud/v2/cloud.proto | 1 + src/proto/depot/cloud/v2/cloud_pb.ts | 6 ++++ src/utils/fly/buildkit.ts | 41 ++++++++++++++++++++++++++++ src/utils/fly/reconcile.ts | 9 ++++-- 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/proto/depot/cloud/v2/cloud.proto b/proto/depot/cloud/v2/cloud.proto index 3e59b9f..37ebf5a 100644 --- a/proto/depot/cloud/v2/cloud.proto +++ b/proto/depot/cloud/v2/cloud.proto @@ -86,6 +86,7 @@ message GetDesiredStateResponse { enum Kind { KIND_UNSPECIFIED = 0; KIND_BUILDKIT = 1; + KIND_BUILDKIT_GPU = 2; } enum MachineState { diff --git a/src/proto/depot/cloud/v2/cloud_pb.ts b/src/proto/depot/cloud/v2/cloud_pb.ts index c1ba88a..041815c 100644 --- a/src/proto/depot/cloud/v2/cloud_pb.ts +++ b/src/proto/depot/cloud/v2/cloud_pb.ts @@ -230,11 +230,17 @@ export enum GetDesiredStateResponse_Kind { * @generated from enum value: KIND_BUILDKIT = 1; */ BUILDKIT = 1, + + /** + * @generated from enum value: KIND_BUILDKIT_GPU = 2; + */ + BUILDKIT_GPU = 2, } // Retrieve enum metadata with: proto3.getEnumType(GetDesiredStateResponse_Kind) proto3.util.setEnumType(GetDesiredStateResponse_Kind, 'depot.cloud.v2.GetDesiredStateResponse.Kind', [ {no: 0, name: 'KIND_UNSPECIFIED'}, {no: 1, name: 'KIND_BUILDKIT'}, + {no: 2, name: 'KIND_BUILDKIT_GPU'}, ]) /** diff --git a/src/utils/fly/buildkit.ts b/src/utils/fly/buildkit.ts index 31e4150..3eef31f 100644 --- a/src/utils/fly/buildkit.ts +++ b/src/utils/fly/buildkit.ts @@ -44,6 +44,47 @@ export async function launchBuildkitMachine(buildkit: BuildkitMachineRequest): P return machine } +export async function launchBuildkitGPUMachine(buildkit: BuildkitMachineRequest): Promise { + const {depotID, region, volumeID, image, env, files} = buildkit + if (region !== 'ord') { + throw new Error('GPU machines are only available in the ord region') + } + + const machine = await launchMachine({ + name: depotID, + region, + config: { + guest: { + cpu_kind: 'performance', + cpus: 16, + memory_mb: 1024 * 32, + gpus: 1, + gpu_kind: 'a10', + }, + files: Object.entries(files).map(([guest_path, raw_value]) => ({ + guest_path, + raw_value: Buffer.from(raw_value).toString('base64'), + })), + init: { + entryPoint: ['/usr/bin/machine-agent'], + }, + env, + image, + mounts: [ + { + encrypted: false, + path: '/var/lib/buildkit', + volume: volumeID, + }, + ], + auto_destroy: false, + restart: {policy: 'no'}, + dns: {}, + }, + }) + return machine +} + export interface BuildkitVolumeRequest { depotID: string region: string diff --git a/src/utils/fly/reconcile.ts b/src/utils/fly/reconcile.ts index 7b9e08a..79f6fe9 100644 --- a/src/utils/fly/reconcile.ts +++ b/src/utils/fly/reconcile.ts @@ -1,6 +1,7 @@ import { GetDesiredStateResponse, GetDesiredStateResponse_Architecture, + GetDesiredStateResponse_Kind, GetDesiredStateResponse_MachineChange, GetDesiredStateResponse_MachineState, GetDesiredStateResponse_NewMachine, @@ -13,7 +14,7 @@ import {CLOUD_AGENT_CONNECTION_ID, FLY_REGION} from '../env' import {errorMessage} from '../errors' import {client} from '../grpc' import {toPlainObject} from '../plain' -import {createBuildkitVolume, launchBuildkitMachine} from './buildkit' +import {createBuildkitVolume, launchBuildkitGPUMachine, launchBuildkitMachine} from './buildkit' import { V1Machine, Volume, @@ -116,7 +117,11 @@ async function reconcileNewMachine(state: V1Machine[], machine: GetDesiredStateR } try { - const flyMachine = await launchBuildkitMachine(req) + const flyMachine = + machine.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU + ? await launchBuildkitGPUMachine(req) + : await launchBuildkitMachine(req) + if (!flyMachine) throw new Error(`Unable to launch machine ${machine.id}`) console.log(`Launched new machine ${machine.id} ${flyMachine.id}`) From 1e25bf7bcbff6b3ec81cb3d40ba8ad95a2ee944a Mon Sep 17 00:00:00 2001 From: Chris Goller Date: Tue, 6 Aug 2024 11:26:45 -0500 Subject: [PATCH 2/4] fix: disallow GPU machines from recreating volumes Signed-off-by: Chris Goller --- src/utils/fly/reconcile.ts | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/utils/fly/reconcile.ts b/src/utils/fly/reconcile.ts index 79f6fe9..9d91e51 100644 --- a/src/utils/fly/reconcile.ts +++ b/src/utils/fly/reconcile.ts @@ -116,14 +116,16 @@ async function reconcileNewMachine(state: V1Machine[], machine: GetDesiredStateR files: flyOptions.files, } - try { - const flyMachine = - machine.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU - ? await launchBuildkitGPUMachine(req) - : await launchBuildkitMachine(req) + if (machine.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) { + const flyMachine = await launchBuildkitGPUMachine(req) + if (!flyMachine) throw new Error(`Unable to launch gpu machine ${machine.id}`) + console.log(`Launched new gpu machine ${machine.id} ${flyMachine.id}`) + return + } + try { + const flyMachine = await launchBuildkitMachine(req) if (!flyMachine) throw new Error(`Unable to launch machine ${machine.id}`) - console.log(`Launched new machine ${machine.id} ${flyMachine.id}`) } catch (err) { // If we get a capacity error, delete the volume and try again. From 449d5e961b27cbee88a37825cfc7b0fd64beac15 Mon Sep 17 00:00:00 2001 From: Chris Goller Date: Tue, 6 Aug 2024 12:26:44 -0500 Subject: [PATCH 3/4] feat: create volume on GPU instance Signed-off-by: Chris Goller --- src/utils/fly/buildkit.ts | 24 +++++++++++++++++++++++- src/utils/fly/reconcile.ts | 16 +++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/utils/fly/buildkit.ts b/src/utils/fly/buildkit.ts index 3eef31f..dbd9cef 100644 --- a/src/utils/fly/buildkit.ts +++ b/src/utils/fly/buildkit.ts @@ -1,5 +1,7 @@ import {V1Machine, Volume, createVolume, launchMachine} from './client' +const GPU_KIND = 'a10' + export interface BuildkitMachineRequest { depotID: string region: string @@ -59,7 +61,7 @@ export async function launchBuildkitGPUMachine(buildkit: BuildkitMachineRequest) cpus: 16, memory_mb: 1024 * 32, gpus: 1, - gpu_kind: 'a10', + gpu_kind: GPU_KIND, }, files: Object.entries(files).map(([guest_path, raw_value]) => ({ guest_path, @@ -103,3 +105,23 @@ export async function createBuildkitVolume(req: BuildkitVolumeRequest): Promise< }) return volume } + +export async function createBuildkitGPUVolume(req: BuildkitVolumeRequest): Promise { + const {depotID, region, sizeGB} = req + const volume = await createVolume({ + name: depotID, + region, + size_gb: sizeGB, + snapshot_retention: 5, // 5 is fly's minimum value. + encrypted: false, + fstype: 'ext4', + compute: { + cpu_kind: 'performance', + cpus: 16, + memory_mb: 1024 * 32, + gpus: 1, + gpu_kind: GPU_KIND, + }, + }) + return volume +} diff --git a/src/utils/fly/reconcile.ts b/src/utils/fly/reconcile.ts index 9d91e51..5baa53c 100644 --- a/src/utils/fly/reconcile.ts +++ b/src/utils/fly/reconcile.ts @@ -14,7 +14,12 @@ import {CLOUD_AGENT_CONNECTION_ID, FLY_REGION} from '../env' import {errorMessage} from '../errors' import {client} from '../grpc' import {toPlainObject} from '../plain' -import {createBuildkitVolume, launchBuildkitGPUMachine, launchBuildkitMachine} from './buildkit' +import { + createBuildkitGPUVolume, + createBuildkitVolume, + launchBuildkitGPUMachine, + launchBuildkitMachine, +} from './buildkit' import { V1Machine, Volume, @@ -63,8 +68,13 @@ async function reconcileNewVolume(state: Volume[], volume: GetDesiredStateRespon const existing = state.find((v) => v.name === volume.id) if (existing) return - console.log(`Creating new volume ${volume.id}`) - await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + if (volume.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) { + console.log(`Creating new gpu volume ${volume.id}`) + await createBuildkitGPUVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + } else { + console.log(`Creating new volume ${volume.id}`) + await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + } } // fly volumes are not attached/detatched. The only modification is deleting the volume. From 3228cf268859bc7c64ad6d0a7f11400dadc5f93f Mon Sep 17 00:00:00 2001 From: Chris Goller Date: Tue, 6 Aug 2024 18:45:52 -0500 Subject: [PATCH 4/4] fix: use entrypoint that will run ldconfig We need ldconfig to be run because the nvidia container runtime requires the location of the various driver libraries. Signed-off-by: Chris Goller --- src/utils/fly/buildkit.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/fly/buildkit.ts b/src/utils/fly/buildkit.ts index dbd9cef..97a3e9b 100644 --- a/src/utils/fly/buildkit.ts +++ b/src/utils/fly/buildkit.ts @@ -68,7 +68,7 @@ export async function launchBuildkitGPUMachine(buildkit: BuildkitMachineRequest) raw_value: Buffer.from(raw_value).toString('base64'), })), init: { - entryPoint: ['/usr/bin/machine-agent'], + entryPoint: ['/usr/bin/entrypoint.sh'], }, env, image,