diff --git a/proto/depot/cloud/v2/cloud.proto b/proto/depot/cloud/v2/cloud.proto index 3e59b9f..37ebf5a 100644 --- a/proto/depot/cloud/v2/cloud.proto +++ b/proto/depot/cloud/v2/cloud.proto @@ -86,6 +86,7 @@ message GetDesiredStateResponse { enum Kind { KIND_UNSPECIFIED = 0; KIND_BUILDKIT = 1; + KIND_BUILDKIT_GPU = 2; } enum MachineState { diff --git a/src/proto/depot/cloud/v2/cloud_pb.ts b/src/proto/depot/cloud/v2/cloud_pb.ts index c1ba88a..041815c 100644 --- a/src/proto/depot/cloud/v2/cloud_pb.ts +++ b/src/proto/depot/cloud/v2/cloud_pb.ts @@ -230,11 +230,17 @@ export enum GetDesiredStateResponse_Kind { * @generated from enum value: KIND_BUILDKIT = 1; */ BUILDKIT = 1, + + /** + * @generated from enum value: KIND_BUILDKIT_GPU = 2; + */ + BUILDKIT_GPU = 2, } // Retrieve enum metadata with: proto3.getEnumType(GetDesiredStateResponse_Kind) proto3.util.setEnumType(GetDesiredStateResponse_Kind, 'depot.cloud.v2.GetDesiredStateResponse.Kind', [ {no: 0, name: 'KIND_UNSPECIFIED'}, {no: 1, name: 'KIND_BUILDKIT'}, + {no: 2, name: 'KIND_BUILDKIT_GPU'}, ]) /** diff --git a/src/utils/fly/buildkit.ts b/src/utils/fly/buildkit.ts index 31e4150..97a3e9b 100644 --- a/src/utils/fly/buildkit.ts +++ b/src/utils/fly/buildkit.ts @@ -1,5 +1,7 @@ import {V1Machine, Volume, createVolume, launchMachine} from './client' +const GPU_KIND = 'a10' + export interface BuildkitMachineRequest { depotID: string region: string @@ -44,6 +46,47 @@ export async function launchBuildkitMachine(buildkit: BuildkitMachineRequest): P return machine } +export async function launchBuildkitGPUMachine(buildkit: BuildkitMachineRequest): Promise { + const {depotID, region, volumeID, image, env, files} = buildkit + if (region !== 'ord') { + throw new Error('GPU machines are only available in the ord region') + } + + const machine = await launchMachine({ + name: depotID, + region, + config: { + guest: { + cpu_kind: 'performance', + cpus: 16, + memory_mb: 1024 * 32, + gpus: 1, + gpu_kind: GPU_KIND, + }, + files: Object.entries(files).map(([guest_path, raw_value]) => ({ + guest_path, + raw_value: Buffer.from(raw_value).toString('base64'), + })), + init: { + entryPoint: ['/usr/bin/entrypoint.sh'], + }, + env, + image, + mounts: [ + { + encrypted: false, + path: '/var/lib/buildkit', + volume: volumeID, + }, + ], + auto_destroy: false, + restart: {policy: 'no'}, + dns: {}, + }, + }) + return machine +} + export interface BuildkitVolumeRequest { depotID: string region: string @@ -62,3 +105,23 @@ export async function createBuildkitVolume(req: BuildkitVolumeRequest): Promise< }) return volume } + +export async function createBuildkitGPUVolume(req: BuildkitVolumeRequest): Promise { + const {depotID, region, sizeGB} = req + const volume = await createVolume({ + name: depotID, + region, + size_gb: sizeGB, + snapshot_retention: 5, // 5 is fly's minimum value. + encrypted: false, + fstype: 'ext4', + compute: { + cpu_kind: 'performance', + cpus: 16, + memory_mb: 1024 * 32, + gpus: 1, + gpu_kind: GPU_KIND, + }, + }) + return volume +} diff --git a/src/utils/fly/reconcile.ts b/src/utils/fly/reconcile.ts index 7b9e08a..5baa53c 100644 --- a/src/utils/fly/reconcile.ts +++ b/src/utils/fly/reconcile.ts @@ -1,6 +1,7 @@ import { GetDesiredStateResponse, GetDesiredStateResponse_Architecture, + GetDesiredStateResponse_Kind, GetDesiredStateResponse_MachineChange, GetDesiredStateResponse_MachineState, GetDesiredStateResponse_NewMachine, @@ -13,7 +14,12 @@ import {CLOUD_AGENT_CONNECTION_ID, FLY_REGION} from '../env' import {errorMessage} from '../errors' import {client} from '../grpc' import {toPlainObject} from '../plain' -import {createBuildkitVolume, launchBuildkitMachine} from './buildkit' +import { + createBuildkitGPUVolume, + createBuildkitVolume, + launchBuildkitGPUMachine, + launchBuildkitMachine, +} from './buildkit' import { V1Machine, Volume, @@ -62,8 +68,13 @@ async function reconcileNewVolume(state: Volume[], volume: GetDesiredStateRespon const existing = state.find((v) => v.name === volume.id) if (existing) return - console.log(`Creating new volume ${volume.id}`) - await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + if (volume.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) { + console.log(`Creating new gpu volume ${volume.id}`) + await createBuildkitGPUVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + } else { + console.log(`Creating new volume ${volume.id}`) + await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size}) + } } // fly volumes are not attached/detatched. The only modification is deleting the volume. @@ -115,10 +126,16 @@ async function reconcileNewMachine(state: V1Machine[], machine: GetDesiredStateR files: flyOptions.files, } + if (machine.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) { + const flyMachine = await launchBuildkitGPUMachine(req) + if (!flyMachine) throw new Error(`Unable to launch gpu machine ${machine.id}`) + console.log(`Launched new gpu machine ${machine.id} ${flyMachine.id}`) + return + } + try { const flyMachine = await launchBuildkitMachine(req) if (!flyMachine) throw new Error(`Unable to launch machine ${machine.id}`) - console.log(`Launched new machine ${machine.id} ${flyMachine.id}`) } catch (err) { // If we get a capacity error, delete the volume and try again.