Skip to content

Commit

Permalink
Merge pull request #100 from depot/feat/gpu
Browse files Browse the repository at this point in the history
feat(fly): support launching A10 GPUs in Fly's ORD region
  • Loading branch information
goller authored Aug 6, 2024
2 parents a0f2a19 + 3228cf2 commit 91a51bb
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 4 deletions.
1 change: 1 addition & 0 deletions proto/depot/cloud/v2/cloud.proto
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ message GetDesiredStateResponse {
enum Kind {
KIND_UNSPECIFIED = 0;
KIND_BUILDKIT = 1;
KIND_BUILDKIT_GPU = 2;
}

enum MachineState {
Expand Down
6 changes: 6 additions & 0 deletions src/proto/depot/cloud/v2/cloud_pb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -230,11 +230,17 @@ export enum GetDesiredStateResponse_Kind {
* @generated from enum value: KIND_BUILDKIT = 1;
*/
BUILDKIT = 1,

/**
* @generated from enum value: KIND_BUILDKIT_GPU = 2;
*/
BUILDKIT_GPU = 2,
}
// Retrieve enum metadata with: proto3.getEnumType(GetDesiredStateResponse_Kind)
proto3.util.setEnumType(GetDesiredStateResponse_Kind, 'depot.cloud.v2.GetDesiredStateResponse.Kind', [
{no: 0, name: 'KIND_UNSPECIFIED'},
{no: 1, name: 'KIND_BUILDKIT'},
{no: 2, name: 'KIND_BUILDKIT_GPU'},
])

/**
Expand Down
63 changes: 63 additions & 0 deletions src/utils/fly/buildkit.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import {V1Machine, Volume, createVolume, launchMachine} from './client'

const GPU_KIND = 'a10'

export interface BuildkitMachineRequest {
depotID: string
region: string
Expand Down Expand Up @@ -44,6 +46,47 @@ export async function launchBuildkitMachine(buildkit: BuildkitMachineRequest): P
return machine
}

export async function launchBuildkitGPUMachine(buildkit: BuildkitMachineRequest): Promise<V1Machine> {
const {depotID, region, volumeID, image, env, files} = buildkit
if (region !== 'ord') {
throw new Error('GPU machines are only available in the ord region')
}

const machine = await launchMachine({
name: depotID,
region,
config: {
guest: {
cpu_kind: 'performance',
cpus: 16,
memory_mb: 1024 * 32,
gpus: 1,
gpu_kind: GPU_KIND,
},
files: Object.entries(files).map(([guest_path, raw_value]) => ({
guest_path,
raw_value: Buffer.from(raw_value).toString('base64'),
})),
init: {
entryPoint: ['/usr/bin/entrypoint.sh'],
},
env,
image,
mounts: [
{
encrypted: false,
path: '/var/lib/buildkit',
volume: volumeID,
},
],
auto_destroy: false,
restart: {policy: 'no'},
dns: {},
},
})
return machine
}

export interface BuildkitVolumeRequest {
depotID: string
region: string
Expand All @@ -62,3 +105,23 @@ export async function createBuildkitVolume(req: BuildkitVolumeRequest): Promise<
})
return volume
}

export async function createBuildkitGPUVolume(req: BuildkitVolumeRequest): Promise<Volume> {
const {depotID, region, sizeGB} = req
const volume = await createVolume({
name: depotID,
region,
size_gb: sizeGB,
snapshot_retention: 5, // 5 is fly's minimum value.
encrypted: false,
fstype: 'ext4',
compute: {
cpu_kind: 'performance',
cpus: 16,
memory_mb: 1024 * 32,
gpus: 1,
gpu_kind: GPU_KIND,
},
})
return volume
}
25 changes: 21 additions & 4 deletions src/utils/fly/reconcile.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {
GetDesiredStateResponse,
GetDesiredStateResponse_Architecture,
GetDesiredStateResponse_Kind,
GetDesiredStateResponse_MachineChange,
GetDesiredStateResponse_MachineState,
GetDesiredStateResponse_NewMachine,
Expand All @@ -13,7 +14,12 @@ import {CLOUD_AGENT_CONNECTION_ID, FLY_REGION} from '../env'
import {errorMessage} from '../errors'
import {client} from '../grpc'
import {toPlainObject} from '../plain'
import {createBuildkitVolume, launchBuildkitMachine} from './buildkit'
import {
createBuildkitGPUVolume,
createBuildkitVolume,
launchBuildkitGPUMachine,
launchBuildkitMachine,
} from './buildkit'
import {
V1Machine,
Volume,
Expand Down Expand Up @@ -62,8 +68,13 @@ async function reconcileNewVolume(state: Volume[], volume: GetDesiredStateRespon
const existing = state.find((v) => v.name === volume.id)
if (existing) return

console.log(`Creating new volume ${volume.id}`)
await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size})
if (volume.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) {
console.log(`Creating new gpu volume ${volume.id}`)
await createBuildkitGPUVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size})
} else {
console.log(`Creating new volume ${volume.id}`)
await createBuildkitVolume({depotID: volume.id, region: volume.zone ?? FLY_REGION, sizeGB: volume.size})
}
}

// fly volumes are not attached/detatched. The only modification is deleting the volume.
Expand Down Expand Up @@ -115,10 +126,16 @@ async function reconcileNewMachine(state: V1Machine[], machine: GetDesiredStateR
files: flyOptions.files,
}

if (machine.kind === GetDesiredStateResponse_Kind.BUILDKIT_GPU) {
const flyMachine = await launchBuildkitGPUMachine(req)
if (!flyMachine) throw new Error(`Unable to launch gpu machine ${machine.id}`)
console.log(`Launched new gpu machine ${machine.id} ${flyMachine.id}`)
return
}

try {
const flyMachine = await launchBuildkitMachine(req)
if (!flyMachine) throw new Error(`Unable to launch machine ${machine.id}`)

console.log(`Launched new machine ${machine.id} ${flyMachine.id}`)
} catch (err) {
// If we get a capacity error, delete the volume and try again.
Expand Down

0 comments on commit 91a51bb

Please sign in to comment.