Skip to content

Commit

Permalink
add GPU support on Google
Browse files Browse the repository at this point in the history
  • Loading branch information
hpratt committed Mar 14, 2023
1 parent cb1b928 commit f1e1d64
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 6 deletions.
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ plugins {
}

group = "io.krews"
version = "0.12.5"
version = "0.13.0"

repositories {
maven { setUrl("http://dl.bintray.com/kotlin/kotlin-eap") }
Expand Down
15 changes: 14 additions & 1 deletion src/main/kotlin/krews/config/GoogleConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,20 @@ data class GoogleTaskConfig(
// Disk Size. Can be used to override the runtime value.
val diskSize: Capacity? = null,
// Type of disk, HDD vs SSD.
val diskType: GoogleDiskType = GoogleDiskType.HDD
val diskType: GoogleDiskType = GoogleDiskType.HDD,
// GPUs to attach to the VM
val gpus: GoogleGPUConfig? = null,
// Image for the machine boot disk
val bootImage: String? = null
)

data class GoogleGPUConfig(
// The type of GPU to attach
val gpuType: String,
// The number of GPUs to attach
val gpuCount: Long,
// Boot image to use
val bootImage: String? = null
)

enum class GoogleDiskType(val value: String) {
Expand Down
27 changes: 24 additions & 3 deletions src/main/kotlin/krews/executor/google/GoogleLocalExecutor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ class GoogleLocalExecutor(workflowConfig: WorkflowConfig) : LocallyDirectedExecu
val maxMemory = taskRunContexts.map { it.memory }.maxBy { it?.bytes ?: -1 } ?: taskConfig.google?.mem
virtualMachine.machineType = googleMachineType(taskConfig.google, maxCpus, maxMemory)

if (taskConfig.google?.gpus != null) {
val acceleratorConfig = Accelerator()
acceleratorConfig.type = taskConfig.google?.gpus!!.gpuType
acceleratorConfig.count = taskConfig.google?.gpus!!.gpuCount
virtualMachine.accelerators = listOf(acceleratorConfig)
if (taskConfig.google?.gpus!!.bootImage != null)
virtualMachine.bootImage = taskConfig.google?.gpus!!.bootImage
}

val serviceAccount = ServiceAccount()
virtualMachine.serviceAccount = serviceAccount
serviceAccount.scopes = listOf(STORAGE_READ_WRITE_SCOPE)
Expand Down Expand Up @@ -121,7 +130,8 @@ class GoogleLocalExecutor(workflowConfig: WorkflowConfig) : LocallyDirectedExecu
taskRunContext.inputsDir,
taskRunContext.outputsDir,
taskRunContext.command,
taskRunContext.env
taskRunContext.env,
taskConfig.google?.gpus != null
)
)

Expand Down Expand Up @@ -223,6 +233,13 @@ class GoogleLocalExecutor(workflowConfig: WorkflowConfig) : LocallyDirectedExecu

}

val NVIDIA_DOCKER_COMMANDS =
"""
sudo cos-extensions install gpu;
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
sudo mount -o remount,exec /var/lib/nvidia
"""

/**
* Create a pipeline action that will execute the task
*/
Expand All @@ -231,7 +248,8 @@ internal fun createExecuteTaskAction(
inputsDir: String,
outputsDir: String,
command: String?,
env: Map<String, String>?
env: Map<String, String>?,
gpus: Boolean
): Action {
val action = Action()
action.imageUri = dockerImage
Expand All @@ -240,7 +258,10 @@ internal fun createExecuteTaskAction(
val actionEnv = mutableMapOf("TMPDIR" to tmpDir)
if (env != null) actionEnv.putAll(env)
action.environment = actionEnv
if (command != null) action.commands = listOf("/bin/sh", "-c", "[ ! -d $tmpDir ] && mkdir $tmpDir;\n $command")
if (command != null) {
if (gpus) action.commands = listOf("/bin/sh", "-c", "$NVIDIA_DOCKER_COMMANDS\n [ ! -d $tmpDir ] && mkdir $tmpDir;\n $command")
else action.commands = listOf("/bin/sh", "-c", "[ ! -d $tmpDir ] && mkdir $tmpDir;\n $command")
}
return action
}

Expand Down
10 changes: 9 additions & 1 deletion src/test/kotlin/krews/GoogleExecutorTests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import org.junit.jupiter.api.*
import java.io.*
import java.util.*


@Disabled
@TestMethodOrder(MethodOrderer.OrderAnnotation::class)
class GoogleExecutorTests {
Expand All @@ -28,6 +27,14 @@ class GoogleExecutorTests {
task.default {
grouping = $grouping
}
task.gpus {
google {
gpus {
gpu-type = "nvidia-tesla-k80"
gpu-count = 1
}
}
}
google {
bucket = "$testBucket"
project-id = "$googleProjectId"
Expand Down Expand Up @@ -71,6 +78,7 @@ class GoogleExecutorTests {
"outputs/base64/test-$i.b64".existsInGS(testBucket, workflowBaseDir)
"outputs/gzip/test-$i.b64.gz".existsInGS(testBucket, workflowBaseDir)
"outputs/gzip/test-$i.b64.fake.none".existsInGS(testBucket, workflowBaseDir)
"outputs/nvidia-smi/test-$i.nvidia-smi-output.txt".existsInGS(testBucket, workflowBaseDir)
}

// Confirm that an html report was generated
Expand Down
12 changes: 12 additions & 0 deletions src/test/kotlin/krews/util/Workflows.kt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@ fun gsFilesWorkflow() = workflow("gs-files-workflow") {
"""
}

val gpus = task<File, OutputFile>("gpus", inputFiles) {
dockerImage = "nvidia/cuda:12.1.0-devel-ubuntu18.04"
output = OutputFile("nvidia-smi/${input.filenameNoExt()}.nvidia-smi-output.txt")
command =
"""
mkdir -p $(dirname ${output!!.dockerPath})
apt -y update
apt -y install lshw
lshw -C display > ${output!!.dockerPath}
"""
}

task<File, GSGzipOutput>("gzip", base64.map { it.base64 }) {
dockerImage = "alpine:3.8"
val outGz = OutputFile("gzip/${input.filename()}.gz")
Expand Down

0 comments on commit f1e1d64

Please sign in to comment.