Skip to content

Commit 8e2862c

Browse files
feat: Add Lustre support to the Vertex Training Custom Job API (#6952)
* feat: Add Lustre support to the Vertex Training Custom Job API docs: A comment for field `timeout` in message `.google.cloud.aiplatform.v1beta1.Scheduling` is changed docs: A comment for field `restart_job_on_worker_restart` in message `.google.cloud.aiplatform.v1beta1.Scheduling` is changed PiperOrigin-RevId: 845354898 Source-Link: googleapis/googleapis@a8e146a Source-Link: googleapis/googleapis-gen@06bf339 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLWFpcGxhdGZvcm0vLk93bEJvdC55YW1sIiwiaCI6IjA2YmYzMzkzYjhhYjVkYjVlNjljNWNkODA0ODZiM2Q2MGI0YjNiNjkifQ== * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 7af18d7 commit 8e2862c

File tree

5 files changed

+504
-3
lines changed

5 files changed

+504
-3
lines changed

packages/google-cloud-aiplatform/protos/google/cloud/aiplatform/v1beta1/custom_job.proto

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,10 @@ message WorkerPoolSpec {
312312
// Optional. List of NFS mount spec.
313313
repeated NfsMount nfs_mounts = 4 [(google.api.field_behavior) = OPTIONAL];
314314

315+
// Optional. List of Lustre mounts.
316+
repeated LustreMount lustre_mounts = 9
317+
[(google.api.field_behavior) = OPTIONAL];
318+
315319
// Disk spec.
316320
DiskSpec disk_spec = 5;
317321
}
@@ -387,10 +391,10 @@ message Scheduling {
387391
FLEX_START = 6;
388392
}
389393

390-
// The maximum job running time. The default is 7 days.
394+
// Optional. The maximum job running time. The default is 7 days.
391395
google.protobuf.Duration timeout = 1;
392396

393-
// Restarts the entire CustomJob if a worker gets restarted.
397+
// Optional. Restarts the entire CustomJob if a worker gets restarted.
394398
// This feature can be used by distributed training jobs that are not
395399
// resilient to workers leaving and joining a job.
396400
bool restart_job_on_worker_restart = 3;

packages/google-cloud-aiplatform/protos/google/cloud/aiplatform/v1beta1/machine_resources.proto

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,22 @@ message NfsMount {
355355
string mount_point = 3 [(google.api.field_behavior) = REQUIRED];
356356
}
357357

358+
// Represents a mount configuration for Lustre file system.
359+
message LustreMount {
360+
// Required. IP address of the Lustre instance.
361+
string instance_ip = 1 [(google.api.field_behavior) = REQUIRED];
362+
363+
// Required. The unique identifier of the Lustre volume.
364+
string volume_handle = 2 [(google.api.field_behavior) = REQUIRED];
365+
366+
// Required. The name of the Lustre filesystem.
367+
string filesystem = 3 [(google.api.field_behavior) = REQUIRED];
368+
369+
// Required. Destination mount path. The Lustre file system will be mounted
370+
// for the user under /mnt/lustre/<mount_point>
371+
string mount_point = 4 [(google.api.field_behavior) = REQUIRED];
372+
}
373+
358374
// The metric specification that defines the target resource utilization
359375
// (CPU utilization, accelerator's duty cycle, and so on) for calculating the
360376
// desired replica count.

0 commit comments

Comments
 (0)