メインコンテンツまでスキップ
バージョン: v2509

integration.kubernetes.pytorchjob

PyTorchJob wrapper for convenient job definition manipulation.

PyTorchJob Objects

class PyTorchJob()

A convenient interface for modifying PyTorchJob definitions.

This class wraps the standard Kubernetes PyTorchJob dictionary format and provides methods for common modifications during hyperparameter tuning.

Supports zenith-tune optimization configuration via annotations:

Note: This is not the actual Kubernetes PyTorchJob CRD object, but a user-friendly wrapper for job definition manipulation.

metadata:
annotations:
zenith-tune/optimization-config: |
variables:
- name: "learning_rate"
type: "float"
range: [0.001, 0.1]
log: true
target_env: "LEARNING_RATE"
- name: "batch_size"
type: "int"
range: [16, 128]
step: 16
target_env: "BATCH_SIZE"
- name: "optimizer"
type: "categorical"
choices: ["adam", "sgd"]
target_env: "OPTIMIZER"
objective:
name: "loss"
regex: "Loss: ([0-9]+\.?[0-9]*)"
direction: "minimize"
n_trials: 100

__init__

def __init__(job_dict_or_job: Union[Dict[str, Any], "PyTorchJob"])

Initialize with a PyTorchJob dictionary or another PyTorchJob.

Arguments:

  • job_dict_or_job - Dictionary representation of a PyTorchJob or another PyTorchJob instance

Raises:

  • ValueError - If job_dict has invalid PyTorchJob structure

set_env

def set_env(key: str,
value: str,
replica_type: str = "Worker",
container_index: int = 0) -> "PyTorchJob"

Set environment variable for specified replica type.

Arguments:

  • key - Environment variable name
  • value - Environment variable value
  • replica_type - Replica type (Worker, Master, etc.)
  • container_index - Container index (default: 0)

Returns:

Self for method chaining

set_name

def set_name(name: str) -> "PyTorchJob"

Set job name.

Arguments:

  • name - Job name

Returns:

Self for method chaining

get_name

def get_name() -> Optional[str]

Get job name.

Returns:

Job name or None if not set

set_command

def set_command(command: list,
replica_type: str = "Worker",
container_index: int = 0) -> "PyTorchJob"

Set command for specified replica type.

Arguments:

  • command - Command list (e.g., ["python", "train.py"])
  • replica_type - Replica type (Worker, Master, etc.)
  • container_index - Container index (default: 0)

Returns:

Self for method chaining

get_command

def get_command(replica_type: str = "Worker",
container_index: int = 0) -> Optional[list]

Get command for specified replica type.

Arguments:

  • replica_type - Replica type (Worker, Master, etc.)
  • container_index - Container index (default: 0)

Returns:

Command list or None if not set

set_worker_replicas

def set_worker_replicas(replicas: int) -> "PyTorchJob"

Set number of worker replicas.

Arguments:

  • replicas - Number of worker replicas

Returns:

Self for method chaining

get_env

def get_env(key: str,
replica_type: str = "Worker",
container_index: int = 0) -> Optional[str]

Get environment variable value.

Arguments:

  • key - Environment variable name
  • replica_type - Replica type (Worker, Master, etc.)
  • container_index - Container index (default: 0)

Returns:

Environment variable value or None if not found

get_env_list

def get_env_list(replica_type: str = "Worker",
container_index: int = 0) -> Dict[str, str]

Get all environment variables as a dictionary.

Arguments:

  • replica_type - Replica type (Worker, Master, etc.)
  • container_index - Container index (default: 0)

Returns:

Dictionary of environment variables

to_dict

def to_dict() -> Dict[str, Any]

Convert back to dictionary format.

Returns:

Dictionary representation of the PyTorchJob

has_tuning_config

def has_tuning_config() -> bool

Check if job has tuning optimization config annotation.

Returns:

True if zenith-tune/optimization-config annotation exists

get_tuning_config

def get_tuning_config() -> Optional[Dict[str, Any]]

Get parsed tuning optimization config from annotations.

Returns:

Parsed config dictionary or None if not found/invalid

get_optimization_variables

def get_optimization_variables() -> List[Dict[str, Any]]

Get optimization variables from tuning config.

Returns:

List of variable configurations, empty if not found

get_objective_config

def get_objective_config() -> Optional[Dict[str, Any]]

Get objective configuration from tuning config.

Returns:

Objective config dictionary or None if not found

get_objective_direction

def get_objective_direction() -> str

Get objective direction (minimize/maximize) from tuning config.

Returns:

"minimize" or "maximize", defaults to "minimize"

should_maximize

def should_maximize() -> bool

Check if objective should be maximized.

Returns:

True if should maximize, False if should minimize

get_n_trials

def get_n_trials() -> int | None

Get number of trials from tuning config.

Returns:

Number of trials or None if not specified

get_status

def get_status() -> str

Get the status of this PyTorchJob.

Returns:

Job status: "Succeeded", "Failed", "InProgress", or "Unknown"

validate_tuning_config

def validate_tuning_config() -> list[str]

Validate tuning configuration format.

Returns:

List of validation error messages, empty if valid

is_tuning_config_valid

def is_tuning_config_valid() -> bool

Check if tuning configuration is valid.

Returns:

True if configuration is valid

__getitem__

def __getitem__(key)

Support dict-like access: job['spec']

__setitem__

def __setitem__(key, value)

Support dict-like assignment: job['spec'] = value

__delitem__

def __delitem__(key)

Support dict-like deletion: del job['status']

__contains__

def __contains__(key)

Support 'in' operator: 'spec' in job

__len__

def __len__()

Support len() function: len(job)

__iter__

def __iter__()

Support iteration: for key in job

keys

def keys()

Support dict.keys(): job.keys()

values

def values()

Support dict.values(): job.values()

items

def items()

Support dict.items(): job.items()

get

def get(key, default=None)

Support dict.get(): job.get('spec')

update

def update(other)

Support dict.update(): job.update(other_dict)

__repr__

def __repr__()

String representation for debugging.