integration.kubernetes.pytorchjob
PyTorchJob wrapper for convenient job definition manipulation.
PyTorchJob Objects
class PyTorchJob()
A convenient interface for modifying PyTorchJob definitions.
This class wraps the standard Kubernetes PyTorchJob dictionary format and provides methods for common modifications during hyperparameter tuning.
Supports zenith-tune optimization configuration via annotations:
Note: This is not the actual Kubernetes PyTorchJob CRD object, but a user-friendly wrapper for job definition manipulation.
metadata:
annotations:
zenith-tune/optimization-config: |
variables:
- name: "learning_rate"
type: "float"
range: [0.001, 0.1]
log: true
target_env: "LEARNING_RATE"
- name: "batch_size"
type: "int"
range: [16, 128]
step: 16
target_env: "BATCH_SIZE"
- name: "optimizer"
type: "categorical"
choices: ["adam", "sgd"]
target_env: "OPTIMIZER"
objective:
name: "loss"
regex: "Loss: ([0-9]+\.?[0-9]*)"
direction: "minimize"
n_trials: 100
__init__
def __init__(job_dict_or_job: Union[Dict[str, Any], "PyTorchJob"])
Initialize with a PyTorchJob dictionary or another PyTorchJob.
Arguments:
job_dict_or_job
- Dictionary representation of a PyTorchJob or another PyTorchJob instance
Raises:
ValueError
- If job_dict has invalid PyTorchJob structure
set_env
def set_env(key: str,
value: str,
replica_type: str = "Worker",
container_index: int = 0) -> "PyTorchJob"
Set environment variable for specified replica type.
Arguments:
key
- Environment variable namevalue
- Environment variable valuereplica_type
- Replica type (Worker, Master, etc.)container_index
- Container index (default: 0)
Returns:
Self for method chaining
set_name
def set_name(name: str) -> "PyTorchJob"
Set job name.
Arguments:
name
- Job name
Returns:
Self for method chaining
get_name
def get_name() -> Optional[str]
Get job name.
Returns:
Job name or None if not set
set_command
def set_command(command: list,
replica_type: str = "Worker",
container_index: int = 0) -> "PyTorchJob"
Set command for specified replica type.
Arguments:
command
- Command list (e.g., ["python", "train.py"])replica_type
- Replica type (Worker, Master, etc.)container_index
- Container index (default: 0)
Returns:
Self for method chaining
get_command
def get_command(replica_type: str = "Worker",
container_index: int = 0) -> Optional[list]
Get command for specified replica type.
Arguments:
replica_type
- Replica type (Worker, Master, etc.)container_index
- Container index (default: 0)
Returns:
Command list or None if not set
set_worker_replicas
def set_worker_replicas(replicas: int) -> "PyTorchJob"
Set number of worker replicas.
Arguments:
replicas
- Number of worker replicas
Returns:
Self for method chaining
get_env
def get_env(key: str,
replica_type: str = "Worker",
container_index: int = 0) -> Optional[str]
Get environment variable value.
Arguments:
key
- Environment variable namereplica_type
- Replica type (Worker, Master, etc.)container_index
- Container index (default: 0)
Returns:
Environment variable value or None if not found
get_env_list
def get_env_list(replica_type: str = "Worker",
container_index: int = 0) -> Dict[str, str]
Get all environment variables as a dictionary.
Arguments:
replica_type
- Replica type (Worker, Master, etc.)container_index
- Container index (default: 0)
Returns:
Dictionary of environment variables
to_dict
def to_dict() -> Dict[str, Any]
Convert back to dictionary format.
Returns:
Dictionary representation of the PyTorchJob
has_tuning_config
def has_tuning_config() -> bool
Check if job has tuning optimization config annotation.
Returns:
True if zenith-tune/optimization-config annotation exists
get_tuning_config
def get_tuning_config() -> Optional[Dict[str, Any]]
Get parsed tuning optimization config from annotations.
Returns:
Parsed config dictionary or None if not found/invalid
get_optimization_variables
def get_optimization_variables() -> List[Dict[str, Any]]
Get optimization variables from tuning config.
Returns:
List of variable configurations, empty if not found
get_objective_config
def get_objective_config() -> Optional[Dict[str, Any]]
Get objective configuration from tuning config.
Returns:
Objective config dictionary or None if not found
get_objective_direction
def get_objective_direction() -> str
Get objective direction (minimize/maximize) from tuning config.
Returns:
"minimize" or "maximize", defaults to "minimize"
should_maximize
def should_maximize() -> bool
Check if objective should be maximized.
Returns:
True if should maximize, False if should minimize
get_n_trials
def get_n_trials() -> int | None
Get number of trials from tuning config.
Returns:
Number of trials or None if not specified
get_status
def get_status() -> str
Get the status of this PyTorchJob.
Returns:
Job status: "Succeeded", "Failed", "InProgress", or "Unknown"
validate_tuning_config
def validate_tuning_config() -> list[str]
Validate tuning configuration format.
Returns:
List of validation error messages, empty if valid
is_tuning_config_valid
def is_tuning_config_valid() -> bool
Check if tuning configuration is valid.
Returns:
True if configuration is valid
__getitem__
def __getitem__(key)
Support dict-like access: job['spec']
__setitem__
def __setitem__(key, value)
Support dict-like assignment: job['spec'] = value
__delitem__
def __delitem__(key)
Support dict-like deletion: del job['status']
__contains__
def __contains__(key)
Support 'in' operator: 'spec' in job
__len__
def __len__()
Support len() function: len(job)
__iter__
def __iter__()
Support iteration: for key in job
keys
def keys()
Support dict.keys(): job.keys()
values
def values()
Support dict.values(): job.values()
items
def items()
Support dict.items(): job.items()
get
def get(key, default=None)
Support dict.get(): job.get('spec')
update
def update(other)
Support dict.update(): job.update(other_dict)
__repr__
def __repr__()
String representation for debugging.