Use checkpointing with exit_for_resume()
¶
This is an example how to use checkpointing and restarting jobs using
exit_for_resume()
when training a neural network with PyTorch.
For more information on exit_for_resume()
see
Restart jobs using exit_for_resume().
import os
import sys
import torch
from cluster_utils import (
exit_for_resume,
finalize_job,
initialize_job,
)
def save_checkpoint(save_path, model, optim, iteration):
"""Save a dict with all variables necessary for a resume in a file.
Make sure to also save the optimizer state if it uses an adaptive
learning rate!"""
torch.save(
{
"model_weights": model.state_dict(),
"optimizer_weights": optim.state_dict(),
"iteration": iteration,
},
save_path,
)
def load_checkpoint(load_path, model, optim):
"""Load all previoulsy saved variables. The program starts clean
after a resume, so we have to look if a checkpoint file exists in the
current folder. If not, then we assume the program runs for the first
time."""
iteration = 0
if os.path.isfile(load_path):
checkpoint = torch.load(load_path)
model.load_state_dict(checkpoint.get("model_weights"))
optim.load_state_dict(checkpoint.get("optimizer_weights"))
iteration = checkpoint.get("iteration")
print(f"Resuming from checkpoint at iteration {iteration}")
return iteration
if __name__ == "__main__":
# parameters are loaded from json file
params = initialize_job()
# a folder for each run is created
os.makedirs(params.working_dir, exist_ok=True)
checkpoint_path = os.path.join(params.working_dir, "checkpoint.pt")
# these are taken from json file for illustration
total_iterations = params.total_iterations
# initialize toy model and optimizer
model = torch.nn.Linear(10, 20)
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
target = torch.ones(size=(128, 20))
# if a checkpoint.pt file exists, it is loaded
iteration = load_checkpoint(checkpoint_path, model, optim)
# redirect output to log file for easier understanding what happens
# the log file is written after the program ends.
sys.stdout = open(f"{params.working_dir}/log_{iteration}.txt", "w") # noqa: SIM115
while True:
# do some training
x = torch.normal(0, 1.0, size=(128, 10))
y = model(x)
loss = torch.nn.functional.mse_loss(y, target)
optim.zero_grad()
loss.backward()
optim.step()
print(f"loss {loss} episode {iteration}")
iteration += 1
# It is best to replace the iteration- with a time-constraint, jobs grow in
# cost based on runtime by (0.1 * running_bid * n_compute_units) every hour!
if iteration == 100:
# we first save the necessary data to restart our job
save_checkpoint(checkpoint_path, model, optim, iteration)
# then we exit the job by calling a special function
# htcondor internally restarts the job in the same cluster_utils working_dir
# you will not see this in the utils progress bar, check
# /working_directories/0/log.txt after the job
print(f"Exit job at iteration {iteration}")
exit_for_resume()
if iteration >= total_iterations:
break
metrics = {"loss": loss, "iterations": iteration}
# save final metrics, you will only see the resuming in the cluster_run.log file
finalize_job(metrics, params)
print(f"Training finished, final loss {loss} at episode {iteration}")
The corresponding cluster_utils config file:
{
"optimization_procedure_name": "grid_search_checkpointing",
"results_dir": "/is/rg/al/Projects/tmp/example_results",
"git_params": {
"branch": "master",
"commit": null
},
"script_relative_path": "examples/checkpointing/checkpoint_example.py",
"load_existing_results": false,
"run_in_working_dir": true,
"environment_setup": {
},
"cluster_requirements": {
"request_cpus": 1,
"request_gpus": 0,
"cuda_requirement": null,
"memory_in_mb": 16000,
"bid": 800,
"hostname_list": ["g018", "g025", "g026", "g027", "g028", "g029", "g030", "g031", "p001", "p002", "p003", "t001"]
},
"restarts": 1,
"samples": null,
"remove_jobs_dir": false,
"fixed_params":{
},
"hyperparam_list": [
{
"param": "total_iterations",
"values": [200]
}
]
}
Note
This example is included in cluster_utils/examples/checkpointing
and can be
directly run from there.