Using the cluster_main Decorator

This is an example how to use the cluster_main() decorator.

import os
import time

import numpy as np

from cluster_utils import cluster_main, exit_for_resume


def fn_to_optimize(*, u, v, w, x, y, sharp_penalty, tuple_input=None):
    """
    A dummy function to test hpo.

    :param u: real variable
    :param v: integer variable living on logscale
    :param w: integer variable
    :param x: real variable
    :param y: real variable living on log-scale
    :param sharp_penalty: discrete variable
    :param tuple_input: a tuple (we only use its length here)
    :return: result of some random computation
    """
    tuple_input = tuple_input or tuple()
    tuple_len = len(tuple_input)
    y_log = np.log(np.abs(y + 1e-7))
    v_log = np.log(np.abs(v + 1e-7))
    assert isinstance(w, int), "w has to be integer"
    assert isinstance(v, int), "v has to be integer"

    result = (
        (x - 3.14) ** 2
        + (y_log - 2.78) ** 2
        + (u * v_log * w + 1) ** 2
        + (u + v_log + w - 5 + tuple_len) ** 2
    )
    if sharp_penalty and x > 3.20:
        result += 1

    if np.random.rand() < 0.1:
        raise ValueError("10 percent of all jobs die here on purpose")

    return result


@cluster_main
def main(working_dir, id, **kwargs):  # noqa A002
    # All parameters in grid_search.json (fixed parameters and the ones
    # searched over) are passed to main.py as arguments, here caught in
    # `**kwargs`.
    # Could have written `main(working_dir, id, fn_args, test_resume, ...)`
    fn_args = kwargs["fn_args"]
    test_resume = kwargs["test_resume"]

    # simulate that the jobs take some time
    time.sleep(np.random.randint(0, 10))
    result_file = os.path.join(working_dir, "result.npy")

    # here we do a little simulation for checkpointing and resuming
    if os.path.isfile(result_file):
        # If there is a result to resume
        noiseless_result = np.load(result_file)
    else:
        # Otherwise compute result, checkpoint it and exit
        noiseless_result = fn_to_optimize(**fn_args)
        print(f"save result to {result_file}")
        np.save(result_file, noiseless_result)
        if test_resume:
            exit_for_resume()

    noisy_result = noiseless_result + 0.5 * np.random.normal()
    metrics = {"result": noisy_result, "noiseless_result": noiseless_result}
    return metrics


if __name__ == "__main__":
    main()

The corresponding cluster_utils config file:

{
  "__import__": "examples/basic/grid_search.json",
  "optimization_procedure_name": "decorator_test",
  "results_dir": "/is/cluster/work/{$USER}/tmp",
  "script_relative_path": "examples/basic/main_with_decorator.py"
}

Note

This example is included in cluster_utils/examples/basic and can be directly run from there.