Source code for deepmd.cluster.slurm

"""MOdule to get resources on SLURM cluster.

References
----------
https://github.com/deepsense-ai/tensorflow_on_slurm ####
"""

import hostlist
import os

from deepmd.cluster import local
from typing import List, Tuple, Optional

__all__ = ["get_resource"]


[docs]def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: """Get SLURM resources: nodename, nodelist, and gpus. Returns ------- Tuple[str, List[str], Optional[List[int]]] nodename, nodelist, and gpus Raises ------ RuntimeError if number of nodes could not be retrieved ValueError list of nodes is not of the same length sa number of nodes ValueError if current nodename is not found in node list """ nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"]) nodename = os.environ["SLURMD_NODENAME"] num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES") if num_nodes_env: num_nodes = int(num_nodes_env) else: raise RuntimeError("Could not get SLURM number of nodes") if len(nodelist) != num_nodes: raise ValueError( f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}" ) if nodename not in nodelist: raise ValueError( f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!" ) gpus = local.get_gpus() return nodename, nodelist, gpus