Source code for libmuscle.manager.qcgpj_instantiator

import asyncio
import logging
import multiprocessing as mp
import os
from pathlib import Path
import queue
import sys
import traceback
from typing import Dict, List, Tuple

from qcg.pilotjob.allocation import (
        Allocation as qcg_Allocation, NodeAllocation as qcg_NodeAllocation)
from qcg.pilotjob.config import Config as qcg_Config
from qcg.pilotjob.errors import InternalError as qcg_InternalError
from qcg.pilotjob.executor import Executor as qcg_Executor
from qcg.pilotjob.joblist import (
        Job as qcg_Job, JobExecution as qcg_JobExecution,
        JobResources as qcg_JobResources)
from qcg.pilotjob.manager import (
        SchedulingIteration as qcg_SchedulingIteration,
        SchedulingJob as qcg_SchedulingJob)
from qcg.pilotjob.parseres import get_resources as qcg_get_resources
from qcg.pilotjob.resources import (
        Node as qcg_Node, ResourcesType as qcg_ResourcesType)

from ymmsl import ExecutionModel, MPICoresResReq, Reference, ThreadedResReq

from libmuscle.manager.instantiator import (
        CancelAllRequest, CrashedResult, InstantiationRequest, Process,
        ProcessStatus, QueueingLogHandler, ShutdownRequest)
from libmuscle.planner.planner import Resources


_logger = logging.getLogger(__name__)


[docs]class StateTracker: """Tracks processes and their state. This keeps a list of running processes and their state. It receives callbacks from QCG-PJ when the state of a job changes, and updates the list accordingly. Attributes: processes: Dict mapping instance names to Process objects. """ def __init__(self) -> None: """Create a StateTracker.""" self.processes: Dict[Reference, Process] = {} # These are for communicating with QCG-PJ self.queued_to_execute = 0 self.stop_processing = False self.zmq_address = '' # QCG-PJ callbacks
[docs] def job_executing(self, job_iteration: qcg_SchedulingIteration) -> None: """Called by Executor when a job has started to run. Args: job_iteration: The job iteration that has started """ name = job_iteration._scheduling_job.job.name self.processes[name].status = ProcessStatus.RUNNING _logger.debug(f'Job {name} running')
[docs] def job_finished( self, job_iteration: qcg_SchedulingIteration, allocation: qcg_Allocation, exit_code: int, error_msg: str, canceled: bool) -> None: """Called by executor when a job has finished. Args: job_iteration: The job iteration that has finished allocation: The allocation it ran on exit_code: Its exit code error_msg: Its error message canceled: Whether it was canceled """ name = job_iteration._scheduling_job.job.name process = self.processes[name] if canceled: process.status = ProcessStatus.CANCELED elif exit_code == 0: process.status = ProcessStatus.SUCCESS else: process.status = ProcessStatus.ERROR process.exit_code = exit_code process.error_msg = error_msg
[docs]class QCGPJInstantiator(mp.Process): """Background process for interacting with the QCG-PJ executor.""" def __init__( self, resources: mp.Queue, requests: mp.Queue, results: mp.Queue, log_records: mp.Queue, run_dir: Path) -> None: """Create a QCGPJProcessManager. Args: resources: Queue for returning the available resources requests: Queue to take requests from results: Queue to communicate finished processes over log_messages: Queue to push log messages to """ super().__init__(name='QCGPJProcessManager') self._resources_out = resources self._requests_in = requests self._results_out = results self._log_records_out = log_records self._run_dir = run_dir
[docs] def run(self) -> None: """Entry point for the process.""" # Put QCG-PJ output in run dir # The configuration setting below is ignored by the agents # due to a bug in QCG-PJ qcgpj_dir = self._run_dir / 'qcgpj' qcgpj_dir.mkdir(exist_ok=True) os.chdir(qcgpj_dir) self._reconfigure_logging() # Executor needs to be instantiated before we go async qcg_config: Dict[str, str] = {qcg_Config.AUX_DIR: str(qcgpj_dir)} self._qcg_resources = qcg_get_resources(qcg_config) self._state_tracker = StateTracker() self._executor = qcg_Executor( self._state_tracker, qcg_config, self._qcg_resources) self._send_resources() try: loop = asyncio.get_event_loop() loop.run_until_complete(self._main()) except: # noqa for line in traceback.format_exception(*sys.exc_info()): _logger.error(line) self._results_out.put(CrashedResult())
async def _main(self) -> None: """Main function for the background process. This sets up QCG-PJ and then accepts requests for instantiating jobs, stopping them, or shutting down. Results of finished jobs are returned via the results queue. """ qcg_iters: Dict[Reference, qcg_SchedulingIteration] = {} await asyncio.sleep(0.01) # allow requests_in queue to be populated shutting_down = False done = False while not done: while not shutting_down: try: request = self._requests_in.get_nowait() if isinstance(request, ShutdownRequest): _logger.debug('Got ShutdownRequest') self._state_tracker.stop_processing = True shutting_down = True elif isinstance(request, CancelAllRequest): _logger.debug('Got CancelAllRequest') await self._cancel_all(qcg_iters) _logger.debug('Done CancelAllRequest') elif isinstance(request, InstantiationRequest): if not shutting_down: qcg_alloc, qcg_iter = self._create_job( request, self._qcg_resources.rtype) qcg_iters[request.instance] = qcg_iter self._state_tracker.processes[request.instance] = ( Process( request.instance, request.resources)) self._state_tracker.queued_to_execute += 1 await self._executor.execute(qcg_alloc, qcg_iter) except queue.Empty: break for name, process in list(self._state_tracker.processes.items()): if process.status.is_finished(): _logger.debug(f'Reporting {name} done') self._results_out.put(process) del self._state_tracker.processes[name] if shutting_down: _logger.debug(f'Done: {self._state_tracker.processes}') done = len(self._state_tracker.processes) == 0 if not done: await asyncio.sleep(0.1) _logger.debug('Stopping executor') await self._executor.stop() def _reconfigure_logging(self) -> None: """Reconfigure logging to send to log_records_out.""" root_logger = logging.getLogger() for h in list(root_logger.handlers): root_logger.removeHandler(h) handler = QueueingLogHandler(self._log_records_out) root_logger.addHandler(handler) def _send_resources(self) -> None: """Converts and sends QCG available resources.""" resources = Resources() for node in self._qcg_resources.nodes: resources.cores[node.name] = set(map(int, node.free_ids)) self._resources_out.put(resources) async def _cancel_all( self, qcg_iters: Dict[Reference, qcg_SchedulingIteration]) -> None: """Cancels all running jobs.""" # Repeat cancel until they're gone to work around QCG-PJ # race condition. while any([ not p.status.is_finished() for p in self._state_tracker.processes.values()]): for instance, process in self._state_tracker.processes.items(): if process.status.is_finished(): continue qcg_iter = qcg_iters[instance] try: await self._executor.cancel_iteration( qcg_iter.job, qcg_iter.iteration) _logger.debug(f'Canceled {instance}') except qcg_InternalError: _logger.debug(f'Canceled {instance} not found') raise except AttributeError: # Workaround for QCG-PJ bug _logger.debug(f'Canceled {instance} not found') raise await asyncio.sleep(0.1) def _create_job( self, request: InstantiationRequest, qcg_resources_type: qcg_ResourcesType ) -> Tuple[qcg_Allocation, qcg_SchedulingIteration]: """Creates a QCG allocation and job for a request.""" total_cores = sum(map(len, request.resources.cores.values())) env = self._create_env(request.instance, request.implementation.env) if request.implementation.script: execution = self._qcg_job_execution_with_script(request, env) else: execution = self._qcg_job_execution_normal( request, env, qcg_resources_type) resources = qcg_JobResources(numCores=total_cores) qcg_job = qcg_Job( str(request.instance), execution=execution, resources=resources) qcg_allocation = qcg_Allocation() for node_name, cores in request.resources.cores.items(): qcg_cores = [str(i) for i in cores] qcg_allocation.add_node( qcg_NodeAllocation(qcg_Node(node_name), qcg_cores, {})) sjob = qcg_SchedulingJob(self._state_tracker, qcg_job) qcg_iteration = qcg_SchedulingIteration(sjob, None, None, resources, []) return qcg_allocation, qcg_iteration def _create_env( self, instance: Reference, overlay: Dict[str, str] ) -> Dict[str, str]: """Updates the environment with the implementation's env. This updates env in-place. Keys from overlay that start with + will have the corresponding value appended to the matching (by key, without the +) value in env, otherwise the value in env gets overwritten. """ env = os.environ.copy() env['MUSCLE_INSTANCE'] = str(instance) for key, value in overlay.items(): if key.startswith('+'): if key[1:] in env: env[key[1:]] += value else: env[key[1:]] = value else: env[key] = value return env def _qcg_job_execution_with_script( self, request: InstantiationRequest, env: Dict[str, str] ) -> qcg_JobExecution: """Create a JobExecution with a run script.""" impl = request.implementation if impl.script is None: raise RuntimeError() script_file = request.instance_dir / 'run_script.sh' with script_file.open('w') as f: f.write(impl.script) script_file.chmod(0o700) if isinstance(request.res_req, ThreadedResReq): env['MUSCLE_THREADS'] = str(request.res_req.threads) elif isinstance(request.res_req, MPICoresResReq): # OpenMPI support rank_file = request.instance_dir / 'rankfile' with rank_file.open('w') as f: i = 0 for node, cores in request.resources.cores.items(): for c in sorted(cores): f.write(f'rank {i}={node} slot={c}\n') i += 1 env['MUSCLE_OPENMPI_RANK_FILE'] = str(rank_file) # IntelMPI support mpi_res_args = list() for node, cores in request.resources.cores.items(): mpi_res_args.extend(['-host', node, '-n', str(len(cores))]) env['MUSCLE_INTELMPI_RESOURCES'] = ' '.join(mpi_res_args) # General environment env['MUSCLE_MPI_PROCESSES'] = str( request.res_req.mpi_processes) env['MUSCLE_THREADS_PER_MPI_PROCESS'] = str( request.res_req.threads_per_mpi_process) return qcg_JobExecution( exec=str(script_file), env=env, stdout=str(request.stdout_path), stderr=str(request.stderr_path), wd=str(request.work_dir), model='default') def _qcg_job_execution_normal( self, request: InstantiationRequest, env: Dict[str, str], qcg_resources_type: qcg_ResourcesType) -> qcg_JobExecution: """Create a JobExecution for a normal description.""" impl = request.implementation total_cores = sum(map(len, request.resources.cores.values())) if impl.execution_model == ExecutionModel.DIRECT: env['OMP_NUM_THREADS'] = str(total_cores) else: env['OMP_NUM_THREADS'] = '1' model_map = { ExecutionModel.DIRECT: 'threads', ExecutionModel.OPENMPI: 'openmpi', ExecutionModel.INTELMPI: 'intelmpi', ExecutionModel.SRUNMPI: 'srunmpi'} qcg_execution_model = model_map[impl.execution_model] executable = str(impl.executable) args = impl.args if impl.args is not None else [] if qcg_resources_type == qcg_ResourcesType.LOCAL: if impl.execution_model == ExecutionModel.DIRECT: if not impl.virtual_env and not impl.modules: # QCG-PJ uses a bash call like this in all but this # case, so we add it here to be consistent. cmd = ' '.join([executable] + args) executable = 'bash' args = ['-l', '-c', cmd] elif impl.execution_model == ExecutionModel.OPENMPI: executable, args = self._with_local_open_mpi( executable, args, total_cores) elif impl.execution_model == ExecutionModel.INTELMPI: executable, args = self._with_local_intel_mpi( executable, args, total_cores) elif impl.execution_model == ExecutionModel.SRUNMPI: raise RuntimeError( f'Cannot instantiate implementation {impl.name} with' ' execution model "srunmpi", because there is no' ' Slurm/srun here.') _logger.debug(f'Starting {executable} with {args}') return qcg_JobExecution( exec=executable, args=args, env=env, stdout=str(request.stdout_path), stderr=str(request.stderr_path), modules=impl.modules, venv=str(impl.virtual_env) if impl.virtual_env else None, wd=str(request.work_dir), model=qcg_execution_model, model_opts={'srun_opts': ['--overlap']}) def _with_local_open_mpi( self, executable: str, args: List[str], num_processes: int ) -> Tuple[str, List[str]]: """Create OpenMPI mpirun call.""" new_args = ['-n', str(num_processes), executable] new_args.extend(args) return 'mpirun', new_args def _with_local_intel_mpi( self, executable: str, args: List[str], num_processes: int ) -> Tuple[str, List[str]]: """Create IntelMPI mpirun call.""" new_args = ['-n', str(num_processes), executable] new_args.extend(args) return 'mpirun', new_args