Skip to content

Commit

Permalink
Adding multiNode, and putting the scripts into a separate directory
Browse files Browse the repository at this point in the history
  • Loading branch information
alonkukl committed Sep 15, 2024
1 parent 1e52186 commit 04849ea
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 0 deletions.
52 changes: 52 additions & 0 deletions examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
#SBATCH --job-name=<job_name>
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t <time_to_run>
#SBATCH -n <number_of_nodes>
#SBATCH --gpus=<total number of GPUs, not nodes>
#SBATCH --output=<full_path_to_log_file>

scratch_dir=<temp_dir>

date
uname -a

### load any modules needed and activate the conda enviorment
module load <module1>
module load <module2>
conda activate <conda-env-name>


echo "starting BQSKit managers on all nodes"
srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
managers_pid=$!

managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started
n=<number_of_nodes>


# Wait until all the the managers have started
while [[ ! -f "$managers_started_file" ]]
do
sleep 0.5
done

while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
sleep 1
done

echo "starting BQSKit server on main node"
bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
server_pid=$!

uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started

echo "will run python your command"

python <Your command>

date

echo "Killing the server"
kill -2 $server_pid
61 changes: 61 additions & 0 deletions examples/bqskit_env_scripts/run_workers_and_managers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash

node_id=$(uname -n)
amount_of_gpus=$1
amount_of_workers_per_gpu=$2
total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))

scratch_dir=<temp_dir>
manager_log_file="$scratch_dir/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started"
managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started"

touch $managers_started_file

wait_for_outgoing_thread_in_manager_log() {
while ! grep -q "Started outgoing thread." $manager_log_file; do
sleep 1
done
uname -a >> $managers_started_file
}

start_mps_servers() {
echo "Starting MPS servers on node $node_id with CUDA $CUDA_VISIBLE_DEVICES"
nvidia-cuda-mps-control -d
}

wait_for_bqskit_server() {
i=0
while [[ ! -f $server_started_file && $i -lt 10 ]]; do
sleep 1
i=$((i+1))
done
}

start_workers() {
echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
done
wait
}

stop_mps_servers() {
echo "Stop MPS servers on node $node_id"
echo quit | nvidia-cuda-mps-control
}

if [ $amount_of_gpus -eq 0 ]; then
echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"
bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file
echo "Manager finished on node $node_id"
else
echo "Will run manager on node $node_id"
bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
wait_for_outgoing_thread_in_manager_log
start_mps_servers
wait_for_bqskit_server
start_workers
echo "Manager and workers finished on node $node_id" >> $manager_log_file
stop_mps_servers
fi
File renamed without changes.

0 comments on commit 04849ea

Please sign in to comment.