Adding multiNode, and putting the scripts into a separate directory

BQSKit · Sep 15, 2024 · 04849ea · 04849ea
1 parent 1e52186
commit 04849ea
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 0 deletions.
diff --git a/examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh b/examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#SBATCH --job-name=<job_name>
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t <time_to_run>
+#SBATCH -n <number_of_nodes>
+#SBATCH --gpus=<total number of GPUs, not nodes>
+#SBATCH --output=<full_path_to_log_file>
+
+scratch_dir=<temp_dir>
+
+date
+uname -a
+
+### load any modules needed and activate the conda enviorment
+module load <module1>
+module load <module2>
+conda activate <conda-env-name>
+
+
+echo "starting BQSKit managers on all nodes"
+srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
+managers_pid=$!
+
+managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started
+n=<number_of_nodes>
+
+
+# Wait until  all the the  managers have started
+while [[ ! -f "$managers_started_file" ]]
+do
+        sleep 0.5
+done
+
+while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
+    sleep 1
+done
+
+echo "starting BQSKit server on main node"
+bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
+server_pid=$!
+
+uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started
+
+echo "will run python your command"
+
+python <Your command>
+
+date
+
+echo "Killing the server"
+kill -2 $server_pid
diff --git a/examples/bqskit_env_scripts/run_workers_and_managers.sh b/examples/bqskit_env_scripts/run_workers_and_managers.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+node_id=$(uname -n)
+amount_of_gpus=$1
+amount_of_workers_per_gpu=$2
+total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
+
+scratch_dir=<temp_dir>
+manager_log_file="$scratch_dir/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
+server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started"
+managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started"
+
+touch $managers_started_file
+
+wait_for_outgoing_thread_in_manager_log() {
+    while ! grep -q "Started outgoing thread." $manager_log_file; do
+        sleep 1
+    done
+    uname -a >> $managers_started_file
+}
+
+start_mps_servers() {
+    echo "Starting MPS servers on node $node_id with CUDA $CUDA_VISIBLE_DEVICES"
+    nvidia-cuda-mps-control -d
+}
+
+wait_for_bqskit_server() {
+    i=0
+    while [[ ! -f $server_started_file && $i -lt 10 ]]; do
+        sleep 1
+        i=$((i+1))
+    done
+}
+
+start_workers() {
+    echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
+    for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
+        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
+    done
+    wait
+}
+
+stop_mps_servers() {
+    echo "Stop MPS servers on node $node_id"
+    echo quit | nvidia-cuda-mps-control
+}
+
+if [ $amount_of_gpus -eq 0 ]; then
+    echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"
+    bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file
+    echo "Manager finished on node $node_id"
+else
+    echo "Will run manager on node $node_id"
+    bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
+    wait_for_outgoing_thread_in_manager_log
+    start_mps_servers
+    wait_for_bqskit_server
+    start_workers
+    echo "Manager and workers finished on node $node_id" >> $manager_log_file
+    stop_mps_servers
+fi
diff --git a/examples/single_server_env.sh → ...s/bqskit_env_scripts/single_server_env.sh b/examples/single_server_env.sh → ...s/bqskit_env_scripts/single_server_env.sh