-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding multiNode, and putting the scripts into a separate directory
- Loading branch information
Showing
3 changed files
with
113 additions
and
0 deletions.
There are no files selected for viewing
52 changes: 52 additions & 0 deletions
52
examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=<job_name> | ||
#SBATCH -C gpu | ||
#SBATCH -q regular | ||
#SBATCH -t <time_to_run> | ||
#SBATCH -n <number_of_nodes> | ||
#SBATCH --gpus=<total number of GPUs, not nodes> | ||
#SBATCH --output=<full_path_to_log_file> | ||
|
||
scratch_dir=<temp_dir> | ||
|
||
date | ||
uname -a | ||
|
||
### load any modules needed and activate the conda enviorment | ||
module load <module1> | ||
module load <module2> | ||
conda activate <conda-env-name> | ||
|
||
|
||
echo "starting BQSKit managers on all nodes" | ||
srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> & | ||
managers_pid=$! | ||
|
||
managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started | ||
n=<number_of_nodes> | ||
|
||
|
||
# Wait until all the the managers have started | ||
while [[ ! -f "$managers_started_file" ]] | ||
do | ||
sleep 0.5 | ||
done | ||
|
||
while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do | ||
sleep 1 | ||
done | ||
|
||
echo "starting BQSKit server on main node" | ||
bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log & | ||
server_pid=$! | ||
|
||
uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started | ||
|
||
echo "will run python your command" | ||
|
||
python <Your command> | ||
|
||
date | ||
|
||
echo "Killing the server" | ||
kill -2 $server_pid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/bin/bash | ||
|
||
node_id=$(uname -n) | ||
amount_of_gpus=$1 | ||
amount_of_workers_per_gpu=$2 | ||
total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu)) | ||
|
||
scratch_dir=<temp_dir> | ||
manager_log_file="$scratch_dir/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log" | ||
server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started" | ||
managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started" | ||
|
||
touch $managers_started_file | ||
|
||
wait_for_outgoing_thread_in_manager_log() { | ||
while ! grep -q "Started outgoing thread." $manager_log_file; do | ||
sleep 1 | ||
done | ||
uname -a >> $managers_started_file | ||
} | ||
|
||
start_mps_servers() { | ||
echo "Starting MPS servers on node $node_id with CUDA $CUDA_VISIBLE_DEVICES" | ||
nvidia-cuda-mps-control -d | ||
} | ||
|
||
wait_for_bqskit_server() { | ||
i=0 | ||
while [[ ! -f $server_started_file && $i -lt 10 ]]; do | ||
sleep 1 | ||
i=$((i+1)) | ||
done | ||
} | ||
|
||
start_workers() { | ||
echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus" | ||
for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do | ||
XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log & | ||
done | ||
wait | ||
} | ||
|
||
stop_mps_servers() { | ||
echo "Stop MPS servers on node $node_id" | ||
echo quit | nvidia-cuda-mps-control | ||
} | ||
|
||
if [ $amount_of_gpus -eq 0 ]; then | ||
echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu" | ||
bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file | ||
echo "Manager finished on node $node_id" | ||
else | ||
echo "Will run manager on node $node_id" | ||
bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file & | ||
wait_for_outgoing_thread_in_manager_log | ||
start_mps_servers | ||
wait_for_bqskit_server | ||
start_workers | ||
echo "Manager and workers finished on node $node_id" >> $manager_log_file | ||
stop_mps_servers | ||
fi |
File renamed without changes.