generated from VectorInstitute/aieng-template
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathlaunch_server.sh
executable file
·145 lines (125 loc) · 4.6 KB
/
launch_server.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
# ================================= Read Named Args ======================================
while [[ "$#" -gt 0 ]]; do
case $1 in
--model-family) model_family="$2"; shift ;;
--model-variant) model_variant="$2"; shift ;;
--model-type) model_type="$2"; shift ;;
--partition) partition="$2"; shift ;;
--qos) qos="$2"; shift ;;
--time) walltime="$2"; shift ;;
--num-nodes) num_nodes="$2"; shift ;;
--num-gpus) num_gpus="$2"; shift ;;
--max-model-len) max_model_len="$2"; shift ;;
--max-num-seqs) max_num_seqs="$2"; shift ;;
--vocab-size) vocab_size="$2"; shift ;;
--data-type) data_type="$2"; shift ;;
--venv) venv="$2"; shift ;;
--log-dir) log_dir="$2"; shift ;;
--model-weights-parent-dir) model_weights_parent_dir="$2"; shift ;;
--pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
--enforce-eager) enforce_eager="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done
required_vars=(model_family model_variant model_type partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
for var in "$required_vars[@]"; do
if [ -z "$!var" ]; then
echo "Error: Missing required --$var argument."
exit 1
fi
done
export MODEL_FAMILY=$model_family
export MODEL_VARIANT=$model_variant
export MODEL_TYPE=$model_type
export JOB_PARTITION=$partition
export QOS=$qos
export WALLTIME=$walltime
export NUM_NODES=$num_nodes
export NUM_GPUS=$num_gpus
export VLLM_MAX_MODEL_LEN=$max_model_len
export VLLM_MAX_LOGPROBS=$vocab_size
export VLLM_DATA_TYPE=$data_type
export VENV_BASE=$venv
export LOG_DIR=$log_dir
export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
if [[ "$model_type" == "LLM" || "$model_type" == "VLM" ]]; then
export VLLM_TASK="generate"
elif [ "$model_type" == "Reward_Modeling" ]; then
export VLLM_TASK="reward"
elif [ "$model_type" == "Text_Embedding" ]; then
export VLLM_TASK="embed"
else
echo "Error: Unknown model_type: $model_type"
exit 1
fi
if [ -n "$max_num_seqs" ]; then
export VLLM_MAX_NUM_SEQS=$max_num_seqs
else
export VLLM_MAX_NUM_SEQS=256
fi
if [ -n "$pipeline_parallelism" ]; then
export PIPELINE_PARALLELISM=$pipeline_parallelism
else
export PIPELINE_PARALLELISM="False"
fi
if [ -n "$enforce_eager" ]; then
export ENFORCE_EAGER=$enforce_eager
else
export ENFORCE_EAGER="False"
fi
# ================================= Set default environment variables ======================================
# Slurm job configuration
export JOB_NAME="$MODEL_FAMILY-$MODEL_VARIANT"
if [ "$JOB_NAME" == "DeepSeek-R1-None" ]; then
export JOB_NAME=$MODEL_FAMILY
fi
if [ "$LOG_DIR" = "default" ]; then
export LOG_DIR="$HOME/.vec-inf-logs/$MODEL_FAMILY"
fi
mkdir -p $LOG_DIR
# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
# SLURM job
export SRC_DIR="$(dirname "$0")"
export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
# Variables specific to your working environment, below are examples for the Vector cluster
export VLLM_MODEL_WEIGHTS="${MODEL_WEIGHTS_PARENT_DIR}/${JOB_NAME}"
export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
# ================================ Validate Inputs & Launch Server =================================
# Set data type to fp16 instead of bf16 for non-Ampere GPUs
fp16_partitions="t4v1 t4v2"
# choose from 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
export VLLM_DATA_TYPE="float16"
echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
fi
echo Job Name: $JOB_NAME
echo Partition: $JOB_PARTITION
echo Num Nodes: $NUM_NODES
echo GPUs per Node: $NUM_GPUS
echo QOS: $QOS
echo Walltime: $WALLTIME
echo Model Type: $MODEL_TYPE
echo Task: $VLLM_TASK
echo Data Type: $VLLM_DATA_TYPE
echo Max Model Length: $VLLM_MAX_MODEL_LEN
echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
echo Vocabulary Size: $VLLM_MAX_LOGPROBS
echo Pipeline Parallelism: $PIPELINE_PARALLELISM
echo Enforce Eager: $ENFORCE_EAGER
echo Log Directory: $LOG_DIR
echo Model Weights Parent Directory: $MODEL_WEIGHTS_PARENT_DIR
is_special=""
if [ "$NUM_NODES" -gt 1 ]; then
is_special="multinode_"
fi
sbatch --job-name $JOB_NAME \
--partition $JOB_PARTITION \
--nodes $NUM_NODES \
--gres gpu:$NUM_GPUS \
--qos $QOS \
--time $WALLTIME \
--output $LOG_DIR/$JOB_NAME.%j.out \
--error $LOG_DIR/$JOB_NAME.%j.err \
$SRC_DIR/${is_special}vllm.slurm