Skip to content

Commit

Permalink
Merge pull request #226 from stanfordnmbl/server
Browse files Browse the repository at this point in the history
Dev: New deployment for new Server
  • Loading branch information
AlbertoCasasOrtiz authored Feb 13, 2025
2 parents f69fe47 + 116e8d4 commit cc976fa
Show file tree
Hide file tree
Showing 11 changed files with 353 additions and 75 deletions.
42 changes: 34 additions & 8 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
import glob
from datetime import datetime, timedelta
import numpy as np
from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials
from utilsAPI import getAPIURL, getWorkerType, getErrorLogBool, getASInstance, unprotect_current_instance, get_number_of_pending_trials
from utilsAuth import getToken
from utils import (getDataDirectory, checkTime, checkResourceUsage,
sendStatusEmail, checkForTrialsWithStatus,
getCommitHash, getHostname, postLocalClientInfo,
postProcessedDuration, makeRequestWithRetry)
postProcessedDuration, makeRequestWithRetry,
writeToErrorLog)

logging.basicConfig(level=logging.INFO)

Expand All @@ -24,6 +25,9 @@
autoScalingInstance = getASInstance()
logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}")

ERROR_LOG = getErrorLogBool()
error_log_path = "/data/error_log.json"

# if true, will delete entire data directory when finished with a trial
isDocker = True

Expand Down Expand Up @@ -160,10 +164,24 @@
time.sleep(0.5)

except Exception as e:
r = makeRequestWithRetry('PATCH',
trial_url, data={"status": "error"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
traceback.print_exc()
try:
r = makeRequestWithRetry('PATCH',
trial_url, data={"status": "error"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

except:
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

# Antoine: Removing this, it is too often causing the machines to stop. Not because
# the machines are failing, but because for instance the video is very long with a lot
Expand All @@ -178,8 +196,16 @@

finally:
# End process duration timer and post duration to database
process_end_time = datetime.now()
postProcessedDuration(trial_url, process_end_time - process_start_time)
try:
process_end_time = datetime.now()
postProcessedDuration(trial_url, process_end_time - process_start_time)
except Exception as e:
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

justProcessed = True

Expand Down
23 changes: 14 additions & 9 deletions docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ REPO_NAME := opencap
PROD_BRANCH := main
DEV_BRANCH := dev

# Initialize variables if not passed in
INSTANCE_ID ?= 0
CPU_SET ?= ""

# Determine the branch name
CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)

Expand Down Expand Up @@ -68,12 +72,13 @@ endif

.PHONY: run
run:
ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH))
aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com

else ifeq ($(CURRENT_BRANCH),$(DEV_BRANCH))
aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com

endif

OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) docker-compose up
@echo "Usage: sudo make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"
@echo "Defaults: INSTANCE_ID=0, CPU_SET=\"\""

COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \
MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \
INSTANCE_ID=$(INSTANCE_ID) \
CPU_SET=$(CPU_SET) \
docker compose up -d
34 changes: 34 additions & 0 deletions docker/check-containers-health.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# Function to check if a container is running
is_container_alive() {
local container_name=$1
docker ps --filter "name=^/${container_name}$" --filter "status=running" --format '{{.Names}}' | grep -wq "$container_name"
return $?
}

# Loop through numbers 0 to 7
for n in {0..7}; do
# Container names
opencap_openpose="opencap_${n}-openpose-1"
opencap_mmpose="opencap_${n}-mmpose-1"
opencap_mobilecap="opencap_${n}-mobilecap-1"

# Check if all three containers are alive
if is_container_alive "$opencap_openpose" && \
is_container_alive "$opencap_mmpose" && \
is_container_alive "$opencap_mobilecap"; then
echo "All containers for instance $n are alive. Skipping."
continue
fi

# Check if any container exists
if docker ps -a --filter "name=^/opencap_${n}-(openpose|mmpose|mobilecap)-1$" --format '{{.Names}}' | grep -q "opencap_${n}"; then
echo "Some containers for instance $n are not alive. Stopping instance."
./stop-container.sh "$n"
./start-container.sh "$n"
else
echo "No containers for instance $n. Skipping."
fi

done
25 changes: 22 additions & 3 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files
openpose:
image: ${OPENPOSE_IMAGE_NAME}
volumes:
Expand All @@ -24,8 +30,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files
mmpose:
image: ${MMPOSE_IMAGE_NAME}
volumes:
Expand All @@ -35,7 +47,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files

volumes:
data: {}
51 changes: 51 additions & 0 deletions docker/start-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Configuration
MAX_INSTANCES=8
CPUS_PER_INSTANCE=14
GPUS_PER_INSTANCE=1

# Get the total number of CPUs and GPUs available
TOTAL_CPUS=$(nproc)
TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

# Check if an instance number is provided
if [ -z "$1" ]; then
echo "Usage: $0 <instance_number>"
echo "Provide the instance number to start (0 to $((MAX_INSTANCES - 1)))."
exit 1
fi

INSTANCE_NUMBER=$1

# Validate the instance number
if (( INSTANCE_NUMBER < 0 || INSTANCE_NUMBER >= MAX_INSTANCES )); then
echo "Error: Instance number must be between 0 and $((MAX_INSTANCES - 1))."
exit 1
fi

# Compute CPU and GPU offsets for the selected instance
CPU_START=$(( INSTANCE_NUMBER * CPUS_PER_INSTANCE ))
CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
CPU_SET="${CPU_START}-${CPU_END}"

# Validate resource availability
if (( CPU_START + CPUS_PER_INSTANCE > TOTAL_CPUS )); then
echo "Error: Not enough CPUs available for instance $INSTANCE_NUMBER."
exit 1
fi

if (( INSTANCE_NUMBER >= TOTAL_GPUS )); then
echo "Error: Not enough GPUs available for instance $INSTANCE_NUMBER."
exit 1
fi

# Start the specific instance
echo "Starting instance $INSTANCE_NUMBER with CPU_SET=${CPU_SET} and GPU=${INSTANCE_NUMBER}"

# Run docker-compose for the specific instance
make run INSTANCE_ID=$INSTANCE_NUMBER CPU_SET=$CPU_SET

sleep 10

echo "Instance $INSTANCE_NUMBER started successfully."
60 changes: 60 additions & 0 deletions docker/start-containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# Configuration
MAX_INSTANCES=8
CPUS_PER_INSTANCE=14
GPUS_PER_INSTANCE=1

# Get the total number of CPUs and GPUs available
TOTAL_CPUS=$(nproc)
TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

# Read number of instances to start
if [ -z "$1" ]; then
echo "Usage: $0 <number_of_instances>"
echo "Provide the number of instances to start (max $MAX_INSTANCES)."
exit 1
fi

NUM_INSTANCES=$1

# Validate the number of instances
if (( NUM_INSTANCES > MAX_INSTANCES )); then
echo "Error: Maximum number of instances is $MAX_INSTANCES."
exit 1
fi

# Check if there are enough resources
if (( NUM_INSTANCES * CPUS_PER_INSTANCE > TOTAL_CPUS )); then
echo "Error: Not enough CPUs. Required: $((NUM_INSTANCES * CPUS_PER_INSTANCE)), Available: $TOTAL_CPUS."
exit 1
fi

if (( NUM_INSTANCES * GPUS_PER_INSTANCE > TOTAL_GPUS )); then
echo "Error: Not enough GPUs. Required: $((NUM_INSTANCES * GPUS_PER_INSTANCE)), Available: $TOTAL_GPUS."
exit 1
fi

# Display summary
echo "Starting $NUM_INSTANCES instances..."
echo "Total CPUs: $TOTAL_CPUS (using $CPUS_PER_INSTANCE per instance)"
echo "Total GPUs: $TOTAL_GPUS (using $GPUS_PER_INSTANCE per instance)"
echo

# Start instances
for (( i=0; i<NUM_INSTANCES; i++ )); do
INSTANCE_ID=$i
CPU_START=$(( i * CPUS_PER_INSTANCE ))
CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
CPU_SET="${CPU_START}-${CPU_END}"

echo "Starting instance $INSTANCE_ID with CPU_SET=${CPU_SET} and GPU=${INSTANCE_ID}"

# Run docker-compose for each instance
make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET

sleep 2
done

echo "All instances started successfully."

1 change: 1 addition & 0 deletions docker/stop-all-containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
for i in $(seq 0 7); do ./stop-container.sh $i; done
25 changes: 25 additions & 0 deletions docker/stop-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Check if INSTANCE_ID is provided
if [ -z "$1" ]; then
echo "Usage: $0 <INSTANCE_ID>"
exit 1
fi

INSTANCE_ID=$1
COMPOSE_PROJECT_NAME="opencap_${INSTANCE_ID}"

echo "Stopping and removing containers for INSTANCE_ID=${INSTANCE_ID}..."

# Stop and remove containers associated with the project
docker-compose \
--project-name $COMPOSE_PROJECT_NAME \
down

# Verify if containers are removed
if [ $? -eq 0 ]; then
echo "Successfully stopped and removed containers for INSTANCE_ID=${INSTANCE_ID}."
else
echo "Failed to stop and remove containers for INSTANCE_ID=${INSTANCE_ID}."
fi

Loading

0 comments on commit cc976fa

Please sign in to comment.