Merge pull request #226 from stanfordnmbl/server

Dev: New deployment for new Server
stanfordnmbl · Feb 13, 2025 · cc976fa · cc976fa
2 parents f69fe47 + 116e8d4
commit cc976fa
Show file tree

Hide file tree

Showing 11 changed files with 353 additions and 75 deletions.
diff --git a/app.py b/app.py
@@ -9,12 +9,13 @@
 import glob
 from datetime import datetime, timedelta
 import numpy as np
-from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials
+from utilsAPI import getAPIURL, getWorkerType, getErrorLogBool, getASInstance, unprotect_current_instance, get_number_of_pending_trials
 from utilsAuth import getToken
 from utils import (getDataDirectory, checkTime, checkResourceUsage,
                   sendStatusEmail, checkForTrialsWithStatus,
                   getCommitHash, getHostname, postLocalClientInfo,
-                  postProcessedDuration, makeRequestWithRetry)
+                  postProcessedDuration, makeRequestWithRetry,
+                  writeToErrorLog)
 
 logging.basicConfig(level=logging.INFO)
 
@@ -24,6 +25,9 @@
 autoScalingInstance = getASInstance()
 logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}")
 
+ERROR_LOG = getErrorLogBool()
+error_log_path = "/data/error_log.json"
+
 # if true, will delete entire data directory when finished with a trial
 isDocker = True
 
@@ -160,10 +164,24 @@
         time.sleep(0.5)
 
     except Exception as e:
-        r = makeRequestWithRetry('PATCH',
-                                 trial_url, data={"status": "error"},
-                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
-        traceback.print_exc()
+        try:
+            r = makeRequestWithRetry('PATCH',
+                                     trial_url, data={"status": "error"},
+                                     headers = {"Authorization": "Token {}".format(API_TOKEN)})
+            traceback.print_exc()
+
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
+
+        except:
+            traceback.print_exc()
+
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
 
         # Antoine: Removing this, it is too often causing the machines to stop. Not because
         # the machines are failing, but because for instance the video is very long with a lot
@@ -178,8 +196,16 @@
 
     finally:
         # End process duration timer and post duration to database
-        process_end_time = datetime.now()
-        postProcessedDuration(trial_url, process_end_time - process_start_time)
+        try:
+            process_end_time = datetime.now()
+            postProcessedDuration(trial_url, process_end_time - process_start_time)
+        except Exception as e:
+            traceback.print_exc()
+
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
 
     justProcessed = True
 

diff --git a/docker/Makefile b/docker/Makefile
@@ -3,6 +3,10 @@ REPO_NAME := opencap
 PROD_BRANCH := main
 DEV_BRANCH := dev
 
+# Initialize variables if not passed in
+INSTANCE_ID ?= 0
+CPU_SET ?= ""
+
 # Determine the branch name
 CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
 
@@ -68,12 +72,13 @@ endif
 
 .PHONY: run
 run:
-ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH))
-	aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com
-
-else ifeq ($(CURRENT_BRANCH),$(DEV_BRANCH))
-	aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com
-
-endif
-
-	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) docker-compose up
+	@echo "Usage: sudo make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"
+	@echo "Defaults: INSTANCE_ID=0, CPU_SET=\"\""	
+
+	COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
+	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
+	OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \
+	MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \
+	INSTANCE_ID=$(INSTANCE_ID) \
+	CPU_SET=$(CPU_SET) \
+	docker compose up -d
diff --git a/docker/check-containers-health.sh b/docker/check-containers-health.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Function to check if a container is running
+is_container_alive() {
+  local container_name=$1
+  docker ps --filter "name=^/${container_name}$" --filter "status=running" --format '{{.Names}}' | grep -wq "$container_name"
+  return $?
+}
+
+# Loop through numbers 0 to 7
+for n in {0..7}; do
+  # Container names
+  opencap_openpose="opencap_${n}-openpose-1"
+  opencap_mmpose="opencap_${n}-mmpose-1"
+  opencap_mobilecap="opencap_${n}-mobilecap-1"
+
+  # Check if all three containers are alive
+  if is_container_alive "$opencap_openpose" && \
+     is_container_alive "$opencap_mmpose" && \
+     is_container_alive "$opencap_mobilecap"; then
+    echo "All containers for instance $n are alive. Skipping."
+    continue
+  fi
+
+  # Check if any container exists
+  if docker ps -a --filter "name=^/opencap_${n}-(openpose|mmpose|mobilecap)-1$" --format '{{.Names}}' | grep -q "opencap_${n}"; then
+    echo "Some containers for instance $n are not alive. Stopping instance."
+    ./stop-container.sh "$n"
+    ./start-container.sh "$n"
+  else
+    echo "No containers for instance $n. Skipping."
+  fi
+
+done
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
@@ -13,8 +13,14 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
+    cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
   openpose:
     image: ${OPENPOSE_IMAGE_NAME}
     volumes:
@@ -24,8 +30,14 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
+    cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
   mmpose:
     image: ${MMPOSE_IMAGE_NAME}
     volumes:
@@ -35,7 +47,14 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
+    cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
+
 volumes:
   data: {}
diff --git a/docker/start-container.sh b/docker/start-container.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Configuration
+MAX_INSTANCES=8
+CPUS_PER_INSTANCE=14
+GPUS_PER_INSTANCE=1
+
+# Get the total number of CPUs and GPUs available
+TOTAL_CPUS=$(nproc)
+TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+# Check if an instance number is provided
+if [ -z "$1" ]; then
+  echo "Usage: $0 <instance_number>"
+  echo "Provide the instance number to start (0 to $((MAX_INSTANCES - 1)))."
+  exit 1
+fi
+
+INSTANCE_NUMBER=$1
+
+# Validate the instance number
+if (( INSTANCE_NUMBER < 0 || INSTANCE_NUMBER >= MAX_INSTANCES )); then
+  echo "Error: Instance number must be between 0 and $((MAX_INSTANCES - 1))."
+  exit 1
+fi
+
+# Compute CPU and GPU offsets for the selected instance
+CPU_START=$(( INSTANCE_NUMBER * CPUS_PER_INSTANCE ))
+CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
+CPU_SET="${CPU_START}-${CPU_END}"
+
+# Validate resource availability
+if (( CPU_START + CPUS_PER_INSTANCE > TOTAL_CPUS )); then
+  echo "Error: Not enough CPUs available for instance $INSTANCE_NUMBER."
+  exit 1
+fi
+
+if (( INSTANCE_NUMBER >= TOTAL_GPUS )); then
+  echo "Error: Not enough GPUs available for instance $INSTANCE_NUMBER."
+  exit 1
+fi
+
+# Start the specific instance
+echo "Starting instance $INSTANCE_NUMBER with CPU_SET=${CPU_SET} and GPU=${INSTANCE_NUMBER}"
+
+# Run docker-compose for the specific instance
+make run INSTANCE_ID=$INSTANCE_NUMBER CPU_SET=$CPU_SET
+
+sleep 10
+
+echo "Instance $INSTANCE_NUMBER started successfully."
diff --git a/docker/start-containers.sh b/docker/start-containers.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Configuration
+MAX_INSTANCES=8
+CPUS_PER_INSTANCE=14
+GPUS_PER_INSTANCE=1
+
+# Get the total number of CPUs and GPUs available
+TOTAL_CPUS=$(nproc)
+TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+# Read number of instances to start
+if [ -z "$1" ]; then
+  echo "Usage: $0 <number_of_instances>"
+  echo "Provide the number of instances to start (max $MAX_INSTANCES)."
+  exit 1
+fi
+
+NUM_INSTANCES=$1
+
+# Validate the number of instances
+if (( NUM_INSTANCES > MAX_INSTANCES )); then
+  echo "Error: Maximum number of instances is $MAX_INSTANCES."
+  exit 1
+fi
+
+# Check if there are enough resources
+if (( NUM_INSTANCES * CPUS_PER_INSTANCE > TOTAL_CPUS )); then
+  echo "Error: Not enough CPUs. Required: $((NUM_INSTANCES * CPUS_PER_INSTANCE)), Available: $TOTAL_CPUS."
+  exit 1
+fi
+
+if (( NUM_INSTANCES * GPUS_PER_INSTANCE > TOTAL_GPUS )); then
+  echo "Error: Not enough GPUs. Required: $((NUM_INSTANCES * GPUS_PER_INSTANCE)), Available: $TOTAL_GPUS."
+  exit 1
+fi
+
+# Display summary
+echo "Starting $NUM_INSTANCES instances..."
+echo "Total CPUs: $TOTAL_CPUS (using $CPUS_PER_INSTANCE per instance)"
+echo "Total GPUs: $TOTAL_GPUS (using $GPUS_PER_INSTANCE per instance)"
+echo
+
+# Start instances
+for (( i=0; i<NUM_INSTANCES; i++ )); do
+  INSTANCE_ID=$i
+  CPU_START=$(( i * CPUS_PER_INSTANCE ))
+  CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
+  CPU_SET="${CPU_START}-${CPU_END}"
+
+  echo "Starting instance $INSTANCE_ID with CPU_SET=${CPU_SET} and GPU=${INSTANCE_ID}"
+
+  # Run docker-compose for each instance
+  make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET
+
+  sleep 2
+done
+
+echo "All instances started successfully."
+
diff --git a/docker/stop-all-containers.sh b/docker/stop-all-containers.sh
@@ -0,0 +1 @@
+for i in $(seq 0 7); do     ./stop-container.sh $i; done
diff --git a/docker/stop-container.sh b/docker/stop-container.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Check if INSTANCE_ID is provided
+if [ -z "$1" ]; then
+  echo "Usage: $0 <INSTANCE_ID>"
+  exit 1
+fi
+
+INSTANCE_ID=$1
+COMPOSE_PROJECT_NAME="opencap_${INSTANCE_ID}"
+
+echo "Stopping and removing containers for INSTANCE_ID=${INSTANCE_ID}..."
+
+# Stop and remove containers associated with the project
+docker-compose \
+  --project-name $COMPOSE_PROJECT_NAME \
+  down
+
+# Verify if containers are removed
+if [ $? -eq 0 ]; then
+  echo "Successfully stopped and removed containers for INSTANCE_ID=${INSTANCE_ID}."
+else
+  echo "Failed to stop and remove containers for INSTANCE_ID=${INSTANCE_ID}."
+fi
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		for i in $(seq 0 7); do ./stop-container.sh $i; done