Skip to content

Commit

Permalink
Add zero downtime deployment (#5338)
Browse files Browse the repository at this point in the history
* Deploy controllers without downtime

* Deploy invokers without downtime

* Deploy schedulers without downtime

* Fix typo

* Fix typo

* Add a disable API to controllers

* Remove unnecessary steps

* Add more logs for container liveness

* Change Set to thread-safe one

* Use the transaction ID of the activation

* Gracefully shutdown activation client proxy

* Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala

Apply suggestion

Co-authored-by: Brendan Doyle <[email protected]>

* Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala

Apply suggestion

Co-authored-by: Brendan Doyle <[email protected]>

* Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala

Co-authored-by: Brendan Doyle <[email protected]>

* Apply #5334

* Remove akka-http dependency from the invoker reactive

* Exclude the prewarm containers count from the /pool/count route

* Add missing import

* Make it compatible with scala-2.13

In scala-2.13 mapValues returns a MapView, and it cannot be cast to Map by default.

* Fix test cases

* Add container id to the logs of ActivationClientProxy

Co-authored-by: Brendan Doyle <[email protected]>
  • Loading branch information
style95 and bdoyle0182 authored Nov 1, 2022
1 parent 74ca61c commit 651a2e9
Show file tree
Hide file tree
Showing 38 changed files with 607 additions and 233 deletions.
13 changes: 11 additions & 2 deletions ansible/group_vars/all
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ controller:
authentication:
spi: "{{ controller_authentication_spi | default('') }}"
loglevel: "{{ controller_loglevel | default(whisk_loglevel) | default('INFO') }}"
username: "{{ controller_username | default('controller.user') }}"
password: "{{ controller_password | default('controller.pass') }}"
entitlement:
spi: "{{ controller_entitlement_spi | default('') }}"
protocol: "{{ controller_protocol | default('https') }}"
Expand All @@ -126,6 +128,10 @@ controller:
password: "openwhisk"
name: "{{ __controller_ssl_keyPrefix }}openwhisk-keystore.p12"
extraEnv: "{{ controller_extraEnv | default({}) }}"
deployment:
ignore_error: "{{ controller_deployment_ignore_error | default('False') }}"
retries: "{{ controller_deployment_retries | default(180) }}"
delay: "{{ controller_deployment_delay | default(5) }}"

jmx:
basePortController: 15000
Expand Down Expand Up @@ -234,6 +240,10 @@ invoker:
creationMaxPeek: "{{ container_creation_max_peek | default(500) }}"
reactiveSpi: "{{ invokerReactive_spi | default('') }}"
serverSpi: "{{ invokerServer_spi | default('') }}"
deployment:
ignore_error: "{{ invoker_deployment_ignore_error | default('False') }}"
retries: "{{ invoker_deployment_retries | default(180) }}"
delay: "{{ invoker_deployment_delay | default(5) }}"

userLogs:
spi: "{{ userLogs_spi | default('org.apache.openwhisk.core.containerpool.logging.DockerToActivationLogStoreProvider') }}"
Expand Down Expand Up @@ -450,8 +460,7 @@ metrics:
user_events: "{{ user_events_enabled | default(false) | lower }}"

zeroDowntimeDeployment:
enabled: "{{ zerodowntime_deployment_switch | default(true) }}"
solution: "{{ zerodowntime_deployment_solution | default('apicall') }}"
enabled: "{{ zerodowntime_deployment_switch | default(false) }}"

etcd:
version: "{{ etcd_version | default('v3.4.0') }}"
Expand Down
83 changes: 83 additions & 0 deletions ansible/roles/controller/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@
"CONFIG_whisk_info_date": "{{ whisk.version.date }}"
"CONFIG_whisk_info_buildNo": "{{ docker.image.tag }}"
"CONFIG_whisk_cluster_name": "{{ whisk.cluster_name | lower }}"
"CONFIG_whisk_controller_username": "{{ controller.username }}"
"CONFIG_whisk_controller_password": "{{ controller.password }}"

"KAFKA_HOSTS": "{{ kafka_connect_string }}"
"CONFIG_whisk_kafka_replicationFactor":
Expand Down Expand Up @@ -363,6 +365,53 @@
include_tasks: "lean.yml"
when: lean

# Before redeploy controller, should remove that controller instance from nginx
- name: remove the controller from nginx's upstream configuration
shell:
docker exec -t nginx sh -c "sed -i \"s/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
delegate_to: "{{ item }}"
with_items: "{{ groups['edge'] }}"
when: zeroDowntimeDeployment.enabled == true

- name: wait some time for controllers fire all existing triggers
shell: sleep 5s
when: zeroDowntimeDeployment.enabled == true

- name: wait until {{ controller_name }} executes all existing activations
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/activation/count"
validate_certs: no
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
return_content: yes
user: "{{ controller.username }}"
password: "{{ controller.password }}"
force_basic_auth: yes
register: result
until: result.content == '0'
retries: "{{ controller.deployment.retries }}"
delay: "{{ controller.deployment.delay }}"
when: zeroDowntimeDeployment.enabled == true
ignore_errors: "{{ controller.deployment.ignore_error }}"

- name: Disable {{ controller_name }} before remove controller
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + groups['controllers'].index(inventory_hostname) }}/disable"
validate_certs: no
client_key: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
method: POST
status_code: 200
user: "{{ controller.username }}"
password: "{{ controller.password }}"
force_basic_auth: yes
ignore_errors: "{{ controller.deployment.ignore_error }}"
when: zeroDowntimeDeployment.enabled == true

- name: wait some time for controller to gracefully shutdown the consumer for activation ack
shell: sleep 5s
when: zeroDowntimeDeployment.enabled == true

- name: (re)start controller
docker_container:
name: "{{ controller_name }}"
Expand Down Expand Up @@ -397,3 +446,37 @@
until: result.status == 200
retries: 12
delay: 10

- name: warm up activation path
uri:
url:
"{{controller.protocol}}://{{ lookup('file', '{{ catalog_auth_key }}')}}@{{ansible_host}}:{{controller_port}}/api/v1/namespaces/_/actions/invokerHealthTestAction{{controller_index}}?blocking=false&result=false"
validate_certs: "no"
client_key:
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
client_cert:
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
method: POST
ignore_errors: True

- name: wait for all invokers in {{ controller_name }} to become up
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/invokers"
validate_certs: no
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
return_content: yes
register: invokerStatus
until: invokerStatus.json|length >= 1 and "unhealthy" not in invokerStatus.content
retries: 14
delay: 5
when: zeroDowntimeDeployment.enabled == true

# When all invokers report their status to controller, add the controller instance to nginx when exist at least one invoker is up
- name: Add the controller back to nginx's upstream configuration when there exist at least one healthy invoker
shell:
docker exec -t nginx sh -c "sed -i \"s/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
delegate_to: "{{ item }}"
with_items: "{{ groups['edge'] }}"
ignore_errors: True
when: zeroDowntimeDeployment.enabled == true and "up" in invokerStatus.content
33 changes: 33 additions & 0 deletions ansible/roles/invoker/tasks/clean.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,37 @@
invoker_name: "{{ name_prefix ~ ((invoker_index_base | int) + host_group.index(inventory_hostname)) }}"
invoker_index: "{{ (invoker_index_base | int) + host_group.index(inventory_hostname) }}"

- name: disable invoker{{ groups['invokers'].index(inventory_hostname) }}
uri:
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/disable"
validate_certs: no
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
method: POST
status_code: 200
user: "{{ invoker.username }}"
password: "{{ invoker.password }}"
force_basic_auth: yes
ignore_errors: "{{ invoker.deployment.ignore_error }}"
when: zeroDowntimeDeployment.enabled == true and enable_scheduler

- name: wait invoker{{ groups['invokers'].index(inventory_hostname) }} to clean up all existing containers
uri:
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/pool/count"
validate_certs: no
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
user: "{{ invoker.username }}"
password: "{{ invoker.password }}"
force_basic_auth: yes
return_content: yes
register: result
until: result.content == '0'
retries: "{{ invoker.deployment.retries }}"
delay: "{{ invoker.deployment.delay }}"
when: zeroDowntimeDeployment.enabled == true and enable_scheduler
ignore_errors: "{{ invoker.deployment.ignore_error }}"

- name: remove invoker
docker_container:
name: "{{ invoker_name }}"
Expand Down Expand Up @@ -59,12 +90,14 @@
path: "{{ whisk_logs_dir }}/{{ invoker_name }}"
state: absent
become: "{{ logs.dir.become }}"
when: mode == "clean"

- name: remove invoker conf directory
file:
path: "{{ invoker.confdir }}/{{ invoker_name }}"
state: absent
become: "{{ invoker.dir.become }}"
when: mode == "clean"

# Workaround for orphaned ifstate.veth* files on Ubuntu 14.04
# See https://github.com/moby/moby/issues/22513
Expand Down
6 changes: 6 additions & 0 deletions ansible/roles/invoker/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
---
# This role installs invokers.

###
# When the zero-downtime-deployment is enabled, clean.yml is used to gracefully shut down the invoker.
#
- import_tasks: clean.yml
when: zeroDowntimeDeployment.enabled == true and enable_scheduler

- import_tasks: docker_login.yml

- name: get invoker name and index
Expand Down
21 changes: 6 additions & 15 deletions ansible/roles/schedulers/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,6 @@
include_tasks: "{{ item }}.yml"
with_items: "{{ scheduler_plugins | default([]) }}"

- name: Judge current scheduler whether deployed
shell: echo $(docker ps | grep {{ scheduler_name }} | wc -l)
register: schedulerDeployed
when: zeroDowntimeDeployment.enabled == true

- name: disable scheduler{{ groups['schedulers'].index(inventory_hostname) }} before redeploy scheduler
uri:
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/disable"
Expand All @@ -295,27 +290,23 @@
password: "{{ scheduler.password }}"
force_basic_auth: yes
ignore_errors: "{{ scheduler.deployment_ignore_error }}"
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
when: zeroDowntimeDeployment.enabled == true

- name: wait until all queue and create queue task is finished before redeploy scheduler when using apicall solution or half solution
- name: wait until all activation is finished before redeploy scheduler
uri:
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/queue/total"
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/activation/count"
validate_certs: no
return_content: yes
user: "{{ scheduler.username }}"
password: "{{ scheduler.password }}"
force_basic_auth: yes
register: totalQueue
until: totalQueue.content == "0"
register: result
until: result.content == "0"
retries: 180
delay: 5
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
when: zeroDowntimeDeployment.enabled == true
ignore_errors: "{{ scheduler.deployment_ignore_error }}"

- name: wait until all queue and create queue task is finished before redeploy scheduler using sleep solution
shell: sleep 120s
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" and zeroDowntimeDeployment.solution == 'sleep'

- name: (re)start scheduler
docker_container:
name: "{{ scheduler_name }}"
Expand Down
4 changes: 4 additions & 0 deletions ansible/templates/whisk.properties.j2
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,15 @@ edge.host.apiport=443
kafkaras.host.port={{ kafka.ras.port }}
redis.host.port={{ redis.port }}
invoker.hosts.basePort={{ invoker.port }}
invoker.username={{ invoker.username }}
invoker.password={{ invoker.password }}

controller.hosts={{ groups["controllers"] | map('extract', hostvars, 'ansible_host') | list | join(",") }}
controller.host.basePort={{ controller.basePort }}
controller.instances={{ controller.instances }}
controller.protocol={{ controller.protocol }}
controller.username={{ controller.username }}
controller.password={{ controller.password }}

invoker.container.network=bridge
invoker.container.policy={{ invoker_container_policy_name | default()}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ object ConfigKeys {

val dataManagementServiceRetryInterval = "whisk.scheduler.data-management-service.retry-interval"

val whiskControllerUsername = "whisk.controller.username"
val whiskControllerPassword = "whisk.controller.password"

val whiskSchedulerUsername = "whisk.scheduler.username"
val whiskSchedulerPassword = "whisk.scheduler.password"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,13 @@ object InvokerResourceMessage extends DefaultJsonProtocol {
* ...
* ]
*/
object StatusQuery
object GetState

case class StatusData(invocationNamespace: String, fqn: String, waitingActivation: Int, status: String, data: String)
case class StatusData(invocationNamespace: String,
fqn: String,
waitingActivation: List[ActivationId],
status: String,
data: String)
extends Message {

override def serialize: String = StatusData.serdes.write(this).compactPrint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ package org.apache.openwhisk.core.service
import akka.actor.{Actor, ActorRef, ActorSystem, Props}
import com.ibm.etcd.api.Event.EventType
import com.ibm.etcd.client.kv.WatchUpdate
import org.apache.openwhisk.common.Logging
import org.apache.openwhisk.common.{GracefulShutdown, Logging}
import org.apache.openwhisk.core.etcd.EtcdClient
import org.apache.openwhisk.core.etcd.EtcdType._

import scala.collection.JavaConverters._
import scala.collection.concurrent.TrieMap

Expand Down Expand Up @@ -141,6 +142,13 @@ class WatcherService(etcdClient: EtcdClient)(implicit logging: Logging, actorSys
// always send WatcherClosed back to sender if it need a feedback
if (request.needFeedback)
sender ! WatcherClosed(request.watchKey, request.isPrefix)

case GracefulShutdown =>
watcher.close()
putWatchers.clear()
deleteWatchers.clear()
prefixPutWatchers.clear()
prefixDeleteWatchers.clear()
}
}

Expand Down
4 changes: 4 additions & 0 deletions core/controller/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,8 @@ whisk{
file-system : true
dir-path : "/swagger-ui/"
}
controller {
username: "controller.user"
password: "controller.pass"
}
}
Loading

0 comments on commit 651a2e9

Please sign in to comment.