From ad4ab1126e2aa6f3dd4e53418264a14da816c573 Mon Sep 17 00:00:00 2001 From: Maximilian Schubert Date: Mon, 18 Dec 2023 14:36:04 +0100 Subject: [PATCH] feat: add metrics; fix latency check; latency time in sec (#45) * feat: add metrics; fix latency check; latency time in sec * docs: add metrics docu * feat: mutex for metrics; rename metric * feat: match up checks; mutex for metrics * feat: register collectors without error handling and no panic; split reconcileChecks --- README.md | 140 +++++++++++++++++++++++++---------- go.mod | 9 +++ go.sum | 26 ++++++- pkg/checks/checks.go | 9 ++- pkg/checks/checks_moq.go | 54 ++++++++++++-- pkg/checks/health.go | 80 ++++++++++++++------ pkg/checks/health_test.go | 5 +- pkg/checks/latency.go | 144 ++++++++++++++++++++++++++++-------- pkg/checks/latency_test.go | 63 +++++++++++++--- pkg/sparrow/api.go | 7 ++ pkg/sparrow/api_test.go | 10 ++- pkg/sparrow/metrics.go | 52 +++++++++++++ pkg/sparrow/metrics_moq.go | 68 +++++++++++++++++ pkg/sparrow/metrics_test.go | 70 ++++++++++++++++++ pkg/sparrow/run.go | 114 ++++++++++++++++++---------- pkg/sparrow/run_test.go | 5 ++ 16 files changed, 697 insertions(+), 159 deletions(-) create mode 100644 pkg/sparrow/metrics.go create mode 100644 pkg/sparrow/metrics_moq.go create mode 100644 pkg/sparrow/metrics_test.go diff --git a/README.md b/README.md index cbad09e4..be19162c 100644 --- a/README.md +++ b/README.md @@ -18,34 +18,41 @@ - [Loader](#loader) - [Runtime](#runtime) - [Check: Health](#check-health) + - [Health Metrics](#health-metrics) - [Check: Latency](#check-latency) - - [API](#api) + - [Latency Metrics](#latency-metrics) +- [API](#api) +- [Metrics](#metrics) - [Code of Conduct](#code-of-conduct) - [Working Language](#working-language) - [Support and Feedback](#support-and-feedback) - [How to Contribute](#how-to-contribute) - [Licensing](#licensing) - -The `sparrow` is an infrastructure monitoring tool. The binary includes several checks (e.g. health check) that will be executed periodically. +The `sparrow` is an infrastructure monitoring tool. The binary includes several checks (e.g. health check) that will be +executed periodically. ## About this component -The `sparrow` performs several checks to monitor the health of the infrastructure and network from its point of view. The following checks are available: +The `sparrow` performs several checks to monitor the health of the infrastructure and network from its point of view. +The following checks are available: -1. Health check - `health`: The `sparrow` is able perform an http-based (HTTP/1.1) health check to provided endpoints. The `sparrow` will expose its own health check endpoint as well. +1. Health check - `health`: The `sparrow` is able to perform an HTTP-based (HTTP/1.1) health check to the provided + endpoints. The `sparrow` will expose its own health check endpoint as well. -2. Latency check - `latency`: The `sparrow` is able to communicate with other `sparrow` instances to calculate the time a request takes to the target and back. The check is http (HTTP/1.1) based as well. +2. Latency check - `latency`: The `sparrow` is able to communicate with other `sparrow` instances to calculate the time + a request takes to the target and back. The check is http (HTTP/1.1) based as well. ## Installation -The `sparrow` is provided as an small binary & a container image. +The `sparrow` is provided as a small binary & a container image. Please see the [release notes](https://github.com/caas-team/sparrow/releases) for to get the latest version. ### Binary -The binary is available for several distributions. Currently the binary needs to be installed from a provided bundle or source. +The binary is available for several distributions. Currently, the binary needs to be installed from a provided bundle or +source. ```sh curl https://github.com/caas-team/sparrow/releases/download/v${RELEASE_VERSION}/sparrow_${RELEASE_VERSION}_linux_amd64.tar.gz -Lo sparrow.tar.gz @@ -53,29 +60,33 @@ curl https://github.com/caas-team/sparrow/releases/download/v${RELEASE_VERSION}/ ``` For example release `v0.0.1`: + ```sh curl https://github.com/caas-team/sparrow/releases/download/v0.0.1/sparrow_0.0.1_linux_amd64.tar.gz -Lo sparrow.tar.gz curl https://github.com/caas-team/sparrow/releases/download/v0.0.1/sparrow_0.0.1_checksums.txt -Lo checksums.txt ``` Extract the binary: + ```sh tar -xf sparrow.tar.gz ``` ### Container Image -The [sparrow container images](https://github.com/caas-team/sparrow/pkgs/container/sparrow) for dedicated [release](https://github.com/caas-team/sparrow/releases) can be found in the GitHub registry. +The [sparrow container images](https://github.com/caas-team/sparrow/pkgs/container/sparrow) for +dedicated [release](https://github.com/caas-team/sparrow/releases) can be found in the GitHub registry. ### Helm -Sparrow can be install via Helm Chart. The chart is provided in the GitHub registry: +Sparrow can be installed via Helm Chart. The chart is provided in the GitHub registry: ```sh helm -n sparrow upgrade -i sparrow oci://ghcr.io/caas-team/charts/sparrow --version 1.0.0 --create-namespace ``` -The default settings are fine for a local running configuration. With the default Helm values the sparrow loader uses a runtime configuration that is provided in a ConfigMap. The ConfigMap can be set by defining the `runtimeConfig` section. +The default settings are fine for a local running configuration. With the default Helm values, the sparrow loader uses a +runtime configuration that is provided in a ConfigMap. The ConfigMap can be set by defining the `runtimeConfig` section. To be able to load the configuration during the runtime dynamically, the sparrow loader needs to be set to type `http`. @@ -86,8 +97,9 @@ startupConfig: loaderType: http loaderHttpUrl: https://url-to-runtime-config.de/api/config%2Eyaml -runtimeConfig: {} +runtimeConfig: { } ``` + For all available value options see [Chart README](./chart/README.md). Additionally check out the sparrow [configuration](#configuration) variants. @@ -102,22 +114,25 @@ Run a `sparrow` container by using e.g. `docker run ghcr.io/caas-team/sparrow`. Pass the available configuration arguments to the container e.g. `docker run ghcr.io/caas-team/sparrow --help`. -Start the instance using a mounted startup configuration file e.g. `docker run -v /config:/config ghcr.io/caas-team/sparrow --config /config/config.yaml`. +Start the instance using a mounted startup configuration file +e.g. `docker run -v /config:/config ghcr.io/caas-team/sparrow --config /config/config.yaml`. ## Configuration -The configuration is divided into two parts. The startup configuration and the runtime configuration. The startup configuration is a technical configuration to configure the `sparrow` instance itself. The runtime configuration will be loaded by the `loader` from a remote endpoint. This configuration consist of the checks configuration. +The configuration is divided into two parts. The startup configuration and the runtime configuration. The startup +configuration is a technical configuration to configure the `sparrow` instance itself. The runtime configuration will be +loaded by the `loader` from a remote endpoint. This configuration consists of the checks' configuration. ### Startup -The available configuration options can found in the [CLI flag documentation](docs/sparrow.md). +The available configuration options can be found in the [CLI flag documentation](docs/sparrow.md). The `sparrow` is able to get the startup configuration from different sources as follows. Priority of configuration (high to low): 1. CLI flags -2. Environment variables +2. Environment variables 3. Defined configuration file 4. Default configuration file @@ -130,12 +145,18 @@ The loader can be selected by specifying the `loaderType` configuration paramete The default loader is an `http` loader that is able to get the runtime configuration from a remote endpoint. Available loader: -- `http`: The default. Loads configuration from a remote endpoint. Token authentication is available. Additional configuration parameter have the prefix `loaderHttp`. -- `file` (experimental): Loads configuration once from a local file. Additional configuration parameter have the prefix `loaderFile`. This is just for development purposes. + +- `http`: The default. Loads configuration from a remote endpoint. Token authentication is available. Additional + configuration parameters have the prefix `loaderHttp`. +- `file` (experimental): Loads configuration once from a local file. Additional configuration parameters have the + prefix `loaderFile`. This is just for development purposes. ### Runtime -Besides the technical startup configuration the configuration for the `sparrow` checks is loaded dynamically from an http endpoint. The `loader` is able to load the configuration dynamically during the runtime. Checks can be enabled, disabled and configured. The available loader confutation options for the startup configuration can be found in [here](sparrow_run.md) +Besides the technical startup configuration the configuration for the `sparrow` checks is loaded dynamically from an +HTTP endpoint. The `loader` is able to load the configuration dynamically during the runtime. Checks can be enabled, +disabled and configured. The available loader confutation options for the startup configuration can be found +in [here](sparrow_run.md) Example format of a runtime configuration: @@ -152,8 +173,11 @@ checks: Available configuration options: - `checks.health.enabled` (boolean): Currently not used. -- `checks.health.targets` (list of strings): List of targets to send health probe. Needs to be a valid url. Can be another `sparrow` instance. Use health endpoint, e.g. `https://sparrow-dns.telekom.de/checks/health`. The remote `sparrow` instance needs the `healthEndpoint` enabled. -- `checks.health.healthEndpoint` (boolean): Needs to be activated when the `sparrow` should expose its own health endpoint. Mandatory if another `sparrow` instance wants perform a health check. +- `checks.health.targets` (list of strings): List of targets to send health probe. Needs to be a valid url. Can be + another `sparrow` instance. Use health endpoint, e.g. `https://sparrow-dns.telekom.de/checks/health`. The + remote `sparrow` instance needs the `healthEndpoint` enabled. +- `checks.health.healthEndpoint` (boolean): Needs to be activated when the `sparrow` should expose its own health + endpoint. Mandatory if another `sparrow` instance wants to perform a health check. Example configuration: @@ -166,21 +190,31 @@ checks: healthEndpoint: false ``` +#### Health Metrics + +- `sparrow_health_up` + - Type: Gauge + - Description: Health of targets + - Labelled with `target` + ### Check: Latency Available configuration options: - `checks` - - `latency` - - `enabled` (boolean): Currently not used. - - `interval` (integer): Interval in seconds to perform the latency check. - - `timeout` (integer): Timeout in seconds for the latency check. - - `retry` - - `count` (integer): Number of retries for the latency check. - - `delay` (integer): Delay in seconds between retries for the latency check. - - `targets` (list of strings): List of targets to send latency probe. Needs to be a valid url. Can be another `sparrow` instance. Use latency endpoint, e.g. `https://sparrow-dns.telekom.de/checks/latency`. The remote `sparrow` instance needs the `latencyEndpoint` enabled. - - `latencyEndpoint` (boolean): Needs to be activated when the `sparrow` should expose its own latency endpoint. Mandatory if another `sparrow` instance wants perform a latency check. -Example configuration: + - `latency` + - `enabled` (boolean): Currently not used. + - `interval` (integer): Interval in seconds to perform the latency check. + - `timeout` (integer): Timeout in seconds for the latency check. + - `retry` + - `count` (integer): Number of retries for the latency check. + - `delay` (integer): Delay in seconds between retries for the latency check. + - `targets` (list of strings): List of targets to send latency probe. Needs to be a valid url. Can be + another `sparrow` instance. Use latency endpoint, e.g. `https://sparrow-dns.telekom.de/checks/latency`. The + remote `sparrow` instance needs the `latencyEndpoint` enabled. + - `latencyEndpoint` (boolean): Needs to be activated when the `sparrow` should expose its own latency endpoint. + Mandatory if another `sparrow` instance wants to perform a latency check. + Example configuration: ```yaml checks: @@ -196,13 +230,38 @@ checks: - https://google.com/ ``` -### API +#### Latency Metrics + +- `sparrow_latency_duration_seconds` + - Type: Gauge + - Description: Latency with status information of targets + - Labelled with `target` and `status` + +- `sparrow_latency_count` + - Type: Counter + - Description: Count of latency checks done + - Labelled with `target` + +- `sparrow_latency_duration` + - Type: Histogram + - Description: Latency of targets in seconds + - Labelled with `target` + +## API + +The `sparrow` exposes an API that does provide access to the check results. Each check will register its own endpoint +at `/v1/metrics/{check-name}`. The API definition will be exposed at `/openapi` + +## Metrics -The `sparrow` exposes an API that does provide access to the check results. Each check will register its own endpoint at `/v1/metrics/{check-name}`. The API definition will be exposed at `/openapi` +The `sparrow` is providing a `/metrics` endpoint to expose application metrics. Besides metrics about runtime +information the sparrow is also provided `Check` specific metrics. See the Checks section for more information. ## Code of Conduct -This project has adopted the [Contributor Covenant](https://www.contributor-covenant.org/) in version 2.1 as our code of conduct. Please see the details in our [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). All contributors must abide by the code of conduct. +This project has adopted the [Contributor Covenant](https://www.contributor-covenant.org/) in version 2.1 as our code of +conduct. Please see the details in our [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). All contributors must abide by the code +of conduct. ## Working Language @@ -218,19 +277,24 @@ The application itself and all end-user facing content will be made available in The following channels are available for discussions, feedback, and support requests: | Type | Channel | -| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| | **Issues** | | ## How to Contribute -Contribution and feedback is encouraged and always welcome. For more information about how to contribute, the project structure, as well as additional contribution information, see our [Contribution Guidelines](./CONTRIBUTING.md). By participating in this project, you agree to abide by its [Code of Conduct](./CODE_OF_CONDUCT.md) at all times. +Contribution and feedback is encouraged and always welcome. For more information about how to contribute, the project +structure, as well as additional contribution information, see our [Contribution Guidelines](./CONTRIBUTING.md). By +participating in this project, you agree to abide by its [Code of Conduct](./CODE_OF_CONDUCT.md) at all times. ## Licensing Copyright (c) 2023 Deutsche Telekom IT GmbH. -Licensed under the **Apache License, Version 2.0** (the "License"); you may not use this file except in compliance with the License. +Licensed under the **Apache License, Version 2.0** (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at . -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the [LICENSE](./LICENSE) for the specific language governing permissions and limitations under the License. +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an " +AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the [LICENSE](./LICENSE) for +the specific language governing permissions and limitations under the License. diff --git a/go.mod b/go.mod index 38bc5df6..42af17d3 100644 --- a/go.mod +++ b/go.mod @@ -14,21 +14,29 @@ require ( ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/go-openapi/jsonpointer v0.20.0 // indirect github.com/go-openapi/swag v0.22.4 // indirect + github.com/golang/protobuf v1.5.3 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/invopop/yaml v0.2.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/perimeterx/marshmallow v1.1.5 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_golang v1.17.0 + github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect + github.com/prometheus/common v0.44.0 // indirect + github.com/prometheus/procfs v0.11.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.3.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect @@ -41,5 +49,6 @@ require ( golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa // indirect golang.org/x/sys v0.14.0 // indirect golang.org/x/text v0.14.0 // indirect + google.golang.org/protobuf v1.31.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect ) diff --git a/go.sum b/go.sum index 7f66ce71..6d94bae7 100644 --- a/go.sum +++ b/go.sum @@ -38,7 +38,11 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= @@ -100,6 +104,9 @@ github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvq github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= @@ -111,6 +118,7 @@ github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= @@ -160,6 +168,8 @@ github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0V github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g= github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -175,10 +185,18 @@ github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qR github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= +github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 h1:v7DLqVdK4VrYkVD5diGdl4sxJurKJEMnODWRJlxV9oM= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= +github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= +github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= +github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= +github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= -github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.3.0 h1:zT7VEGWC2DTflmccN/5T1etyKvxSxpHsjb9cJvm4SvQ= @@ -506,6 +524,10 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= diff --git a/pkg/checks/checks.go b/pkg/checks/checks.go index d495ab12..4853d85e 100644 --- a/pkg/checks/checks.go +++ b/pkg/checks/checks.go @@ -24,6 +24,7 @@ import ( "time" "github.com/getkin/kin-openapi/openapi3" + "github.com/prometheus/client_golang/prometheus" "github.com/caas-team/sparrow/pkg/api" ) @@ -56,12 +57,14 @@ type Check interface { // the check with a specific HTTP client, which can be used for network requests // during the check's execution SetClient(c *http.Client) - // Should return an openapi3.SchemaRef of the result type returned by the check + // Schema returns an openapi3.SchemaRef of the result type returned by the check Schema() (*openapi3.SchemaRef, error) - // Allows the check to register a handler on sparrows http server at runtime + // RegisterHandler Allows the check to register a handler on sparrows http server at runtime RegisterHandler(ctx context.Context, router *api.RoutingTree) - // Allows the check to deregister a handler on sparrows http server at runtime + // DeregisterHandler allows the check to deregister a handler on sparrows http server at runtime DeregisterHandler(ctx context.Context, router *api.RoutingTree) + // GetMetricCollectors allows the check to provide prometheus metric collectors + GetMetricCollectors() []prometheus.Collector } type Result struct { diff --git a/pkg/checks/checks_moq.go b/pkg/checks/checks_moq.go index 69c23cb1..664c81b5 100644 --- a/pkg/checks/checks_moq.go +++ b/pkg/checks/checks_moq.go @@ -7,6 +7,7 @@ import ( "context" "github.com/caas-team/sparrow/pkg/api" "github.com/getkin/kin-openapi/openapi3" + "github.com/prometheus/client_golang/prometheus" "net/http" "sync" ) @@ -24,6 +25,9 @@ var _ Check = &CheckMock{} // DeregisterHandlerFunc: func(ctx context.Context, router *api.RoutingTree) { // panic("mock out the DeregisterHandler method") // }, +// GetMetricCollectorsFunc: func() []prometheus.Collector { +// panic("mock out the GetMetricCollectors method") +// }, // RegisterHandlerFunc: func(ctx context.Context, router *api.RoutingTree) { // panic("mock out the RegisterHandler method") // }, @@ -55,6 +59,9 @@ type CheckMock struct { // DeregisterHandlerFunc mocks the DeregisterHandler method. DeregisterHandlerFunc func(ctx context.Context, router *api.RoutingTree) + // GetMetricCollectorsFunc mocks the GetMetricCollectors method. + GetMetricCollectorsFunc func() []prometheus.Collector + // RegisterHandlerFunc mocks the RegisterHandler method. RegisterHandlerFunc func(ctx context.Context, router *api.RoutingTree) @@ -85,6 +92,9 @@ type CheckMock struct { // Router is the router argument value. Router *api.RoutingTree } + // GetMetricCollectors holds details about calls to the GetMetricCollectors method. + GetMetricCollectors []struct { + } // RegisterHandler holds details about calls to the RegisterHandler method. RegisterHandler []struct { // Ctx is the ctx argument value. @@ -125,14 +135,15 @@ type CheckMock struct { CResult chan<- Result } } - lockDeregisterHandler sync.RWMutex - lockRegisterHandler sync.RWMutex - lockRun sync.RWMutex - lockSchema sync.RWMutex - lockSetClient sync.RWMutex - lockSetConfig sync.RWMutex - lockShutdown sync.RWMutex - lockStartup sync.RWMutex + lockDeregisterHandler sync.RWMutex + lockGetMetricCollectors sync.RWMutex + lockRegisterHandler sync.RWMutex + lockRun sync.RWMutex + lockSchema sync.RWMutex + lockSetClient sync.RWMutex + lockSetConfig sync.RWMutex + lockShutdown sync.RWMutex + lockStartup sync.RWMutex } // DeregisterHandler calls DeregisterHandlerFunc. @@ -171,6 +182,33 @@ func (mock *CheckMock) DeregisterHandlerCalls() []struct { return calls } +// GetMetricCollectors calls GetMetricCollectorsFunc. +func (mock *CheckMock) GetMetricCollectors() []prometheus.Collector { + if mock.GetMetricCollectorsFunc == nil { + panic("CheckMock.GetMetricCollectorsFunc: method is nil but Check.GetMetricCollectors was just called") + } + callInfo := struct { + }{} + mock.lockGetMetricCollectors.Lock() + mock.calls.GetMetricCollectors = append(mock.calls.GetMetricCollectors, callInfo) + mock.lockGetMetricCollectors.Unlock() + return mock.GetMetricCollectorsFunc() +} + +// GetMetricCollectorsCalls gets all the calls that were made to GetMetricCollectors. +// Check the length with: +// +// len(mockedCheck.GetMetricCollectorsCalls()) +func (mock *CheckMock) GetMetricCollectorsCalls() []struct { +} { + var calls []struct { + } + mock.lockGetMetricCollectors.RLock() + calls = mock.calls.GetMetricCollectors + mock.lockGetMetricCollectors.RUnlock() + return calls +} + // RegisterHandler calls RegisterHandlerFunc. func (mock *CheckMock) RegisterHandler(ctx context.Context, router *api.RoutingTree) { if mock.RegisterHandlerFunc == nil { diff --git a/pkg/checks/health.go b/pkg/checks/health.go index 8180970c..dd0fb0bb 100644 --- a/pkg/checks/health.go +++ b/pkg/checks/health.go @@ -30,14 +30,21 @@ import ( "github.com/caas-team/sparrow/pkg/api" "github.com/getkin/kin-openapi/openapi3" "github.com/mitchellh/mapstructure" + "github.com/prometheus/client_golang/prometheus" ) +var stateMapping = map[int]string{ + 0: "unhealthy", + 1: "healthy", +} + // Health is a check that measures the availability of an endpoint type Health struct { - route string - config HealthConfig - c chan<- Result - done chan bool + route string + config HealthConfig + c chan<- Result + done chan bool + metrics healthMetrics } // HealthConfig contains the health check config @@ -52,6 +59,11 @@ type healthData struct { Targets []Target `json:"targets"` } +// Defined metric collectors of health check +type healthMetrics struct { + health *prometheus.GaugeVec +} + type Target struct { Target string `json:"target"` Status string `json:"status"` @@ -60,10 +72,11 @@ type Target struct { // NewHealthCheck creates a new HealthCheck func NewHealthCheck() Check { return &Health{ - route: "health", - config: HealthConfig{}, - c: nil, - done: make(chan bool, 1), + route: "health", + config: HealthConfig{}, + metrics: newHealthMetrics(), + c: nil, + done: make(chan bool, 1), } } @@ -149,10 +162,33 @@ func (h *Health) DeregisterHandler(_ context.Context, router *api.RoutingTree) { router.Remove(http.MethodGet, h.route) } +// NewHealthMetrics initializes metric collectors of the health check +func newHealthMetrics() healthMetrics { + return healthMetrics{ + health: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "sparrow_health_up", + Help: "Health of targets", + }, + []string{ + "target", + }, + ), + } +} + +// GetMetricCollectors returns all metric collectors of check +func (h *Health) GetMetricCollectors() []prometheus.Collector { + return []prometheus.Collector{ + h.metrics.health, + } +} + // check performs a health check using a retry function // to get the health status for all targets func (h *Health) check(ctx context.Context) healthData { - log := logger.FromContext(ctx) + log := logger.FromContext(ctx).WithGroup("check") + log.Debug("Checking health") if len(h.config.Targets) == 0 { log.Debug("No targets defined") return healthData{} @@ -163,8 +199,8 @@ func (h *Health) check(ctx context.Context) healthData { var wg sync.WaitGroup var mu sync.Mutex - for _, target := range h.config.Targets { - target := target + for _, t := range h.config.Targets { + target := t wg.Add(1) l := log.With("target", target) @@ -177,28 +213,28 @@ func (h *Health) check(ctx context.Context) healthData { go func() { defer wg.Done() + state := 1 - targetData := Target{ - Target: target, - Status: "healthy", - } - - l.Debug("Starting retry routine to get health of target") + l.Debug("Starting retry routine to get health status") if err := getHealthRetry(ctx); err != nil { - targetData.Status = "unhealthy" + state = 0 } - l.Debug("Successfully got health status of target", "status", targetData.Status) + l.Debug("Successfully got health status of target", "status", stateMapping[state]) mu.Lock() - hd.Targets = append(hd.Targets, targetData) - mu.Unlock() + defer mu.Unlock() + hd.Targets = append(hd.Targets, Target{ + Target: target, + Status: stateMapping[state], + }) + h.metrics.health.WithLabelValues(target).Set(float64(state)) }() } log.Debug("Waiting for all routines to finish") wg.Wait() - log.Info("Successfully got health status from all targets") + log.Debug("Successfully got health status from all targets") return hd } diff --git a/pkg/checks/health_test.go b/pkg/checks/health_test.go index 84a15a2e..a1dff995 100644 --- a/pkg/checks/health_test.go +++ b/pkg/checks/health_test.go @@ -80,7 +80,9 @@ func TestHealth_SetConfig(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - h := &Health{} + h := &Health{ + metrics: newHealthMetrics(), + } if err := h.SetConfig(context.Background(), tt.inputConfig); (err != nil) != tt.wantErr { t.Errorf("Health.SetConfig() error = %v, wantErr %v", err, tt.wantErr) @@ -262,6 +264,7 @@ func TestHealth_Check(t *testing.T) { config: HealthConfig{ Targets: tt.targets, }, + metrics: newHealthMetrics(), } got := h.check(tt.ctx) assert.Equal(t, len(got.Targets), len(tt.want.Targets), "Amount of targets is not equal") diff --git a/pkg/checks/latency.go b/pkg/checks/latency.go index 524bb354..77280015 100644 --- a/pkg/checks/latency.go +++ b/pkg/checks/latency.go @@ -1,3 +1,21 @@ +// sparrow +// (C) 2023, Deutsche Telekom IT GmbH +// +// Deutsche Telekom IT GmbH and all other contributors / +// copyright owners license this file to you under the Apache +// License, Version 2.0 (the "License"); you may not use this +// file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package checks import ( @@ -5,11 +23,13 @@ import ( "fmt" "io" "net/http" + "strconv" "sync" "time" "github.com/getkin/kin-openapi/openapi3" "github.com/mitchellh/mapstructure" + "github.com/prometheus/client_golang/prometheus" "github.com/caas-team/sparrow/internal/helper" "github.com/caas-team/sparrow/internal/logger" @@ -20,20 +40,22 @@ var _ Check = (*Latency)(nil) func NewLatencyCheck() Check { return &Latency{ - mu: sync.Mutex{}, - cfg: LatencyConfig{}, - c: nil, - done: make(chan bool, 1), - client: &http.Client{}, + mu: sync.Mutex{}, + cfg: LatencyConfig{}, + c: nil, + done: make(chan bool, 1), + client: &http.Client{}, + metrics: newLatencyMetrics(), } } type Latency struct { - cfg LatencyConfig - mu sync.Mutex - c chan<- Result - done chan bool - client *http.Client + cfg LatencyConfig + mu sync.Mutex + c chan<- Result + done chan bool + client *http.Client + metrics latencyMetrics } type LatencyConfig struct { @@ -46,7 +68,14 @@ type LatencyConfig struct { type LatencyResult struct { Code int `json:"code"` Error *string `json:"error"` - Total int64 `json:"total"` + Total float64 `json:"total"` +} + +// Defined metric collectors of latency check +type latencyMetrics struct { + latencyDuration *prometheus.GaugeVec + latencyCount *prometheus.CounterVec + latencyHistogram *prometheus.HistogramVec } func (l *Latency) Run(ctx context.Context) error { @@ -129,6 +158,49 @@ func (l *Latency) Handler(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) } +// NewLatencyMetrics initializes metric collectors of the latency check +func newLatencyMetrics() latencyMetrics { + return latencyMetrics{ + latencyDuration: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "sparrow_latency_duration_seconds", + Help: "Latency with status information of targets", + }, + []string{ + "target", + "status", + }, + ), + latencyCount: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "sparrow_latency_count", + Help: "Count of latency checks done", + }, + []string{ + "target", + }, + ), + latencyHistogram: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "sparrow_latency_duration", + Help: "Latency of targets in seconds", + }, + []string{ + "target", + }, + ), + } +} + +// GetMetricCollectors returns all metric collectors of check +func (h *Latency) GetMetricCollectors() []prometheus.Collector { + return []prometheus.Collector{ + h.metrics.latencyDuration, + h.metrics.latencyCount, + h.metrics.latencyHistogram, + } +} + func (l *Latency) check(ctx context.Context) map[string]LatencyResult { log := logger.FromContext(ctx).WithGroup("check") log.Debug("Checking latency") @@ -136,6 +208,7 @@ func (l *Latency) check(ctx context.Context) map[string]LatencyResult { log.Debug("No targets defined") return map[string]LatencyResult{} } + log.Debug("Getting latency status for each target in separate routine", "amount", len(l.cfg.Targets)) var mu sync.Mutex var wg sync.WaitGroup @@ -144,29 +217,41 @@ func (l *Latency) check(ctx context.Context) map[string]LatencyResult { l.mu.Lock() l.client.Timeout = l.cfg.Timeout * time.Second l.mu.Unlock() - for _, tar := range l.cfg.Targets { - target := tar + for _, t := range l.cfg.Targets { + target := t wg.Add(1) - go func(target string) { + lo := log.With("target", target) + + getLatencyRetry := helper.Retry(func(ctx context.Context) error { + res := getLatency(ctx, l.client, target) + mu.Lock() + defer mu.Unlock() + results[target] = res + + return nil + }, l.cfg.Retry) + + go func() { defer wg.Done() - lo := log.With("target", target) - lo.Debug("Starting retry routine to get latency", "target", target) - - err := helper.Retry(func(ctx context.Context) error { - lo.Debug("Getting latency", "timing out in", l.client.Timeout.String()) - res := getLatency(ctx, l.client, target) - mu.Lock() - defer mu.Unlock() - results[target] = res - return nil - }, l.cfg.Retry)(ctx) - if err != nil { + + lo.Debug("Starting retry routine to get latency status") + if err := getLatencyRetry(ctx); err != nil { lo.Error("Error while checking latency", "error", err) } - }(target) + + lo.Debug("Successfully got latency status of target") + mu.Lock() + defer mu.Unlock() + l.metrics.latencyDuration.WithLabelValues(target, strconv.Itoa(results[target].Code)).Set(results[target].Total) + l.metrics.latencyHistogram.WithLabelValues(target).Observe(results[target].Total) + l.metrics.latencyCount.WithLabelValues(target).Inc() + }() } + + log.Debug("Waiting for all routines to finish") wg.Wait() + log.Debug("Successfully got latency status from all targets") return results } @@ -191,14 +276,13 @@ func getLatency(ctx context.Context, client *http.Client, url string) LatencyRes res.Error = &errval return res } + end := time.Now() res.Code = resp.StatusCode defer func(Body io.ReadCloser) { _ = Body.Close() }(resp.Body) - end := time.Now() - - res.Total = end.Sub(start).Milliseconds() + res.Total = end.Sub(start).Seconds() return res } diff --git a/pkg/checks/latency_test.go b/pkg/checks/latency_test.go index 9a9eeda3..54ddab5c 100644 --- a/pkg/checks/latency_test.go +++ b/pkg/checks/latency_test.go @@ -1,3 +1,21 @@ +// sparrow +// (C) 2023, Deutsche Telekom IT GmbH +// +// Deutsche Telekom IT GmbH and all other contributors / +// copyright owners license this file to you under the Apache +// License, Version 2.0 (the "License"); you may not use this +// file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package checks import ( @@ -24,9 +42,9 @@ func stringPointer(s string) *string { return &s } -func TestLatency_Run(t *testing.T) { +func TestLatency_Run(t *testing.T) { //nolint:gocyclo httpmock.Activate() - t.Cleanup(httpmock.DeactivateAndReset) + defer httpmock.DeactivateAndReset() tests := []struct { name string @@ -116,11 +134,11 @@ func TestLatency_Run(t *testing.T) { t.Fatalf("Latency.Startup() error = %v", err) } - c.SetClient(&http.Client{Transport: httpmock.DefaultTransport}) - err = c.SetConfig(tt.ctx, LatencyConfig{ - Targets: tt.targets, - Interval: time.Second * 120, - Timeout: time.Second * 5, + c.SetClient(&http.Client{}) + err = c.SetConfig(tt.ctx, map[string]any{ + "targets": tt.targets, + "interval": 1, + "timeout": 5, }) if err != nil { t.Fatalf("Latency.SetConfig() error = %v", err) @@ -144,9 +162,31 @@ func TestLatency_Run(t *testing.T) { result := <-results assert.IsType(t, tt.want.Data, result.Data) - if !reflect.DeepEqual(result.Data, tt.want.Data) { - t.Errorf("Latency.Run() = %v, want %v", result.Data, tt.want.Data) + + got := result.Data.(map[string]LatencyResult) + expected := result.Data.(map[string]LatencyResult) + if len(got) != len(expected) { + t.Errorf("Length of Latency.Run() result set (%v) does not match length of expected result set (%v)", len(got), len(expected)) + } + + for key, resultObj := range got { + if expected[key].Code != resultObj.Code { + t.Errorf("Result Code of %q = %v, want %v", key, resultObj.Code, expected[key].Code) + } + if expected[key].Error != resultObj.Error { + t.Errorf("Result Error of %q = %v, want %v", key, resultObj.Error, expected[key].Error) + } + if key != timeoutURL { + if resultObj.Total <= 0 || resultObj.Total >= 1 { + t.Errorf("Result Total time of %q = %v, want in between 0 and 1", key, resultObj.Total) + } + } else { + if resultObj.Total != 0 { + t.Errorf("Result Total time of %q = %v, want %v since an timeout occurred", key, resultObj.Total, 0) + } + } } + if result.Err != tt.want.Err { t.Errorf("Latency.Run() = %v, want %v", result.Err, tt.want.Err) } @@ -251,8 +291,9 @@ func TestLatency_check(t *testing.T) { } l := &Latency{ - cfg: LatencyConfig{Targets: tt.targets, Interval: time.Second * 120, Timeout: time.Second * 1}, - client: &http.Client{Transport: httpmock.DefaultTransport}, + cfg: LatencyConfig{Targets: tt.targets, Interval: time.Second * 120, Timeout: time.Second * 1}, + client: &http.Client{Transport: httpmock.DefaultTransport}, + metrics: newLatencyMetrics(), } got := l.check(tt.ctx) diff --git a/pkg/sparrow/api.go b/pkg/sparrow/api.go index 77ebe99d..d3554264 100644 --- a/pkg/sparrow/api.go +++ b/pkg/sparrow/api.go @@ -29,6 +29,7 @@ import ( "github.com/caas-team/sparrow/internal/logger" "github.com/getkin/kin-openapi/openapi3" "github.com/go-chi/chi/v5" + "github.com/prometheus/client_golang/prometheus/promhttp" "gopkg.in/yaml.v3" ) @@ -58,6 +59,12 @@ func (s *Sparrow) register(ctx context.Context) { // Handles internal api // handlers are (de)registered by the checks themselves s.router.HandleFunc("/checks/*", s.handleChecks) + + s.router.Handle("/metrics", + promhttp.HandlerFor( + s.metrics.GetRegistry(), + promhttp.HandlerOpts{Registry: s.metrics.GetRegistry()}, + )) } // Serves the data api. diff --git a/pkg/sparrow/api_test.go b/pkg/sparrow/api_test.go index d5198a50..fead34c9 100644 --- a/pkg/sparrow/api_test.go +++ b/pkg/sparrow/api_test.go @@ -42,12 +42,13 @@ import ( func TestSparrow_register(t *testing.T) { r := chi.NewRouter() s := Sparrow{ - router: r, + router: r, + metrics: NewMetrics(), } s.register(context.Background()) - expectedRoutes := []string{"/openapi.yaml", "/v1/metrics/{checkName}", "/checks/*"} + expectedRoutes := []string{"/openapi.yaml", "/v1/metrics/{checkName}", "/checks/*", "/metrics"} routes := r.Routes() for _, route := range expectedRoutes { found := 0 @@ -67,8 +68,9 @@ func TestSparrow_register(t *testing.T) { func TestSparrow_api_shutdownWhenContextCanceled(t *testing.T) { s := Sparrow{ - cfg: &config.Config{Api: config.ApiConfig{ListeningAddress: ":8080"}}, - router: chi.NewRouter(), + cfg: &config.Config{Api: config.ApiConfig{ListeningAddress: ":8080"}}, + router: chi.NewRouter(), + metrics: NewMetrics(), } ctx, cancel := context.WithCancel(context.Background()) cancel() diff --git a/pkg/sparrow/metrics.go b/pkg/sparrow/metrics.go new file mode 100644 index 00000000..5241c7fc --- /dev/null +++ b/pkg/sparrow/metrics.go @@ -0,0 +1,52 @@ +// sparrow +// (C) 2023, Deutsche Telekom IT GmbH +// +// Deutsche Telekom IT GmbH and all other contributors / +// copyright owners license this file to you under the Apache +// License, Version 2.0 (the "License"); you may not use this +// file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package sparrow + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) + +//go:generate moq -out metrics_moq.go . Metrics +type Metrics interface { + // GetRegistry returns the prometheus registry instance + // containing the registered prometheus collectors + GetRegistry() *prometheus.Registry +} + +type PrometheusMetrics struct { + registry *prometheus.Registry +} + +// InitMetrics initializes the metrics and returns the PrometheusMetrics +func NewMetrics() Metrics { + registry := prometheus.NewRegistry() + + registry.MustRegister( + collectors.NewGoCollector(), + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + ) + + return &PrometheusMetrics{registry: registry} +} + +// GetRegistry returns the registry to register prometheus metrics +func (m *PrometheusMetrics) GetRegistry() *prometheus.Registry { + return m.registry +} diff --git a/pkg/sparrow/metrics_moq.go b/pkg/sparrow/metrics_moq.go new file mode 100644 index 00000000..19cc3e8b --- /dev/null +++ b/pkg/sparrow/metrics_moq.go @@ -0,0 +1,68 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package sparrow + +import ( + "github.com/prometheus/client_golang/prometheus" + "sync" +) + +// Ensure, that MetricsMock does implement Metrics. +// If this is not the case, regenerate this file with moq. +var _ Metrics = &MetricsMock{} + +// MetricsMock is a mock implementation of Metrics. +// +// func TestSomethingThatUsesMetrics(t *testing.T) { +// +// // make and configure a mocked Metrics +// mockedMetrics := &MetricsMock{ +// GetRegistryFunc: func() *prometheus.Registry { +// panic("mock out the GetRegistry method") +// }, +// } +// +// // use mockedMetrics in code that requires Metrics +// // and then make assertions. +// +// } +type MetricsMock struct { + // GetRegistryFunc mocks the GetRegistry method. + GetRegistryFunc func() *prometheus.Registry + + // calls tracks calls to the methods. + calls struct { + // GetRegistry holds details about calls to the GetRegistry method. + GetRegistry []struct { + } + } + lockGetRegistry sync.RWMutex +} + +// GetRegistry calls GetRegistryFunc. +func (mock *MetricsMock) GetRegistry() *prometheus.Registry { + if mock.GetRegistryFunc == nil { + panic("MetricsMock.GetRegistryFunc: method is nil but Metrics.GetRegistry was just called") + } + callInfo := struct { + }{} + mock.lockGetRegistry.Lock() + mock.calls.GetRegistry = append(mock.calls.GetRegistry, callInfo) + mock.lockGetRegistry.Unlock() + return mock.GetRegistryFunc() +} + +// GetRegistryCalls gets all the calls that were made to GetRegistry. +// Check the length with: +// +// len(mockedMetrics.GetRegistryCalls()) +func (mock *MetricsMock) GetRegistryCalls() []struct { +} { + var calls []struct { + } + mock.lockGetRegistry.RLock() + calls = mock.calls.GetRegistry + mock.lockGetRegistry.RUnlock() + return calls +} diff --git a/pkg/sparrow/metrics_test.go b/pkg/sparrow/metrics_test.go new file mode 100644 index 00000000..1e4878cd --- /dev/null +++ b/pkg/sparrow/metrics_test.go @@ -0,0 +1,70 @@ +// sparrow +// (C) 2023, Deutsche Telekom IT GmbH +// +// Deutsche Telekom IT GmbH and all other contributors / +// copyright owners license this file to you under the Apache +// License, Version 2.0 (the "License"); you may not use this +// file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package sparrow + +import ( + "reflect" + "testing" + + "github.com/prometheus/client_golang/prometheus" +) + +func TestPrometheusMetrics_GetRegistry(t *testing.T) { + type fields struct { + registry *prometheus.Registry + } + tests := []struct { + name string + fields fields + want *prometheus.Registry + }{ + { + name: "simple registry", + fields: fields{ + registry: prometheus.NewRegistry(), + }, + want: prometheus.NewRegistry(), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := &PrometheusMetrics{ + registry: tt.fields.registry, + } + if got := m.GetRegistry(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("PrometheusMetrics.GetRegistry() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestNewMetrics(t *testing.T) { + testMetrics := NewMetrics() + testGauge := prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "TEST_GAUGE", + }, + ) + + t.Run("Register a collector", func(t *testing.T) { + testMetrics.(*PrometheusMetrics).registry.MustRegister( + testGauge, + ) + }) +} diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index 0e43f57d..f9fc31ba 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -39,6 +39,8 @@ type Sparrow struct { checks map[string]checks.Check client *http.Client + metrics Metrics + resultFanIn map[string]chan checks.Result cResult chan checks.ResultDTO @@ -56,6 +58,7 @@ func New(cfg *config.Config) *Sparrow { db: db.NewInMemory(), checks: make(map[string]checks.Check), client: &http.Client{}, + metrics: NewMetrics(), resultFanIn: make(map[string]chan checks.Result), cResult: make(chan checks.ResultDTO, 1), cfg: cfg, @@ -107,6 +110,7 @@ func (s *Sparrow) ReconcileChecks(ctx context.Context) { for name, checkCfg := range s.cfg.Checks { name := name log := logger.FromContext(ctx).With("name", name) + if existingCheck, ok := s.checks[name]; ok { // Check already registered, reset config err := existingCheck.SetConfig(ctx, checkCfg) @@ -115,61 +119,91 @@ func (s *Sparrow) ReconcileChecks(ctx context.Context) { } continue } + // Check is a new Check and needs to be registered - getRegisteredCheck := checks.RegisteredChecks[name] - if getRegisteredCheck == nil { - log.WarnContext(ctx, "Check is not registered") + s.registerCheck(ctx, name, checkCfg) + } + + for existingCheckName, existingCheck := range s.checks { + if _, ok := s.cfg.Checks[existingCheckName]; ok { + // Check is known check continue } - check := getRegisteredCheck() - s.checks[name] = check - // Create a fan in a channel for the check - checkChan := make(chan checks.Result, 1) - s.resultFanIn[name] = checkChan + // Check has been removed from config + s.unregisterCheck(ctx, existingCheckName, existingCheck) + } +} - check.SetClient(s.client) - err := check.SetConfig(ctx, checkCfg) - if err != nil { - log.ErrorContext(ctx, "Failed to set config for check", "name", name, "error", err) - } - go fanInResults(checkChan, s.cResult, name) - err = check.Startup(ctx, checkChan) - if err != nil { - log.ErrorContext(ctx, "Failed to startup check", "name", name, "error", err) - close(checkChan) - continue - } - check.RegisterHandler(ctx, s.routingTree) +// registerCheck registers and executes a new check +func (s *Sparrow) registerCheck(ctx context.Context, name string, checkCfg any) { + log := logger.FromContext(ctx).With("name", name) - go func() { - err := check.Run(ctx) - if err != nil { - log.ErrorContext(ctx, "Failed to run check", "name", name, "error", err) - } - }() + getRegisteredCheck := checks.RegisteredChecks[name] + if getRegisteredCheck == nil { + log.WarnContext(ctx, "Check is not registered") + return } + check := getRegisteredCheck() + s.checks[name] = check - for existingCheckName, existingCheck := range s.checks { - log := logger.FromContext(ctx).With("checkName", existingCheckName) - if _, ok := s.cfg.Checks[existingCheckName]; ok { - continue + // Create a fan in a channel for the check + checkChan := make(chan checks.Result, 1) + s.resultFanIn[name] = checkChan + + check.SetClient(s.client) + err := check.SetConfig(ctx, checkCfg) + if err != nil { + log.ErrorContext(ctx, "Failed to set config for check", "error", err) + } + go fanInResults(checkChan, s.cResult, name) + err = check.Startup(ctx, checkChan) + if err != nil { + log.ErrorContext(ctx, "Failed to startup check", "error", err) + close(checkChan) + return + } + check.RegisterHandler(ctx, s.routingTree) + + // Add prometheus collectors of check to registry + for _, collector := range check.GetMetricCollectors() { + if err := s.metrics.GetRegistry().Register(collector); err != nil { + log.ErrorContext(ctx, "Could not add metrics collector to registry") } + } - // Check has been removed from config; shutdown and remove - existingCheck.DeregisterHandler(ctx, s.routingTree) - err := existingCheck.Shutdown(ctx) + go func() { + err := check.Run(ctx) if err != nil { - log.ErrorContext(ctx, "Failed to shutdown check", "error", err) + log.ErrorContext(ctx, "Failed to run check", "error", err) } - if c, ok := s.resultFanIn[existingCheckName]; ok { - // close fan in the channel if it exists - close(c) - delete(s.resultFanIn, existingCheckName) + }() +} + +// UnregisterCheck removes the check from sparrow and performs a soft shutdown for the check +func (s *Sparrow) unregisterCheck(ctx context.Context, name string, check checks.Check) { + log := logger.FromContext(ctx).With("name", name) + // Check has been removed from config; shutdown and remove + check.DeregisterHandler(ctx, s.routingTree) + + // Remove prometheus collectors of check from registry + for _, metricsCollector := range check.GetMetricCollectors() { + if !s.metrics.GetRegistry().Unregister(metricsCollector) { + log.ErrorContext(ctx, "Could not remove metrics collector from registry") } + } - delete(s.checks, existingCheckName) + err := check.Shutdown(ctx) + if err != nil { + log.ErrorContext(ctx, "Failed to shutdown check", "error", err) } + if c, ok := s.resultFanIn[name]; ok { + // close fan in the channel if it exists + close(c) + delete(s.resultFanIn, name) + } + + delete(s.checks, name) } // This is a fan in for the checks. diff --git a/pkg/sparrow/run_test.go b/pkg/sparrow/run_test.go index 053737c1..87081027 100644 --- a/pkg/sparrow/run_test.go +++ b/pkg/sparrow/run_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/getkin/kin-openapi/openapi3" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/caas-team/sparrow/internal/logger" @@ -58,6 +59,9 @@ func TestSparrow_ReconcileChecks(t *testing.T) { RegisterHandlerFunc: func(ctx context.Context, router *api.RoutingTree) {}, DeregisterHandlerFunc: func(ctx context.Context, router *api.RoutingTree) {}, SetClientFunc: func(c *http.Client) {}, + GetMetricCollectorsFunc: func() []prometheus.Collector { + return []prometheus.Collector{} + }, } checks.RegisteredChecks = map[string]func() checks.Check{ @@ -148,6 +152,7 @@ func TestSparrow_ReconcileChecks(t *testing.T) { cfg: tt.fields.cfg, cCfgChecks: tt.fields.cCfgChecks, db: tt.fields.db, + metrics: NewMetrics(), } // Send new config to channel