Merge pull request 'feat: alert manager set-up for all services' (#129) from feat-alert-manager into master
Reviewed-on: https://gitea.freeleaps.mathmast.com/freeleaps/freeleaps-ops/pulls/129
This commit is contained in:
commit
7938fab3b0
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.central-storage.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.central-storage.prometheusRule.name }}
|
||||
namespace: {{ .Values.central-storage.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.central-storage.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.central-storage.prometheusRule.rules }}
|
||||
- name: {{ $.Values.central-storage.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -119,3 +119,30 @@ centralStorage:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-central-storage
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsCentralStorageServiceDown
|
||||
expr: up{job="central-storage-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: central-storage-service
|
||||
annotations:
|
||||
summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Central Storage service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsCentralStorageServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: central-storage-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Central Storage service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -109,3 +109,30 @@ centralStorage:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-central-storage
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsCentralStorageServiceDown
|
||||
expr: up{job="central-storage-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: central-storage-service
|
||||
annotations:
|
||||
summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Central Storage service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsCentralStorageServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: central-storage-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Central Storage service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
37
freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml
Normal file
37
freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml
Normal file
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.chat.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.chat.prometheusRule.name }}
|
||||
namespace: {{ .Values.chat.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.chat.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.chat.prometheusRule.rules }}
|
||||
- name: {{ $.Values.chat.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -154,3 +154,30 @@ chat:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-chat
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsChatServiceDown
|
||||
expr: up{job="chat-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: chat-service
|
||||
annotations:
|
||||
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Chat service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsChatServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: chat-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -145,3 +145,31 @@ chat:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-chat
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsChatServiceDown
|
||||
expr: up{job="chat-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: chat-service
|
||||
annotations:
|
||||
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Chat service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsChatServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: chat-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.content.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.content.prometheusRule.name }}
|
||||
namespace: {{ .Values.content.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.content.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.content.prometheusRule.rules }}
|
||||
- name: {{ $.Values.content.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -115,3 +115,30 @@ content:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-content
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsContentServiceDown
|
||||
expr: up{job="content-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: content-service
|
||||
annotations:
|
||||
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Content service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsContentServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: content-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Content service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -106,3 +106,30 @@ content:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-content
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsContentServiceDown
|
||||
expr: up{job="content-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: content-service
|
||||
annotations:
|
||||
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Content service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsContentServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: content-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Content service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.devops.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.devops.prometheusRule.name }}
|
||||
namespace: {{ .Values.devops.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.devops.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.devops.prometheusRule.rules }}
|
||||
- name: {{ $.Values.devops.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -120,3 +120,30 @@ devops:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-devops
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsDevopsServiceDown
|
||||
expr: up{job="devops-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: devops-service
|
||||
annotations:
|
||||
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devops service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsDevopsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: devops-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -97,3 +97,30 @@ devops:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-devops
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsDevopsServiceDown
|
||||
expr: up{job="devops-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: devops-service
|
||||
annotations:
|
||||
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devops service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsDevopsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: devops-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.devsvc.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.devsvc.prometheusRule.name }}
|
||||
namespace: {{ .Values.devsvc.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.devsvc.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.devsvc.prometheusRule.rules }}
|
||||
- name: {{ $.Values.devsvc.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -147,3 +147,30 @@ devsvc:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-devsvc
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsDevsvcServiceDown
|
||||
expr: up{job="devsvc-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: devsvc-service
|
||||
annotations:
|
||||
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devsvc service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsDevsvcServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: devsvc-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -138,3 +138,30 @@ devsvc:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-devsvc
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsDevsvcServiceDown
|
||||
expr: up{job="devsvc-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: devsvc-service
|
||||
annotations:
|
||||
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devsvc service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsDevsvcServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: devsvc-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.freeleaps.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.freeleaps.prometheusRule.name }}
|
||||
namespace: {{ .Values.freeleaps.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.freeleaps.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.freeleaps.prometheusRule.rules }}
|
||||
- name: {{ $.Values.freeleaps.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -141,3 +141,30 @@ freeleaps:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-freeleaps
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsFreeleapsServiceDown
|
||||
expr: up{job="freeleaps-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: freeleaps-service
|
||||
annotations:
|
||||
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsFreeleapsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: freeleaps-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -132,3 +132,30 @@ freeleaps:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-freeleaps
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsFreeleapsServiceDown
|
||||
expr: up{job="freeleaps-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: freeleaps-service
|
||||
annotations:
|
||||
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsFreeleapsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="freeleaps-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: freeleaps-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.notification.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.notification.prometheusRule.name }}
|
||||
namespace: {{ .Values.notification.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.notification.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.notification.prometheusRule.rules }}
|
||||
- name: {{ $.Values.notification.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -143,3 +143,30 @@ notification:
|
||||
remoteRef:
|
||||
key: freeleaps-alpha-twilio-auth-token
|
||||
type: Secret
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-notification
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsNotificationServiceDown
|
||||
expr: up{job="notification-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: notification-service
|
||||
annotations:
|
||||
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Notification service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsNotificationServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: notification-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -129,3 +129,30 @@ notification:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-notification
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsNotificationServiceDown
|
||||
expr: up{job="notification-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: notification-service
|
||||
annotations:
|
||||
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Notification service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsNotificationServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: notification-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if .Values.payment.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ .Values.payment.prometheusRule.name }}
|
||||
namespace: {{ .Values.payment.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.payment.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.payment.prometheusRule.rules }}
|
||||
- name: {{ $.Values.payment.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -115,3 +115,30 @@ payment:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-alpha-payment
|
||||
enabled: false
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsPaymentServiceDown
|
||||
expr: up{job="payment-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: payment-service
|
||||
annotations:
|
||||
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Payment service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsPaymentServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: payment-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
@ -106,3 +106,30 @@ payment:
|
||||
controlledResources:
|
||||
- cpu
|
||||
- memory
|
||||
prometheusRule:
|
||||
name: freepeals-prod-payment
|
||||
enabled: true
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsPaymentServiceDown
|
||||
expr: up{job="payment-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: payment-service
|
||||
annotations:
|
||||
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
|
||||
description: Freeleaps Payment service has been down for more than 1 minutes.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
- alert: FreeleapsPaymentServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: payment-service
|
||||
annotations:
|
||||
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
|
||||
Loading…
Reference in New Issue
Block a user