Merge pull request 'feat: alert manager set-up for all services' (#129) from feat-alert-manager into master

Reviewed-on: https://gitea.freeleaps.mathmast.com/freeleaps/freeleaps-ops/pulls/129
This commit is contained in:
Easonzyc 2025-10-21 01:42:59 +00:00
commit 7938fab3b0
24 changed files with 729 additions and 0 deletions

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.central-storage.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.central-storage.prometheusRule.name }}
namespace: {{ .Values.central-storage.prometheusRule.namespace | quote }}
{{- with .Values.central-storage.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.central-storage.prometheusRule.rules }}
- name: {{ $.Values.central-storage.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -119,3 +119,30 @@ centralStorage:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-central-storage
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsCentralStorageServiceDown
expr: up{job="central-storage-service"} == 0
for: 1m
labels:
severity: critical
service: central-storage-service
annotations:
summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }})
description: Freeleaps Central Storage service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsCentralStorageServiceHighErrorRate
expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: central-storage-service
annotations:
summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }})
description: Freeleaps Central Storage service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -109,3 +109,30 @@ centralStorage:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-central-storage
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsCentralStorageServiceDown
expr: up{job="central-storage-service"} == 0
for: 1m
labels:
severity: critical
service: central-storage-service
annotations:
summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }})
description: Freeleaps Central Storage service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsCentralStorageServiceHighErrorRate
expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: central-storage-service
annotations:
summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }})
description: Freeleaps Central Storage service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.chat.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.chat.prometheusRule.name }}
namespace: {{ .Values.chat.prometheusRule.namespace | quote }}
{{- with .Values.chat.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.chat.prometheusRule.rules }}
- name: {{ $.Values.chat.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -154,3 +154,30 @@ chat:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-chat
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsChatServiceDown
expr: up{job="chat-service"} == 0
for: 1m
labels:
severity: critical
service: chat-service
annotations:
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
description: Freeleaps Chat service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsChatServiceHighErrorRate
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: chat-service
annotations:
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -145,3 +145,31 @@ chat:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-chat
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsChatServiceDown
expr: up{job="chat-service"} == 0
for: 1m
labels:
severity: critical
service: chat-service
annotations:
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
description: Freeleaps Chat service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsChatServiceHighErrorRate
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: chat-service
annotations:
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.content.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.content.prometheusRule.name }}
namespace: {{ .Values.content.prometheusRule.namespace | quote }}
{{- with .Values.content.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.content.prometheusRule.rules }}
- name: {{ $.Values.content.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -115,3 +115,30 @@ content:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-content
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsContentServiceDown
expr: up{job="content-service"} == 0
for: 1m
labels:
severity: critical
service: content-service
annotations:
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
description: Freeleaps Content service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsContentServiceHighErrorRate
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: content-service
annotations:
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
description: Freeleaps Content service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -106,3 +106,30 @@ content:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-content
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsContentServiceDown
expr: up{job="content-service"} == 0
for: 1m
labels:
severity: critical
service: content-service
annotations:
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
description: Freeleaps Content service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsContentServiceHighErrorRate
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: content-service
annotations:
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
description: Freeleaps Content service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.devops.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.devops.prometheusRule.name }}
namespace: {{ .Values.devops.prometheusRule.namespace | quote }}
{{- with .Values.devops.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.devops.prometheusRule.rules }}
- name: {{ $.Values.devops.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -120,3 +120,30 @@ devops:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-devops
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsDevopsServiceDown
expr: up{job="devops-service"} == 0
for: 1m
labels:
severity: critical
service: devops-service
annotations:
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
description: Freeleaps Devops service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsDevopsServiceHighErrorRate
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: devops-service
annotations:
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -97,3 +97,30 @@ devops:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-devops
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsDevopsServiceDown
expr: up{job="devops-service"} == 0
for: 1m
labels:
severity: critical
service: devops-service
annotations:
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
description: Freeleaps Devops service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsDevopsServiceHighErrorRate
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: devops-service
annotations:
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.devsvc.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.devsvc.prometheusRule.name }}
namespace: {{ .Values.devsvc.prometheusRule.namespace | quote }}
{{- with .Values.devsvc.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.devsvc.prometheusRule.rules }}
- name: {{ $.Values.devsvc.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -147,3 +147,30 @@ devsvc:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-devsvc
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsDevsvcServiceDown
expr: up{job="devsvc-service"} == 0
for: 1m
labels:
severity: critical
service: devsvc-service
annotations:
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
description: Freeleaps Devsvc service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsDevsvcServiceHighErrorRate
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: devsvc-service
annotations:
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -138,3 +138,30 @@ devsvc:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-devsvc
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsDevsvcServiceDown
expr: up{job="devsvc-service"} == 0
for: 1m
labels:
severity: critical
service: devsvc-service
annotations:
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
description: Freeleaps Devsvc service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsDevsvcServiceHighErrorRate
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: devsvc-service
annotations:
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.freeleaps.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.freeleaps.prometheusRule.name }}
namespace: {{ .Values.freeleaps.prometheusRule.namespace | quote }}
{{- with .Values.freeleaps.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.freeleaps.prometheusRule.rules }}
- name: {{ $.Values.freeleaps.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -141,3 +141,30 @@ freeleaps:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-freeleaps
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsFreeleapsServiceDown
expr: up{job="freeleaps-service"} == 0
for: 1m
labels:
severity: critical
service: freeleaps-service
annotations:
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsFreeleapsServiceHighErrorRate
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: freeleaps-service
annotations:
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -132,3 +132,30 @@ freeleaps:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-freeleaps
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsFreeleapsServiceDown
expr: up{job="freeleaps-service"} == 0
for: 1m
labels:
severity: critical
service: freeleaps-service
annotations:
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsFreeleapsServiceHighErrorRate
expr: rate(http_requests_total{job="freeleaps-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: freeleaps-service
annotations:
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.notification.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.notification.prometheusRule.name }}
namespace: {{ .Values.notification.prometheusRule.namespace | quote }}
{{- with .Values.notification.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.notification.prometheusRule.rules }}
- name: {{ $.Values.notification.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -143,3 +143,30 @@ notification:
remoteRef:
key: freeleaps-alpha-twilio-auth-token
type: Secret
prometheusRule:
name: freepeals-alpha-notification
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsNotificationServiceDown
expr: up{job="notification-service"} == 0
for: 1m
labels:
severity: critical
service: notification-service
annotations:
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
description: Freeleaps Notification service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsNotificationServiceHighErrorRate
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: notification-service
annotations:
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -129,3 +129,30 @@ notification:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-notification
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsNotificationServiceDown
expr: up{job="notification-service"} == 0
for: 1m
labels:
severity: critical
service: notification-service
annotations:
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
description: Freeleaps Notification service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsNotificationServiceHighErrorRate
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: notification-service
annotations:
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.payment.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.payment.prometheusRule.name }}
namespace: {{ .Values.payment.prometheusRule.namespace | quote }}
{{- with .Values.payment.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.payment.prometheusRule.rules }}
- name: {{ $.Values.payment.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -115,3 +115,30 @@ payment:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-payment
enabled: false
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsPaymentServiceDown
expr: up{job="payment-service"} == 0
for: 1m
labels:
severity: critical
service: payment-service
annotations:
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
description: Freeleaps Payment service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsPaymentServiceHighErrorRate
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: payment-service
annotations:
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -106,3 +106,30 @@ payment:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-payment
enabled: true
namespace: freeleaps-monitoring-system
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsPaymentServiceDown
expr: up{job="payment-service"} == 0
for: 1m
labels:
severity: critical
service: payment-service
annotations:
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
description: Freeleaps Payment service has been down for more than 1 minutes.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsPaymentServiceHighErrorRate
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: payment-service
annotations:
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7