首次提交:初始化项目
This commit is contained in:
34
009-基础设施/006-monitoring-grafana/deploy.sh
Normal file
34
009-基础设施/006-monitoring-grafana/deploy.sh
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 添加 Prometheus 社区 Helm 仓库
|
||||
echo "添加 Prometheus Helm 仓库..."
|
||||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
helm repo update
|
||||
|
||||
# 创建命名空间
|
||||
echo "创建 monitoring 命名空间..."
|
||||
kubectl create namespace monitoring
|
||||
|
||||
# 安装 kube-prometheus-stack (包含 Prometheus, Grafana, Alertmanager)
|
||||
echo "安装 kube-prometheus-stack..."
|
||||
helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \
|
||||
--namespace monitoring \
|
||||
-f values.yaml
|
||||
|
||||
# 等待部署完成
|
||||
echo "等待 Prometheus 和 Grafana 启动..."
|
||||
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=grafana -n monitoring --timeout=300s
|
||||
|
||||
# 显示状态
|
||||
echo ""
|
||||
echo "监控系统部署完成!"
|
||||
kubectl get pods -n monitoring
|
||||
kubectl get svc -n monitoring
|
||||
|
||||
echo ""
|
||||
echo "访问信息:"
|
||||
echo " Grafana: http://grafana.local (需要配置 Ingress)"
|
||||
echo " 默认用户名: admin"
|
||||
echo " 默认密码: prom-operator"
|
||||
echo ""
|
||||
echo " Prometheus: http://prometheus.local (需要配置 Ingress)"
|
||||
59
009-基础设施/006-monitoring-grafana/ingress.yaml
Normal file
59
009-基础设施/006-monitoring-grafana/ingress.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: grafana-ingress
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: web
|
||||
spec:
|
||||
rules:
|
||||
- host: grafana.u6.net3w.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: kube-prometheus-stack-grafana
|
||||
port:
|
||||
number: 80
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: prometheus-ingress
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: web
|
||||
spec:
|
||||
rules:
|
||||
- host: prometheus.u6.net3w.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: kube-prometheus-stack-prometheus
|
||||
port:
|
||||
number: 9090
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: alertmanager-ingress
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: web
|
||||
spec:
|
||||
rules:
|
||||
- host: alertmanager.u6.net3w.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: kube-prometheus-stack-alertmanager
|
||||
port:
|
||||
number: 9093
|
||||
241
009-基础设施/006-monitoring-grafana/readme.md
Normal file
241
009-基础设施/006-monitoring-grafana/readme.md
Normal file
@@ -0,0 +1,241 @@
|
||||
# Prometheus + Grafana 监控系统
|
||||
|
||||
## 组件说明
|
||||
|
||||
### Prometheus
|
||||
- **功能**: 时间序列数据库,收集和存储指标数据
|
||||
- **存储**: 20Gi Longhorn 卷
|
||||
- **数据保留**: 15 天
|
||||
- **访问**: http://prometheus.local
|
||||
|
||||
### Grafana
|
||||
- **功能**: 可视化仪表板
|
||||
- **存储**: 5Gi Longhorn 卷
|
||||
- **默认用户**: admin
|
||||
- **默认密码**: prom-operator
|
||||
- **访问**: http://grafana.local
|
||||
|
||||
### Alertmanager
|
||||
- **功能**: 告警管理和通知
|
||||
- **存储**: 5Gi Longhorn 卷
|
||||
- **访问**: http://alertmanager.local
|
||||
|
||||
### Node Exporter
|
||||
- **功能**: 收集节点级别的系统指标(CPU、内存、磁盘等)
|
||||
|
||||
### Kube State Metrics
|
||||
- **功能**: 收集 Kubernetes 资源状态指标
|
||||
|
||||
## 部署方式
|
||||
|
||||
```bash
|
||||
bash deploy.sh
|
||||
```
|
||||
|
||||
## 部署后配置
|
||||
|
||||
### 1. 应用 Ingress
|
||||
```bash
|
||||
kubectl apply -f ingress.yaml
|
||||
```
|
||||
|
||||
### 2. 配置 /etc/hosts
|
||||
```
|
||||
<节点IP> grafana.local
|
||||
<节点IP> prometheus.local
|
||||
<节点IP> alertmanager.local
|
||||
```
|
||||
|
||||
### 3. 访问 Grafana
|
||||
1. 打开浏览器访问: http://grafana.local
|
||||
2. 使用默认凭证登录:
|
||||
- 用户名: admin
|
||||
- 密码: prom-operator
|
||||
3. 首次登录后建议修改密码
|
||||
|
||||
## 预置仪表板
|
||||
|
||||
Grafana 已预装多个仪表板:
|
||||
|
||||
1. **Kubernetes / Compute Resources / Cluster**
|
||||
- 集群整体资源使用情况
|
||||
|
||||
2. **Kubernetes / Compute Resources / Namespace (Pods)**
|
||||
- 按命名空间查看 Pod 资源使用
|
||||
|
||||
3. **Kubernetes / Compute Resources / Node (Pods)**
|
||||
- 按节点查看 Pod 资源使用
|
||||
|
||||
4. **Kubernetes / Networking / Cluster**
|
||||
- 集群网络流量统计
|
||||
|
||||
5. **Node Exporter / Nodes**
|
||||
- 节点详细指标(CPU、内存、磁盘、网络)
|
||||
|
||||
## 监控目标
|
||||
|
||||
系统会自动监控:
|
||||
|
||||
- ✅ Kubernetes API Server
|
||||
- ✅ Kubelet
|
||||
- ✅ Node Exporter (节点指标)
|
||||
- ✅ Kube State Metrics (K8s 资源状态)
|
||||
- ✅ CoreDNS
|
||||
- ✅ Prometheus 自身
|
||||
- ✅ Grafana
|
||||
|
||||
## 添加自定义监控
|
||||
|
||||
### 监控 Redis
|
||||
|
||||
创建 ServiceMonitor:
|
||||
|
||||
```yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: redis-monitor
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: redis
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- redis
|
||||
endpoints:
|
||||
- port: redis
|
||||
interval: 30s
|
||||
```
|
||||
|
||||
### 监控 PostgreSQL
|
||||
|
||||
需要部署 postgres-exporter:
|
||||
|
||||
```bash
|
||||
helm install postgres-exporter prometheus-community/prometheus-postgres-exporter \
|
||||
--namespace postgresql \
|
||||
--set config.datasource.host=postgresql-service.postgresql.svc.cluster.local \
|
||||
--set config.datasource.user=postgres \
|
||||
--set config.datasource.password=postgres123
|
||||
```
|
||||
|
||||
## 告警配置
|
||||
|
||||
### 查看告警规则
|
||||
```bash
|
||||
kubectl get prometheusrules -n monitoring
|
||||
```
|
||||
|
||||
### 自定义告警规则
|
||||
|
||||
创建 PrometheusRule:
|
||||
|
||||
```yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: custom-alerts
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: custom
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "节点内存使用率超过 90%"
|
||||
description: "节点 {{ $labels.instance }} 内存使用率为 {{ $value | humanizePercentage }}"
|
||||
```
|
||||
|
||||
## 配置告警通知
|
||||
|
||||
编辑 Alertmanager 配置:
|
||||
|
||||
```bash
|
||||
kubectl edit secret alertmanager-kube-prometheus-stack-alertmanager -n monitoring
|
||||
```
|
||||
|
||||
添加邮件、Slack、钉钉等通知渠道。
|
||||
|
||||
## 数据持久化
|
||||
|
||||
所有数据都存储在 Longhorn 卷上:
|
||||
- Prometheus 数据: 20Gi
|
||||
- Grafana 配置: 5Gi
|
||||
- Alertmanager 数据: 5Gi
|
||||
|
||||
可以通过 Longhorn UI 创建快照和备份到 S3。
|
||||
|
||||
## 常用操作
|
||||
|
||||
### 查看 Prometheus 目标
|
||||
访问: http://prometheus.local/targets
|
||||
|
||||
### 查看告警
|
||||
访问: http://alertmanager.local
|
||||
|
||||
### 导入自定义仪表板
|
||||
1. 访问 Grafana
|
||||
2. 点击 "+" -> "Import"
|
||||
3. 输入仪表板 ID 或上传 JSON
|
||||
|
||||
推荐仪表板:
|
||||
- Node Exporter Full: 1860
|
||||
- Kubernetes Cluster Monitoring: 7249
|
||||
- Longhorn: 13032
|
||||
|
||||
### 查看日志
|
||||
```bash
|
||||
# Prometheus 日志
|
||||
kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus -f
|
||||
|
||||
# Grafana 日志
|
||||
kubectl logs -n monitoring -l app.kubernetes.io/name=grafana -f
|
||||
```
|
||||
|
||||
## 性能优化
|
||||
|
||||
### 调整数据保留时间
|
||||
编辑 values.yaml 中的 `retention` 参数,然后:
|
||||
```bash
|
||||
helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \
|
||||
--namespace monitoring -f values.yaml
|
||||
```
|
||||
|
||||
### 调整采集间隔
|
||||
默认采集间隔为 30 秒,可以在 ServiceMonitor 中调整。
|
||||
|
||||
## 故障排查
|
||||
|
||||
### Prometheus 无法采集数据
|
||||
```bash
|
||||
# 检查 ServiceMonitor
|
||||
kubectl get servicemonitor -A
|
||||
|
||||
# 检查 Prometheus 配置
|
||||
kubectl get prometheus -n monitoring -o yaml
|
||||
```
|
||||
|
||||
### Grafana 无法连接 Prometheus
|
||||
检查 Grafana 数据源配置:
|
||||
1. 登录 Grafana
|
||||
2. Configuration -> Data Sources
|
||||
3. 确认 Prometheus URL 正确
|
||||
|
||||
## 卸载
|
||||
|
||||
```bash
|
||||
helm uninstall kube-prometheus-stack -n monitoring
|
||||
kubectl delete namespace monitoring
|
||||
```
|
||||
|
||||
## 参考资源
|
||||
|
||||
- Prometheus 文档: https://prometheus.io/docs/
|
||||
- Grafana 文档: https://grafana.com/docs/
|
||||
- kube-prometheus-stack: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
|
||||
89
009-基础设施/006-monitoring-grafana/values.yaml
Normal file
89
009-基础设施/006-monitoring-grafana/values.yaml
Normal file
@@ -0,0 +1,89 @@
|
||||
# Prometheus Operator 配置
|
||||
prometheusOperator:
|
||||
enabled: true
|
||||
|
||||
# Prometheus 配置
|
||||
prometheus:
|
||||
enabled: true
|
||||
prometheusSpec:
|
||||
retention: 15d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 250m
|
||||
limits:
|
||||
memory: 2Gi
|
||||
cpu: 1000m
|
||||
|
||||
# Grafana 配置
|
||||
grafana:
|
||||
enabled: true
|
||||
adminPassword: prom-operator
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: longhorn
|
||||
size: 5Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 512Mi
|
||||
cpu: 500m
|
||||
|
||||
# Alertmanager 配置
|
||||
alertmanager:
|
||||
enabled: true
|
||||
alertmanagerSpec:
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
|
||||
# Node Exporter (收集节点指标)
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
|
||||
# Kube State Metrics (收集 K8s 资源指标)
|
||||
kubeStateMetrics:
|
||||
enabled: true
|
||||
|
||||
# 默认监控规则
|
||||
defaultRules:
|
||||
create: true
|
||||
rules:
|
||||
alertmanager: true
|
||||
etcd: true
|
||||
configReloaders: true
|
||||
general: true
|
||||
k8s: true
|
||||
kubeApiserverAvailability: true
|
||||
kubeApiserverSlos: true
|
||||
kubelet: true
|
||||
kubeProxy: true
|
||||
kubePrometheusGeneral: true
|
||||
kubePrometheusNodeRecording: true
|
||||
kubernetesApps: true
|
||||
kubernetesResources: true
|
||||
kubernetesStorage: true
|
||||
kubernetesSystem: true
|
||||
kubeScheduler: true
|
||||
kubeStateMetrics: true
|
||||
network: true
|
||||
node: true
|
||||
nodeExporterAlerting: true
|
||||
nodeExporterRecording: true
|
||||
prometheus: true
|
||||
prometheusOperator: true
|
||||
Reference in New Issue
Block a user