跳到主要内容

14 篇博文 含有标签「Prometheus」

查看所有标签

二进制安装 Prome 生态

· 阅读需 7 分钟

Prometheus 二进制安装

下载解压二进制程序

mkdir /apps  && cd /apps

#wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
wget https://github.com/prometheus/prometheus/releases/download/v2.35.0/prometheus-2.35.0.linux-amd64.tar.gz
tar xf prometheus-2.35.0.linux-amd64.tar.gz

# 创建软连接
ln -sv prometheus-2.35.0.linux-amd64 /apps/prometheus

# 检查配置文件
./promtool check config prometheus.yml

创建 prometheus service 启动脚本

vim /etc/systemd/system/prometheus.service

[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target

[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml

[Install]
WantedBy=multi-user.target

启动 prometheus 服务

systemctl daemon-reload
systemctl restart prometheus
systemctl enable prometheus

node export 二进制安装

下载解压二进制程序

cd /apps 
wget https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz
tar xf node_exporter-1.3.1.linux-amd64.tar.gz

# 创建软连接
ln -sv /apps/node_exporter-1.3.1.linux-amd64 /apps/node_exporter

创建 node-exporter service 启动脚本

vim /etc/systemd/system/node-exporter.service

[Unit]
Description=Prometheus Node Exporter
After=network.target

[Service]
ExecStart=/apps/node_exporter/node_exporter

[Install]
WantedBy=multi-user.target

启动 node exporter 服务

systemctl daemon-reload
systemctl restart node-exporter
systemctl enable node-exporter.service

添加node节点数据收集

vim /apps/prometheus/prometheus.yml

  - job_name: 'prometheus-node'
static_configs:
- targets: ['192.168.15.100:9100']

重启服务

systemctl restart prometheus.service

Alertmanager 二进制安装

下载解压二进制程序

cd /apps 
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
tar xf alertmanager-0.24.0.linux-amd64.tar.gz

# 创建软连接
ln -sv /apps/alertmanager-0.24.0.linux-amd64 /apps/alertmanager

创建 alertmanager service 启动脚本

vim /etc/systemd/system/alertmanager.service

[Unit]
Description=alertmanager
Documentation=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=root
ExecStart=/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target

编辑配置文件 (邮件)

vim /apps/alertmanager/alertmanager.yml

global:
resolve_timeout: 5m
smtp_from: 'xxxx@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'xxxx@qq.com'
smtp_auth_password: 'uuxxxxdvnxzbiaf'
smtp_require_tls: false
smtp_hello: '@qq.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2m
repeat_interval: 5m
receiver: 'web.hook'
# #receiver: 'default-receiver' #其他的告警发送给default-receiver
# routes: #将critical的报警发送给myalertname
# - reciver: myalertname
# group_wait: 10s
receivers:
- name: 'web.hook'
# webhook_configs:
# - url: 'http://127.0.0.1:5001/'
email_configs:
- to: 'xxxx@qq.com'
inhibit_rules:
- source_match: #源匹配级别,当匹配成功发出通知,但是其他'alertname','dev','instance'产生的warning级别的告警通知将被抑制
severity: 'critical' #报警的事件级别
target_match:
severity: 'warning' #匹配目标为新产生的目标告警为'warning' 将被抑制
equal: ['alertname', 'dev', 'instance']

编辑配置文件 (企业微信)

vim /apps/alertmanager/alertmanager.yml

global:
resolve_timeout: 5m

route:
group_by: ['alertname']
group_wait: 10s # 初次发送告警延时
group_interval: 10s # 距离第一次发送告警,等待多久再次发送告警
repeat_interval: 60m # 告警重发时间
receiver: 'wechat'

receivers:
- name: 'wechat'
webhook_configs:
- url: 'http://172.20.254.138:9080/wechatbot' # adapter
send_resolved: true

inhibit_rules:
- source_match: #源匹配级别,当匹配成功发出通知,但是其他'alertname','dev','instance'产生的warning级别的告警通知将被抑制
severity: 'critical' #报警的事件级别
target_match:
severity: 'warning' #匹配目标为新产生的目标告警为'warning' 将被抑制
equal: ['alertname', 'dev', 'instance']

启动 Alertmanager服务

systemctl daemon-reload
systemctl start alertmanager
systemctl enable alertmanager

安装 Adapter 适配器

git clone https://github.com/lckei/prometheus-wechatbot-webhook.git
编辑 Dockerfile
cat Dockerfile
FROM alpine:latest
LABEL maintainer="kei"
ENV VERSION 1.0
WORKDIR /apps
ADD src/app /apps/app
RUN chmod +x /apps/app
#ADD src/wechatbot.tmpl /apps/wechatbot.tmpl
ADD src/wechatbot2.tmpl /apps/wechatbot.tmpl
EXPOSE 9080
CMD ["/apps/app"]
编辑告警模版文件
{{ define "wechatbot.url.api" }}https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=2730f396-a070-4618-aedb-d290ad132ffc{{end}}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
==========异常告警==========
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.description}};{{$alert.Annotations.summary}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
============END============
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
==========异常恢复==========
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.description}};{{$alert.Annotations.summary}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
============END============
{{- end }}
{{- end }}
{{- end }}
构建 Adapter
docker build -t wechatbot:v1 .
运行 Adapter
docker run -d --name wechatbot --restart=always \
-v /etc/localtime:/etc/localtime \
-v src/wechatbot.tmpl:/apps/wechatbot.tmpl \
-p 9080:9080 wechatbot:v1

rpm 安装 grafana

安装 mysql

# 下载mysql源安装包
wget http://dev.mysql.com/get/mysql57-community-release-el7-8.noarch.rpm

# 安装mysql源
yum localinstall mysql57-community-release-el7-8.noarch.rpm -y

# 检查mysql源是否安装成功
yum repolist enabled | grep "mysql.*-community.*"

# 安装MySQL (5.7需绕过验证)

yum install mysql-community-server -y --nogpgcheck


# 3、启动MySQL服务
systemctl start mysqld

# 查看MySQL的启动状态
systemctl status mysqld


#4、开机启动
systemctl enable mysqld
systemctl daemon-reload

# 5、修改root本地登录密码
# mysql安装完成之后,在/var/log/mysqld.log文件中给root生成了一个默认密码。通过下面的方式找到root默认密码,然后登录mysql进行修改:
grep 'temporary password' /var/log/mysqld.log
mysql -uroot -p

# mysql5.7默认安装了密码安全检查插件(validate_password),默认密码检查策略要求密码必须包含:大小写字母、数字和特殊符号,
# 并且长度不能少于8位。否则会提示ERROR 1819 (HY000): Your password does not satisfy the current policy requirements错误

# 如果不需要密码策略,添加my.cnf文件中添加如下配置禁用即可:
# 配置默认编码为utf8
# 关闭客户端dns反解

echo -e "validate_password = off\ncharacter_set_server=utf8\ninit_connect='SET NAMES utf8'\nskip-name-resolve\n" >> /etc/my.cnf
systemctl restart mysqld

mysql -uroot -p

## 授权
alter user 'root'@'localhost' identified by '123123';

grant all privileges on *.* to root@'%' identified by '123123' with grant option;
flush privileges;

下载安装包

cd /apps
# 地址 https://grafana.com/grafana/download
wget -O /opt/tgzs/grafana-7.5.1-1.x86_64.rpm https://dl.grafana.com/oss/release/grafana-7.5.1-1.x86_64.rpm
yum install grafana-7.5.1-1.x86_64.rpm

# 下载grafana8
wget https://dl.grafana.com/oss/release/grafana-8.5.1-1.x86_64.rpm
yum install -y grafana-8.5.1-1.x86_64.rpm

# 下载grafana10 (建议安装10及以上的,UI更漂亮~)
wget https://dl.grafana.com/oss/release/grafana-10.2.1-1.x86_64.rpm
yum install -y grafana-10.2.1-1.x86_64.rpm

在 mysql 中创建数据库

CREATE DATABASE IF NOT EXISTS grafana DEFAULT CHARSET utf8 COLLATE utf8_general_ci;

修改配置文件

vim /etc/grafana/grafana.ini

type = mysql
host = 127.0.0.1:3306
name = grafana
user = root
password = 123456

启动服务

systemctl start grafana-server
systemctl enable grafana-server
systemctl status grafana-server

# 查看日志
tail -f /var/log/grafana/grafana.log

Blackbox_exporter 监控实现

· 阅读需 3 分钟

Blackbox_exporter 监控实现

https://prometheus.io/download/#blackbox_exporter

HTTP/HTTPS : URL/API 可用性检测

TCP :端口监听检测

ICMP :主机存活检测

DNS :域名解析

部署 blackbox exporter

wget  https://github.com/prometheus/blackbox_exporter/releases/download/v0.19.0/blackbox_exporter-0.19.0.linux-amd64.tar.gz

tar xf blackbox_exporter-0.19.0.linux-amd64.tar.gz

ln -sv /apps/blackbox_exporter-0.19.0.linux-amd64 /apps/blackbox_exporter

创建 blackbox exporter 启动文件

vim /etc/systemd/system/blackbox-exporter.service

[Unit]
Description=Prometheus Blackbox Exporter
After=network.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/apps/blackbox_exporter/blackbox_exporter \
--config.file=/apps/blackbox_exporter/blackbox.yml \
--web.listen-address=:9115
Restart=on-failure

[Install]
WantedBy=multi-user.target

启动并设置开机自启动

systemctl restart blackbox-exporter.service && systemctl enable blackbox-exporter.service

blackbox exporter 实现URL监控

prometheus 调用 blackbox exporter 实现对 URL/ICMP 的监控

Prometheus URL 监控配置

vim /apps/prometheus/prometheus.yml

#网站监控
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['http://www.baidu.com', 'https://yuanshisen.com']
labels:
instance: http_status
group: web
relabel_configs:
- source_labels: [__address__] #relabel 通过_address_(当前目标地址)写入_param_target标签来创建一个label。
target_label: __param_target #监控目标 www.xiaomi.com,作_address_value
- source_labels: [__param_target] #监控目标
target_label: url #将监控目标与url 创建-个 label
- target_label: __address__
replacement: 192.168.15.110:9115

配置生效

/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule file found

# 重启服务
systemctl restart prometheus.service

# 访问浏览器
192.168.15.110:9115

blackbox exporter 实现 ICMP 监控

Prometheus ICMP 监控配置

vim /apps/prometheus/prometheus.yml

#icmp 检测
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets: ['192.168.3.15', "223.6.6.6"]
labels:
instance: 'ping_status'
group: 'icmp'
relabel_configs:
- source_labels: [__address__]
target_label: param_target
- source_labels: [__param_target]
target_label: ip #ip 与_param_target 创建- label
- target_label: __address__
replacement: 192.168.15.110:9115

配置生效

/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule file found

# 重启服务
systemctl restart prometheus.service

# 访问浏览器
192.168.15.110:9115

blackbox exporter 实现端口监控

端口监控配置

vim /apps/prometheus/prometheus.yml

#端口监控
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['192.168.15.100:9100', '192.168.15.100:80','192.168.15.100:22']
labels:
instance: 'port_status'
group: 'port'
relabel_configs:
- source_labels: [__address__]
target_label: param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 192.168.15.110:9115

配置生效

/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule file found

# 重启服务
systemctl restart prometheus.service

# 访问浏览器
192.168.15.110:9115

grafana 导入模版

  • 13587(对于web url 监控展示的效果好,其他弱一些)
  • 9965 (除了web url展示的不全面,整体都比较全面)

Kube-state-metrics 部署

· 阅读需 2 分钟

部署 kube-state-metrics

https://github.com/kubernetes/kube-state-metrics

Kube-state-metrics : 通过监听 API Server 生成有关资源对象的状态指标,比如 Deploy、node、pod,它只提供metrics数据并不存储,所以我们需要用 Prometheus 抓取这些数据然后存储,主要关注的一些业务相关的元数据,具体举几个例子:

Deployment、Pod、副本状态等,调度了多少replicas? 现在可用的有哪些? 多少个 Pod 是 running/stopped/terminated 状态?Pod 重启了多少次?目前有多少 job 在运行中 等等 ...

镜像

https://hub.docker.com/r/bitnami/kube-state-metrics

https://quay.io/repository/coreos/kube-state-metrics?tag=latest&tab=tags

指标

https://xie.infoq.cn/article/9e1fff6306649e65480a96bb1

部署 kube-state-metrics

vim kube-state-metrics-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: bitnami/kube-state-metrics:2.2.4
ports:
- containerPort: 8080

---
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system

---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: kube-system
labels:
app: kube-state-metrics
spec:
type: NodePort
ports:
- name: kube-state-metrics
port: 8080
targetPort: 8080
nodePort: 31666
protocol: TCP
selector:
app: kube-state-metrics

kubectl create -f kube-state-metrics-deploy.yaml

prometheus 采集数据

vim /apps/prometheus/prometheus.yml

   - job_name: "kube-state-metrics"
static_configs:
- targets: ["192.168.15.111:31666"]
grafana 导入模板

三个选择

  • 13332 (自我感觉这个符合我的审美)
  • 13824
  • 14518