Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: Server
Busy
expr: sum
by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
10
for: 30s
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
summary: server busy的次数
|
ok
|
|
4.336s ago
|
242.6us |
alert: No
Available Coordinator
expr: sum
by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
- sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
offset 5m) > 0
for: 5m
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
avail coordinator, 当前值 {{ $value }}'
summary: no avail coordinator 的次数
|
ok
|
|
4.336s ago
|
105.4us |
alert: Abnormal
prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
< 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 如连续报警请检查 prepay instances
summary: funplus-singapore prod busy android 数量异常
|
ok
|
|
4.336s ago
|
291.2us |
alert: Prepay
instance not ready
expr: not_ready_count
- (not_ready_count offset 1m) > 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
summary: prepay instance in not ready state
|
ok
|
|
4.336s ago
|
52.94us |
alert: Apiserver
is down?
expr: absent(apiserver_request_total)
== 1
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: apiserver may be unhealthy
summary: no apiserver_request_total metrics
|
ok
|
|
4.336s ago
|
2.139ms |
|
14.377s ago |
4.52ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: node:runtime:seconds
expr: label_join(time()
- node_boot_time_seconds, "computed", ",", "instance")
|
ok
|
|
10.85s ago
|
227.8us |
record: node:cpu_count
expr: label_join(count
by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})),
"computed", ",", "instance")
|
ok
|
|
10.85s ago
|
276.3us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))), "computed",
",", "instance")
|
ok
|
|
10.85s ago
|
286.9us |
record: node:memory_usage_ratio
expr: label_join(((node_memory_MemTotal_bytes
- node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
/ (node_memory_MemTotal_bytes)), "computed", ",", "instance")
|
ok
|
|
10.849s ago
|
333us |
record: node:memory_usage_size
expr: label_join(node_memory_MemTotal_bytes
- (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes),
"computed", ",", "instance")
|
ok
|
|
10.849s ago
|
251.1us |
record: node:cpu_usage_ratio_system:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="system"}[1m])), "computed",
",", "instance")
|
ok
|
|
10.849s ago
|
265.7us |
record: node:cpu_usage_ratio_user:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="user"}[1m])), "computed",
",", "instance")
|
ok
|
|
10.849s ago
|
240.6us |
record: node:cpu_usage_ratio_iowait:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])), "computed",
",", "instance")
|
ok
|
|
10.848s ago
|
213.6us |
record: node:network_receive:5m
expr: label_join(irate(node_network_receive_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
10.848s ago
|
1.472ms |
record: node:network_transmit:5m
expr: label_join(irate(node_network_transmit_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
10.847s ago
|
1.466ms |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
72.54us |
record: node:disk_write_iops:1m
expr: label_join(irate(node_disk_writes_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
48.79us |
record: node:disk_read:1m
expr: label_join(irate(node_disk_read_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
39.14us |
record: node:disk_write:1m
expr: label_join(irate(node_disk_written_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
93.93us |
record: node:read_disk_io_time:1m
expr: label_join(irate(node_disk_read_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
113.9us |
record: node:write_disk_io_time:1m
expr: label_join(irate(node_disk_write_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
90.5us |
record: node:disk_io_time:1m
expr: label_join(irate(node_disk_io_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
81.32us |
record: node:tcp_activeopens:1m
expr: label_join(irate(node_netstat_Tcp_ActiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
234.8us |
record: node:tcp_passiveopens:1m
expr: label_join(irate(node_netstat_Tcp_PassiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
10.845s ago
|
219.1us |
record: node:filesystem_usage_size
expr: label_join(node_filesystem_size_bytes{fstype=~"ext4|xfs"}
- node_filesystem_avail_bytes{fstype=~"ext4|xfs"}, "computed", ",",
"instance")
|
ok
|
|
10.845s ago
|
636.4us |
record: node:filesystem_usage_ratio
expr: label_join(1
- (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}),
"computed", ",", "instance")
|
ok
|
|
10.844s ago
|
681.4us |
record: node:gpu_used
expr: label_join(sum
by(instance) (container_accelerator_memory_used_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
10.844s ago
|
108.3us |
record: node:gpu_allocate
expr: label_join(sum
by(instance) (container_accelerator_memory_total_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
10.844s ago
|
79.41us |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
10.844s ago
|
73.75us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])), "computed",
",", "instance")
|
ok
|
|
10.844s ago
|
256.8us |
record: node:load1
expr: label_join(node_load1,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
111.3us |
record: node:load5
expr: label_join(node_load5,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
105us |
record: node:load15
expr: label_join(node_load15,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
98.97us |
record: node:mem_total
expr: label_join(node_memory_MemTotal_bytes,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
82.02us |
record: node:mem_free
expr: label_join(node_memory_MemFree_bytes,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
100.9us |
record: node:mem_available
expr: label_join(node_memory_MemAvailable_bytes,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
113.7us |
record: node:mem_cache
expr: label_join(node_memory_Cached_bytes,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
98.57us |
record: node:mem_buffers
expr: label_join(node_memory_Buffers_bytes,
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
81.49us |
record: node:gpu_used
expr: label_join(container_accelerator_memory_used_bytes{containenr_name="node-gpu-exporter"},
"computed", ",", "instance")
|
ok
|
|
10.843s ago
|
49.61us |
|
7.262s ago |
7.939ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: pod:run_time:seconds
expr: label_join(time()
- kube_pod_created, "computed", ",", "pod")
|
ok
|
|
7.262s ago
|
207.5us |
record: pod:filesystem_usage:bytes
expr: label_join(sum
by(pod_name, namespace) (container_fs_usage_bytes{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
7.262s ago
|
2.157ms |
record: pod:cpu_reuqest
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="cpu"}),
"computed", ",", "pod")
|
ok
|
|
7.26s ago
|
69.36us |
record: pod:memory_request
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="memory"}),
"computed", ",", "pod")
|
ok
|
|
7.26s ago
|
50.48us |
record: pod:cpu_usage
expr: label_join(sum
by(namespace, pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",image!=""}[1m])),
"computed", ",", "pod_name")
|
ok
|
|
7.26s ago
|
2.938ms |
record: pod:memory_usage_of_requests
expr: label_join(sum
by(pod_name) (container_memory_rss{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
7.257s ago
|
2.047ms |
record: pod:gpu_smutil
expr: label_join(sum
without(container) (nvidia_gpu_process_smutil), "computed", ",",
"pod_name")
|
ok
|
|
7.255s ago
|
127.7us |
record: pod:gpu_memutil
expr: label_join(sum
without(container) (nvidia_gpu_process_memutil), "computed", ",",
"pod_name")
|
ok
|
|
7.255s ago
|
87.71us |
record: pod:gpu_decutil
expr: label_join(sum
without(container) (nvidia_gpu_process_decutil), "computed", ",",
"pod_name")
|
ok
|
|
7.255s ago
|
58.39us |
record: pod:gpu_encutil
expr: label_join(sum
without(container) (nvidia_gpu_process_encutil), "computed", ",",
"pod_name")
|
ok
|
|
7.255s ago
|
95.37us |
record: pod:gpu_graph
expr: label_join(sum
without(container) (nvidia_gpu_process_graph), "computed", ",",
"pod_name")
|
ok
|
|
7.255s ago
|
79.65us |