| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Server
Busy
expr: sum
by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
10
for: 30s
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
summary: server busy的次数
|
ok
|
|
8.698s ago
|
280.1us |
| alert: No
Available Coordinator
expr: sum
by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
- sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
offset 5m) > 0
for: 5m
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
avail coordinator, 当前值 {{ $value }}'
summary: no avail coordinator 的次数
|
ok
|
|
8.698s ago
|
106.1us |
| alert: Abnormal
prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
< 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 如连续报警请检查 prepay instances
summary: funplus-singapore prod busy android 数量异常
|
ok
|
|
8.698s ago
|
358.8us |
| alert: Prepay
instance not ready
expr: not_ready_count
- (not_ready_count offset 1m) > 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
summary: prepay instance in not ready state
|
ok
|
|
8.698s ago
|
52.52us |
| alert: Apiserver
is down?
expr: absent(apiserver_request_total)
== 1
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: apiserver may be unhealthy
summary: no apiserver_request_total metrics
|
ok
|
|
8.698s ago
|
1.927ms |
|
3.739s ago |
3.713ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: node:runtime:seconds
expr: label_join(time()
- node_boot_time_seconds, "computed", ",", "instance")
|
ok
|
|
215ms ago
|
211.2us |
| record: node:cpu_count
expr: label_join(count
by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})),
"computed", ",", "instance")
|
ok
|
|
215ms ago
|
194.5us |
| record: node:cpu_usage_ratio:1m
expr: label_join(1
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))), "computed",
",", "instance")
|
ok
|
|
215ms ago
|
223.2us |
| record: node:memory_usage_ratio
expr: label_join(((node_memory_MemTotal_bytes
- node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
/ (node_memory_MemTotal_bytes)), "computed", ",", "instance")
|
ok
|
|
215ms ago
|
299.3us |
| record: node:memory_usage_size
expr: label_join(node_memory_MemTotal_bytes
- (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes),
"computed", ",", "instance")
|
ok
|
|
215ms ago
|
241.8us |
| record: node:cpu_usage_ratio_system:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="system"}[1m])), "computed",
",", "instance")
|
ok
|
|
215ms ago
|
332.6us |
| record: node:cpu_usage_ratio_user:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="user"}[1m])), "computed",
",", "instance")
|
ok
|
|
214ms ago
|
349.8us |
| record: node:cpu_usage_ratio_iowait:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])), "computed",
",", "instance")
|
ok
|
|
214ms ago
|
355.7us |
| record: node:network_receive:5m
expr: label_join(irate(node_network_receive_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
214ms ago
|
1.292ms |
| record: node:network_transmit:5m
expr: label_join(irate(node_network_transmit_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
212ms ago
|
1.306ms |
| record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
78.43us |
| record: node:disk_write_iops:1m
expr: label_join(irate(node_disk_writes_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
45.07us |
| record: node:disk_read:1m
expr: label_join(irate(node_disk_read_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
72.89us |
| record: node:disk_write:1m
expr: label_join(irate(node_disk_written_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
52.66us |
| record: node:read_disk_io_time:1m
expr: label_join(irate(node_disk_read_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
37.55us |
| record: node:write_disk_io_time:1m
expr: label_join(irate(node_disk_write_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
41.93us |
| record: node:disk_io_time:1m
expr: label_join(irate(node_disk_io_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
35.5us |
| record: node:tcp_activeopens:1m
expr: label_join(irate(node_netstat_Tcp_ActiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
112.3us |
| record: node:tcp_passiveopens:1m
expr: label_join(irate(node_netstat_Tcp_PassiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
116.2us |
| record: node:filesystem_usage_size
expr: label_join(node_filesystem_size_bytes{fstype=~"ext4|xfs"}
- node_filesystem_avail_bytes{fstype=~"ext4|xfs"}, "computed", ",",
"instance")
|
ok
|
|
211ms ago
|
616.9us |
| record: node:filesystem_usage_ratio
expr: label_join(1
- (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}),
"computed", ",", "instance")
|
ok
|
|
211ms ago
|
567.1us |
| record: node:gpu_used
expr: label_join(sum
by(instance) (container_accelerator_memory_used_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
103.5us |
| record: node:gpu_allocate
expr: label_join(sum
by(instance) (container_accelerator_memory_total_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
74.87us |
| record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
63.18us |
| record: node:cpu_usage_ratio:1m
expr: label_join(1
- avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])), "computed",
",", "instance")
|
ok
|
|
210ms ago
|
189.4us |
| record: node:load1
expr: label_join(node_load1,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
94.77us |
| record: node:load5
expr: label_join(node_load5,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
93.39us |
| record: node:load15
expr: label_join(node_load15,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
81.71us |
| record: node:mem_total
expr: label_join(node_memory_MemTotal_bytes,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
80.3us |
| record: node:mem_free
expr: label_join(node_memory_MemFree_bytes,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
87.6us |
| record: node:mem_available
expr: label_join(node_memory_MemAvailable_bytes,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
84.47us |
| record: node:mem_cache
expr: label_join(node_memory_Cached_bytes,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
87.28us |
| record: node:mem_buffers
expr: label_join(node_memory_Buffers_bytes,
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
82.44us |
| record: node:gpu_used
expr: label_join(container_accelerator_memory_used_bytes{containenr_name="node-gpu-exporter"},
"computed", ",", "instance")
|
ok
|
|
210ms ago
|
44.2us |
|
11.628s ago |
4.4ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: pod:run_time:seconds
expr: label_join(time()
- kube_pod_created, "computed", ",", "pod")
|
ok
|
|
11.628s ago
|
132.7us |
| record: pod:filesystem_usage:bytes
expr: label_join(sum
by(pod_name, namespace) (container_fs_usage_bytes{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
11.628s ago
|
1.051ms |
| record: pod:cpu_reuqest
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="cpu"}),
"computed", ",", "pod")
|
ok
|
|
11.627s ago
|
85.93us |
| record: pod:memory_request
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="memory"}),
"computed", ",", "pod")
|
ok
|
|
11.627s ago
|
54.08us |
| record: pod:cpu_usage
expr: label_join(sum
by(namespace, pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",image!=""}[1m])),
"computed", ",", "pod_name")
|
ok
|
|
11.628s ago
|
1.411ms |
| record: pod:memory_usage_of_requests
expr: label_join(sum
by(pod_name) (container_memory_rss{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
11.626s ago
|
1.359ms |
| record: pod:gpu_smutil
expr: label_join(sum
without(container) (nvidia_gpu_process_smutil), "computed", ",",
"pod_name")
|
ok
|
|
11.625s ago
|
84.38us |
| record: pod:gpu_memutil
expr: label_join(sum
without(container) (nvidia_gpu_process_memutil), "computed", ",",
"pod_name")
|
ok
|
|
11.625s ago
|
60.2us |
| record: pod:gpu_decutil
expr: label_join(sum
without(container) (nvidia_gpu_process_decutil), "computed", ",",
"pod_name")
|
ok
|
|
11.625s ago
|
50.99us |
| record: pod:gpu_encutil
expr: label_join(sum
without(container) (nvidia_gpu_process_encutil), "computed", ",",
"pod_name")
|
ok
|
|
11.625s ago
|
42.58us |
| record: pod:gpu_graph
expr: label_join(sum
without(container) (nvidia_gpu_process_graph), "computed", ",",
"pod_name")
|
ok
|
|
11.625s ago
|
48.29us |