Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: Server
Busy
expr: sum
by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
10
for: 30s
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
summary: server busy的次数
|
ok
|
|
4.503s ago
|
256.7us |
alert: No
Available Coordinator
expr: sum
by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
- sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
offset 5m) > 0
for: 5m
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
avail coordinator, 当前值 {{ $value }}'
summary: no avail coordinator 的次数
|
ok
|
|
4.503s ago
|
107.6us |
alert: Abnormal
prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
< 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 如连续报警请检查 prepay instances
summary: funplus-singapore prod busy android 数量异常
|
ok
|
|
4.503s ago
|
426.7us |
alert: Prepay
instance not ready
expr: not_ready_count
- (not_ready_count offset 1m) > 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
summary: prepay instance in not ready state
|
ok
|
|
4.502s ago
|
71.33us |
alert: Apiserver
is down?
expr: absent(apiserver_request_total)
== 1
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: apiserver may be unhealthy
summary: no apiserver_request_total metrics
|
ok
|
|
4.502s ago
|
2.555ms |
|
14.543s ago |
5.105ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: node:runtime:seconds
expr: label_join(time()
- node_boot_time_seconds, "computed", ",", "instance")
|
ok
|
|
11.017s ago
|
226.8us |
record: node:cpu_count
expr: label_join(count
by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})),
"computed", ",", "instance")
|
ok
|
|
11.016s ago
|
282.6us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))), "computed",
",", "instance")
|
ok
|
|
11.016s ago
|
304.1us |
record: node:memory_usage_ratio
expr: label_join(((node_memory_MemTotal_bytes
- node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
/ (node_memory_MemTotal_bytes)), "computed", ",", "instance")
|
ok
|
|
11.016s ago
|
347.9us |
record: node:memory_usage_size
expr: label_join(node_memory_MemTotal_bytes
- (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes),
"computed", ",", "instance")
|
ok
|
|
11.016s ago
|
276.9us |
record: node:cpu_usage_ratio_system:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="system"}[1m])), "computed",
",", "instance")
|
ok
|
|
11.015s ago
|
276.6us |
record: node:cpu_usage_ratio_user:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="user"}[1m])), "computed",
",", "instance")
|
ok
|
|
11.015s ago
|
266.5us |
record: node:cpu_usage_ratio_iowait:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])), "computed",
",", "instance")
|
ok
|
|
11.015s ago
|
252.1us |
record: node:network_receive:5m
expr: label_join(irate(node_network_receive_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
11.015s ago
|
1.565ms |
record: node:network_transmit:5m
expr: label_join(irate(node_network_transmit_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
11.013s ago
|
2.38ms |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
91.93us |
record: node:disk_write_iops:1m
expr: label_join(irate(node_disk_writes_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
72.02us |
record: node:disk_read:1m
expr: label_join(irate(node_disk_read_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
53.16us |
record: node:disk_write:1m
expr: label_join(irate(node_disk_written_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
42.61us |
record: node:read_disk_io_time:1m
expr: label_join(irate(node_disk_read_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
52.59us |
record: node:write_disk_io_time:1m
expr: label_join(irate(node_disk_write_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
41.78us |
record: node:disk_io_time:1m
expr: label_join(irate(node_disk_io_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
51.39us |
record: node:tcp_activeopens:1m
expr: label_join(irate(node_netstat_Tcp_ActiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
132.1us |
record: node:tcp_passiveopens:1m
expr: label_join(irate(node_netstat_Tcp_PassiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
11.011s ago
|
133.5us |
record: node:filesystem_usage_size
expr: label_join(node_filesystem_size_bytes{fstype=~"ext4|xfs"}
- node_filesystem_avail_bytes{fstype=~"ext4|xfs"}, "computed", ",",
"instance")
|
ok
|
|
11.011s ago
|
695.4us |
record: node:filesystem_usage_ratio
expr: label_join(1
- (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}),
"computed", ",", "instance")
|
ok
|
|
11.01s ago
|
693.9us |
record: node:gpu_used
expr: label_join(sum
by(instance) (container_accelerator_memory_used_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
11.01s ago
|
100.7us |
record: node:gpu_allocate
expr: label_join(sum
by(instance) (container_accelerator_memory_total_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
75.85us |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
56.95us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])), "computed",
",", "instance")
|
ok
|
|
11.009s ago
|
318.1us |
record: node:load1
expr: label_join(node_load1,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
126.4us |
record: node:load5
expr: label_join(node_load5,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
138.4us |
record: node:load15
expr: label_join(node_load15,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
107.9us |
record: node:mem_total
expr: label_join(node_memory_MemTotal_bytes,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
112.2us |
record: node:mem_free
expr: label_join(node_memory_MemFree_bytes,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
104.6us |
record: node:mem_available
expr: label_join(node_memory_MemAvailable_bytes,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
98.64us |
record: node:mem_cache
expr: label_join(node_memory_Cached_bytes,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
103.7us |
record: node:mem_buffers
expr: label_join(node_memory_Buffers_bytes,
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
100.7us |
record: node:gpu_used
expr: label_join(container_accelerator_memory_used_bytes{containenr_name="node-gpu-exporter"},
"computed", ",", "instance")
|
ok
|
|
11.009s ago
|
58.93us |
|
7.429s ago |
5.977ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: pod:run_time:seconds
expr: label_join(time()
- kube_pod_created, "computed", ",", "pod")
|
ok
|
|
7.429s ago
|
170.4us |
record: pod:filesystem_usage:bytes
expr: label_join(sum
by(pod_name, namespace) (container_fs_usage_bytes{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
7.429s ago
|
1.428ms |
record: pod:cpu_reuqest
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="cpu"}),
"computed", ",", "pod")
|
ok
|
|
7.427s ago
|
87.01us |
record: pod:memory_request
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="memory"}),
"computed", ",", "pod")
|
ok
|
|
7.427s ago
|
50.81us |
record: pod:cpu_usage
expr: label_join(sum
by(namespace, pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",image!=""}[1m])),
"computed", ",", "pod_name")
|
ok
|
|
7.427s ago
|
1.97ms |
record: pod:memory_usage_of_requests
expr: label_join(sum
by(pod_name) (container_memory_rss{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
7.425s ago
|
1.839ms |
record: pod:gpu_smutil
expr: label_join(sum
without(container) (nvidia_gpu_process_smutil), "computed", ",",
"pod_name")
|
ok
|
|
7.423s ago
|
111.3us |
record: pod:gpu_memutil
expr: label_join(sum
without(container) (nvidia_gpu_process_memutil), "computed", ",",
"pod_name")
|
ok
|
|
7.423s ago
|
64.11us |
record: pod:gpu_decutil
expr: label_join(sum
without(container) (nvidia_gpu_process_decutil), "computed", ",",
"pod_name")
|
ok
|
|
7.423s ago
|
51.35us |
record: pod:gpu_encutil
expr: label_join(sum
without(container) (nvidia_gpu_process_encutil), "computed", ",",
"pod_name")
|
ok
|
|
7.423s ago
|
63.69us |
record: pod:gpu_graph
expr: label_join(sum
without(container) (nvidia_gpu_process_graph), "computed", ",",
"pod_name")
|
ok
|
|
7.423s ago
|
122.2us |