| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Server
Busy
expr: sum
by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
10
for: 30s
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
summary: server busy的次数
|
ok
|
|
10.836s ago
|
237us |
| alert: No
Available Coordinator
expr: sum
by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
- sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
offset 5m) > 0
for: 5m
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
avail coordinator, 当前值 {{ $value }}'
summary: no avail coordinator 的次数
|
ok
|
|
10.836s ago
|
113.2us |
| alert: Abnormal
prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
< 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 如连续报警请检查 prepay instances
summary: funplus-singapore prod busy android 数量异常
|
ok
|
|
10.836s ago
|
514.5us |
| alert: Prepay
instance not ready
expr: not_ready_count
- (not_ready_count offset 1m) > 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
summary: prepay instance in not ready state
|
ok
|
|
10.835s ago
|
53.25us |
| alert: Apiserver
is down?
expr: absent(apiserver_request_total)
== 1
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: apiserver may be unhealthy
summary: no apiserver_request_total metrics
|
ok
|
|
10.835s ago
|
3.397ms |
|
5.876s ago |
6.217ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: node:runtime:seconds
expr: label_join(time()
- node_boot_time_seconds, "computed", ",", "instance")
|
ok
|
|
2.349s ago
|
242.4us |
| record: node:cpu_count
expr: label_join(count
by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})),
"computed", ",", "instance")
|
ok
|
|
2.349s ago
|
420.1us |
| record: node:cpu_usage_ratio:1m
expr: label_join(1
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))), "computed",
",", "instance")
|
ok
|
|
2.348s ago
|
515.7us |
| record: node:memory_usage_ratio
expr: label_join(((node_memory_MemTotal_bytes
- node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
/ (node_memory_MemTotal_bytes)), "computed", ",", "instance")
|
ok
|
|
2.348s ago
|
635.3us |
| record: node:memory_usage_size
expr: label_join(node_memory_MemTotal_bytes
- (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes),
"computed", ",", "instance")
|
ok
|
|
2.347s ago
|
426.1us |
| record: node:cpu_usage_ratio_system:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="system"}[1m])), "computed",
",", "instance")
|
ok
|
|
2.347s ago
|
486.7us |
| record: node:cpu_usage_ratio_user:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="user"}[1m])), "computed",
",", "instance")
|
ok
|
|
2.346s ago
|
388.1us |
| record: node:cpu_usage_ratio_iowait:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])), "computed",
",", "instance")
|
ok
|
|
2.346s ago
|
345.9us |
| record: node:network_receive:5m
expr: label_join(irate(node_network_receive_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
2.346s ago
|
2.333ms |
| record: node:network_transmit:5m
expr: label_join(irate(node_network_transmit_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
2.344s ago
|
2.395ms |
| record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
83.85us |
| record: node:disk_write_iops:1m
expr: label_join(irate(node_disk_writes_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
63.16us |
| record: node:disk_read:1m
expr: label_join(irate(node_disk_read_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
55.63us |
| record: node:disk_write:1m
expr: label_join(irate(node_disk_written_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
56.51us |
| record: node:read_disk_io_time:1m
expr: label_join(irate(node_disk_read_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
52.88us |
| record: node:write_disk_io_time:1m
expr: label_join(irate(node_disk_write_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
49.73us |
| record: node:disk_io_time:1m
expr: label_join(irate(node_disk_io_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
56.51us |
| record: node:tcp_activeopens:1m
expr: label_join(irate(node_netstat_Tcp_ActiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
183.5us |
| record: node:tcp_passiveopens:1m
expr: label_join(irate(node_netstat_Tcp_PassiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
2.341s ago
|
182.3us |
| record: node:filesystem_usage_size
expr: label_join(node_filesystem_size_bytes{fstype=~"ext4|xfs"}
- node_filesystem_avail_bytes{fstype=~"ext4|xfs"}, "computed", ",",
"instance")
|
ok
|
|
2.341s ago
|
1.384ms |
| record: node:filesystem_usage_ratio
expr: label_join(1
- (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}),
"computed", ",", "instance")
|
ok
|
|
2.34s ago
|
1.103ms |
| record: node:gpu_used
expr: label_join(sum
by(instance) (container_accelerator_memory_used_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
2.339s ago
|
120.1us |
| record: node:gpu_allocate
expr: label_join(sum
by(instance) (container_accelerator_memory_total_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
2.339s ago
|
105.8us |
| record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
2.339s ago
|
64.97us |
| record: node:cpu_usage_ratio:1m
expr: label_join(1
- avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])), "computed",
",", "instance")
|
ok
|
|
2.339s ago
|
407.3us |
| record: node:load1
expr: label_join(node_load1,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
185.1us |
| record: node:load5
expr: label_join(node_load5,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
167.1us |
| record: node:load15
expr: label_join(node_load15,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
158.2us |
| record: node:mem_total
expr: label_join(node_memory_MemTotal_bytes,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
124us |
| record: node:mem_free
expr: label_join(node_memory_MemFree_bytes,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
149.2us |
| record: node:mem_available
expr: label_join(node_memory_MemAvailable_bytes,
"computed", ",", "instance")
|
ok
|
|
2.338s ago
|
539.2us |
| record: node:mem_cache
expr: label_join(node_memory_Cached_bytes,
"computed", ",", "instance")
|
ok
|
|
2.337s ago
|
179.3us |
| record: node:mem_buffers
expr: label_join(node_memory_Buffers_bytes,
"computed", ",", "instance")
|
ok
|
|
2.337s ago
|
149.7us |
| record: node:gpu_used
expr: label_join(container_accelerator_memory_used_bytes{containenr_name="node-gpu-exporter"},
"computed", ",", "instance")
|
ok
|
|
2.337s ago
|
95.09us |
|
13.761s ago |
3.713ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: pod:run_time:seconds
expr: label_join(time()
- kube_pod_created, "computed", ",", "pod")
|
ok
|
|
13.761s ago
|
154us |
| record: pod:filesystem_usage:bytes
expr: label_join(sum
by(pod_name, namespace) (container_fs_usage_bytes{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
13.761s ago
|
1.046ms |
| record: pod:cpu_reuqest
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="cpu"}),
"computed", ",", "pod")
|
ok
|
|
13.76s ago
|
67.87us |
| record: pod:memory_request
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="memory"}),
"computed", ",", "pod")
|
ok
|
|
13.76s ago
|
44.31us |
| record: pod:cpu_usage
expr: label_join(sum
by(namespace, pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",image!=""}[1m])),
"computed", ",", "pod_name")
|
ok
|
|
13.76s ago
|
1.247ms |
| record: pod:memory_usage_of_requests
expr: label_join(sum
by(pod_name) (container_memory_rss{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
13.759s ago
|
887.5us |
| record: pod:gpu_smutil
expr: label_join(sum
without(container) (nvidia_gpu_process_smutil), "computed", ",",
"pod_name")
|
ok
|
|
13.758s ago
|
68.36us |
| record: pod:gpu_memutil
expr: label_join(sum
without(container) (nvidia_gpu_process_memutil), "computed", ",",
"pod_name")
|
ok
|
|
13.758s ago
|
41.25us |
| record: pod:gpu_decutil
expr: label_join(sum
without(container) (nvidia_gpu_process_decutil), "computed", ",",
"pod_name")
|
ok
|
|
13.758s ago
|
42.53us |
| record: pod:gpu_encutil
expr: label_join(sum
without(container) (nvidia_gpu_process_encutil), "computed", ",",
"pod_name")
|
ok
|
|
13.758s ago
|
43.94us |
| record: pod:gpu_graph
expr: label_join(sum
without(container) (nvidia_gpu_process_graph), "computed", ",",
"pod_name")
|
ok
|
|
13.758s ago
|
50.86us |