You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1484 lines
51 KiB

zabbix_export:
version: '7.0'
template_groups:
- uuid: a571c0d144b14fd4a87a9d9b2aa9fcd6
name: Templates/Applications
templates:
- uuid: b25b8b517a4743c48037bfa10af3dc3c
template: 'Etcd by HTTP'
name: 'Etcd by HTTP'
description: |
Get Etcd metrics by HTTP agent.
Generated by official Zabbix template tool "Templator" 2.0.0
vendor:
name: Zabbix
version: 7.0-0
groups:
- name: Templates/Applications
items:
- uuid: 32a59c8e93e141d6a471266df6dbfbd3
name: 'Etcd: Cluster version'
type: DEPENDENT
key: etcd.cluster.version
delay: '0'
history: 7d
trends: '0'
value_type: CHAR
description: 'The version of the `etcd cluster`.'
preprocessing:
- type: JSONPATH
parameters:
- $.etcdcluster
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 1d
master_item:
key: etcd.get_version
tags:
- tag: component
value: application
triggers:
- uuid: 7c87424c4fb34c56bc9b976755b4ec02
expression: 'last(/Etcd by HTTP/etcd.cluster.version,#1)<>last(/Etcd by HTTP/etcd.cluster.version,#2) and length(last(/Etcd by HTTP/etcd.cluster.version))>0'
name: 'Etcd: Cluster version has changed'
event_name: 'Etcd: Cluster version has changed (new version: {ITEM.VALUE})'
priority: INFO
description: 'Etcd version has changed. Acknowledge to close the problem manually.'
manual_close: 'YES'
tags:
- tag: scope
value: notice
- uuid: 04b0fa552b7d4267b4c5b67ee82ef5f1
name: 'Etcd: CPU'
type: DEPENDENT
key: etcd.cpu.util
delay: '0'
history: 7d
value_type: FLOAT
units: s
description: 'The total user and system CPU time spent in seconds.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_cpu_seconds_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: cpu
- uuid: bfa5bd42637642808802f7b2485a0c4d
name: 'Etcd: DB size'
type: DEPENDENT
key: etcd.db.size
delay: '0'
history: 7d
value_type: FLOAT
units: B
description: 'The total size of the underlying database.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_mvcc_db_total_size_in_bytes
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: database
- uuid: a5bc7ffb090641ab92f537b38b6055e5
name: 'Etcd: Deletes per second'
type: DEPENDENT
key: etcd.delete.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of deletes seen by this member per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_mvcc_delete_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: operations
- uuid: 183843bd93f84dc887a03fb638b2d323
name: 'Etcd: Pending events'
type: DEPENDENT
key: etcd.events.sent.rate
delay: '0'
history: 7d
description: 'The total number of pending events to be sent.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_mvcc_pending_events_total
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: events
- uuid: 34ffab33275a400ab88e5217dee5ef96
name: 'Etcd: Get node metrics'
type: HTTP_AGENT
key: etcd.get_metrics
history: '0'
trends: '0'
value_type: TEXT
authtype: BASIC
username: '{$ETCD.USER}'
password: '{$ETCD.PASSWORD}'
url: '{$ETCD.SCHEME}://{HOST.CONN}:{$ETCD.PORT}/metrics'
tags:
- tag: component
value: raw
- uuid: fdf7593420ab42b2a5af8f8b8030b517
name: 'Etcd: Get version'
type: HTTP_AGENT
key: etcd.get_version
history: '0'
trends: '0'
value_type: TEXT
url: '{$ETCD.SCHEME}://{HOST.CONN}:{$ETCD.PORT}/version'
tags:
- tag: component
value: application
- uuid: d23baf75628043e193ba0a607e1b4215
name: 'Etcd: RPCs received per second'
type: DEPENDENT
key: etcd.grpc.received.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of RPC stream messages received on the server.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- grpc_server_msg_received_total
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- uuid: eda81182710e47e1b5f2f21bb05b4775
name: 'Etcd: RPCs sent per second'
type: DEPENDENT
key: etcd.grpc.sent.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of gRPC stream messages sent by the server.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- grpc_server_msg_sent_total
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- uuid: bf59a130b20d480d93eb9330750e8e28
name: 'Etcd: RPCs started per second'
type: DEPENDENT
key: etcd.grpc.started.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of RPCs started on the server.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- grpc_server_started_total
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- uuid: e03575f4c472410eb6fbcf731ac6aab2
name: 'Etcd: Server has a leader'
type: DEPENDENT
key: etcd.has.leader
delay: '0'
history: 7d
description: |
It defines - whether or not a leader exists:
1 - it exists;
0 - it does not.
valuemap:
name: 'Etcd leader'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_has_leader
- value
- ''
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 10m
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: leader
triggers:
- uuid: 20165719d521453bb239d818ac57805c
expression: 'last(/Etcd by HTTP/etcd.has.leader)=0'
name: 'Etcd: Member has no leader'
priority: AVERAGE
description: 'If a member does not have a leader, it is totally unavailable.'
tags:
- tag: scope
value: availability
- uuid: 3fde4db8b9684ba4b56ba915e48957b5
name: 'Etcd: Node health'
type: HTTP_AGENT
key: etcd.health
history: 7d
authtype: BASIC
username: '{$ETCD.USER}'
password: '{$ETCD.PASSWORD}'
valuemap:
name: 'Etcd healthcheck'
preprocessing:
- type: JSONPATH
parameters:
- $.health
- type: BOOL_TO_DECIMAL
parameters:
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 10m
url: '{$ETCD.SCHEME}://{HOST.CONN}:{$ETCD.PORT}/health'
tags:
- tag: component
value: health
triggers:
- uuid: 6acda7bdc9df4a4ab5b7cca76c6369f0
expression: 'last(/Etcd by HTTP/etcd.health)=0'
name: 'Etcd: Node healthcheck failed'
opdata: 'Current state: {ITEM.LASTVALUE1}'
priority: AVERAGE
description: 'See more details on https://etcd.io/docs/v3.5/op-guide/monitoring/#health-check.'
dependencies:
- name: 'Etcd: Service is unavailable'
expression: 'last(/Etcd by HTTP/net.tcp.service["{$ETCD.SCHEME}","{HOST.CONN}","{$ETCD.PORT}"])=0'
tags:
- tag: scope
value: availability
- uuid: 923a408dd4514e808b6e2137a94f8140
name: 'Etcd: HTTP 4XX'
type: DEPENDENT
key: etcd.http.requests.4xx.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of handled failures of requests (non-watches), by the method (`GET/PUT` etc.), and the code `4XX`.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- 'etcd_http_failed_total{code=~"4.+"}'
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: http
- tag: http-code
value: 4xx
- uuid: c0f27d4bfba344079a31ce8c10b22683
name: 'Etcd: HTTP 5XX'
type: DEPENDENT
key: etcd.http.requests.5xx.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of handled failures of requests (non-watches), by the method (`GET/PUT` etc.), and the code `5XX`.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- 'etcd_http_failed_total{code=~"5.+"}'
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: http
- tag: http-code
value: 5xx
triggers:
- uuid: 0302957e0f6b43389546e1cfb958ed9c
expression: 'min(/Etcd by HTTP/etcd.http.requests.5xx.rate,5m)>{$ETCD.HTTP.FAIL.MAX.WARN}'
name: 'Etcd: Too many HTTP requests failures'
event_name: 'Etcd: Too many HTTP requests failures (over {$ETCD.HTTP.FAIL.MAX.WARN} for 5m)'
priority: WARNING
description: 'Too many requests failed on `etcd` instance with the `5xx HTTP code`.'
tags:
- tag: scope
value: availability
- uuid: 2a19db1c58ee4a509061fcb1b557c1a3
name: 'Etcd: HTTP requests received'
type: DEPENDENT
key: etcd.http.requests.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of requests received into the system (successfully parsed and `authd`).'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- etcd_http_received_total
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: http
- uuid: b3760811472440baad6a338f481ba13a
name: 'Etcd: Server is a leader'
type: DEPENDENT
key: etcd.is.leader
delay: '0'
history: 7d
description: |
It defines - whether or not this member is a leader:
1 - it is;
0 - otherwise.
valuemap:
name: 'Etcd leader'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_is_leader
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 10m
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: leader
triggers:
- uuid: 44d66eb1a332418daf4c3a1110db5458
expression: 'nodata(/Etcd by HTTP/etcd.is.leader,30m)=1'
name: 'Etcd: Failed to fetch info data'
event_name: 'Etcd: Failed to fetch info data (or no data for 30m)'
priority: WARNING
description: 'Zabbix has not received any data for items for the last 30 minutes.'
manual_close: 'YES'
dependencies:
- name: 'Etcd: Service is unavailable'
expression: 'last(/Etcd by HTTP/net.tcp.service["{$ETCD.SCHEME}","{HOST.CONN}","{$ETCD.PORT}"])=0'
tags:
- tag: scope
value: notice
- uuid: ecd1ae9c038f4fc2b720ad562ced0191
name: 'Etcd: Keys compacted per second'
type: DEPENDENT
key: etcd.keys.compacted.rate
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of DB keys compacted per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_mvcc_db_compaction_keys_total
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: keys
- uuid: a3f910efb0a04cc494c07b8703f9d2ec
name: 'Etcd: Keys expired per second'
type: DEPENDENT
key: etcd.keys.expired.rate
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of expired keys per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_store_expires_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: keys
- uuid: fbda737014544cf1bcf544a48aa6e48b
name: 'Etcd: Keys total'
type: DEPENDENT
key: etcd.keys.total
delay: '0'
history: 7d
description: 'The total number of keys.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_mvcc_keys_total
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: keys
- uuid: e45ba61d99b8432b86f5797a2cfdb416
name: 'Etcd: Leader changes'
type: DEPENDENT
key: etcd.leader.changes
delay: '0'
history: 7d
description: 'The number of leader changes the member has seen since its start.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_leader_changes_seen_total
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: leader
triggers:
- uuid: 1ea623927179489890a5a73eeb8177f7
expression: '(max(/Etcd by HTTP/etcd.leader.changes,15m)-min(/Etcd by HTTP/etcd.leader.changes,15m))>{$ETCD.LEADER.CHANGES.MAX.WARN}'
name: 'Etcd: Instance has seen too many leader changes'
event_name: 'Etcd: Instance has seen too many leader changes (over {$ETCD.LEADER.CHANGES.MAX.WARN} for 15m)'''
priority: WARNING
description: 'Rapid leadership changes impact the performance of `etcd` significantly. It also signals that the leader is unstable, perhaps due to network connectivity issues or excessive load hitting the `etcd cluster`.'
tags:
- tag: scope
value: availability
- uuid: 348e15d2ec3a4bb88e2ca371f96c2f00
name: 'Etcd: Maximum open file descriptors'
type: DEPENDENT
key: etcd.max.fds
delay: '0'
history: 7d
value_type: FLOAT
description: 'The Maximum number of open file descriptors.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_max_fds
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: fds
- uuid: d016b8674ebd4251943f2e94b22f5ff2
name: 'Etcd: Client gRPC received bytes per second'
type: DEPENDENT
key: etcd.network.grpc.received.rate
delay: '0'
history: 7d
value_type: FLOAT
units: Bps
description: 'The number of bytes received from gRPC clients per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_network_client_grpc_received_bytes_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- uuid: e50d2d088c6448dbb3ecaeebc3b2b8f1
name: 'Etcd: Client gRPC sent bytes per second'
type: DEPENDENT
key: etcd.network.grpc.sent.rate
delay: '0'
history: 7d
value_type: FLOAT
units: Bps
description: 'The number of bytes sent from gRPC clients per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_network_client_grpc_sent_bytes_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- uuid: d5099d4cdb3044ba95935c2aea2b6352
name: 'Etcd: Open file descriptors'
type: DEPENDENT
key: etcd.open.fds
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of open file descriptors.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_open_fds
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: fds
- uuid: a2927b1e85af41cab9c28b1b79c229ea
name: 'Etcd: Proposals applied per second'
type: DEPENDENT
key: etcd.proposals.applied.rate
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of consensus proposals applied.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_proposals_applied_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: proposals
- uuid: e829f3df055e42dfbce5f27eb7ca487c
name: 'Etcd: Proposals committed per second'
type: DEPENDENT
key: etcd.proposals.committed.rate
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of consensus proposals committed.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_proposals_committed_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: proposals
- uuid: 6b45b99526394a219d31b5c22cb98c85
name: 'Etcd: Proposals failed per second'
type: DEPENDENT
key: etcd.proposals.failed.rate
delay: '0'
history: 7d
value_type: FLOAT
description: 'The number of failed proposals seen.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_proposals_failed_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: proposals
triggers:
- uuid: 432cea8bddd742ca98528be2fbc7e35e
expression: 'min(/Etcd by HTTP/etcd.proposals.failed.rate,5m)>{$ETCD.PROPOSAL.FAIL.MAX.WARN}'
name: 'Etcd: Too many proposal failures'
event_name: 'Etcd: Too many proposal failures (over {$ETCD.PROPOSAL.FAIL.MAX.WARN} for 5m)'''
priority: WARNING
description: 'Normally related to two issues: temporary failures related to a leader election or longer downtime caused by a loss of quorum in the cluster.'
tags:
- tag: scope
value: performance
- uuid: 1c506ff69e7b4564a6d95fd35b1a11fd
name: 'Etcd: Proposals pending'
type: DEPENDENT
key: etcd.proposals.pending
delay: '0'
history: 7d
description: 'The current number of pending proposals to commit.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_server_proposals_pending
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: proposals
triggers:
- uuid: 5feefc4dd5d14fe2b56dd63029b57026
expression: 'min(/Etcd by HTTP/etcd.proposals.pending,5m)>{$ETCD.PROPOSAL.PENDING.MAX.WARN}'
name: 'Etcd: Too many proposals are queued to commit'
event_name: 'Etcd: Too many proposals are queued to commit (over {$ETCD.PROPOSAL.PENDING.MAX.WARN} for 5m)'''
priority: WARNING
description: 'Rising pending proposals suggests there is a high client load, or the member cannot commit proposals.'
tags:
- tag: scope
value: performance
- uuid: bd7398507c274bfab53339380df16761
name: 'Etcd: PUT per second'
type: DEPENDENT
key: etcd.put.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of puts seen by this member per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_mvcc_put_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: operations
- uuid: b744c07f3290467b96b21ea38ad5d497
name: 'Etcd: Range per second'
type: DEPENDENT
key: etcd.range.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of ranges seen by this member per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_mvcc_range_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: operations
- uuid: 88c91b36eca94fd2b357a67d171dc621
name: 'Etcd: Reads per second'
type: DEPENDENT
key: etcd.reads.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of read actions by `get/getRecursive`, local to this member.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- etcd_debugging_store_reads_total
- type: JAVASCRIPT
parameters:
- |
//calculates total reads
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: store
- uuid: 4b881e32094e4f478c5d0849cb5d07a7
name: 'Etcd: Resident memory'
type: DEPENDENT
key: etcd.res.bytes
delay: '0'
history: 7d
value_type: FLOAT
units: B
description: 'The size of resident memory expressed in bytes.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_resident_memory_bytes
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: memory
- uuid: dee9ed8897cf4d3582957707ea09cdf8
name: 'Etcd: Server version'
type: DEPENDENT
key: etcd.server.version
delay: '0'
history: 7d
trends: '0'
value_type: CHAR
description: 'The version of the `etcd server`.'
preprocessing:
- type: JSONPATH
parameters:
- $.etcdserver
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 1d
master_item:
key: etcd.get_version
tags:
- tag: component
value: application
triggers:
- uuid: cfb2fc467b224ef694d59b5c081ed965
expression: 'last(/Etcd by HTTP/etcd.server.version,#1)<>last(/Etcd by HTTP/etcd.server.version,#2) and length(last(/Etcd by HTTP/etcd.server.version))>0'
name: 'Etcd: Server version has changed'
event_name: 'Etcd: Server version has changed (new version: {ITEM.VALUE})'
priority: INFO
description: 'Etcd version has changed. Acknowledge to close the problem manually.'
manual_close: 'YES'
tags:
- tag: scope
value: notice
- uuid: b14c787c716146e990bc388d277a2803
name: 'Etcd: Transaction per second'
type: DEPENDENT
key: etcd.txn.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of transactions seen by this member per second.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- etcd_debugging_mvcc_range_total
- value
- ''
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: transactions
- uuid: 98ec9085d621446aa462efc86cf93905
name: 'Etcd: Uptime'
type: DEPENDENT
key: etcd.uptime
delay: '0'
history: 7d
value_type: FLOAT
units: s
description: '`Etcd` server uptime.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_start_time_seconds
- value
- ''
- type: JAVASCRIPT
parameters:
- |
//use boottime to calculate uptime
return (Math.floor(Date.now()/1000)-Number(value));
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: application
triggers:
- uuid: 6843369969f5410a840494104d71fe1f
expression: 'last(/Etcd by HTTP/etcd.uptime)<10m'
name: 'Etcd: Host has been restarted'
event_name: 'Etcd: {HOST.NAME} has been restarted (uptime < 10m)'
priority: INFO
description: 'Uptime is less than 10 minutes.'
manual_close: 'YES'
tags:
- tag: scope
value: notice
- uuid: c35810b8b7bc4a62970b5293fb2d8fb2
name: 'Etcd: Virtual memory'
type: DEPENDENT
key: etcd.virtual.bytes
delay: '0'
history: 7d
value_type: FLOAT
units: B
description: 'The size of virtual memory expressed in bytes.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- process_virtual_memory_bytes
- value
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: memory
- uuid: 16c041fc189248bfaaa5826ffaf38459
name: 'Etcd: Writes per second'
type: DEPENDENT
key: etcd.writes.rate
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of writes (e.g., `set/compareAndDelete`) seen by this member.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- etcd_debugging_store_writes_total
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: store
- uuid: a0f94f429b99432e86d15ffa74d6eada
name: 'Etcd: Service''s TCP port state'
type: SIMPLE
key: 'net.tcp.service["{$ETCD.SCHEME}","{HOST.CONN}","{$ETCD.PORT}"]'
history: 7d
valuemap:
name: 'Service state'
preprocessing:
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 10m
tags:
- tag: component
value: health
- tag: component
value: network
triggers:
- uuid: 74164f0783ae4227ba44f3e865fee3bd
expression: 'last(/Etcd by HTTP/net.tcp.service["{$ETCD.SCHEME}","{HOST.CONN}","{$ETCD.PORT}"])=0'
name: 'Etcd: Service is unavailable'
priority: AVERAGE
manual_close: 'YES'
tags:
- tag: scope
value: availability
discovery_rules:
- uuid: 5e6121383e5d4f3eb1150a2068a4633b
name: 'gRPC codes discovery'
type: DEPENDENT
key: etcd.grpc_code.discovery
delay: '0'
filter:
evaltype: AND
conditions:
- macro: '{#GRPC.CODE}'
value: '{$ETCD.GRPC_CODE.NOT_MATCHES}'
operator: NOT_MATCHES_REGEX
formulaid: A
- macro: '{#GRPC.CODE}'
value: '{$ETCD.GRPC_CODE.MATCHES}'
formulaid: B
item_prototypes:
- uuid: 7d316cbec2ce4718ac133d90b7a89585
name: 'Etcd: RPCs completed with code {#GRPC.CODE}'
type: DEPENDENT
key: 'etcd.grpc.handled.rate[{#GRPC.CODE}]'
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of RPCs completed on the server with grpc_code {#GRPC.CODE}.'
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- 'grpc_server_handled_total{grpc_method="{#GRPC.CODE}"}'
- type: JAVASCRIPT
parameters:
- |
var valueArr = JSON.parse(value);
return valueArr.reduce(function(acc,obj){
return acc + parseFloat(obj['value'])
},0);
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: grpc
- tag: grpc-code
value: '{#GRPC.CODE}'
trigger_prototypes:
- uuid: 459b6ee5735047d597a6a4ab41b76e21
expression: 'min(/Etcd by HTTP/etcd.grpc.handled.rate[{#GRPC.CODE}],5m)>{$ETCD.GRPC.ERRORS.MAX.WARN}'
name: 'Etcd: Too many failed gRPC requests with code: {#GRPC.CODE}'
event_name: 'Etcd: Too many failed gRPC requests with code: {#GRPC.CODE} (over {$ETCD.GRPC.ERRORS.MAX.WARN} in 5m)'
priority: WARNING
tags:
- tag: scope
value: availability
master_item:
key: etcd.get_metrics
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- grpc_server_handled_total
- type: JAVASCRIPT
parameters:
- |
var data = JSON.parse(value),
lookup = {},
result = [];
for (var item, i = 0; item = data[i++];) {
var code = item.labels.grpc_code;
if (!(code in lookup)) {
lookup[code] = 1;
result.push({ "{#GRPC.CODE}": code });
}
}
return JSON.stringify(result);
- type: DISCARD_UNCHANGED_HEARTBEAT
parameters:
- 1h
overrides:
- name: trigger
step: '1'
filter:
conditions:
- macro: '{#GRPC.CODE}'
value: '{$ETCD.GRPC_CODE.TRIGGER.MATCHES}'
formulaid: A
operations:
- operationobject: TRIGGER_PROTOTYPE
operator: LIKE
value: 'Too many failed gRPC requests'
status: ENABLED
discover: DISCOVER
- uuid: b7b527ee30b84a569afcd1f85b705810
name: 'Peers discovery'
type: DEPENDENT
key: etcd.peer.discovery
delay: '0'
item_prototypes:
- uuid: 4129aa7b8acf4ca3b5476461fe5275c9
name: 'Etcd: Etcd peer {#ETCD.PEER}: Bytes received'
type: DEPENDENT
key: 'etcd.bytes.received.rate[{#ETCD.PEER}]'
delay: '0'
history: 7d
value_type: FLOAT
units: Bps
description: 'The number of bytes received from a peer with the ID `{#ETCD.PEER}`.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- 'etcd_network_peer_received_bytes_total{From="{#ETCD.PEER}"}'
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: network
- tag: component
value: peers
- tag: peer
value: '{#ETCD.PEER}'
- uuid: 8f5fecbabe474baaab40df46879401af
name: 'Etcd: Etcd peer {#ETCD.PEER}: Bytes sent'
type: DEPENDENT
key: 'etcd.bytes.sent.rate[{#ETCD.PEER}]'
delay: '0'
history: 7d
value_type: FLOAT
units: Bps
description: 'The number of bytes sent to a peer with the ID `{#ETCD.PEER}`.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- 'etcd_network_peer_sent_bytes_total{To="{#ETCD.PEER}"}'
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: network
- tag: component
value: peers
- tag: peer
value: '{#ETCD.PEER}'
- uuid: 2521ccfc16fc43069001883b85aa0243
name: 'Etcd: Etcd peer {#ETCD.PEER}: Receive failures'
type: DEPENDENT
key: 'etcd.received.fail.rate[{#ETCD.PEER}]'
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of received failures from a peer with the ID `{#ETCD.PEER}`.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- 'etcd_network_peer_received_failures_total{To="{#ETCD.PEER}"}'
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: network
- tag: component
value: peers
- tag: peer
value: '{#ETCD.PEER}'
- uuid: 5756f1a16e5c42b79f6d6225c5382599
name: 'Etcd: Etcd peer {#ETCD.PEER}: Send failures'
type: DEPENDENT
key: 'etcd.sent.fail.rate[{#ETCD.PEER}]'
delay: '0'
history: 7d
value_type: FLOAT
units: rps
description: 'The number of sent failures from a peer with the ID `{#ETCD.PEER}`.'
preprocessing:
- type: PROMETHEUS_PATTERN
parameters:
- 'etcd_network_peer_sent_failures_total{To="{#ETCD.PEER}"}'
- value
- ''
error_handler: CUSTOM_VALUE
error_handler_params: '0'
- type: CHANGE_PER_SECOND
parameters:
- ''
master_item:
key: etcd.get_metrics
tags:
- tag: component
value: network
- tag: component
value: peers
- tag: peer
value: '{#ETCD.PEER}'
master_item:
key: etcd.get_metrics
lld_macro_paths:
- lld_macro: '{#ETCD.PEER}'
path: $.labels.To
preprocessing:
- type: PROMETHEUS_TO_JSON
parameters:
- etcd_network_peer_sent_bytes_total
tags:
- tag: class
value: application
- tag: target
value: etcd
macros:
- macro: '{$ETCD.GRPC.ERRORS.MAX.WARN}'
value: '1'
description: 'The maximum number of gRPC request failures.'
- macro: '{$ETCD.GRPC_CODE.MATCHES}'
value: '.*'
description: 'The filter of discoverable gRPC codes. See more details on https://github.com/grpc/grpc/blob/master/doc/statuscodes.md.'
- macro: '{$ETCD.GRPC_CODE.NOT_MATCHES}'
value: CHANGE_IF_NEEDED
description: 'The filter to exclude discovered gRPC codes. See more details on https://github.com/grpc/grpc/blob/master/doc/statuscodes.md.'
- macro: '{$ETCD.GRPC_CODE.TRIGGER.MATCHES}'
value: Aborted|Unavailable
description: 'The filter of discoverable gRPC codes, which will create triggers.'
- macro: '{$ETCD.HTTP.FAIL.MAX.WARN}'
value: '2'
description: 'The maximum number of HTTP request failures.'
- macro: '{$ETCD.LEADER.CHANGES.MAX.WARN}'
value: '5'
description: 'The maximum number of leader changes.'
- macro: '{$ETCD.OPEN.FDS.MAX.WARN}'
value: '90'
description: 'The maximum percentage of used file descriptors.'
- macro: '{$ETCD.PASSWORD}'
- macro: '{$ETCD.PORT}'
value: '2379'
description: 'The port of `etcd` API endpoint.'
- macro: '{$ETCD.PROPOSAL.FAIL.MAX.WARN}'
value: '2'
description: 'The maximum number of proposal failures.'
- macro: '{$ETCD.PROPOSAL.PENDING.MAX.WARN}'
value: '5'
description: 'The maximum number of proposals in queue.'
- macro: '{$ETCD.SCHEME}'
value: http
description: 'The request scheme which may be `http` or `https`.'
- macro: '{$ETCD.USER}'
dashboards:
- uuid: 5b0ffbb731cd4415a2edbc74978e0276
name: 'Etcd Overview'
pages:
- widgets:
- type: graph
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: Proposals rate'
- type: graph
x: '12'
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: Read/Write rate'
- type: graph
'y': '5'
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: gRPC client traffic'
- type: graph
x: '12'
'y': '5'
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: HTTP requests rate'
- type: graph
'y': '10'
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: gRPC requests rate'
- type: graph
x: '12'
'y': '10'
width: '12'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: Memory usage'
- type: graph
'y': '15'
width: '24'
height: '5'
fields:
- type: GRAPH
name: graphid
value:
host: 'Etcd by HTTP'
name: 'Etcd: File descriptors'
valuemaps:
- uuid: f25e21a70baa4e009bdbcb44acb1a22e
name: 'Etcd healthcheck'
mappings:
- value: '0'
newvalue: Failed
- value: '1'
newvalue: Ok
- uuid: 7bcaf8a520e24613a96d49e63a91a55b
name: 'Etcd leader'
mappings:
- value: '0'
newvalue: 'No'
- value: '1'
newvalue: 'Yes'
- uuid: 1735a8d251b24c3fbab32e766064536b
name: 'Service state'
mappings:
- value: '0'
newvalue: Down
- value: '1'
newvalue: Up
triggers:
- uuid: c45583928d204c04ad8884115e1e35c5
expression: 'min(/Etcd by HTTP/etcd.open.fds,5m)/last(/Etcd by HTTP/etcd.max.fds)*100>{$ETCD.OPEN.FDS.MAX.WARN}'
name: 'Etcd: Current number of open files is too high'
event_name: 'Etcd: Current number of open files is too high (over {$ETCD.OPEN.FDS.MAX.WARN}% for 5m)'
priority: WARNING
description: |
Heavy usage of a file descriptor (i.e., near the limit of the process's file descriptor) indicates a potential file descriptor exhaustion issue.
If the file descriptors are exhausted, `etcd` may panic because it cannot create new WAL files.
tags:
- tag: scope
value: capacity
graphs:
- uuid: 18baccd03c0f4814a42d32b51334787d
name: 'Etcd: File descriptors'
graph_items:
- drawtype: GRADIENT_LINE
color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.open.fds
- sortorder: '1'
drawtype: BOLD_LINE
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.max.fds
- uuid: eefd07cf30d84cc4b84f802468363200
name: 'Etcd: gRPC client traffic'
graph_items:
- color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.network.grpc.received.rate
- sortorder: '1'
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.network.grpc.sent.rate
- uuid: c53ee0dba42d4a1f8afedbe0f6e42785
name: 'Etcd: gRPC requests rate'
graph_items:
- color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.grpc.received.rate
- sortorder: '1'
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.grpc.sent.rate
- sortorder: '2'
color: 00611C
item:
host: 'Etcd by HTTP'
key: etcd.grpc.started.rate
- uuid: 520ff92815d84e0f84e9296d249c04ae
name: 'Etcd: HTTP requests rate'
graph_items:
- color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.http.requests.4xx.rate
- sortorder: '1'
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.http.requests.5xx.rate
- sortorder: '2'
color: 00611C
item:
host: 'Etcd by HTTP'
key: etcd.http.requests.rate
- uuid: 90af5b2f75b7402693bad7a8f371ab8e
name: 'Etcd: Memory usage'
graph_items:
- drawtype: GRADIENT_LINE
color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.res.bytes
- sortorder: '1'
drawtype: GRADIENT_LINE
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.virtual.bytes
- uuid: 59cd15292ad04ebd902a7d3080b53838
name: 'Etcd: Proposals rate'
graph_items:
- color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.proposals.failed.rate
- sortorder: '1'
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.proposals.committed.rate
- sortorder: '2'
color: 00611C
item:
host: 'Etcd by HTTP'
key: etcd.proposals.applied.rate
- sortorder: '3'
drawtype: BOLD_LINE
color: F7941D
yaxisside: RIGHT
item:
host: 'Etcd by HTTP'
key: etcd.proposals.pending
- uuid: b374fab55bcc452e9279214ddb2c8024
name: 'Etcd: Read/Write rate'
graph_items:
- color: 199C0D
item:
host: 'Etcd by HTTP'
key: etcd.reads.rate
- sortorder: '1'
color: F63100
item:
host: 'Etcd by HTTP'
key: etcd.writes.rate