You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1607 lines
64 KiB
1607 lines
64 KiB
zabbix_export:
|
|
version: '7.0'
|
|
template_groups:
|
|
- uuid: a571c0d144b14fd4a87a9d9b2aa9fcd6
|
|
name: Templates/Applications
|
|
templates:
|
|
- uuid: e129aeba7c814bf189772cf5919b4bbb
|
|
template: 'Hadoop by HTTP'
|
|
name: 'Hadoop by HTTP'
|
|
description: |
|
|
The template gets the Hadoop metrics from cluster's hosts (ResourceManager, NodeManagers, NameNode, DataNodes) by HTTP agent. You should define the IP address (or FQDN) and Web-UI port for the ResourceManager in {$HADOOP.RESOURCEMANAGER.HOST} and {$HADOOP.RESOURCEMANAGER.PORT} macros and for the NameNode in {$HADOOP.NAMENODE.HOST} and {$HADOOP.NAMENODE.PORT} macros respectively. Macros can be set in the template or overridden at the host level.
|
|
|
|
You can discuss this template or leave feedback on our forum https://www.zabbix.com/forum/zabbix-suggestions-and-feedback/413459-discussion-thread-for-official-zabbix-template-hadoop
|
|
|
|
Generated by official Zabbix template tool "Templator" 2.0.0
|
|
vendor:
|
|
name: Zabbix
|
|
version: 7.0-0
|
|
groups:
|
|
- name: Templates/Applications
|
|
items:
|
|
- uuid: d2d19ac9d1eb434c98a55cbf76c27850
|
|
name: 'Hadoop: Get DataNodes states'
|
|
type: HTTP_AGENT
|
|
key: hadoop.datanodes.get
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JAVASCRIPT
|
|
parameters:
|
|
- |
|
|
try {
|
|
parsed = JSON.parse(value);
|
|
var result = [];
|
|
|
|
function getNodes(nodes, state) {
|
|
Object.keys(nodes).forEach(function (field) {
|
|
var Node = {};
|
|
Node['HostName'] = field || '';
|
|
Node['adminState'] = nodes[field].adminState || '';
|
|
Node['operState'] = state || '';
|
|
Node['version'] = nodes[field].version || '';
|
|
result.push(Node);
|
|
});
|
|
}
|
|
|
|
getNodes(JSON.parse(parsed.beans[0].LiveNodes), 'Live');
|
|
getNodes(JSON.parse(parsed.beans[0].DeadNodes), 'Dead');
|
|
getNodes(JSON.parse(parsed.beans[0].DecomNodes), 'Decommission');
|
|
getNodes(JSON.parse(parsed.beans[0].EnteringMaintenanceNodes), 'Maintenance');
|
|
|
|
return JSON.stringify(result);
|
|
}
|
|
catch (error) {
|
|
throw 'Failed to process response received from Hadoop';
|
|
}
|
|
url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 2cb55b7ed9cd41878dc985497f45e084
|
|
name: 'NameNode: Total blocks'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.blocks_total
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Count of blocks tracked by NameNode.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].BlocksTotal.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 1d098dc6fa134053b6c6be0e7618092e
|
|
name: 'NameNode: Blocks allocable'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.block_capacity
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Maximum number of blocks allocable.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].BlockCapacity.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 26ca0bbd18e04b49b9eb8d2a74f4fd15
|
|
name: 'NameNode: Capacity remaining'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.capacity_remaining
|
|
delay: '0'
|
|
history: 7d
|
|
units: B
|
|
description: 'Available capacity.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].CapacityRemaining.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: c73c2b6c24b846e49bdb68c3f5a01419
|
|
name: 'NameNode: Corrupt blocks'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.corrupt_blocks
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of corrupt blocks.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].CorruptBlocks.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 82198b21427a4e39a173369db42d9de3
|
|
name: 'NameNode: Total files'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.files_total
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Total count of files tracked by the NameNode.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].FilesTotal.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 687406d06ce94a8291b2e72bb2f8bec4
|
|
name: 'Hadoop: Get NameNode stats'
|
|
type: HTTP_AGENT
|
|
key: hadoop.namenode.get
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: ea72dc1574f348d19432a1a922b4ed35
|
|
name: 'NameNode: Get info'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.info
|
|
delay: '0'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name=~''Hadoop:service=NameNode,name=*'')]'
|
|
error_handler: CUSTOM_VALUE
|
|
error_handler_params: '[]'
|
|
master_item:
|
|
key: hadoop.namenode.get
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 30ee7e09067e4f00a4f26ad6c00454b2
|
|
name: 'NameNode: Missing blocks'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.missing_blocks
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of missing blocks.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].MissingBlocks.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
triggers:
|
|
- uuid: 3b92daaaddb74105a5e57c4b381e3060
|
|
expression: 'min(/Hadoop by HTTP/hadoop.namenode.missing_blocks,15m)>0'
|
|
name: 'NameNode: Cluster has missing blocks'
|
|
priority: AVERAGE
|
|
description: 'A missing block is far worse than a corrupt block, because a missing block cannot be recovered by copying a replica.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 3473bad0a7c94c8b9fd35cd4398e6215
|
|
name: 'NameNode: Dead DataNodes'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.num_dead_data_nodes
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Count of dead DataNodes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].NumDeadDataNodes.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
triggers:
|
|
- uuid: b2d1a26791aa4b16865b4410c50c7ceb
|
|
expression: 'min(/Hadoop by HTTP/hadoop.namenode.num_dead_data_nodes,5m)>0'
|
|
name: 'NameNode: Cluster has DataNodes in Dead state'
|
|
priority: AVERAGE
|
|
description: 'The death of a DataNode causes a flurry of network activity, as the NameNode initiates replication of blocks lost on the dead nodes.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 398a8c95db3248b684f222fe7b912fe3
|
|
name: 'NameNode: Alive DataNodes'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.num_live_data_nodes
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Count of alive DataNodes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].NumLiveDataNodes.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 15bcb22fdc7f4e2c8f24560ef641d63d
|
|
name: 'NameNode: Stale DataNodes'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.num_stale_data_nodes
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'DataNodes that do not send a heartbeat within 30 seconds are marked as "stale".'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].StaleDataNodes.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: b72d54b849fc48fd8e7cdacd75943c23
|
|
name: 'NameNode: Block Pool Renaming'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.percent_block_pool_used
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=NameNodeInfo'')].PercentBlockPoolUsed.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 3cfbf084a31b479c91be356556d43c0d
|
|
name: 'NameNode: Percent capacity remaining'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.percent_remaining
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: '%'
|
|
description: 'Available capacity in percent.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=NameNodeInfo'')].PercentRemaining.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
triggers:
|
|
- uuid: 3104295848c5497085f397b8f3e06ef6
|
|
expression: 'max(/Hadoop by HTTP/hadoop.namenode.percent_remaining,15m)<{$HADOOP.CAPACITY_REMAINING.MIN.WARN}'
|
|
name: 'NameNode: Cluster capacity remaining is low'
|
|
event_name: 'NameNode: Cluster capacity remaining is low (below {$HADOOP.CAPACITY_REMAINING.MIN.WARN}% for 15m)'
|
|
priority: WARNING
|
|
description: 'A good practice is to ensure that disk use never exceeds 80 percent capacity.'
|
|
tags:
|
|
- tag: scope
|
|
value: capacity
|
|
- uuid: a9e6c1e2f9544c71844785b4baa9c017
|
|
name: 'NameNode: RPC queue & processing time'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.rpc_processing_time_avg
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
description: 'Average time spent on processing RPC requests.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=RpcActivityForPort9000'')].RpcProcessingTimeAvgTime.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 9f00149ef0c2444ebbc9327b24acd7b9
|
|
name: 'NameNode: Total load'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.total_load
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'The current number of concurrent file accesses (read/write) across all DataNodes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].TotalLoad.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 6abfe537a36646a0b10fe2c72586d249
|
|
name: 'NameNode: Transactions since last checkpoint'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.transactions_since_last_checkpoint
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Total number of transactions since last checkpoint.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].TransactionsSinceLastCheckpoint.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 249098bbeb7a43cdac59f1297ca95104
|
|
name: 'NameNode: Under-replicated blocks'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.under_replicated_blocks
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'The number of blocks with insufficient replication.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].UnderReplicatedBlocks.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
- uuid: 7e8769eb77304b6f9c6e1d5bbd420fd0
|
|
name: 'NameNode: Uptime'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.uptime
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()'
|
|
- type: MULTIPLIER
|
|
parameters:
|
|
- '0.001'
|
|
master_item:
|
|
key: hadoop.namenode.get
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
triggers:
|
|
- uuid: 9fac0ae651ab40a08551945eb0a93b68
|
|
expression: 'nodata(/Hadoop by HTTP/hadoop.namenode.uptime,30m)=1'
|
|
name: 'NameNode: Failed to fetch NameNode API page'
|
|
event_name: 'NameNode: Failed to fetch NameNode API page (or no data for 30m)'
|
|
priority: WARNING
|
|
description: 'Zabbix has not received any data for items for the last 30 minutes.'
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: 'NameNode: Service is unavailable'
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
- uuid: 84d866bc0dc3486d9c5dc9beefec8d31
|
|
expression: 'last(/Hadoop by HTTP/hadoop.namenode.uptime)<10m'
|
|
name: 'NameNode: Service has been restarted'
|
|
event_name: 'NameNode: Service has been restarted (uptime < 10m)'
|
|
priority: INFO
|
|
description: 'Uptime is less than 10 minutes.'
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 396eb8f791d54254b08ddee553d3d944
|
|
name: 'NameNode: Failed volumes'
|
|
type: DEPENDENT
|
|
key: hadoop.namenode.volume_failures_total
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of failed volumes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].VolumeFailuresTotal.first()'
|
|
master_item:
|
|
key: hadoop.namenode.info
|
|
tags:
|
|
- tag: component
|
|
value: namenode
|
|
triggers:
|
|
- uuid: fcf791b6d0594dbb9ddfc3f93bc94825
|
|
expression: 'min(/Hadoop by HTTP/hadoop.namenode.volume_failures_total,15m)>0'
|
|
name: 'NameNode: Cluster has volume failures'
|
|
priority: AVERAGE
|
|
description: 'HDFS now allows for disks to fail in place, without affecting DataNode operations, until a threshold value is reached. This is set on each DataNode via the dfs.datanode.failed.volumes.tolerated property; it defaults to 0, meaning that any volume failure will shut down the DataNode; on a production cluster where DataNodes typically have 6, 8, or 12 disks, setting this parameter to 1 or 2 is typically the best practice.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 6d7546c5d15d4e478b2e87e35d5306b0
|
|
name: 'Hadoop: Get NodeManagers states'
|
|
type: HTTP_AGENT
|
|
key: hadoop.nodemanagers.get
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JAVASCRIPT
|
|
parameters:
|
|
- 'return JSON.stringify(JSON.parse(JSON.parse(value).beans[0].LiveNodeManagers))'
|
|
url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: e693cff98ec74cc198ec6b5e973f116c
|
|
name: 'Hadoop: Get ResourceManager stats'
|
|
type: HTTP_AGENT
|
|
key: hadoop.resourcemanager.get
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 5b9200a5a39c41c2b4b88e7d41d90e7b
|
|
name: 'ResourceManager: Get info'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.info
|
|
delay: '0'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name=~''Hadoop:service=ResourceManager,name=*'')]'
|
|
error_handler: CUSTOM_VALUE
|
|
error_handler_params: '[]'
|
|
master_item:
|
|
key: hadoop.resourcemanager.get
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 63d4fe7384044027b08b99698355fd8b
|
|
name: 'ResourceManager: Active NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_active_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Active NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumActiveNMs.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
triggers:
|
|
- uuid: eb02a30f45394e4d84d9d7239002ed40
|
|
expression: 'max(/Hadoop by HTTP/hadoop.resourcemanager.num_active_nm,5m)=0'
|
|
name: 'ResourceManager: Cluster has no active NodeManagers'
|
|
priority: HIGH
|
|
description: 'Cluster is unable to execute any jobs without at least one NodeManager.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 3fccfdd8738544ca8969ade842430fc8
|
|
name: 'ResourceManager: Decommissioned NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_decommissioned_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Decommissioned NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumDecommissionedNMs.first()'
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: 9aad193a9e074575878e44aa96ff4237
|
|
name: 'ResourceManager: Decommissioning NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_decommissioning_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Decommissioning NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumDecommissioningNMs.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: c4bbf5295b2a44619e2b641468071f9b
|
|
name: 'ResourceManager: Lost NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_lost_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Lost NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumLostNMs.first()'
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: b7791ce30e8f4aa7b5eea2ee7ca7eef9
|
|
name: 'ResourceManager: Rebooted NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_rebooted_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Rebooted NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumRebootedNMs.first()'
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: 666152b3bf544a29b9e58a9f417c0ab8
|
|
name: 'ResourceManager: Shutdown NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_shutdown_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Shutdown NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumShutdownNMs.first()'
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: e6aa4b4b29414f2fb1f06bd536552c1c
|
|
name: 'ResourceManager: Unhealthy NMs'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.num_unhealthy_nm
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of Unhealthy NodeManagers.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumUnhealthyNMs.first()'
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
triggers:
|
|
- uuid: 0f35a0fa7a404559a3df225b906f0653
|
|
expression: 'min(/Hadoop by HTTP/hadoop.resourcemanager.num_unhealthy_nm,15m)>0'
|
|
name: 'ResourceManager: Cluster has unhealthy NodeManagers'
|
|
priority: AVERAGE
|
|
description: 'YARN considers any node with disk utilization exceeding the value specified under the property yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage (in yarn-site.xml) to be unhealthy. Ample disk space is critical to ensure uninterrupted operation of a Hadoop cluster, and large numbers of unhealthyNodes (the number to alert on depends on the size of your cluster) should be quickly investigated and resolved.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: c4c3195326e34ebcb57e5039beffce7c
|
|
name: 'ResourceManager: RPC queue & processing time'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.rpc_processing_time_avg
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
description: 'Average time spent on processing RPC requests.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.name==''Hadoop:service=ResourceManager,name=RpcActivityForPort8031'')].RpcProcessingTimeAvgTime.first()'
|
|
master_item:
|
|
key: hadoop.resourcemanager.info
|
|
tags:
|
|
- tag: component
|
|
value: resourcemanager
|
|
- uuid: 4e74ca69a84d441e95e2c20afd25fada
|
|
name: 'ResourceManager: Uptime'
|
|
type: DEPENDENT
|
|
key: hadoop.resourcemanager.uptime
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()'
|
|
- type: MULTIPLIER
|
|
parameters:
|
|
- '0.001'
|
|
master_item:
|
|
key: hadoop.resourcemanager.get
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
triggers:
|
|
- uuid: 7d4d026992344602a199966a8308a571
|
|
expression: 'nodata(/Hadoop by HTTP/hadoop.resourcemanager.uptime,30m)=1'
|
|
name: 'ResourceManager: Failed to fetch ResourceManager API page'
|
|
event_name: 'ResourceManager: Failed to fetch ResourceManager API page (or no data for 30m)'
|
|
priority: WARNING
|
|
description: 'Zabbix has not received any data for items for the last 30 minutes.'
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: 'ResourceManager: Service is unavailable'
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
- uuid: ade7cc30a4184ef89ed896bae56e0b18
|
|
expression: 'last(/Hadoop by HTTP/hadoop.resourcemanager.uptime)<10m'
|
|
name: 'ResourceManager: Service has been restarted'
|
|
event_name: 'ResourceManager: Service has been restarted (uptime < 10m)'
|
|
priority: INFO
|
|
description: 'Uptime is less than 10 minutes.'
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 66a87b21d32c436bb2d2eb23ec328f91
|
|
name: 'NameNode: Service response time'
|
|
type: SIMPLE
|
|
key: 'net.tcp.service.perf["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
description: 'Hadoop NameNode API performance.'
|
|
tags:
|
|
- tag: component
|
|
value: network
|
|
triggers:
|
|
- uuid: 4e4a6ab28fe5492d8fe4e291b8a586dc
|
|
expression: 'min(/Hadoop by HTTP/net.tcp.service.perf["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"],5m)>{$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN}'
|
|
name: 'NameNode: Service response time is too high'
|
|
event_name: 'NameNode: Service response time is too high (over {$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN} for 5m)'
|
|
priority: WARNING
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: 'NameNode: Service is unavailable'
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0'
|
|
tags:
|
|
- tag: scope
|
|
value: performance
|
|
- uuid: 98b11f1156dc472fbce27ca053e01d4e
|
|
name: 'ResourceManager: Service response time'
|
|
type: SIMPLE
|
|
key: 'net.tcp.service.perf["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
description: 'Hadoop ResourceManager API performance.'
|
|
tags:
|
|
- tag: component
|
|
value: network
|
|
triggers:
|
|
- uuid: e8e55f4c7e9e4823927a8c1345d3b941
|
|
expression: 'min(/Hadoop by HTTP/net.tcp.service.perf["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"],5m)>{$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN}'
|
|
name: 'ResourceManager: Service response time is too high'
|
|
event_name: 'ResourceManager: Service response time is too high (over {$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN} for 5m)'
|
|
priority: WARNING
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: 'ResourceManager: Service is unavailable'
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0'
|
|
tags:
|
|
- tag: scope
|
|
value: performance
|
|
- uuid: 2c52d856e07e4524abf3c2ae4b47c6b6
|
|
name: 'NameNode: Service status'
|
|
type: SIMPLE
|
|
key: 'net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]'
|
|
history: 7d
|
|
description: 'Hadoop NameNode API port availability.'
|
|
valuemap:
|
|
name: 'Service state'
|
|
preprocessing:
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 10m
|
|
tags:
|
|
- tag: component
|
|
value: health
|
|
- tag: component
|
|
value: network
|
|
triggers:
|
|
- uuid: f7e16c4ec91e4c04b13b73ee817c71d7
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0'
|
|
name: 'NameNode: Service is unavailable'
|
|
priority: AVERAGE
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
- uuid: 615b75c42ebe471da798a0613667d499
|
|
name: 'ResourceManager: Service status'
|
|
type: SIMPLE
|
|
key: 'net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]'
|
|
history: 7d
|
|
description: 'Hadoop ResourceManager API port availability.'
|
|
valuemap:
|
|
name: 'Service state'
|
|
preprocessing:
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 10m
|
|
tags:
|
|
- tag: component
|
|
value: health
|
|
- tag: component
|
|
value: network
|
|
triggers:
|
|
- uuid: a9ac7ede0c004fe18ab9f1fee36ad2b2
|
|
expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0'
|
|
name: 'ResourceManager: Service is unavailable'
|
|
priority: AVERAGE
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
discovery_rules:
|
|
- uuid: 0f05e90a6fc547d18f291ae2264db9d1
|
|
name: 'Data node discovery'
|
|
type: HTTP_AGENT
|
|
key: hadoop.datanode.discovery
|
|
delay: 1h
|
|
item_prototypes:
|
|
- uuid: ef570f8b37c545bd880b7df20bd19f06
|
|
name: '{#HOSTNAME}: Admin state'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.admin_state[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
description: 'Administrative state.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.adminState
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: 'hadoop.datanode.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: datanode
|
|
- uuid: 14904ca75991456784d2082c14b7ec88
|
|
name: '{#HOSTNAME}: Used'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.dfs_used[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: B
|
|
description: 'Used disk space.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].DfsUsed.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: capacity
|
|
- uuid: 6d2d030b3ddb41a394faede737329bbb
|
|
name: 'Hadoop DataNode {#HOSTNAME}: Get stats'
|
|
type: HTTP_AGENT
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
url: '{#INFOADDR}/jmx'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 01bc20e53e314089a55b270961062c00
|
|
name: '{#HOSTNAME}: JVM Garbage collection time'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.jvm.gc_time[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: '!ms'
|
|
description: 'The JVM garbage collection time in milliseconds.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=DataNode,name=JvmMetrics'')].GcTimeMillis.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: datanode
|
|
- uuid: 4cae9eef95f24810a6607de5348b7b54
|
|
name: '{#HOSTNAME}: JVM Heap usage'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.jvm.mem_heap_used[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: '!MB'
|
|
description: 'The JVM heap usage in MBytes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=DataNode,name=JvmMetrics'')].MemHeapUsedM.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: capacity
|
|
- uuid: dc30742dba2e4e5d99ca237615ffaef3
|
|
name: '{#HOSTNAME}: JVM Threads'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.jvm.threads[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'The number of JVM threads.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Threading'')].ThreadCount.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: datanode
|
|
- uuid: 57c00b46aef94c018806cdae43adfab5
|
|
name: '{#HOSTNAME}: Number of failed volumes'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.numfailedvolumes[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'Number of failed storage volumes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].NumFailedVolumes.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: datanode
|
|
- uuid: a6541492d4f7426b8016d1a8932b87ce
|
|
name: '{#HOSTNAME}: Oper state'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.oper_state[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
description: 'Operational state.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.operState
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: 'hadoop.datanode.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: datanode
|
|
trigger_prototypes:
|
|
- uuid: 9f657289a04041e5bcaa1947f62f607d
|
|
expression: 'last(/Hadoop by HTTP/hadoop.datanode.oper_state[{#HOSTNAME}])<>"Live"'
|
|
name: '{#HOSTNAME}: DataNode has state {ITEM.VALUE}.'
|
|
priority: AVERAGE
|
|
description: 'The state is different from normal.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: e1f9badba66147e6aaa2f895e6638fb9
|
|
name: 'Hadoop DataNode {#HOSTNAME}: Get raw info'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.raw_info[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.HostName==''{#HOSTNAME}'')].first()'
|
|
error_handler: DISCARD_VALUE
|
|
master_item:
|
|
key: hadoop.datanodes.get
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 5a46ec3c89eb40d4ad57cec2080c66f8
|
|
name: '{#HOSTNAME}: Remaining'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.remaining[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: B
|
|
description: 'Remaining disk space.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].Remaining.first()'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: capacity
|
|
- uuid: 2ac19ff8ee7f480f9974be56ab06eaaf
|
|
name: '{#HOSTNAME}: Uptime'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.uptime[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()'
|
|
- type: MULTIPLIER
|
|
parameters:
|
|
- '0.001'
|
|
master_item:
|
|
key: 'hadoop.datanode.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
trigger_prototypes:
|
|
- uuid: 3eccb9daf76f4bde88b424cf6f2d21f6
|
|
expression: 'nodata(/Hadoop by HTTP/hadoop.datanode.uptime[{#HOSTNAME}],30m)=1'
|
|
name: '{#HOSTNAME}: Failed to fetch DataNode API page'
|
|
event_name: '{#HOSTNAME}: Failed to fetch DataNode API page (or no data for 30m)'
|
|
priority: WARNING
|
|
description: 'Zabbix has not received any data for items for the last 30 minutes.'
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: '{#HOSTNAME}: DataNode has state {ITEM.VALUE}.'
|
|
expression: 'last(/Hadoop by HTTP/hadoop.datanode.oper_state[{#HOSTNAME}])<>"Live"'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
- uuid: e40298d300764251abcf93d5df3d9a67
|
|
expression: 'last(/Hadoop by HTTP/hadoop.datanode.uptime[{#HOSTNAME}])<10m'
|
|
name: '{#HOSTNAME}: Service has been restarted'
|
|
event_name: '{#HOSTNAME}: Service has been restarted (uptime < 10m)'
|
|
priority: INFO
|
|
description: 'Uptime is less than 10 minutes.'
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: 62b4ca9b1e8a43aa89fbeb78ac16c8cf
|
|
name: '{#HOSTNAME}: Version'
|
|
type: DEPENDENT
|
|
key: 'hadoop.datanode.version[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
description: 'DataNode software version.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.version
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: 'hadoop.datanode.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
graph_prototypes:
|
|
- uuid: c497416bcce1416ebcede7fc491ccdba
|
|
name: '{#HOSTNAME}: DataNode {#HOSTNAME} DFS size'
|
|
type: STACKED
|
|
graph_items:
|
|
- drawtype: FILLED_REGION
|
|
color: 199C0D
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'hadoop.datanode.dfs_used[{#HOSTNAME}]'
|
|
- sortorder: '1'
|
|
drawtype: FILLED_REGION
|
|
color: F63100
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'hadoop.datanode.remaining[{#HOSTNAME}]'
|
|
url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo'
|
|
preprocessing:
|
|
- type: JAVASCRIPT
|
|
parameters:
|
|
- |
|
|
try{
|
|
parsed = JSON.parse(value);
|
|
var result = [];
|
|
|
|
function getNodes(nodes) {
|
|
Object.keys(nodes).forEach(function (field) {
|
|
var Node = {};
|
|
Node['{#HOSTNAME}'] = field || '';
|
|
Node['{#INFOADDR}'] = nodes[field].infoAddr || '';
|
|
result.push(Node);
|
|
});
|
|
}
|
|
|
|
getNodes(JSON.parse(parsed.beans[0].LiveNodes));
|
|
getNodes(JSON.parse(parsed.beans[0].DeadNodes));
|
|
getNodes(JSON.parse(parsed.beans[0].DecomNodes));
|
|
getNodes(JSON.parse(parsed.beans[0].EnteringMaintenanceNodes));
|
|
|
|
return JSON.stringify(result);
|
|
}
|
|
catch (error) {
|
|
throw 'Failed to process response received from Hadoop.';
|
|
}
|
|
- uuid: de2d5f97843345668bc0b8c8336b9c14
|
|
name: 'Node manager discovery'
|
|
type: HTTP_AGENT
|
|
key: hadoop.nodemanager.discovery
|
|
delay: 1h
|
|
item_prototypes:
|
|
- uuid: ffa4704e099a4f1a8b49add245938501
|
|
name: '{#HOSTNAME}: Available memory'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.availablememory[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: '!MB'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.AvailableMemoryMB
|
|
master_item:
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: memory
|
|
- uuid: e8d0ea2c96b643f899e370ab73c5c262
|
|
name: '{#HOSTNAME}: Container launch avg duration'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.container_launch_duration_avg[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=NodeManager,name=NodeManagerMetrics'')].ContainerLaunchDurationAvgTime.first()'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: 23c89dfb26a34b77bf34fcf543f719f2
|
|
name: 'Hadoop NodeManager {#HOSTNAME}: Get stats'
|
|
type: HTTP_AGENT
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
url: '{#NODEHTTPADDRESS}/jmx'
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 82e289c999a246a6bd1feb85349d0348
|
|
name: '{#HOSTNAME}: JVM Garbage collection time'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.jvm.gc_time[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: '!ms'
|
|
description: 'The JVM garbage collection time in milliseconds.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=NodeManager,name=JvmMetrics'')].GcTimeMillis.first()'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: 4032f0a266c44b34896e8179bbed2419
|
|
name: '{#HOSTNAME}: JVM Heap usage'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.jvm.mem_heap_used[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: '!MB'
|
|
description: 'The JVM heap usage in MBytes.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=NodeManager,name=JvmMetrics'')].MemHeapUsedM.first()'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: d7485913b2db4e31a8f02f63f8c18913
|
|
name: '{#HOSTNAME}: JVM Threads'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.jvm.threads[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
description: 'The number of JVM threads.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Threading'')].ThreadCount.first()'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: 662cafd31e194db8808c75789bf712eb
|
|
name: '{#HOSTNAME}: Number of containers'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.numcontainers[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.NumContainers
|
|
master_item:
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: a8b11a7fb3244792abf4ffa9461e4712
|
|
name: 'Hadoop NodeManager {#HOSTNAME}: Get raw info'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: '0'
|
|
trends: '0'
|
|
value_type: TEXT
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.[?(@.HostName==''{#HOSTNAME}'')].first()'
|
|
error_handler: DISCARD_VALUE
|
|
master_item:
|
|
key: hadoop.nodemanagers.get
|
|
tags:
|
|
- tag: component
|
|
value: raw
|
|
- uuid: 01a5bcdbfc1c4a84a471738998aed372
|
|
name: '{#HOSTNAME}: RPC queue & processing time'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.rpc_processing_time_avg[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
description: 'Average time spent on processing RPC requests.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''Hadoop:service=NodeManager,name=RpcActivityForPort8040'')].RpcProcessingTimeAvgTime.first()'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
- uuid: bab9c705d31e42ce9af65b396e18504b
|
|
name: '{#HOSTNAME}: State'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.state[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
description: 'State of the node - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONING, DECOMMISSIONED, LOST, REBOOTED, SHUTDOWN.'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.State
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: nodemanager
|
|
trigger_prototypes:
|
|
- uuid: 8752a292093347fcb16d3f06dd97c5c3
|
|
expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.state[{#HOSTNAME}])<>"RUNNING"'
|
|
name: '{#HOSTNAME}: NodeManager has state {ITEM.VALUE}.'
|
|
priority: AVERAGE
|
|
description: 'The state is different from normal.'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: f8f6799130d34848a7dfb65815939c48
|
|
name: '{#HOSTNAME}: Uptime'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.uptime[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
value_type: FLOAT
|
|
units: s
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()'
|
|
- type: MULTIPLIER
|
|
parameters:
|
|
- '0.001'
|
|
master_item:
|
|
key: 'hadoop.nodemanager.get[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
trigger_prototypes:
|
|
- uuid: 6f8a6308d4334dd9bebe7af2fa3fb831
|
|
expression: 'nodata(/Hadoop by HTTP/hadoop.nodemanager.uptime[{#HOSTNAME}],30m)=1'
|
|
name: '{#HOSTNAME}: Failed to fetch NodeManager API page'
|
|
event_name: '{#HOSTNAME}: Failed to fetch NodeManager API page (or no data for 30m)'
|
|
priority: WARNING
|
|
description: 'Zabbix has not received any data for items for the last 30 minutes.'
|
|
manual_close: 'YES'
|
|
dependencies:
|
|
- name: '{#HOSTNAME}: NodeManager has state {ITEM.VALUE}.'
|
|
expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.state[{#HOSTNAME}])<>"RUNNING"'
|
|
tags:
|
|
- tag: scope
|
|
value: availability
|
|
- uuid: 05f3cf8ed34f4a708df508f0e50e119d
|
|
expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.uptime[{#HOSTNAME}])<10m'
|
|
name: '{#HOSTNAME}: Service has been restarted'
|
|
event_name: '{#HOSTNAME}: Service has been restarted (uptime < 10m)'
|
|
priority: INFO
|
|
description: 'Uptime is less than 10 minutes.'
|
|
manual_close: 'YES'
|
|
tags:
|
|
- tag: scope
|
|
value: notice
|
|
- uuid: d92b66e61a5244a995693ab8aedee96e
|
|
name: '{#HOSTNAME}: Used memory'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.usedmemory[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
units: '!MB'
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.UsedMemoryMB
|
|
master_item:
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: memory
|
|
- uuid: c4d46de2d6d341f5a2c1826236f94e5e
|
|
name: '{#HOSTNAME}: Version'
|
|
type: DEPENDENT
|
|
key: 'hadoop.nodemanager.version[{#HOSTNAME}]'
|
|
delay: '0'
|
|
history: 7d
|
|
trends: '0'
|
|
value_type: CHAR
|
|
preprocessing:
|
|
- type: JSONPATH
|
|
parameters:
|
|
- $.NodeManagerVersion
|
|
- type: DISCARD_UNCHANGED_HEARTBEAT
|
|
parameters:
|
|
- 6h
|
|
master_item:
|
|
key: 'hadoop.nodemanager.raw_info[{#HOSTNAME}]'
|
|
tags:
|
|
- tag: component
|
|
value: system
|
|
url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo'
|
|
preprocessing:
|
|
- type: JAVASCRIPT
|
|
parameters:
|
|
- |
|
|
try {
|
|
parsed = JSON.parse(value);
|
|
var result = [];
|
|
|
|
function getNodes(nodes) {
|
|
Object.keys(nodes).forEach(function (field) {
|
|
var Node = {};
|
|
Node['{#HOSTNAME}'] = nodes[field].HostName || '';
|
|
Node['{#NODEHTTPADDRESS}'] = nodes[field].NodeHTTPAddress || '';
|
|
result.push(Node);
|
|
});
|
|
}
|
|
|
|
getNodes(JSON.parse(parsed.beans[0].LiveNodeManagers));
|
|
|
|
return JSON.stringify(result);
|
|
}
|
|
catch (error) {
|
|
throw 'Failed to process response received from Hadoop.';
|
|
}
|
|
tags:
|
|
- tag: class
|
|
value: application
|
|
- tag: target
|
|
value: hadoop
|
|
macros:
|
|
- macro: '{$HADOOP.CAPACITY_REMAINING.MIN.WARN}'
|
|
value: '20'
|
|
description: 'The Hadoop cluster capacity remaining percent for trigger expression.'
|
|
- macro: '{$HADOOP.NAMENODE.HOST}'
|
|
value: NameNode
|
|
description: 'The Hadoop NameNode host IP address or FQDN.'
|
|
- macro: '{$HADOOP.NAMENODE.PORT}'
|
|
value: '9870'
|
|
description: 'The Hadoop NameNode Web-UI port.'
|
|
- macro: '{$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN}'
|
|
value: 10s
|
|
description: 'The Hadoop NameNode API page maximum response time in seconds for trigger expression.'
|
|
- macro: '{$HADOOP.RESOURCEMANAGER.HOST}'
|
|
value: ResourceManager
|
|
description: 'The Hadoop ResourceManager host IP address or FQDN.'
|
|
- macro: '{$HADOOP.RESOURCEMANAGER.PORT}'
|
|
value: '8088'
|
|
description: 'The Hadoop ResourceManager Web-UI port.'
|
|
- macro: '{$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN}'
|
|
value: 10s
|
|
description: 'The Hadoop ResourceManager API page maximum response time in seconds for trigger expression.'
|
|
dashboards:
|
|
- uuid: 474829439a064a6ba1c33d8d81e6e832
|
|
name: 'Hadoop: Overview'
|
|
pages:
|
|
- name: Main
|
|
widgets:
|
|
- type: graph
|
|
'y': '2'
|
|
width: '12'
|
|
height: '5'
|
|
fields:
|
|
- type: INTEGER
|
|
name: source_type
|
|
value: '1'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.namenode.rpc_processing_time_avg
|
|
- type: graph
|
|
x: '12'
|
|
'y': '2'
|
|
width: '12'
|
|
height: '5'
|
|
fields:
|
|
- type: INTEGER
|
|
name: source_type
|
|
value: '1'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.rpc_processing_time_avg
|
|
- type: graph
|
|
'y': '7'
|
|
width: '12'
|
|
height: '5'
|
|
fields:
|
|
- type: GRAPH
|
|
name: graphid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
name: 'NameNode: DataNodes'
|
|
- type: graph
|
|
x: '12'
|
|
'y': '7'
|
|
width: '12'
|
|
height: '5'
|
|
fields:
|
|
- type: GRAPH
|
|
name: graphid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
name: 'NameNode: NMs'
|
|
- type: item
|
|
name: 'NameNode response time'
|
|
x: '8'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '4'
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'net.tcp.service.perf["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]'
|
|
- type: item
|
|
name: 'NameNode status'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '4'
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]'
|
|
- type: item
|
|
name: 'NameNode uptime'
|
|
x: '4'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.namenode.uptime
|
|
- type: item
|
|
name: 'ResourceManager response time'
|
|
x: '20'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '4'
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'net.tcp.service.perf["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]'
|
|
- type: item
|
|
name: 'ResourceManager status'
|
|
x: '12'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '4'
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: 'net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]'
|
|
- type: item
|
|
name: 'ResourceManager uptime'
|
|
x: '16'
|
|
width: '4'
|
|
fields:
|
|
- type: INTEGER
|
|
name: show
|
|
value: '2'
|
|
- type: ITEM
|
|
name: itemid
|
|
value:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.uptime
|
|
valuemaps:
|
|
- uuid: 6c967c4df18d4c7ebb0fd4be17df292a
|
|
name: 'Service state'
|
|
mappings:
|
|
- value: '0'
|
|
newvalue: Down
|
|
- value: '1'
|
|
newvalue: Up
|
|
graphs:
|
|
- uuid: 632a641116194105983cc581b1bd890d
|
|
name: 'NameNode: DataNodes'
|
|
graph_items:
|
|
- color: 199C0D
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.namenode.num_live_data_nodes
|
|
- sortorder: '1'
|
|
color: F63100
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.namenode.num_dead_data_nodes
|
|
- sortorder: '2'
|
|
color: 00611C
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.namenode.num_stale_data_nodes
|
|
- uuid: 96ef586f2f554d028efcbb75b4c7024b
|
|
name: 'NameNode: NMs'
|
|
graph_items:
|
|
- color: 199C0D
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.num_active_nm
|
|
- sortorder: '1'
|
|
color: F63100
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.num_lost_nm
|
|
- sortorder: '2'
|
|
color: 00611C
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.num_rebooted_nm
|
|
- sortorder: '3'
|
|
color: F7941D
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.num_shutdown_nm
|
|
- sortorder: '4'
|
|
color: FC6EA3
|
|
calc_fnc: ALL
|
|
item:
|
|
host: 'Hadoop by HTTP'
|
|
key: hadoop.resourcemanager.num_unhealthy_nm
|