说明:
这篇文章主要对nagios常见配置文件进行说明。其中nagios常用配置文件主要用以下6个:
cgi.cfg(控制cgi访问的配置文件),nagios.cfg(Nagios主配置文件),resource.cfg(resource.cfg定义了一些变量,以便被其他文件引用,如$USER1$),objects(objects是一个目录,用于定义Nagios对象),servers(servers是自己创建的目录,Nagios可以加载该目录下所有配置文件,常用于定义各远程服务器监控信息,需要在nagios.cfg中配置)
其中:
objects目录下文件::
commands.cfg #命令定义配置文件,里面定义的命令可以被其他文件引用
contacts.cfg #联系人和联系人组配置文件
localhost.cfg #监控本地机器的配置文件
printer.cfg #监控打印机的一个事例配置文件(默认未启用)
switch.cfg #监控路由器的一个事例配置文件(默认未启用)
templates.cfg #模板配置文件,在此可以定义模板,在其他文件中引用
timeperiods.cfg #定义监控时间段的配置文件
windows.cfg #监控Windows的一个事例配置文件(默认未启用)
./servers目录下文件:
hostgroup.cfg #自己创建的主机群组配置文件
192.168.0.2.cfg #自己创建的监控远程Linux主机的配置文件
原理:

nagios主要是监控一台主机的各种信息,如硬盘,负载,端口等。这些在nagios里被定义为一个个服务,为了与主机提供的服务相区别,我这里用项目这个词),而实现每个监控项目,则需要通过commands.cfg文件中定义的命令。
为了不必重复定义一些项目,Nagios引入了一个模板配置文件(templates.cfg),将一些共性的属性定义成模板,以便于多次引用。
我们现在有一个监控项目是监控一台机器的web服务是否正常, 我们需要哪些元素呢?
最重要的有下面三点:首先是监控哪台机器,然后是这个监控要用什么命令实现,最后就是出了问题的时候要通知哪个联系人:
a.首先应该在commands.cfg中定义监控远程服务和资源的命令,以及如何发送邮件的命令。大部分监控远程服务和资源的命令的命令通过/usr/local/nagios/libexec下的脚本实现,如ping命令为check_ping。/usr/local/nagios/libexec下的脚本命令的使用发法可以通过-h参数查看;
b.在contacts.cfg文件中定义联系人和联系人组,在timeperiods.cfg中定义监控时间段;
c.服务器监控配置文件中引用前面定义的元素来监控服务器状态。
具体:
1.resource.cfg配置文件,定义了些变量,以便被其他文件引用,如$USER1$
# vi /usr/local/nagios/etc/resource.cfg $USER1$=/usr/local/nagios/libexec //定义$USER1$变量,设置插件路径
2.commands.cfg配置文件,主要定义监控服务器各服务的命令
# vi /usr/local/nagios/etc/objects/commands.cfg
define command{ //定义check-host-alive命令
command_name check-host-alive //命令名称
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5 //具体命令
}
define command{ //定义nrpe命令
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
3.contacts.cfg,定义联系人,联系人组等信息。
# vi /usr/local/nagios/etc/objects/contacts.cfg
#定义联系人,多个联系人,需写多个define contact {} 段
define contact{
contact_name nagiosadmin ; Short name of user
use generic-contact ; Inherit default values from generic-contact template (defined above)//generic-contact在templates.cfg中定义。
alias Nagios Admin ; Full name of user
email test@gmaile.com ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
}
define contact {
}
#定义联系人组
define contactgroup{
contactgroup_name admins
alias Nagios Administrators
members nagiosadmin #在此可以加入多个联系人,中间用逗号隔开
}
4.timeperiods.cfg,定义监控的时间段
# vi /usr/local/nagios/etc/objects/timeperiods.cfg
define timeperiod{
timeperiod_name 24x7 #监控所有时间段(7*24小时)
alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}
5.templates.cfg 通用模板定义,包括监控主机模板generic-host,linux_server;监控服务模板local_service;联系人模板generic-contact(并非真正的联系人,真正的联系人在contacts.cfg中定义)。很重要!!!
# vi /usr/local/nagios/etc/objects/templates.cfg
#联系人模板generic-contact
define contact{
name generic-contact ; The name of this contact template
service_notification_period 24x7 ; service notifications can be sent anytime
host_notification_period 24x7 ; host notifications can be sent anytime
service_notification_options w,u,c,r,f,s ; send notifications for all service states, flapping events, and scheduled downtime events
host_notification_options d,u,r,f,s ; send notifications for all host states, flapping events, and scheduled downtime events
service_notification_commands notify-service-by-email ; send service notifications via email
host_notification_commands notify-host-by-email ; send host notifications via email
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL CONTACT, JUST A TEMPLATE!
}
service_notification_period 24x7 //服务出了状况通知的时间段,这个时间段就是上面在timeperiods.cfg中定义的.
host_notification_period 24x7 //主机出了状况通知的时间段, 这个时间段就是上面在timeperiods.cfg中定义的
service_notification_options w,u,c,r //当服务出现w—报警(warning),u—未知(unkown),c—严重(critical),或者r—从异常情况恢复正常,在这四种情况下通知联系人.
host_notification_options d,u,r //当主机出现d—当机(down),u—返回不可达(unreachable),r—从异常情况恢复正常,在这3种情况下通知联系人
service_notification_commands notify-service-by-email //服务出问题通知采用的命令notify-service-by-email,这个命令是在commands.cfg中定义的,作用是给联系人发邮件.
host_notification_commands notify-host-by-email //同上,主机出问题时采用的也是发邮件的方式通知联系人
#定义generic-host主机模板
define host{
name generic-host ; The name of this host template
notifications_enabled 1 ; Host notifications are enabled
event_handler_enabled 1 ; Host event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
notification_period 24x7 ; Send host notifications at any time
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}
#定义Linux主机模板
define host{
name linux-server ; The name of this host template
use generic-host ; This template inherits other values from the generic-host template
check_period 24x7 ; By default, Linux hosts are checked round the clock
check_interval 5 ; Actively check the host every 5 minutes
retry_interval 1 ; Schedule host check retries at 1 minute intervals
max_check_attempts 10 ; Check each Linux host 10 times (max)
check_command check-host-alive ; Default command to check Linux hosts
notification_period workhours ; Linux admins hate to be woken up, so we only notify during the day
; Note that the notification_period variable is being overridden from
; the value that is inherited from the generic-host template!
notification_interval 120 ; Resend notifications every 2 hours
notification_options d,u,r ; Only send notifications for specific host states
contact_groups admins ; Notifications get sent to the admins by default
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}
6.定义被监控远程主机及被监控远程主机服务
# mkdir /usr/local/nagios/etc/servers # vi nagios.cfg //取消前面的#注释 cfg_dir=/usr/local/nagios/etc/servers //在nagios.cfg配置文件中启用对/usr/local/nagios/etc/servers/配置文件的引用。
#远程Linux主机监控文件,如果监控多台主机只需简单复制修改即可。我们应该牢记192.168.0.2.cfg用到的命令在commands.cfg中定义,在commands.cfg中定义的命令用到/usr/local/nagios/libexec下的插件(命令)。
# vi /usr/local/nagios/etc/servers/192.168.0.2.cfg
#定义主机
define host{
use linux-server ; Name of host template to use
; This host definition will inherit all variables that are defined
; in (or inherited by) the linux-server host template definition.
host_name 192.168.0.2 //host_name这个参数可自定义,不需要是/etc/hosts的主机名
alias 192.168.0.2 //alias这个参数也可定义,可以和host_name不一样
address 192.168.0.2 //IP地址
}
#定义被监控远程主机服务,如ping远程Linux主机
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description PING
check_command check_ping!100.0,20%!500.0,60% ;check_ping命令在commands.cfg中定义,后跟两个参数,命令及参数间用!分割。
}
#定义远程Linux主机存活判断
define service{
use generic-service ; Name of service template to use
host_name kslvs1
service_description PING
check_command check_nrpe218!check_ping
}
#检查远程Linux主机根分区使用情况,check_nrpe命令必须在/usr/local/nagios/etc/objects/commands.cfg中定义(默认未定义)
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description Root Partition
check_command check_nrpe!check_disk_root
}
#检查远程Linux主机的登录人数
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description Current Users
check_command check_nrpe!check_users
}
#检查远程Linux的主机的负载
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description Current Load
check_command check_nrpe!check_load
}
#检查远程Linux主机swap分区使用情况
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description Swap Usage
check_command check_nrpe!check_swap
}
#检查远程Linux主机的SSH服务
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description SSH
check_command check_ssh
notifications_enabled 0
}
#检查远程Linux主机的HTTP服务
define service{
use generic-service ; Name of service template to use
host_name sectop
service_description HTTP
check_command check_http
notifications_enabled 0
}
附录:
暂空!