为了提高系统的可靠性和稳定性,及时发现并解决潜在问题,机房存储设备除需要开启监控外,还需要定期进行巡检,检查监控无法发现的问题。
由于公司业务系统多,机构分散等原因,会有比较多的存储设备,同时因采购策略的不同,采购到的存储设备会有多种型号,导致日常巡检耗时耗力,亟需引入自动化处理。
目前我们用到的存储设备包括了Netapp、HP、IBM、联通及浪潮等几家公司产品,绝大部分都可以通过ssh进行登录,有些不用登录,可以通过命令远程查看存储信息,因为涉及很多设备,且品牌多,使用python编写可能会更合适。
另外我们日常办公使用飞书,可通过设置Linux crontab,把执行的python结果推送到飞书群(需提前在飞书群设置好群机器人),知会到各相关人员,有问题可方便快速处理。
结论:Python + ssh(存储命令)+ Linux crontab + 飞书
在确认进行自动化巡检前,我们需要确认以下信息:
●巡检范围:所有可接入的存储设备
●巡检指标:确定需要巡检的指标。主要包括各种组件状态,比如系统状态,磁盘状态及日志状态等,具体需要根据存储特性决定。
●巡检频率:设定合适的巡检周期,配合监控,设置每天较为合适。
实现代码如下:
import paramiko
import subprocess
import requests
import json
messages = []
# 飞书 Webhook URL
FEISHU_WEBHOOK_URL = "群机器人链接"
def send_feishu_message(message):
headers = {
"Content-Type": "application/json"
}
data = {
"msg_type": "text",
"content": {
"text": message
}
}
response = requests.post(FEISHU_WEBHOOK_URL, headers=headers, data=json.dumps(data))
if response.status_code != 200:
messages.append(f"发送飞书消息失败: {response.text}")
#使用ssh登录设备运行远程命令执行
def run_remote_command(ssh_client, command):
stdin, stdout, stderr = ssh_client.exec_command(command)
return stdout.read().decode(), stderr.read().decode()
def check_netapp_system_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "system health status show")
# stdout = check_system_health(command_runner, "system health status show")
if stderr:
return f"错误: {stderr}"
lines = stdout.strip().split('\n')
if lines[2] != "ok":
# mestext = f"*****系统健康状态异常,状态为{lines[2]}"
messages.append(f"\n*****系统健康状态异常,状态为{lines[2]}\n")
else:
messages.append("系统健康状态ok".strip())
def check_netapp_subsystem_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "system health subsystem show")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:-1]:
parts = line.split()
if len(parts) >= 1:
if parts[1] != "ok":
messages.append(f"\n*****子系统{parts[0]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有子系统状态ok".strip())
def check_netapp_chassis_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "system chassis show")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:-1]:
parts = line.split()
if len(parts) >= 2:
if parts[1] != "ok":
messages.append(f"\n*****机箱{parts[2]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有机箱状态ok".strip())
def check_netapp_controller_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "system controller show")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:-1]:
parts = line.split()
if len(parts) >= 4:
if parts[4] != "ok":
messages.append(f"\n*****控制器{parts[0]}异常,状态为{parts[4]}\n")
normal_found = True
if not normal_found:
messages.append("所有控制器状态ok".strip())
def check_netapp_node_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "system node show")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:-1]:
parts = line.split()
if len(parts) >= 4:
if parts[1] != "true":
messages.append(f"\n*****节点{parts[0]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有节点状态OK".strip())
def check_netapp_network_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "network interface show")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[3:-1]:
parts = line.split()
if len(parts) >= 4:
# print(parts[0])
if parts[1] != "up/up":
messages.append(f"\n*****网络接口{parts[0]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有网络接口状态up".strip())
def check_hpe_node_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "shownode")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:]:
parts = line.split()
# print(parts)
if len(parts) >= 4:
if parts[2] != "OK":
messages.append(f"\n*****节点{parts[0]}异常,状态为{parts[2]}\n")
normal_found = True
if not normal_found:
messages.append("所有节点状态OK".strip())
def check_hpe_battery_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "showbattery")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
if len(parts) >= 4:
if parts[4] != "OK":
messages.append(f"\n*****电池{parts[0]}异常,状态为{parts[4]}\n")
normal_found = True
if not normal_found:
messages.append("所有电池状态OK".strip())
def check_hpe_disk_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "showpd")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[2:-2]:
parts = line.split()
# print(parts)
if len(parts) >= 4:
if parts[4] != "normal":
messages.append(f"\n*****机箱{parts[1]}磁盘类型{parts[2]}异常,状态为{parts[4]}\n")
normal_found = True
if not normal_found:
messages.append("所有磁盘状态正常".strip())
from datetime import datetime, timedelta
def check_hpe_alert_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "showalert -oneline")
if stderr:
return f"错误: {stderr}"
normal_found = False
current_time = datetime.now()
one_week_ago =current_time - timedelta(days=7)
lines = stdout.strip().split('\n')
# print(lines)
for line in lines[1:-2]:
parts = line.split()
if len(parts) >= 6:
combined_field = ' '.join(parts[6:])
parts = parts[:6] + [combined_field]
time_str = parts[3]
try:
datetime_obj = datetime.strptime(time_str, "%Y-%m-%d")
if datetime_obj > one_week_ago:
messages.append(f"\n*****告警时间:{parts[3]},原因:{parts[6]}\n")
normal_found = True
except ValueError:
continue
if not normal_found:
messages.append("无告警日志".strip())
def check_ibm_bootdrive_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsnodebootdrive")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 6:
if parts[4] != "online":
messages.append(f"\n*****启动盘序列号{parts[5]}状态为{parts[4]},机柜序列号为{parts[7]}\n")
normal_found = True
if not normal_found:
messages.append("所有启动盘状态正常".strip())
def check_ibm_node_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsnode")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
if len(parts) >= 6:
if parts[3] != "online":
messages.append(f"\n*****节点{parts[1]}状态为{parts[3]},机柜序列号为{parts[10]}\n")
normal_found = True
if not normal_found:
messages.append("所有节点状态正常".strip())
def check_ibm_nodebattery_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsnodebattery")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 3:
if parts[3] != "online":
messages.append(f"\n*****节点电池{parts[1]}状态为{parts[3]}\n")
normal_found = True
if not normal_found:
messages.append("所有节点电池状态正常".strip())
def check_ibm_mdisk_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsmdisk")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 3:
if parts[2] != "online":
messages.append(f"\n*****磁盘{parts[1]}状态为{parts[2]}\n")
normal_found = True
if not normal_found:
messages.append("所有磁盘状态正常".strip())
def check_ibm_eventlog(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lseventlog |grep alert")
if stderr:
return f"错误: {stderr}"
current_time = datetime.now()
one_week_ago = current_time - timedelta(days=7)
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[0:]:
parts = line.split()
# print(parts)
if len(parts) >= 3:
combined_field = ' '.join(parts[8:])
parts = parts[:8] + [combined_field]
time_str = parts[1]
datetime_obj = datetime.strptime(time_str,"%y%m%d%H%M%S")
if datetime_obj > one_week_ago:
messages.append(f"\n*****告警时间:{datetime_obj},{parts[8]}\n")
normal_found = True
if not normal_found:
messages.append("日志正常".strip())
def check_ibm_copy_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsrcrelationship")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts[5])
if len(parts) >= 13:
if parts[12] == "CG_esdb" or parts[12] == "CG_newwmsdb":
if parts[13] != "consistent_synchronized":
messages.append(f"\n*****磁盘{parts[5]}复制状态异常,状态为{parts[13]}\n")
normal_found = True
if not normal_found:
messages.append("所有磁盘复制状态正常".strip())
def check_langchao_disk_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsdrive")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 5:
if parts[1] != "online":
messages.append(f"\n*****磁盘类型{parts[2]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有磁盘状态正常".strip())
def check_langchao_enclosure_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsenclosure")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 5:
if parts[1] != "online":
messages.append(f"\n*****机柜{parts[2]}异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("所有机柜状态正常".strip())
def check_langchao_port_health(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lsportsas")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 8:
if parts[1] == "1" and parts[6] != "online":
messages.append(f"\n*****节点端口{parts[4]}异常,状态为{parts[6]}\n")
normal_found = True
if not normal_found:
messages.append("所有节点端口状态正常".strip())
def check_langchao_host_state(ssh_client):
stdout, stderr = run_remote_command(ssh_client, "lshost")
if stderr:
return f"错误: {stderr}"
normal_found = False
lines = stdout.strip().split('\n')
for line in lines[1:]:
parts = line.split()
# print(parts)
if len(parts) >= 3:
if parts[4] != "online":
messages.append(f"\n*****主机{parts[1]}端口异常,状态为{parts[4]}\n")
normal_found = True
if not normal_found:
messages.append("所有主机端口状态正常".strip())
#不用登录,直接使用命令访问远端主机
def run_smcli_command(ip, username, password, command):
smcli_command = [
'SMcli.bat',
ip,
'-c', command,
'-u', username,
'-p', password,
'-k', 'trust'
]
try:
result = subprocess.run(
smcli_command,
capture_output=True,
text=True,
check=True
)
return result.stdout.strip() # 返回标准输出
except subprocess.CalledProcessError as e:
return f"命令执行失败(返回码 {e.returncode}):\n{e.stderr}"
except FileNotFoundError:
return "错误: SMcli 未找到,请确认路径或安装"
except Exception as e:
return f"未知错误: {str(e)}"
def show_all_drives(ip, username, password):
command = "show allDrives;"
output = run_smcli_command(ip, username, password, command)
lines = output.strip().split('\n')
normal_found = False
for line in lines[17:26]:
parts = line.split()
# print(parts)
if len(parts) >= 10:
if parts[2] != "Optimal":
messages.append(f"\n*****磁盘位置{parts[1]}状态为{parts[2]}\n")
normal_found = True
if not normal_found:
messages.append("所有磁盘状态正常".strip())
def show_all_volumes_summary(ip, username, password):
command = "show allvolumes summary;"
output = run_smcli_command(ip, username, password, command)
lines = output.strip().split('\n')
normal_found = False
for line in lines[11:]:
parts = line.split()
# print(parts)
if len(parts) >= 6:
if parts[3] != "Optimal":
messages.append(f"\n*****卷组{parts[0]}状态为{parts[3]}\n")
normal_found = True
if not normal_found:
messages.append("所有卷组状态正常".strip())
def show_all_controllerA_summary(ip, username, password):
command = "show controller[a] summary;"
output = run_smcli_command(ip, username, password, command)
lines = output.strip().split('\n')
normal_found = False
for line in lines[9:10]:
parts = line.split()
# print(parts)
if len(parts) >= 1:
if parts[1] != "Online":
messages.append(f"\n*****控制器A异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("控制器A状态正常".strip())
def show_all_controllerB_summary(ip, username, password):
command = "show controller[b] summary;"
output = run_smcli_command(ip, username, password, command)
lines = output.strip().split('\n')
normal_found = False
for line in lines[9:10]:
parts = line.split()
# print(parts)
if len(parts) >= 1:
if parts[1] != "Online":
messages.append(f"\n*****控制器B异常,状态为{parts[1]}\n")
normal_found = True
if not normal_found:
messages.append("控制器B状态正常".strip())
def main():
hosts = [
{'hostname': '', 'port': 22, 'username': '', 'password': '、', 'alias': '', 'storage_type': 'Netapp'},
{'hostname': '', 'port': 22, 'username': '', 'password': '', 'alias': '', 'storage_type': 'HPE'},
{'hostname': '', 'port': 22, 'username': '', 'password': '', 'alias': '', 'storage_type': 'HP'},
{'hostname': '', 'port': 22, 'username': '', 'password': '', 'alias': '', 'storage_type': 'IBM'},
{'hostname': '', 'port': 22, 'username': '', 'password': '', 'alias': '', 'storage_type': '浪潮'},
{"ip": "", "username": "", "password": "", "alias": "", 'storage_type': 'lenovo'},
]
for host in hosts:
hostname_or_ip = host.get('hostname', host.get('ip'))
messages.append(f"\n=== {host['alias']} {hostname_or_ip} 巡检结果 ===")
# messages.append(f"\n=== {host['alias']} {host['hostname']}巡检结果 ===")
try:
if host['storage_type'] == 'Netapp':
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(host['hostname'], host['port'], host['username'], host['password'])
check_netapp_system_health(ssh_client)
check_netapp_subsystem_health(ssh_client)
check_netapp_chassis_health(ssh_client)
check_netapp_controller_health(ssh_client)
check_netapp_node_health(ssh_client)
check_netapp_network_health(ssh_client)
elif host['storage_type'] == 'HPE':
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(host['hostname'], host['port'], host['username'], host['password'])
check_hpe_node_state(ssh_client)
check_hpe_battery_state(ssh_client)
check_hpe_disk_state(ssh_client)
check_hpe_alert_state(ssh_client)
elif host['storage_type'] == 'HP':
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(host['hostname'], host['port'], host['username'], host['password'])
check_hp_disk_health(ssh_client)
elif host['storage_type'] == '浪潮':
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(host['hostname'], host['port'], host['username'], host['password'])
check_langchao_disk_health(ssh_client)
check_langchao_enclosure_health(ssh_client)
check_ibm_eventlog(ssh_client)
check_langchao_port_health(ssh_client)
check_langchao_host_state(ssh_client)
elif host['storage_type'] == 'IBM':
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(host['hostname'], host['port'], host['username'], host['password'])
check_ibm_bootdrive_health(ssh_client)
check_ibm_node_health(ssh_client)
check_ibm_nodebattery_health(ssh_client)
check_ibm_mdisk_health(ssh_client)
check_ibm_eventlog(ssh_client)
check_ibm_copy_state(ssh_client)
elif host['storage_type'] == 'lenovo':
# check_lenovo_system_health(host['ip'], host['username'], host['password'])
show_all_drives(host["ip"], host["username"], host["password"])
show_all_volumes_summary(host["ip"], host["username"], host["password"])
show_all_controllerA_summary(host["ip"], host["username"], host["password"])
show_all_controllerB_summary(host["ip"], host["username"], host["password"])
# print(result.stdout)
else:
messages.append("暂不支持该存储类型")
except Exception as e:
messages.append(f"连接失败: {e}")
finally:
ssh_client.close()
send_feishu_message(" ".join(messages))
if __name__ == "__main__":
main()
结果如下:
=== XXX系统存储a.b.c.d巡检结果 === 系统健康状态ok 所有子系统状态ok 所有机箱状态ok 所有控制器状态ok 所有节点状态OK 所有网络接口状态up
=== XXX系统存储a.b.c.d巡检结果 === 所有磁盘状态正常 所有机柜状态正常 日志正常 所有节点端口状态正常 所有主机端口状态正常
由于产品特性,可能有些核心组件巡查项无法实现,需要我们后续通过其它方法来实现,不断完善巡检内容。
版权说明:如非注明,本站文章均为 扬州驻场服务-网络设备调试-监控维修-南京泽同信息科技有限公司 原创,转载请注明出处和附带本文链接。
请在这里放置你的在线分享代码