优化硬盘检测逻辑避免唤醒休眠硬盘

This commit is contained in:
xiaochao
2025-07-28 14:10:23 +08:00
parent 30b1b7d271
commit 25348fff9b
3 changed files with 225 additions and 125 deletions

View File

@@ -313,11 +313,34 @@ class FlynasCoordinator(DataUpdateCoordinator):
if connection_id is not None: if connection_id is not None:
await self.release_ssh_connection(connection_id) await self.release_ssh_connection(connection_id)
async def ping_system(self) -> bool:
"""轻量级系统状态检测"""
# 对于本地主机直接返回True
if self.host in ['localhost', '127.0.0.1']:
return True
try:
# 使用异步ping检测减少超时时间
proc = await asyncio.create_subprocess_exec(
'ping', '-c', '1', '-W', '1', self.host,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL
)
await asyncio.wait_for(proc.wait(), timeout=2) # 总超时时间2秒
return proc.returncode == 0
except Exception:
return False
async def _monitor_system_status(self): async def _monitor_system_status(self):
"""系统离线时轮询检测状态""" """系统离线时轮询检测状态"""
self._debug_log(f"启动系统状态监控,每{self._retry_interval}秒检测一次") self._debug_log(f"启动系统状态监控,每{self._retry_interval}秒检测一次")
# 使用指数退避策略,避免频繁检测
check_interval = self._retry_interval
max_interval = 300 # 最大5分钟检测一次
while True: while True:
await asyncio.sleep(self._retry_interval) await asyncio.sleep(check_interval)
if await self.ping_system(): if await self.ping_system():
self._info_log("检测到系统已开机,触发重新加载") self._info_log("检测到系统已开机,触发重新加载")
@@ -326,24 +349,10 @@ class FlynasCoordinator(DataUpdateCoordinator):
self.hass.config_entries.async_reload(self.config_entry.entry_id) self.hass.config_entries.async_reload(self.config_entry.entry_id)
) )
break break
else:
async def ping_system(self) -> bool: # 系统仍然离线,增加检测间隔(指数退避)
"""轻量级系统状态检测""" check_interval = min(check_interval * 1.5, max_interval)
# 对于本地主机直接返回True self._debug_log(f"系统仍离线,下次检测间隔: {check_interval}")
if self.host in ['localhost', '127.0.0.1']:
return True
try:
# 使用异步ping检测
proc = await asyncio.create_subprocess_exec(
'ping', '-c', '1', '-W', '1', self.host,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL
)
await proc.wait()
return proc.returncode == 0
except Exception:
return False
async def _async_update_data(self): async def _async_update_data(self):
"""数据更新入口,优化命令执行频率""" """数据更新入口,优化命令执行频率"""

View File

@@ -14,6 +14,7 @@ class DiskManager:
self.disk_full_info_cache = {} # 缓存磁盘完整信息 self.disk_full_info_cache = {} # 缓存磁盘完整信息
self.first_run = True # 首次运行标志 self.first_run = True # 首次运行标志
self.initial_detection_done = False # 首次完整检测完成标志 self.initial_detection_done = False # 首次完整检测完成标志
self.disk_io_stats_cache = {} # 缓存磁盘I/O统计信息
def extract_value(self, text: str, patterns, default="未知", format_func=None): def extract_value(self, text: str, patterns, default="未知", format_func=None):
if not text: if not text:
@@ -38,10 +39,9 @@ class DiskManager:
async def check_disk_active(self, device: str, window: int = 30) -> bool: async def check_disk_active(self, device: str, window: int = 30) -> bool:
"""检查硬盘在指定时间窗口内是否有活动""" """检查硬盘在指定时间窗口内是否有活动"""
try: try:
# 正确的路径是 /sys/block/{device}/stat
stat_path = f"/sys/block/{device}/stat" stat_path = f"/sys/block/{device}/stat"
# 读取统计文件 # 读取当前统计文件
stat_output = await self.coordinator.run_command(f"cat {stat_path} 2>/dev/null") stat_output = await self.coordinator.run_command(f"cat {stat_path} 2>/dev/null")
if not stat_output: if not stat_output:
self.logger.debug(f"无法读取 {stat_path},默认返回活跃状态") self.logger.debug(f"无法读取 {stat_path},默认返回活跃状态")
@@ -53,51 +53,147 @@ class DiskManager:
self.logger.debug(f"无效的统计信息格式:{stat_output}") self.logger.debug(f"无效的统计信息格式:{stat_output}")
return True return True
# 关键字段当前正在进行的I/O操作数量第9个字段索引8 try:
in_flight = int(stats[8]) # /sys/block/{device}/stat 字段说明:
# 0: read I/Os requests 读请求次数
# 1: read I/Os merged 读请求合并次数
# 2: read sectors 读扇区数
# 3: read ticks 读操作耗时(ms)
# 4: write I/Os requests 写请求次数
# 5: write I/Os merged 写请求合并次数
# 6: write sectors 写扇区数
# 7: write ticks 写操作耗时(ms)
# 8: in_flight 当前进行中的I/O请求数
# 9: io_ticks I/O活动时间(ms)
# 10: time_in_queue 队列中的总时间(ms)
# 如果当前有I/O操作直接返回活跃状态 current_stats = {
if in_flight > 0: 'read_ios': int(stats[0]),
'write_ios': int(stats[4]),
'in_flight': int(stats[8]),
'io_ticks': int(stats[9])
}
# 如果当前有正在进行的I/O操作直接返回活跃状态
if current_stats['in_flight'] > 0:
self.logger.debug(f"磁盘 {device} 有正在进行的I/O操作: {current_stats['in_flight']}")
self.disk_io_stats_cache[device] = current_stats
return True
# 检查是否有缓存的统计信息
cached_stats = self.disk_io_stats_cache.get(device)
if cached_stats:
# 比较I/O请求次数的变化
read_ios_diff = current_stats['read_ios'] - cached_stats['read_ios']
write_ios_diff = current_stats['write_ios'] - cached_stats['write_ios']
io_ticks_diff = current_stats['io_ticks'] - cached_stats['io_ticks']
self.logger.debug(f"磁盘 {device} I/O变化: 读={read_ios_diff}, 写={write_ios_diff}, 活动时间={io_ticks_diff}ms")
# 如果在检测窗口内有I/O活动认为磁盘活跃
if read_ios_diff > 0 or write_ios_diff > 0 or io_ticks_diff > 100: # 100ms内的活动
self.logger.debug(f"磁盘 {device} 在窗口期内有I/O活动")
self.disk_io_stats_cache[device] = current_stats
return True
# 检查io_ticks是否表明最近有活动
# io_ticks是累积值如果在合理范围内增长说明有轻微活动
if io_ticks_diff > 0 and io_ticks_diff < window * 1000: # 在窗口时间内的轻微活动
self.logger.debug(f"磁盘 {device} 有轻微I/O活动")
self.disk_io_stats_cache[device] = current_stats
return True
else:
# 首次检测,保存当前状态并认为活跃
self.logger.debug(f"磁盘 {device} 首次检测,保存统计信息")
self.disk_io_stats_cache[device] = current_stats
return True
# 更新缓存
self.disk_io_stats_cache[device] = current_stats
# 检查硬盘电源状态
power_state = await self.get_disk_power_state(device)
if power_state in ["standby", "sleep", "idle"]:
self.logger.debug(f"磁盘 {device} 处于省电状态: {power_state}")
return False
# 所有检查都通过,返回非活跃状态
self.logger.debug(f"磁盘 {device} 判定为非活跃状态")
return False
except (ValueError, IndexError) as e:
self.logger.debug(f"解析统计信息失败: {e}")
return True return True
# 检查I/O操作时间第10个字段索引9 - io_ticks单位毫秒
io_ticks = int(stats[9])
# 如果设备在窗口时间内有I/O活动返回活跃状态
if io_ticks > window * 1000:
return True
# 所有检查都通过,返回非活跃状态
return False
except Exception as e: except Exception as e:
self.logger.error(f"检测硬盘活动状态失败: {str(e)}", exc_info=True) self.logger.error(f"检测硬盘活动状态失败: {str(e)}")
return True # 出错时默认执行检测 return True # 出错时默认执行检测
async def get_disk_activity(self, device: str) -> str: async def get_disk_power_state(self, device: str) -> str:
"""获取硬盘活动状态(活动中/空闲中/休眠中)""" """获取硬盘电源状态"""
try: try:
# 检查硬盘是否处于休眠状态 # 检查 SCSI 设备状态
state_path = f"/sys/block/{device}/device/state" state_path = f"/sys/block/{device}/device/state"
state_output = await self.coordinator.run_command(f"cat {state_path} 2>/dev/null || echo 'unknown'") state_output = await self.coordinator.run_command(f"cat {state_path} 2>/dev/null || echo 'unknown'")
state = state_output.strip().lower() state = state_output.strip().lower()
if state in ["standby", "sleep"]: if state in ["running", "active"]:
return "active"
elif state in ["standby", "sleep"]:
return state
# 对于某些设备尝试通过hdparm检查状态非侵入性
hdparm_output = await self.coordinator.run_command(f"hdparm -C /dev/{device} 2>/dev/null || echo 'unknown'")
if "standby" in hdparm_output.lower():
return "standby"
elif "sleeping" in hdparm_output.lower():
return "sleep"
elif "active/idle" in hdparm_output.lower():
return "active"
return "unknown"
except Exception as e:
self.logger.debug(f"获取磁盘 {device} 电源状态失败: {e}")
return "unknown"
async def get_disk_activity(self, device: str) -> str:
"""获取硬盘活动状态(活动中/空闲中/休眠中)"""
try:
# 先检查电源状态
power_state = await self.get_disk_power_state(device)
if power_state in ["standby", "sleep"]:
return "休眠中" return "休眠中"
# 检查最近一分钟内的硬盘活动 # 检查最近的I/O活动
stat_path = f"/sys/block/{device}/stat" stat_path = f"/sys/block/{device}/stat"
stat_output = await self.coordinator.run_command(f"cat {stat_path}") stat_output = await self.coordinator.run_command(f"cat {stat_path} 2>/dev/null")
stats = stat_output.split()
if len(stats) >= 11: if stat_output:
# 第9个字段是最近完成的读操作数 stats = stat_output.split()
# 第10个字段是最近完成的写操作数 if len(stats) >= 11:
recent_reads = int(stats[8]) try:
recent_writes = int(stats[9]) in_flight = int(stats[8]) # 当前进行中的I/O
if recent_reads > 0 or recent_writes > 0: # 如果有正在进行的I/O返回活动中
return "活动中" if in_flight > 0:
return "活动中"
# 检查缓存的统计信息来判断近期活动
cached_stats = self.disk_io_stats_cache.get(device)
if cached_stats:
current_read_ios = int(stats[0])
current_write_ios = int(stats[4])
read_diff = current_read_ios - cached_stats.get('read_ios', 0)
write_diff = current_write_ios - cached_stats.get('write_ios', 0)
if read_diff > 0 or write_diff > 0:
return "活动中"
except (ValueError, IndexError):
pass
return "空闲中" return "空闲中"

View File

@@ -293,89 +293,84 @@ class SystemManager:
return {} return {}
async def get_vol_usage(self) -> dict: async def get_vol_usage(self) -> dict:
"""获取 /vol* 开头的存储卷使用信息""" """获取 /vol* 开头的存储卷使用信息,避免唤醒休眠磁盘"""
try: try:
# 优先使用字节单位 # 首先检查哪些卷是活跃的,避免访问休眠磁盘
df_output = await self.coordinator.run_command("df -B 1 /vol* 2>/dev/null") active_vols = await self.check_active_volumes()
if df_output:
return self.parse_df_bytes(df_output)
df_output = await self.coordinator.run_command("df -h /vol*") if active_vols:
if df_output: # 只查询活跃的卷,避免使用通配符可能唤醒所有磁盘
return self.parse_df_human_readable(df_output) vol_list = " ".join(active_vols)
df_output = await self.coordinator.run_command(f"df -B 1 {vol_list} 2>/dev/null")
if df_output:
return self.parse_df_bytes(df_output)
df_output = await self.coordinator.run_command(f"df -h {vol_list} 2>/dev/null")
if df_output:
return self.parse_df_human_readable(df_output)
# 如果没有活跃卷或者上述方法失败,使用缓存或者返回空
return {} return {}
except Exception as e: except Exception as e:
self.logger.error("获取存储卷信息失败: %s", str(e)) self.logger.error("获取存储卷信息失败: %s", str(e))
return {} return {}
def parse_df_bytes(self, df_output: str) -> dict: async def check_active_volumes(self) -> list:
volumes = {} """检查当前活跃的存储卷,避免唤醒休眠磁盘"""
for line in df_output.splitlines()[1:]: try:
parts = line.split() # 获取所有挂载点,这个操作不会访问磁盘内容
if len(parts) < 6: mount_output = await self.coordinator.run_command("mount | grep '/vol'")
continue active_vols = []
mount_point = parts[-1] for line in mount_output.splitlines():
# 只处理 /vol 开头的挂载点 if '/vol' in line:
if not mount_point.startswith("/vol"): # 提取挂载点
continue parts = line.split()
for part in parts:
if part.startswith('/vol'):
# 检查这个卷对应的磁盘是否活跃
if await self.is_volume_disk_active(part):
active_vols.append(part)
break
try: self._debug_log(f"检测到活跃存储卷: {active_vols}")
size_bytes = int(parts[1]) return active_vols
used_bytes = int(parts[2])
avail_bytes = int(parts[3])
use_percent = parts[4]
def bytes_to_human(b): except Exception as e:
for unit in ['', 'K', 'M', 'G', 'T']: self._debug_log(f"检查活跃存储卷失败: {e}")
if abs(b) < 1024.0: return []
return f"{b:.1f}{unit}"
b /= 1024.0
return f"{b:.1f}P"
volumes[mount_point] = { async def is_volume_disk_active(self, mount_point: str) -> bool:
"filesystem": parts[0], """检查存储卷对应的磁盘是否活跃"""
"size": bytes_to_human(size_bytes), try:
"used": bytes_to_human(used_bytes), # 获取挂载点对应的设备
"available": bytes_to_human(avail_bytes), device_output = await self.coordinator.run_command(f"findmnt -n -o SOURCE {mount_point} 2>/dev/null")
"use_percent": use_percent if not device_output:
} return False
except (ValueError, IndexError) as e:
self.logger.debug("解析存储卷行失败: %s - %s", line, str(e))
continue
return volumes device = device_output.strip()
# 提取设备名(去掉分区号)
import re
device_match = re.search(r'/dev/([a-zA-Z]+)', device)
if device_match:
device_name = device_match.group(1)
def parse_df_human_readable(self, df_output: str) -> dict: # 检查设备的I/O统计不直接访问磁盘
volumes = {} stat_path = f"/sys/block/{device_name}/stat"
for line in df_output.splitlines()[1:]: stat_output = await self.coordinator.run_command(f"cat {stat_path} 2>/dev/null")
parts = line.split()
if len(parts) < 6:
continue
mount_point = parts[-1] if stat_output:
if not mount_point.startswith("/vol"): stats = stat_output.split()
continue if len(stats) >= 9:
in_flight = int(stats[8]) # 当前进行中的I/O
return in_flight > 0 # 有I/O活动认为是活跃的
try: return False
size = parts[1]
used = parts[2]
avail = parts[3]
use_percent = parts[4]
volumes[mount_point] = { except Exception as e:
"filesystem": parts[0], self._debug_log(f"检查卷磁盘活跃状态失败 {mount_point}: {e}")
"size": size, return False
"used": used,
"available": avail,
"use_percent": use_percent
}
except (ValueError, IndexError) as e:
self.logger.debug("解析存储卷行失败: %s - %s", line, str(e))
continue
return volumes
async def reboot_system(self): async def reboot_system(self):
"""重启系统""" """重启系统"""