system_resources.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. import psutil
  2. import time
  3. import os
  4. import socket
  5. import pymysql
  6. import platform
  7. from datetime import datetime
  8. from config import mysql_config
  9. # Try GPU
  10. try:
  11. import pynvml
  12. pynvml.nvmlInit()
  13. GPU_AVAILABLE = True
  14. except Exception:
  15. GPU_AVAILABLE = False
  16. # 雪花算法ID生成器
  17. class SnowflakeIdGenerator:
  18. def __init__(self, datacenter_id=1, worker_id=1):
  19. self.datacenter_id = datacenter_id
  20. self.worker_id = worker_id
  21. self.sequence = 0
  22. self.last_timestamp = -1
  23. # 各部分位数
  24. self.datacenter_id_bits = 5
  25. self.worker_id_bits = 5
  26. self.sequence_bits = 12
  27. # 最大值
  28. self.max_datacenter_id = -1 ^ (-1 << self.datacenter_id_bits)
  29. self.max_worker_id = -1 ^ (-1 << self.worker_id_bits)
  30. self.max_sequence = -1 ^ (-1 << self.sequence_bits)
  31. # 位移
  32. self.worker_id_shift = self.sequence_bits
  33. self.datacenter_id_shift = self.sequence_bits + self.worker_id_bits
  34. self.timestamp_shift = self.sequence_bits + self.worker_id_bits + self.datacenter_id_bits
  35. # 起始时间戳 (2020-01-01)
  36. self.epoch = 1577836800000
  37. def _current_millis(self):
  38. return int(time.time() * 1000)
  39. def _wait_next_millis(self, last_timestamp):
  40. timestamp = self._current_millis()
  41. while timestamp <= last_timestamp:
  42. timestamp = self._current_millis()
  43. return timestamp
  44. def generate_id(self):
  45. timestamp = self._current_millis()
  46. if timestamp < self.last_timestamp:
  47. raise Exception("Clock moved backwards!")
  48. if timestamp == self.last_timestamp:
  49. self.sequence = (self.sequence + 1) & self.max_sequence
  50. if self.sequence == 0:
  51. timestamp = self._wait_next_millis(self.last_timestamp)
  52. else:
  53. self.sequence = 0
  54. self.last_timestamp = timestamp
  55. return ((timestamp - self.epoch) << self.timestamp_shift) | \
  56. (self.datacenter_id << self.datacenter_id_shift) | \
  57. (self.worker_id << self.worker_id_shift) | \
  58. self.sequence
  59. # 全局ID生成器
  60. id_generator = SnowflakeIdGenerator()
  61. def mb(v):
  62. return round(v / 1024 / 1024, 2)
  63. def cpu_info():
  64. return {
  65. "usage": psutil.cpu_percent(interval=1),
  66. "cores": psutil.cpu_count(logical=True),
  67. "load": os.getloadavg()
  68. }
  69. def memory_info():
  70. m = psutil.virtual_memory()
  71. return {
  72. "used": mb(m.used),
  73. "total": mb(m.total),
  74. "percent": m.percent
  75. }
  76. def disk_info(prev=None, interval=1):
  77. now = psutil.disk_io_counters()
  78. if prev is None:
  79. return now, {"read_mb": 0, "write_mb": 0}
  80. read_speed = (now.read_bytes - prev.read_bytes) / interval / 1024 / 1024
  81. write_speed = (now.write_bytes - prev.write_bytes) / interval / 1024 / 1024
  82. return now, {
  83. "read_mb": round(read_speed, 2),
  84. "write_mb": round(write_speed, 2)
  85. }
  86. def get_disk_usage():
  87. """获取磁盘使用情况"""
  88. disk = psutil.disk_usage('/')
  89. return {
  90. "total": round(disk.total / 1024 / 1024 / 1024, 2), # GB
  91. "used": round(disk.used / 1024 / 1024 / 1024, 2), # GB
  92. "free": round(disk.free / 1024 / 1024 / 1024, 2), # GB
  93. "percent": disk.percent
  94. }
  95. def disk_io_info(prev=None, interval=1):
  96. """获取磁盘IO信息,包括IOPS"""
  97. now = psutil.disk_io_counters()
  98. if prev is None:
  99. return now, {
  100. "read_mb": 0,
  101. "write_mb": 0,
  102. "read_iops": 0,
  103. "write_iops": 0
  104. }
  105. read_speed = (now.read_bytes - prev.read_bytes) / interval / 1024 / 1024
  106. write_speed = (now.write_bytes - prev.write_bytes) / interval / 1024 / 1024
  107. read_iops = int((now.read_count - prev.read_count) / interval)
  108. write_iops = int((now.write_count - prev.write_count) / interval)
  109. return now, {
  110. "read_mb": round(read_speed, 2),
  111. "write_mb": round(write_speed, 2),
  112. "read_iops": read_iops,
  113. "write_iops": write_iops
  114. }
  115. def network_speed(prev, interval):
  116. now = psutil.net_io_counters()
  117. up = (now.bytes_sent - prev.bytes_sent) / interval
  118. down = (now.bytes_recv - prev.bytes_recv) / interval
  119. return now, round(up / 1024 / 1024, 2), round(down / 1024 / 1024, 2)
  120. def gpu_info():
  121. if not GPU_AVAILABLE:
  122. return []
  123. gpus = []
  124. for i in range(pynvml.nvmlDeviceGetCount()):
  125. h = pynvml.nvmlDeviceGetHandleByIndex(i)
  126. mem = pynvml.nvmlDeviceGetMemoryInfo(h)
  127. util = pynvml.nvmlDeviceGetUtilizationRates(h)
  128. temp = pynvml.nvmlDeviceGetTemperature(
  129. h, pynvml.NVML_TEMPERATURE_GPU
  130. )
  131. gpus.append({
  132. "id": i,
  133. "util": util.gpu,
  134. "mem_used": mb(mem.used),
  135. "mem_total": mb(mem.total),
  136. "temp": temp
  137. })
  138. return gpus
  139. def get_local_ip():
  140. """获取本机内网IP地址和网卡名称"""
  141. try:
  142. # 创建一个UDP socket
  143. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  144. # 连接到外部地址(不需要真正连接,只是为了获取本地IP)
  145. s.connect(("8.8.8.8", 80))
  146. local_ip = s.getsockname()[0]
  147. s.close()
  148. # 获取网卡名称
  149. net_if_addrs = psutil.net_if_addrs()
  150. interface_name = "Unknown"
  151. for interface, addrs in net_if_addrs.items():
  152. for addr in addrs:
  153. if addr.family == socket.AF_INET and addr.address == local_ip:
  154. interface_name = interface
  155. break
  156. if interface_name != "Unknown":
  157. break
  158. return interface_name, local_ip
  159. except Exception:
  160. # 如果上述方法失败,尝试获取hostname对应的IP
  161. try:
  162. hostname = socket.gethostname()
  163. local_ip = socket.gethostbyname(hostname)
  164. # 尝试获取网卡名称
  165. net_if_addrs = psutil.net_if_addrs()
  166. interface_name = "Unknown"
  167. for interface, addrs in net_if_addrs.items():
  168. for addr in addrs:
  169. if addr.family == socket.AF_INET and addr.address == local_ip:
  170. interface_name = interface
  171. break
  172. if interface_name != "Unknown":
  173. break
  174. return interface_name, local_ip
  175. except Exception:
  176. return "Unknown", "Unable to get IP"
  177. def get_db_connection():
  178. """获取数据库连接"""
  179. return pymysql.connect(**mysql_config)
  180. def get_server_base_info():
  181. """获取服务器基础信息"""
  182. try:
  183. # 获取主机名
  184. server_name = socket.gethostname()
  185. if not server_name:
  186. server_name = "服务"
  187. # 获取操作系统信息
  188. os_type = platform.system() # Linux, Windows, Darwin
  189. os_version = platform.release() # 内核版本
  190. # 尝试获取更详细的系统信息
  191. try:
  192. if os_type == "Linux":
  193. # 尝试读取 /etc/os-release
  194. if os.path.exists("/etc/os-release"):
  195. with open("/etc/os-release", "r") as f:
  196. for line in f:
  197. if line.startswith("PRETTY_NAME="):
  198. os_version = line.split("=")[1].strip().strip('"')
  199. break
  200. except:
  201. pass
  202. # 服务器类型默认为物理机
  203. server_type = "物理机"
  204. # 尝试判断是否为虚拟机或容器
  205. try:
  206. # 检查是否在Docker容器中
  207. if os.path.exists("/.dockerenv"):
  208. server_type = "容器"
  209. # 检查是否为虚拟机
  210. elif os.path.exists("/sys/class/dmi/id/product_name"):
  211. with open("/sys/class/dmi/id/product_name", "r") as f:
  212. product = f.read().lower()
  213. if "virtual" in product or "vmware" in product or "kvm" in product:
  214. server_type = "虚拟机"
  215. except:
  216. pass
  217. return {
  218. "server_name": server_name,
  219. "server_type": server_type,
  220. "os_type": os_type,
  221. "os_version": os_version
  222. }
  223. except Exception as e:
  224. print(f"Error getting server base info: {e}")
  225. return {
  226. "server_name": "Unknown",
  227. "server_type": "物理机",
  228. "os_type": "Linux",
  229. "os_version": "Unknown"
  230. }
  231. def get_or_create_server_id(server_ip):
  232. """获取或创建服务器ID"""
  233. try:
  234. conn = get_db_connection()
  235. cursor = conn.cursor()
  236. # 查询是否存在该IP的服务器记录
  237. cursor.execute("SELECT id, server_name FROM server_base_info WHERE server_ip = %s", (server_ip,))
  238. result = cursor.fetchone()
  239. if result:
  240. # 已存在,返回ID
  241. server_id = result[0]
  242. server_name = result[1]
  243. else:
  244. # 不存在,创建新记录
  245. server_id = id_generator.generate_id()
  246. base_info = get_server_base_info()
  247. server_name = base_info['server_name']
  248. cursor.execute(
  249. """INSERT INTO server_base_info
  250. (id, server_ip, server_name, server_type, os_type, os_version)
  251. VALUES (%s, %s, %s, %s, %s, %s)""",
  252. (server_id, server_ip, base_info['server_name'],
  253. base_info['server_type'], base_info['os_type'], base_info['os_version'])
  254. )
  255. conn.commit()
  256. print(f"Created new server base info: {base_info['server_name']} ({server_ip})")
  257. cursor.close()
  258. conn.close()
  259. return server_id, server_name
  260. except Exception as e:
  261. print(f"Error getting or creating server ID: {e}")
  262. return None
  263. def save_to_database(server_ip, interface_name, cpu, mem, disk_usage, disk_io, up, down, gpus, save_disk_usage=True):
  264. """保存监控数据到数据库
  265. Args:
  266. save_disk_usage: 是否保存磁盘使用数据,默认True。设为False可跳过磁盘使用数据的保存
  267. """
  268. conn = None
  269. try:
  270. from datetime import datetime
  271. # 格式化为字符串
  272. current_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  273. # 获取或创建服务器ID
  274. server_id, server_name = get_or_create_server_id(server_ip)
  275. if server_id is None:
  276. return False
  277. conn = get_db_connection()
  278. cursor = conn.cursor()
  279. # 插入CPU数据
  280. cpu_id = id_generator.generate_id()
  281. cursor.execute(
  282. "INSERT INTO server_cpu_monitor (id, server_id, server_ip, cpu_usage, cpu_core, server_name, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s)",
  283. (cpu_id, server_id, server_ip, cpu['usage'], cpu['cores'], server_name, current_time_str)
  284. )
  285. # 插入内存数据
  286. mem_id = id_generator.generate_id()
  287. mem_free = mem['total'] - mem['used']
  288. cursor.execute(
  289. "INSERT INTO server_memory_monitor (id, server_id, server_ip, mem_total, mem_used, mem_free, mem_usage, server_name, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
  290. (mem_id, server_id, server_ip, int(mem['total']), int(mem['used']), int(mem_free), mem['percent'], server_name, current_time_str)
  291. )
  292. # 插入磁盘使用数据
  293. if save_disk_usage:
  294. disk_id = id_generator.generate_id()
  295. cursor.execute(
  296. "INSERT INTO server_disk_monitor (id, server_id, server_ip, disk_total, disk_used, disk_free, disk_usage, server_name, disk_path, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
  297. (disk_id, server_id, server_ip, int(disk_usage['total']), int(disk_usage['used']), int(disk_usage['free']), disk_usage['percent'], server_name, "/opt", current_time_str)
  298. )
  299. # 插入磁盘IO数据
  300. io_id = id_generator.generate_id()
  301. cursor.execute(
  302. "INSERT INTO server_io_monitor (id, server_id, server_ip, io_read_mb, io_write_mb, io_read_iops, io_write_iops, server_name, disk_path, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
  303. (io_id, server_id, server_ip, disk_io['read_mb'], disk_io['write_mb'], disk_io['read_iops'], disk_io['write_iops'], server_name, "/opt", current_time_str)
  304. )
  305. # 插入网络上行数据
  306. upload_id = id_generator.generate_id()
  307. cursor.execute(
  308. "INSERT INTO server_net_upload_monitor (id, server_id, server_ip, net_card, upload_speed, server_name, upload_packets, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
  309. (upload_id, server_id, server_ip, interface_name, up, server_name, 0, current_time_str)
  310. )
  311. # 插入网络下行数据
  312. download_id = id_generator.generate_id()
  313. cursor.execute(
  314. "INSERT INTO server_net_download_monitor (id, server_id, server_ip, net_card, download_speed, server_name, download_packets, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
  315. (download_id, server_id, server_ip, interface_name, down, server_name, 0, current_time_str)
  316. )
  317. # 插入GPU数据
  318. gpu_count = 0
  319. for g in gpus:
  320. try:
  321. gpu_id = id_generator.generate_id()
  322. cursor.execute(
  323. "INSERT INTO server_gpu_monitor (id, server_id, server_ip, gpu_card_no, gpu_usage, gpu_mem_total, gpu_mem_used, gpu_temp, server_name, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
  324. (gpu_id, server_id, server_ip, g['id'], g['util'], int(g['mem_total']), int(g['mem_used']), g['temp'], server_name, current_time_str)
  325. )
  326. gpu_count += 1
  327. print(f"DEBUG: GPU{g['id']} data inserted successfully (ID: {gpu_id})")
  328. except Exception as gpu_error:
  329. print(f"Error inserting GPU{g['id']} data: {gpu_error}")
  330. print(f"GPU data: {g}")
  331. conn.commit()
  332. print(f"DEBUG: Transaction committed. {gpu_count} GPU records saved.")
  333. cursor.close()
  334. conn.close()
  335. # 调试信息:显示GPU数据保存情况
  336. if gpus and gpu_count == 0:
  337. print(f"WARNING: {len(gpus)} GPUs detected but 0 saved to database!")
  338. elif gpus:
  339. print(f"INFO: Successfully saved {gpu_count}/{len(gpus)} GPU records to database")
  340. return True
  341. except Exception as e:
  342. print(f"Database error: {e}")
  343. import traceback
  344. traceback.print_exc()
  345. if conn:
  346. try:
  347. conn.rollback()
  348. conn.close()
  349. except:
  350. pass
  351. return False
  352. def cleanup_old_data(days=7):
  353. """清理指定天数之前的监控数据
  354. Args:
  355. days: 保留最近多少天的数据,默认7天
  356. """
  357. try:
  358. from datetime import datetime, timedelta
  359. conn = get_db_connection()
  360. cursor = conn.cursor()
  361. # 计算截止日期
  362. cutoff_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d %H:%M:%S')
  363. # 需要清理的表列表
  364. tables = [
  365. 'server_cpu_monitor',
  366. 'server_memory_monitor',
  367. 'server_disk_monitor',
  368. 'server_io_monitor',
  369. 'server_net_upload_monitor',
  370. 'server_net_download_monitor',
  371. 'server_gpu_monitor'
  372. ]
  373. # 记录清理数据数量(便于调试)
  374. total_deleted = 0
  375. for table in tables:
  376. try:
  377. # 找到创建时间小于截止日期的数据并删除
  378. cursor.execute(f"DELETE FROM {table} WHERE create_time < %s", (cutoff_date,))
  379. deleted = cursor.rowcount
  380. total_deleted += deleted
  381. print(f"Cleaned {deleted} records from {table}")
  382. except Exception as e:
  383. print(f"Error cleaning {table}: {e}")
  384. # 关闭事务
  385. conn.commit()
  386. cursor.close()
  387. conn.close()
  388. print(f"Total cleaned: {total_deleted} records (older than {days} days)")
  389. return True
  390. except Exception as e:
  391. print(f"Cleanup error: {e}")
  392. return False
  393. def monitor(interval=5, disk_usage_interval=3600, cleanup_interval=86400):
  394. """系统资源监控主循环
  395. Args:
  396. interval: 常规监控数据采集间隔(秒),默认5秒
  397. disk_usage_interval: 磁盘使用数据更新间隔(秒),默认3600秒(1小时)
  398. cleanup_interval: 数据清理间隔(秒),默认86400秒(24小时)
  399. """
  400. interface_name, local_ip = get_local_ip()
  401. print("System resource monitor started (Ctrl+C to exit)")
  402. print(f"Network Interface: {interface_name} | Local IP: {local_ip}")
  403. print(f"Recording interval: {interval} seconds")
  404. print(f"Disk usage update interval: {disk_usage_interval} seconds ({disk_usage_interval/3600:.1f} hours)")
  405. print(f"Data cleanup interval: {cleanup_interval} seconds ({cleanup_interval/86400:.1f} days)\n")
  406. prev_net = psutil.net_io_counters()
  407. prev_disk = psutil.disk_io_counters()
  408. # 记录上次更新磁盘使用数据的时间
  409. last_disk_usage_update = time.time()
  410. # 记录上次清理数据的时间
  411. last_cleanup_time = time.time()
  412. while True:
  413. time.sleep(interval)
  414. current_time = time.time()
  415. ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  416. cpu = cpu_info()
  417. mem = memory_info()
  418. disk_usage = get_disk_usage()
  419. prev_disk, disk_io = disk_io_info(prev_disk, interval)
  420. prev_net, up, down = network_speed(prev_net, interval)
  421. gpus = gpu_info()
  422. # 判断是否需要更新磁盘使用数据
  423. should_save_disk_usage = (current_time - last_disk_usage_update) >= disk_usage_interval
  424. # 保存到数据库
  425. db_success = save_to_database(
  426. local_ip, interface_name, cpu, mem, disk_usage, disk_io, up, down, gpus,
  427. save_disk_usage=should_save_disk_usage
  428. )
  429. # 更新磁盘使用数据的时间戳
  430. if should_save_disk_usage and db_success:
  431. last_disk_usage_update = current_time
  432. disk_status = "Disk usage saved"
  433. else:
  434. disk_status = "Disk usage skipped"
  435. db_status = f"Saved to DB ({disk_status})" if db_success else "DB Error"
  436. # 判断是否需要清理旧数据
  437. if (current_time - last_cleanup_time) >= cleanup_interval:
  438. print("\n" + "=" * 80)
  439. print("Starting data cleanup...")
  440. cleanup_success = cleanup_old_data(days=7)
  441. if cleanup_success:
  442. last_cleanup_time = current_time
  443. print("Data cleanup completed")
  444. else:
  445. print("Data cleanup failed")
  446. print("=" * 80 + "\n")
  447. print("=" * 80)
  448. print(f"Time: {ts} | Interface: {interface_name} | IP: {local_ip} | {db_status}")
  449. print(f"CPU: {cpu['usage']}% | Cores: {cpu['cores']} | Load: {cpu['load']}")
  450. print(f"Memory: {mem['used']}MB / {mem['total']}MB ({mem['percent']}%)")
  451. print(f"Disk: {disk_usage['used']}GB / {disk_usage['total']}GB ({disk_usage['percent']}%)")
  452. print(f"Disk IO: Read {disk_io['read_mb']}MB/s | Write {disk_io['write_mb']}MB/s | "
  453. f"IOPS: R={disk_io['read_iops']} W={disk_io['write_iops']}")
  454. print(f"Network: Up {up} MB/s | Down {down} MB/s")
  455. if gpus:
  456. for g in gpus:
  457. print(
  458. f"GPU{g['id']}: {g['util']}% | "
  459. f"{g['mem_used']}/{g['mem_total']}MB | "
  460. f"{g['temp']}°C"
  461. )
  462. else:
  463. print("GPU: Not available")
  464. print("=" * 80)
  465. if __name__ == "__main__":
  466. monitor()