goheung/app/AI_modules/gpu_utils/remote_gpu.py
2026-02-02 19:07:53 +09:00

231 lines
7.4 KiB
Python

# Remote GPU Connection
"""
원격 GPU 서버 연결 유틸리티
지원:
- SSH를 통한 원격 GPU 서버 연결
- 파일 전송 (SCP)
- 원격 학습 실행
"""
import os
import subprocess
from pathlib import Path
class RemoteGPUClient:
"""
원격 GPU 서버 클라이언트
사용 예시:
client = RemoteGPUClient(
host='192.168.1.100',
username='user',
key_path='~/.ssh/id_rsa'
)
client.connect()
client.check_gpu()
client.upload_file('local_data.zip', '/remote/path/')
client.run_training('python train.py')
"""
def __init__(self, host, username='root', port=22, key_path=None, password=None):
"""
Args:
host: 원격 서버 IP 또는 호스트명
username: SSH 사용자명
port: SSH 포트
key_path: SSH 키 파일 경로
password: 비밀번호 (키 없을 경우)
"""
self.host = host
self.username = username
self.port = port
self.key_path = key_path
self.password = password
self.connected = False
def _get_ssh_cmd(self):
"""SSH 명령어 기본 구성"""
cmd = ['ssh', '-p', str(self.port)]
if self.key_path:
cmd.extend(['-i', os.path.expanduser(self.key_path)])
cmd.append(f'{self.username}@{self.host}')
return cmd
def connect(self):
"""연결 테스트"""
try:
cmd = self._get_ssh_cmd() + ['echo', 'Connected']
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
self.connected = True
print(f"[Connected] {self.username}@{self.host}:{self.port}")
return True
else:
print(f"[Connection Failed] {result.stderr}")
return False
except subprocess.TimeoutExpired:
print("[Connection Failed] Timeout")
return False
except Exception as e:
print(f"[Connection Failed] {e}")
return False
def check_gpu(self):
"""원격 서버 GPU 상태 확인"""
cmd = self._get_ssh_cmd() + ['nvidia-smi']
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
print("[Remote GPU Status]")
print(result.stdout)
return result.stdout
else:
print(f"[GPU Check Failed] {result.stderr}")
return None
def run_command(self, command):
"""원격 명령어 실행"""
cmd = self._get_ssh_cmd() + [command]
result = subprocess.run(cmd, capture_output=True, text=True)
print(f"[Remote Command] {command}")
if result.stdout:
print(result.stdout)
if result.stderr:
print(f"[Error] {result.stderr}")
return result.returncode == 0, result.stdout, result.stderr
def upload_file(self, local_path, remote_path):
"""파일 업로드 (SCP)"""
cmd = ['scp', '-P', str(self.port)]
if self.key_path:
cmd.extend(['-i', os.path.expanduser(self.key_path)])
cmd.extend([local_path, f'{self.username}@{self.host}:{remote_path}'])
print(f"[Uploading] {local_path} -> {remote_path}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print("[Upload Complete]")
return True
else:
print(f"[Upload Failed] {result.stderr}")
return False
def download_file(self, remote_path, local_path):
"""파일 다운로드 (SCP)"""
cmd = ['scp', '-P', str(self.port)]
if self.key_path:
cmd.extend(['-i', os.path.expanduser(self.key_path)])
cmd.extend([f'{self.username}@{self.host}:{remote_path}', local_path])
print(f"[Downloading] {remote_path} -> {local_path}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print("[Download Complete]")
return True
else:
print(f"[Download Failed] {result.stderr}")
return False
def run_training(self, command, working_dir=None, background=False):
"""
원격 학습 실행
Args:
command: 실행할 명령어 (예: 'python train.py')
working_dir: 작업 디렉토리
background: 백그라운드 실행 여부
"""
if working_dir:
command = f'cd {working_dir} && {command}'
if background:
command = f'nohup {command} > training.log 2>&1 &'
return self.run_command(command)
class SSHGPURunner:
"""
SSH 터널을 통한 GPU 학습 실행기
사용 예시:
runner = SSHGPURunner(
host='gpu-server.example.com',
username='user',
remote_project_dir='/home/user/projects/goheung'
)
# 프로젝트 동기화 및 학습 실행
runner.sync_project('./app')
runner.run('python -m app.AI_modules.DeepLabV3.train')
runner.download_results('./models/')
"""
def __init__(self, host, username, remote_project_dir, port=22, key_path=None):
self.client = RemoteGPUClient(host, username, port, key_path)
self.remote_project_dir = remote_project_dir
def connect(self):
"""연결"""
return self.client.connect()
def sync_project(self, local_dir):
"""
프로젝트 폴더 동기화 (rsync)
Args:
local_dir: 로컬 프로젝트 디렉토리
"""
cmd = [
'rsync', '-avz', '--progress',
'-e', f'ssh -p {self.client.port}',
]
if self.client.key_path:
cmd[4] = f'ssh -p {self.client.port} -i {self.client.key_path}'
cmd.extend([
f'{local_dir}/',
f'{self.client.username}@{self.client.host}:{self.remote_project_dir}/'
])
print(f"[Syncing] {local_dir} -> {self.remote_project_dir}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print("[Sync Complete]")
return True
else:
print(f"[Sync Failed] {result.stderr}")
return False
def run(self, command):
"""원격 학습 실행"""
return self.client.run_training(command, working_dir=self.remote_project_dir)
def run_background(self, command, log_file='training.log'):
"""백그라운드로 학습 실행"""
full_command = f'nohup {command} > {log_file} 2>&1 &'
return self.client.run_training(full_command, working_dir=self.remote_project_dir)
def check_training_log(self, log_file='training.log', lines=50):
"""학습 로그 확인"""
command = f'tail -n {lines} {self.remote_project_dir}/{log_file}'
return self.client.run_command(command)
def download_results(self, local_dir, remote_pattern='*.h5'):
"""학습 결과 다운로드"""
remote_path = f'{self.remote_project_dir}/{remote_pattern}'
return self.client.download_file(remote_path, local_dir)
def download_model(self, model_name, local_path):
"""특정 모델 파일 다운로드"""
remote_path = f'{self.remote_project_dir}/models/{model_name}'
return self.client.download_file(remote_path, local_path)