231 lines
7.4 KiB
Python
231 lines
7.4 KiB
Python
# Remote GPU Connection
|
|
"""
|
|
원격 GPU 서버 연결 유틸리티
|
|
|
|
지원:
|
|
- SSH를 통한 원격 GPU 서버 연결
|
|
- 파일 전송 (SCP)
|
|
- 원격 학습 실행
|
|
"""
|
|
import os
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
|
|
class RemoteGPUClient:
|
|
"""
|
|
원격 GPU 서버 클라이언트
|
|
|
|
사용 예시:
|
|
client = RemoteGPUClient(
|
|
host='192.168.1.100',
|
|
username='user',
|
|
key_path='~/.ssh/id_rsa'
|
|
)
|
|
client.connect()
|
|
client.check_gpu()
|
|
client.upload_file('local_data.zip', '/remote/path/')
|
|
client.run_training('python train.py')
|
|
"""
|
|
|
|
def __init__(self, host, username='root', port=22, key_path=None, password=None):
|
|
"""
|
|
Args:
|
|
host: 원격 서버 IP 또는 호스트명
|
|
username: SSH 사용자명
|
|
port: SSH 포트
|
|
key_path: SSH 키 파일 경로
|
|
password: 비밀번호 (키 없을 경우)
|
|
"""
|
|
self.host = host
|
|
self.username = username
|
|
self.port = port
|
|
self.key_path = key_path
|
|
self.password = password
|
|
self.connected = False
|
|
|
|
def _get_ssh_cmd(self):
|
|
"""SSH 명령어 기본 구성"""
|
|
cmd = ['ssh', '-p', str(self.port)]
|
|
if self.key_path:
|
|
cmd.extend(['-i', os.path.expanduser(self.key_path)])
|
|
cmd.append(f'{self.username}@{self.host}')
|
|
return cmd
|
|
|
|
def connect(self):
|
|
"""연결 테스트"""
|
|
try:
|
|
cmd = self._get_ssh_cmd() + ['echo', 'Connected']
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
if result.returncode == 0:
|
|
self.connected = True
|
|
print(f"[Connected] {self.username}@{self.host}:{self.port}")
|
|
return True
|
|
else:
|
|
print(f"[Connection Failed] {result.stderr}")
|
|
return False
|
|
except subprocess.TimeoutExpired:
|
|
print("[Connection Failed] Timeout")
|
|
return False
|
|
except Exception as e:
|
|
print(f"[Connection Failed] {e}")
|
|
return False
|
|
|
|
def check_gpu(self):
|
|
"""원격 서버 GPU 상태 확인"""
|
|
cmd = self._get_ssh_cmd() + ['nvidia-smi']
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
if result.returncode == 0:
|
|
print("[Remote GPU Status]")
|
|
print(result.stdout)
|
|
return result.stdout
|
|
else:
|
|
print(f"[GPU Check Failed] {result.stderr}")
|
|
return None
|
|
|
|
def run_command(self, command):
|
|
"""원격 명령어 실행"""
|
|
cmd = self._get_ssh_cmd() + [command]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
print(f"[Remote Command] {command}")
|
|
if result.stdout:
|
|
print(result.stdout)
|
|
if result.stderr:
|
|
print(f"[Error] {result.stderr}")
|
|
|
|
return result.returncode == 0, result.stdout, result.stderr
|
|
|
|
def upload_file(self, local_path, remote_path):
|
|
"""파일 업로드 (SCP)"""
|
|
cmd = ['scp', '-P', str(self.port)]
|
|
if self.key_path:
|
|
cmd.extend(['-i', os.path.expanduser(self.key_path)])
|
|
cmd.extend([local_path, f'{self.username}@{self.host}:{remote_path}'])
|
|
|
|
print(f"[Uploading] {local_path} -> {remote_path}")
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("[Upload Complete]")
|
|
return True
|
|
else:
|
|
print(f"[Upload Failed] {result.stderr}")
|
|
return False
|
|
|
|
def download_file(self, remote_path, local_path):
|
|
"""파일 다운로드 (SCP)"""
|
|
cmd = ['scp', '-P', str(self.port)]
|
|
if self.key_path:
|
|
cmd.extend(['-i', os.path.expanduser(self.key_path)])
|
|
cmd.extend([f'{self.username}@{self.host}:{remote_path}', local_path])
|
|
|
|
print(f"[Downloading] {remote_path} -> {local_path}")
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("[Download Complete]")
|
|
return True
|
|
else:
|
|
print(f"[Download Failed] {result.stderr}")
|
|
return False
|
|
|
|
def run_training(self, command, working_dir=None, background=False):
|
|
"""
|
|
원격 학습 실행
|
|
|
|
Args:
|
|
command: 실행할 명령어 (예: 'python train.py')
|
|
working_dir: 작업 디렉토리
|
|
background: 백그라운드 실행 여부
|
|
"""
|
|
if working_dir:
|
|
command = f'cd {working_dir} && {command}'
|
|
|
|
if background:
|
|
command = f'nohup {command} > training.log 2>&1 &'
|
|
|
|
return self.run_command(command)
|
|
|
|
|
|
class SSHGPURunner:
|
|
"""
|
|
SSH 터널을 통한 GPU 학습 실행기
|
|
|
|
사용 예시:
|
|
runner = SSHGPURunner(
|
|
host='gpu-server.example.com',
|
|
username='user',
|
|
remote_project_dir='/home/user/projects/goheung'
|
|
)
|
|
|
|
# 프로젝트 동기화 및 학습 실행
|
|
runner.sync_project('./app')
|
|
runner.run('python -m app.AI_modules.DeepLabV3.train')
|
|
runner.download_results('./models/')
|
|
"""
|
|
|
|
def __init__(self, host, username, remote_project_dir, port=22, key_path=None):
|
|
self.client = RemoteGPUClient(host, username, port, key_path)
|
|
self.remote_project_dir = remote_project_dir
|
|
|
|
def connect(self):
|
|
"""연결"""
|
|
return self.client.connect()
|
|
|
|
def sync_project(self, local_dir):
|
|
"""
|
|
프로젝트 폴더 동기화 (rsync)
|
|
|
|
Args:
|
|
local_dir: 로컬 프로젝트 디렉토리
|
|
"""
|
|
cmd = [
|
|
'rsync', '-avz', '--progress',
|
|
'-e', f'ssh -p {self.client.port}',
|
|
]
|
|
|
|
if self.client.key_path:
|
|
cmd[4] = f'ssh -p {self.client.port} -i {self.client.key_path}'
|
|
|
|
cmd.extend([
|
|
f'{local_dir}/',
|
|
f'{self.client.username}@{self.client.host}:{self.remote_project_dir}/'
|
|
])
|
|
|
|
print(f"[Syncing] {local_dir} -> {self.remote_project_dir}")
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("[Sync Complete]")
|
|
return True
|
|
else:
|
|
print(f"[Sync Failed] {result.stderr}")
|
|
return False
|
|
|
|
def run(self, command):
|
|
"""원격 학습 실행"""
|
|
return self.client.run_training(command, working_dir=self.remote_project_dir)
|
|
|
|
def run_background(self, command, log_file='training.log'):
|
|
"""백그라운드로 학습 실행"""
|
|
full_command = f'nohup {command} > {log_file} 2>&1 &'
|
|
return self.client.run_training(full_command, working_dir=self.remote_project_dir)
|
|
|
|
def check_training_log(self, log_file='training.log', lines=50):
|
|
"""학습 로그 확인"""
|
|
command = f'tail -n {lines} {self.remote_project_dir}/{log_file}'
|
|
return self.client.run_command(command)
|
|
|
|
def download_results(self, local_dir, remote_pattern='*.h5'):
|
|
"""학습 결과 다운로드"""
|
|
remote_path = f'{self.remote_project_dir}/{remote_pattern}'
|
|
return self.client.download_file(remote_path, local_dir)
|
|
|
|
def download_model(self, model_name, local_path):
|
|
"""특정 모델 파일 다운로드"""
|
|
remote_path = f'{self.remote_project_dir}/models/{model_name}'
|
|
return self.client.download_file(remote_path, local_path)
|