goheung/app/AI_modules/gpu_utils/gpu_config.py

"""
-------------------------------------------------------------------------
File: gpu_config.py
Description: 로컬 GPU 감지 및 설정 유틸리티
Author: 소지안 프로
Created: 2026-02-02
Last Modified: 2026-02-02
-------------------------------------------------------------------------
"""
import os
import subprocess


def check_gpu_status():
    """GPU 상태 확인 및 출력"""
    print("=" * 50)
    print("GPU Status Check")
    print("=" * 50)

    # NVIDIA GPU 확인 (nvidia-smi)
    try:
        result = subprocess.run(
            ['nvidia-smi'],
            capture_output=True,
            text=True,
            timeout=10
        )
        if result.returncode == 0:
            print("\n[NVIDIA GPU Detected]")
            print(result.stdout)
        else:
            print("\n[NVIDIA GPU] Not found or driver not installed")
    except FileNotFoundError:
        print("\n[NVIDIA GPU] nvidia-smi not found")
    except Exception as e:
        print(f"\n[NVIDIA GPU] Error: {e}")

    # TensorFlow GPU 확인
    try:
        import tensorflow as tf
        gpus = tf.config.list_physical_devices('GPU')
        print(f"\n[TensorFlow] GPU devices: {len(gpus)}")
        for gpu in gpus:
            print(f"  - {gpu}")
    except ImportError:
        print("\n[TensorFlow] Not installed")
    except Exception as e:
        print(f"\n[TensorFlow] Error: {e}")

    # PyTorch GPU 확인
    try:
        import torch
        print(f"\n[PyTorch] CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"  - Device count: {torch.cuda.device_count()}")
            print(f"  - Current device: {torch.cuda.current_device()}")
            print(f"  - Device name: {torch.cuda.get_device_name(0)}")
    except ImportError:
        print("\n[PyTorch] Not installed")
    except Exception as e:
        print(f"\n[PyTorch] Error: {e}")

    print("\n" + "=" * 50)


def get_gpu_info():
    """GPU 정보 딕셔너리로 반환"""
    info = {
        'nvidia_available': False,
        'tensorflow_gpus': [],
        'pytorch_cuda': False,
        'gpu_name': None,
        'gpu_memory': None,
    }

    # NVIDIA 정보
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
            capture_output=True,
            text=True,
            timeout=10
        )
        if result.returncode == 0:
            info['nvidia_available'] = True
            parts = result.stdout.strip().split(',')
            if len(parts) >= 2:
                info['gpu_name'] = parts[0].strip()
                info['gpu_memory'] = parts[1].strip()
    except:
        pass

    # TensorFlow
    try:
        import tensorflow as tf
        gpus = tf.config.list_physical_devices('GPU')
        info['tensorflow_gpus'] = [str(gpu) for gpu in gpus]
    except:
        pass

    # PyTorch
    try:
        import torch
        info['pytorch_cuda'] = torch.cuda.is_available()
    except:
        pass

    return info


def setup_tensorflow_gpu(memory_limit=None, allow_growth=True):
    """
    TensorFlow GPU 설정

    Args:
        memory_limit: GPU 메모리 제한 (MB). None이면 제한 없음
        allow_growth: 메모리 동적 할당 허용

    Returns:
        설정된 GPU 리스트
    """
    import tensorflow as tf

    gpus = tf.config.list_physical_devices('GPU')

    if not gpus:
        print("[Warning] No GPU found for TensorFlow")
        return []

    try:
        for gpu in gpus:
            if memory_limit:
                # 메모리 제한 설정
                tf.config.set_logical_device_configuration(
                    gpu,
                    [tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit)]
                )
                print(f"[TensorFlow] GPU memory limited to {memory_limit}MB")
            elif allow_growth:
                # 동적 메모리 할당
                tf.config.experimental.set_memory_growth(gpu, True)
                print("[TensorFlow] GPU memory growth enabled")

        print(f"[TensorFlow] Configured {len(gpus)} GPU(s)")
        return gpus

    except RuntimeError as e:
        print(f"[TensorFlow] GPU configuration error: {e}")
        return []


def setup_pytorch_gpu(device_id=0):
    """
    PyTorch GPU 설정

    Args:
        device_id: 사용할 GPU 디바이스 ID

    Returns:
        torch.device 객체
    """
    import torch

    if torch.cuda.is_available():
        device = torch.device(f'cuda:{device_id}')
        torch.cuda.set_device(device_id)
        print(f"[PyTorch] Using GPU: {torch.cuda.get_device_name(device_id)}")
    else:
        device = torch.device('cpu')
        print("[PyTorch] CUDA not available, using CPU")

    return device


def limit_gpu_memory(fraction=0.5):
    """
    GPU 메모리 사용량 제한 (TensorFlow)

    Args:
        fraction: 전체 메모리 중 사용할 비율 (0.0 ~ 1.0)
    """
    import tensorflow as tf

    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # 환경변수로 메모리 제한
            os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
            print(f"[TensorFlow] GPU memory fraction set to {fraction * 100}%")
        except Exception as e:
            print(f"[Error] {e}")


def set_visible_gpus(gpu_ids):
    """
    사용할 GPU 지정

    Args:
        gpu_ids: GPU ID 리스트 (예: [0, 1])
    """
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu_ids))
    print(f"[GPU] Visible devices set to: {gpu_ids}")