o
    ei=M                     @   sh  d dl Z d dlZd dlmZ d dlZd dlmZ de jfddZ	de jfddZ
de jfd	d
ZdeddfddZde jfddZde jfddZde jfddZdee fddZ				d,dedededB dedB dedB dedeeef fddZG dd  d ZG d!d" d"Z	d-d#eeB d$ee dB deeed"f B fd%d&Z	d.d'ed(ed)edefd*d+ZdS )/    N)Any)_get_device_indexreturnc               	   C   s   zdd l } tt| dd }W n# ttfy5   tjdkr.tdt	j
jd  d}ntd}Y nw |j|_|j|_|j|_|j|_|j|_|S )Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r	   lib r    [/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/cuda/_utils.py_get_hip_runtime_library   s   

r"   c                   C   s   t jdkr
tdS tdS )Nr   z
nvcuda.dllzlibcuda.so.1)r   r   r
   r   r    r    r    r!   _get_cuda_library"   s   


r#   c                   C      t jjrt S t S N)r   r   r   r"   r#   r    r    r    r!   _get_gpu_runtime_library*   s   r&   resultc                 C   sR   | dkrd S t  }t }|| t | |jd ur |j nd}td| )Nr   Unknown CUDA errorCUDA error: )r
   c_char_pr&   r   byrefvaluedecodeRuntimeError)r'   err_strlibcudaerror_messager    r    r!   _check_cuda2   s   r2   c               
   C   s   zdd l } tt| dd }W n0 ttfyB   tjdkr;d	dt
jjd dt
jjd g}td| d}ntd}Y nw |j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_ |S )	Nr   hiprtcr    0   r   zlibhiprtc.so)!r	   r
   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetCUBINSizehiprtcGetCodenvrtcGetCUBINhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r	   r   version_strr    r    r!   _get_hiprtc_library>   s.   


rM   c               	   C   sr   t tjjdd } tjdkrd|  dg}nd|  dg}|D ]}zt|W   S  t	y4   Y q!w t	d)	N.r   r   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r
   r   OSError)major_version
nvrtc_libslib_namer    r    r!   _get_nvrtc_library[   s   

rW   c                   C   r$   r%   )r   r   r   rM   rW   r    r    r    r!   _get_gpu_rtc_libraryn   s   rX   c                     s>   ddl m} m} dh  fdd|D }tjjr||  |S )z
    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

    Returns:
        List of HIPCC/NVCC flags that can be safely used with NVRTC.
    r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexprc                    s   g | ]}| vr|qS r    r    .0flagnvrtc_unsupported_flagsr    r!   
<listcomp>   s    z1_get_gpu_rtc_compatible_flags.<locals>.<listcomp>)torch.utils.cpp_extensionrY   rZ   r   r   r   extend)rY   rZ   compatible_flagsr    r^   r!   _get_gpu_rtc_compatible_flagsw   s   

rd   Fkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc              	      s  ddl }t d dtddf fdd}| d}|du r8|j|j }	|jjr0|	j	 }n|	j
 |	j }g }
|jjrI|
d|   n
|
d	|   dd
lm} |d}|D ]}|
d|   q_|r}|D ]}|
d|   qp|rt|jjdk rtd|jj |du rg }|d |r|D ]
}|
|d qt }|
dd |D  t|
}tj| |
 }t }|t||| d ddd |d}||| |||}| krt }|t| t|j}|| t d|j!  t }|"|t| t|j}|#|| t }|$||t| |jdurO|j! }nd}%t| |j&|fS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC
        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

    Returns:
        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
    r   Nr'   r   c                    sL   |  kr$t  }| t | |jd ur|j nd}td| d S )Nr(   r)   )r
   r*   r9   r+   r,   r-   r.   )r'   r/   r1   NVRTC_SUCCESSlibnvrtcr    r!   check_nvrtc   s   

z#_nvrtc_compile.<locals>.check_nvrtcutf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrQ   z-Iz12.8zPCH requires CUDA 12.8+, got z--pchc                 S   s   g | ]}| d qS )ro   )encoder[   r    r    r!   r`      s    z"_nvrtc_compile.<locals>.<listcomp>z.cuzKernel compilation failed:
r4   )'
torch.cudarX   rP   rq   rQ   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendra   rp   r   AssertionErrorrd   rb   lenr
   r*   c_void_pr;   r+   rI   r?   c_size_trE   create_string_bufferr,   rG   r.   r-   rA   rC   rK   r=   raw)re   rf   rg   rh   ri   rj   r   rn   source_bytespropsoptionsrp   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsnum_optionsoptions_arrayprogc_kernel_namereslog_sizelogbinary_sizebinaryc_mangled_namemangled_namer    rk   r!   _nvrtc_compile   s   





r   c                   @   s2   e Zd ZdejddfddZdeddfdd	ZdS )
_CudaModulemoduler   Nc                 C   s   || _ i | _d S r%   )_module_kernels)selfr   r    r    r!   __init__"  s   
z_CudaModule.__init__name_CudaKernelc              
   C   s   || j v r
| j | S ddlm} | }t }zt|t|| j|	d t
|| j}|| j |< |W S  tyJ } z	td| d|d }~ww )Nr   )r&   ro   zNo kernel named 'z' in this module)r   torch.cuda._utilsr&   r
   r{   r2   r   r+   r   rq   r   r.   AttributeError)r   r   r&   r0   funckernelerrr    r    r!   __getattr__&  s$   


z_CudaModule.__getattr__)__name__
__module____qualname__r
   r{   r   r   r   r    r    r    r!   r   !  s    r   c                   @   s   e Zd ZdZdejdejddfddZ						dd
eeeef deeeef de	dB dede
dB ddfddZdeddfddZdS )r   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    r   r   r   Nc                 C   s   || _ || _d| _d S )Nr   )r   r   _max_shared_mem_bytes)r   r   r   r    r    r!   r   D  s   
z_CudaKernel.__init__   r   r   r   gridblockargs
shared_memstreamc                 C   s  ddl }|jj }|sg }g }g }	|D ]Y}
t|
|jr?|
js*|
jr&|
 s*t	dt
|
 }|| |	t
| qt|
trRt
|
}|	t
| qt|
tret
|
}|	t
| qtdt|
 t
jt|	  }t|	D ]\}}
t
|
t
j||< qz|du rddl}|j }|dkr| jdks|| jkr| jdkrdnd| j d}td	| d
| dt|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r   r6   ) r   rQ   _utilsr&   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr
   r{   data_ptrrx   r+   rP   c_intfloatc_double	TypeErrortyperz   	enumeratecastrr   current_streamr   r.   r2   r   r   _as_parameter_)r   r   r   r   r   r   r   r0   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgr    r    r!   __call__I  sl   






z_CudaKernel.__call__shared_mem_bytesc                 C   s   |dk r	|| _ d S t }tj }tjjr|jdkrdnd}nt|dd}||kr4t	d| d| dd	}t
|| j|| || _ d S )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r&   r   rQ   rs   r   r   ru   getattrr.   r2   r   r   )r   r   r0   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizer    r    r!   set_shared_memory_config  s4   

z$_CudaKernel.set_shared_memory_config)r   r   Nr   N)r   r   r   __doc__r
   r{   r   tuplerP   listr   r   r   r    r    r    r!   r   ?  s,    
ar   ptxkernel_namesc           	   	   C   s   ddl }t }t| tr| d} t }|j }| t	|
t||  W d   n1 s2w   Y  |s=t|S i }|D ]}t }t	|t|||d t||||< qA|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nro   )rr   r&   r   r   rq   r
   r{   rQ   r   r2   r   r+   r   r   r   )	r   r   r   r0   r   r   kernelsr   r   r    r    r!   _cuda_load_module  s*   


r   deviceoptional	allow_cpuc                 C   s   t | tr| S t | trt| } t | tjr2|r&| jdvr%td|  n| jdkr2td|  tj sAt | tj	jrA| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )rQ   cpuz(Expected a cuda or cpu device, but got: rQ   z!Expected a cuda device, but got: )r   rP   r   r   r   r   r   jitis_scriptingrQ   idx_torch_get_device_index)r   r   r   r    r    r!   r     s   





r   )NNNFr%   )FF)r
   r   typingr   r   torch._utilsr   r   r   r"   r#   r&   rP   r2   rM   rW   rX   r   r   rd   boolr   bytesr   r   r   dictr   r    r    r    r!   <module>   sl    	

  

1