o
    ki/                     @  s  U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
r7dd
lmZ ddlZddlmZ dgZdaded< eG dd dZededdZ	dfdgddZedhddZdidd Z	!djdkd*d+Zdld4d5Zdmd<d=Zed>Z dndAdBZ!	dodpdIdJZ"	KdqdrdNdOZ#dddddddPdsdTdUZ$ddddVdtdYdZZ%	[	K	Kdudd\dvd]d^Z&dd\dwdadbZ'ej(dcedd dS )xzUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                   @  s    e Zd ZU ded< dddZdS )	
_FA4HandlezLibrary | NonelibraryreturnNonec                 C  s
   d | _ d S N)r   )self r   a/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/nn/attention/_fa4.pyremove"   s   
z_FA4Handle.removeN)r   r   )__name__
__module____qualname____annotations__r   r   r   r   r   r      s   
 r   devicetorch.devicer   intc                 C  s   t j| \}}|S r   )torchcudaget_device_capability)r   major_r   r   r   _get_device_major&   s   r%   flash_attn.cute.interfacemodule_pathstrc                 C  s   t | }| att S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)r'   r$   r   r   r   r   ,   s   

r   c                 C  s2   t | }t|drt|dstd|  d|S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r'   moduler   r   r   r)   ;   s   
r)   r   c                  C  sH   t ddd} | dtd | dtd | dtd | dtd | S )NatenIMPLCUDA_flash_attention_forward_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libr   r   r   r*   C   s   r*   r   querytorch.Tensortensorstuple[torch.Tensor, ...]	cum_seq_qtorch.Tensor | Nonerequire_fp32$tuple[tuple[str, torch.Tensor], ...]c                 C  s   t dd |D sdS tdd |D dkrdS | jtjtjfvr#dS |D ]\}}|jtjkr6| d	  S q%|d u rC|  d
krCdS |d urO|  dkrOdS tj	 sVdS t
| jdvr_dS d S )Nc                 s  s    | ]}|j V  qd S r   )is_cuda.0tr   r   r   	<genexpr>Z   s    z,_fa4_common_support_error.<locals>.<genexpr>zinputs must be CUDA tensorsc                 S  s   h | ]}|j qS r   )r   rH   r   r   r   	<setcomp>\   s    z,_fa4_common_support_error.<locals>.<setcomp>r	   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllendtyper    float16bfloat16float32dimr!   is_availabler%   r   )r?   rA   rC   rE   nametensorr   r   r   _fa4_common_support_errorT   s&   
r\   keyvalue	dropout_pfloatreturn_debug_maskboolalibi_slopes	seqused_kc           	      C  sp   |dkrdS |r
dS |d urdS |d ur!|j tjkrdS |js!dS t| | ||f|}|d ur6|dkr4dS |S d S )	N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArM   z(query, key, value must be on same device)rT   r    int32rG   r\   )	r?   r]   r^   r_   ra   rc   rd   rC   errorr   r   r   _fa4_forward_support_errorn   s*   
ri   grad_outout	logsumexpwindow_size_left
int | Nonewindow_size_rightc
                 C  sR   |dkrdS |d us|	d urdS t || |||||f|d|ffd}
|
d ur'|
S d S )Nre   rf   z windowed attention not supportedrl   )rE   )r\   )rj   r?   r]   r^   rk   rl   r_   rC   rm   ro   rh   r   r   r   _fa4_backward_support_error   s   rp   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                  G  s   t dd | D S )Nc                 s  s    | ]	}| d dV  qdS )r	      N)	transposerH   r   r   r   rK      s    z#_transpose_dense.<locals>.<genexpr>)tuple)rA   r   r   r   _transpose_dense   s   rw   cu_seq_qcu_seq_kscalefloat | None	is_causal!tuple[torch.Tensor, torch.Tensor]c              	   C  st   t d u rtdtt }||||d|||	d ur|	 nd d}|
d ur'|
|d< |j| ||fi |\}
}|
| fS )NFA4 not registeredT)softmax_scalecausalrm   ro   
return_lsecu_seqlens_qcu_seqlens_krd   rk   )r   r0   r)   
contiguousr+   )r?   r]   r^   rx   ry   rz   r|   rm   ro   rd   rk   r1   kwargslser   r   r   _fa4_run_forward   s    
r   Fdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                 C  sL   t d u rtdtt }|j||||| | ||	|||
d\}}}|||fS )Nr~   )r   r   r   r   r   )r   r0   r)   r,   r   )rj   r?   r]   r^   rk   rl   rx   ry   rz   r|   r   r1   dqdkdvr   r   r   _fa4_run_backward   s"   
r   )rz   rm   ro   rd   rc   rk   	cum_seq_kmax_qmax_kc
                C  s   t | ||||	|||}|d urtd| t| |||||
|||||\}}tjdtj| jd}tjdtj| jd}tjd| j| jd}|||||fS )Nz)FA4 flash_attention forward unsupported: )rt   )rT   r   r   r   )	ri   r0   r   r    zerosuint64r   emptyrT   )r?   r]   r^   rC   r   r   r   r_   r|   ra   rz   rm   ro   rd   rc   rk   rh   r   	rng_statephilox_offset
debug_maskr   r   r   r:      s:   
r:   )rz   rm   ro   r   unusedc                C  sd   t | ||||||
|||
}|d urtd| t }t| ||||||||||\}}}|||fS )Nz*FA4 flash_attention backward unsupported: )rp   r0   r    $are_deterministic_algorithms_enabledr   )rj   r?   r]   r^   rk   rl   rC   r   r   r   r_   r|   r   r   rz   rm   ro   rh   r   r   r   r   r   r   r   r;   $  s:   

r;   re   rz   c                C  s   t | ||||d d d }|d urtd| t| ||\}}	}
t| }|dd}|d}|	d}t||	|
d d |||||||d\}}}}}| d}|d}||d d |||||f	S )NzFA4 SDPA forward unsupported: r	   rt   )rz   rk   )ri   r0   rw   r    
empty_likeru   sizer:   )r?   r]   r^   r_   r|   ra   rz   rh   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr$   r   r   r   r   r   r   r   r   r   r<   W  sV   






r<   philox_seedr   c                C  s   t | ||||||
d d d 
}|d urtd| t||||| \}}}}}|d}|d}	t||||||d d ||	|
||||d\}}}t|||\}}}|||fS )NzFA4 SDPA backward unsupported: rt   r   )rp   r0   rw   r   r;   )rj   r?   r]   r^   rk   rl   rC   r   r   r   r_   r|   r   r   rz   rh   r   r   r   ogor   r   r   r   r   r   r=     sH   


r=   FA4)register_fn)r   r   r   r   )r&   )r'   r(   r   r   )r'   r(   r   r   )r   r   )r   )
r?   r@   rA   rB   rC   rD   rE   rF   r   r   )r?   r@   r]   r@   r^   r@   r_   r`   ra   rb   rc   rD   rd   rD   rC   rD   r   r   )rj   r@   r?   r@   r]   r@   r^   r@   rk   r@   rl   r@   r_   r`   rC   rD   rm   rn   ro   rn   r   r   )rA   rr   r   rs   r   )r?   r@   r]   r@   r^   r@   rx   rD   ry   rD   rz   r{   r|   rb   rm   rn   ro   rn   rd   rD   rk   rD   r   r}   )F)rj   r@   r?   r@   r]   r@   r^   r@   rk   r@   rl   r@   rx   rD   ry   rD   rz   r{   r|   rb   r   rb   r   r   ) r?   r@   r]   r@   r^   r@   rC   rD   r   rD   r   r   r   r   r_   r`   r|   rb   ra   rb   rz   r{   rm   rn   ro   rn   rd   rD   rc   rD   rk   rD   )"rj   r@   r?   r@   r]   r@   r^   r@   rk   r@   rl   r@   rC   rD   r   rD   r   r   r   r   r_   r`   r|   rb   r   r@   r   r@   rz   r{   rm   rn   ro   rn   )re   FF)r?   r@   r]   r@   r^   r@   r_   r`   r|   rb   ra   rb   rz   r{   )rj   r@   r?   r@   r]   r@   r^   r@   rk   r@   rl   r@   rC   rD   r   rD   r   r   r   r   r_   r`   r|   rb   r   r@   r   r@   rz   r{   ))__doc__
__future__r   r-   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r    r
   typesr   r    torch.libraryr   __all__r   r   r   r%   r   r)   r*   r\   ri   rp   rq   rw   r   r   r:   r;   r<   r=   register_flash_attention_implr   r   r   r   <module>   sl    


!
,,B7M8