o
    kiK                     @  s  U d Z ddlmZ ddlZddlZddlmZ erddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlZdd	lmZ d
dlmZ dgZdaded< daded< e
G dd dZedqddZ	drdsddZdtddZdud d!Zdvd.d/Zdwd6d7Zdxd>d?Z ed@Z!dydCdDZ"dzdFdGZ#				d{d|dPdQZ$	Rd}d~dWdXZ%			dddYdYddddZdd\d]Z&ddYdYddddZdd^d_Z'dddd`ddcddZ(				e	R	RdddfddgdhZ)	e	R	RdddfddidjZ*ddfddmdnZ+ej,doedp dS )z
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                   @  s    e Zd ZU ded< dddZdS )	
_FA3HandlezLibrary | NonelibraryreturnNonec                 C  s   d | _ tjd d S )NF)r   torch_C_set_sdp_use_fa3)self r   a/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/nn/attention/_fa3.pyremove*   s   z_FA3Handle.removeN)r   r   )__name__
__module____qualname____annotations__r   r   r   r   r   r   &   s   
 r   devicetorch.devicer   intc                 C  s   t j| \}}|S N)r   cudaget_device_capability)r   major_r   r   r   _get_device_major0   s   r&   flash_attn_interfacemodule_pathstrc                 C  s   t |  tjd tt S )z
    Register FA3 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA3 implementation.
    T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsr(   r   r   r   r   6   s   	
r   c                 C  sz   t |  ttjdstd|  dttjjds"td|  dttjjds1td|  dtjjjatjjj	a
d S )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr-   r.   r   r/   r   r,   r   r   r   r*   G   s   



r*   r	   c                  C  sd   t ddd} | dtd | dtd | dtd | dtd | dtd | d	td | S )
NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)r	   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libr   r   r   r+   X   s0   r+   querytorch.Tensortensorstuple[torch.Tensor, ...]	dropout_pfloat	cum_seq_qtorch.Tensor | None	q_descale	k_descale	v_descale
str | Nonec                 C  s   |dkrdS t dd |D sdS tdd |D dkrd	S | jtjkr6|d u s0|d u s0|d u r6td
t |d u rB|  dkrBdS |d urN|  dkrNdS tj	
 sUdS t| jdkr^dS d S )N        zdropout_p must be 0c                 s  s    | ]}|j V  qd S r!   )is_cuda.0tr   r   r   	<genexpr>   s    z,_fa3_common_support_error.<locals>.<genexpr>zinputs must be CUDA tensorsc                 S     h | ]}|j qS r   )r   rR   r   r   r   	<setcomp>       z,_fa3_common_support_error.<locals>.<setcomp>r
   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllendtyper   float8_e4m3fnwarningswarnUserWarningdimr"   is_availabler&   r   )rD   rF   rH   rJ   rL   rM   rN   r   r   r   _fa3_common_support_errort   s*   	
rf   keyvaluereturn_debug_maskboolalibi_slopes	seqused_kc                   s   |rdS |d ur
dS |d ur|j tjkrdS |jsdS tjtjtjf t fdd| ||hD s6d  S tdd	 | ||hD d
krFdS t	| | ||f||||	|
}|d ur_|dkr]dS |S d S )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc                 3      | ]}|j  v V  qd S r!   r_   rR   supported_dtypesr   r   rU          z-_fa3_forward_support_error.<locals>.<genexpr>inputs must be one of c                 S  rV   r   rn   rR   r   r   r   rW      rX   z-_fa3_forward_support_error.<locals>.<setcomp>r
   #all inputs must have the same dtyperY   z(query, key, value must be on same device)
r_   r   int32rQ   r`   float16bfloat16r]   r^   rf   )rD   rg   rh   rH   ri   rk   rl   rJ   rL   rM   rN   errorr   ro   r   _fa3_forward_support_error   s8   
	rx   grad_outout	logsumexpwindow_size_left
int | Nonewindow_size_rightc
                   s   |j tjkr		 dS |j tjkrdS tjtjf t fdd| ||||hD s,d  S tdd | ||||hD dkr>d	S t|| |||||f||d d d }
|
d urT|
S d S )
NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c                 3  rm   r!   rn   rR   ro   r   r   rU      rq   z._fa3_backward_support_error.<locals>.<genexpr>rr   c                 S  rV   r   rn   rR   r   r   r   rW      rX   z._fa3_backward_support_error.<locals>.<setcomp>r
   rs   )	r_   r   r`   float32ru   rv   r]   r^   rf   )ry   rD   rg   rh   rz   r{   rH   rJ   r|   r~   rw   r   ro   r   _fa3_backward_support_error   s,    
 	r   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                  G  s   t dd | D S )Nc                 s  s    | ]	}| d dV  qdS )r
      N)	transposerR   r   r   r   rU      s    z#_transpose_dense.<locals>.<genexpr>)tuple)rF   r   r   r   _transpose_dense   s   r   xc                 C  s"   | dur|  ddkr|  S | S )z2Ensure tensor is contiguous in the last dimension.Nr
   )stride
contiguous)r   r   r   r   _maybe_contiguous   s   "r   cu_seq_qcu_seq_kmax_qmax_kscalefloat | None	is_causal!tuple[torch.Tensor, torch.Tensor]c                 C  sD  t du rtdt| }t|}|jtjkr(|ddkr(|ddkr(| nt|}t|}t|}t|}t g |||ddd|||dd|||dddddd||||||	durr|	nd|
durz|
ndddddt rdnddtj	
 pdR  \}}}}|| fS )	zF
    Run the FA3 forward pass by calling the C++ kernel directly.
    NFA3 not registeredr   r
   r   rP   T)r   r4   r   r_   r   r`   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)rD   rg   rh   r   r   r   r   r   r   r|   r~   rl   rz   rL   rM   rN   qkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accumr   r   r   _fa3_run_forward   s   	
 !"$r   Fmax_seqlen_qmax_seqlen_kdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                 C  s   t d u rtdt| }|ddkr| n|}|ddkr$| n|}|ddkr1| n|}t|}t|}t|}t|}t|}t |||||||||||d d ||	|
|||d|tj pfd |||fS )Nr   r   r
   rP   r   )	r   r4   r   r   r   r   
empty_liker   r   )ry   rD   rg   rh   rz   r{   r   r   r   r   r   r   r|   r~   r   doutr   r   r   olsedqdkdvr   r   r   _fa3_run_backwardC  sH   



r   r   r   r|   r~   rl   rk   rz   	cum_seq_kc                C  s   t | ||||	||||
||}|d urtd| t| |||||||||||||
||\}}tjdtj| jd}tjdtj| jd}tjd| j| jd}|||||fS )Nz)FA3 flash_attention forward unsupported: )r   )r_   r   r   r   )	rx   r4   r   r   zerosuint64r   emptyr_   )rD   rg   rh   rJ   r   r   r   rH   r   ri   rL   rM   rN   r   r|   r~   rl   rk   rz   rw   r   	rng_statephilox_offset
debug_maskr   r   r   r=   ~  sJ   r=   c
                C  s.   t | |||||||||	d d d |
|||||dS )Nr   )r=   )rD   rg   rh   rJ   r   r   r   rH   r   ri   r   r|   r~   rl   rk   rz   r   r   r   r?     s*   r?   )r   r|   r~   r   unusedc                C  s   t | ||||||
|||
}|durtd| t }t| |||||||||	|||dur/|nd|dur6|nd|\}}}|||fS )z0FA3 implementation of _flash_attention_backward.Nz*FA3 flash_attention backward unsupported: r   )r   r4   r   r   r   )ry   rD   rg   rh   rz   r{   rJ   r   r   r   rH   r   r   r   r   r|   r~   rw   r   r   r   r   r   r   r   rA     sB   

rA   rP   r   c	                C  s   t | ||||d d d |||}
|
d urtd|
 t| ||\}}}| jtjkr+tjn| j}tj| |d}|dd}|	d}|	d}t
|||d d ||||||	||||d\}}}}}| 	d}|	d}||d d |||||f	S )NzFA3 SDPA forward unsupported: rn   r
   r   )r   rz   rL   rM   rN   )rx   r4   r   r_   r   r`   rv   r   r   sizer=   )rD   rg   rh   rL   rM   rN   rH   r   ri   r   rw   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr%   r   r   r   r   r   r   r   r   r   r>      sd   



r>   c                C  s   t | ||d d d ||||d
S )Nr   )r>   )rD   rg   rh   rH   r   ri   r   r   r   r   r@   g  s   
r@   philox_seedr   c                C  s   t | ||||||
ddd
}|durtd| t| ||||\}}}}}t||||||dd||	|
||||d\}}}t|||\}}}|||fS )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r4   r   rA   )ry   rD   rg   rh   rz   r{   rJ   r   r   r   rH   r   r   r   r   rw   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outr   r   r   rB     s6   

rB   FA3)register_fn)r   r   r   r    )r'   )r(   r)   r   r   )r(   r)   r   r   )r   r	   )rD   rE   rF   rG   rH   rI   rJ   rK   rL   rK   rM   rK   rN   rK   r   rO   )rD   rE   rg   rE   rh   rE   rH   rI   ri   rj   rk   rK   rl   rK   rJ   rK   rL   rK   rM   rK   rN   rK   r   rO   )ry   rE   rD   rE   rg   rE   rh   rE   rz   rE   r{   rE   rH   rI   rJ   rK   r|   r}   r~   r}   r   rO   )rF   r   r   r   )r   rK   r   rK   )NNNN)"rD   rE   rg   rE   rh   rE   r   rK   r   rK   r   r    r   r    r   r   r   rj   r|   r}   r~   r}   rl   rK   rz   rK   rL   rK   rM   rK   rN   rK   r   r   )F) ry   rE   rD   rE   rg   rE   rh   rE   rz   rE   r{   rE   r   rK   r   rK   r   r}   r   r}   r   r   r   rj   r|   r    r~   r    r   rj   r   r   )NNN)&rD   rE   rg   rE   rh   rE   rJ   rK   r   rK   r   r    r   r    rH   rI   r   rj   ri   rj   rL   rK   rM   rK   rN   rK   r   r   r|   r    r~   r    rl   rK   rk   rK   rz   rK   ) rD   rE   rg   rE   rh   rE   rJ   rK   r   rK   r   r    r   r    rH   rI   r   rj   ri   rj   r   r   r|   r    r~   r    rl   rK   rk   rK   rz   rK   )"ry   rE   rD   rE   rg   rE   rh   rE   rz   rE   r{   rE   rJ   rK   r   rK   r   r    r   r    rH   rI   r   rj   r   rE   r   rE   r   r   r|   r}   r~   r}   )NNNrP   FF)rD   rE   rg   rE   rh   rE   rL   rK   rM   rK   rN   rK   rH   rI   r   rj   ri   rj   r   r   )rP   FF)rD   rE   rg   rE   rh   rE   rH   rI   r   rj   ri   rj   r   r   )ry   rE   rD   rE   rg   rE   rh   rE   rz   rE   r{   rE   rJ   rK   r   rK   r   r    r   r    rH   rI   r   rj   r   rE   r   rE   r   r   )-__doc__
__future__r   r0   ra   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r   r   torch.libraryr	    r   __all__r   r   r   r   r&   r   r*   r+   rf   rx   r   r   r   r   r   r   r=   r?   rA   r>   r@   rB   register_flash_attention_implr   r   r   r   <module>   s    	



%
+&

\FI:?K(5