o
    ki\1                  #   @   sF  d Z ddlZddlmZ ddlmZmZ ddlZee	Z
ddgZdee dB dee fd	d
ZedddedefddZG dd deZejjdi d			d9dejdejdejdejdejdededededB dee dB deejejejf fddZej			d9dejdejdejdejdejdededededB dee dB deejejejf fdd Zddd!d"dejdejdejdejdejdeded#edB dedB deeef dejeejejf B fd$dZd%ed&eed'f d(eddfd)d*Zejjd+i d		d:d,ejdejdejdejd-ejd.ejdejdejdededed/ejdedB dee dB deejejejf fd0d1Zej		d:d,ejdejdejdejd-ejd.ejdejdejdededed/ejdedB dee dB deejejejf fd2d3Zd%ed,ejd4ejd5ejdeejdB d'f f
d6d7Zejeed8 dS );z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequestwindow_sizereturnc                 C   s2   | d u rddg} t | dkrtdt |  | S )N   z$window_size must have length 2, got )len
ValueError)r    r   c/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/nn/attention/varlen.py_normalize_window_size   s
   r      )maxsizedevice_indexc                 C   s   dS )z;Cache device capability check to avoid repeated CUDA calls.Fr   )r   r   r   r   _should_use_cudnn   s   r   c                   @   s   e Zd ZU dZdZeed< dS )r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r   #   s   
 ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscalec
                 C   s   t |	}	| jot| jj}
|
rGtd |	d dks |	d dkr$tdtj	j
j| ||d||||dd|d	|d
}|d |d |d }}}n"td tj	j
j| ||||||d|d	||	d |	d d\}}}}}tjdtj| jd}|||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnr   r	      TcuDNN backend does not support window attention. Please use Flash Attention backend.NT        Fr$      -Using Flash Attention backend for varlen_attn)return_debug_maskr$   window_size_leftwindow_size_rightr
   dtypedevice)r   is_cudar   r2   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r   r    r!   r"   r#   r$   r   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_r   r   r   _varlen_attn-   sX   



rF   c
                 C   s   t |	}	t| }
| d}| d}tjjr,|dd }tj|||ftj| jd}ntj||ftj| jd}tjdtj	| jd}|
||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r&   r0   r/   )
r   r8   
empty_likesizeversionhipemptyfloatr2   r>   )r   r   r   r   r    r!   r"   r#   r$   r   rA   total_q	num_heads
batch_size	logsumexprC   r   r   r   _varlen_attn_fakes   s   



rQ   )r	   r	   )
return_auxr$   r   rR   c                C   sL   |	dk}
t jj| |||||||
|t|	
\}}}|dur$|jr$||fS |S )au  
    Compute variable-length attention using Flash Attention.
    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.

    Args:
        query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
        key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
        value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
        cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
        cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
        max_q (int): Maximum query sequence length in the batch.
        max_k (int): Maximum key/value sequence length in the batch.
        return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
        scale (float, optional): Scaling factor for attention scores
        window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
            Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
            or (W, 0) for causal attention with sliding window of size W.

    Returns:
        output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H, D)`.

        If ``return_aux`` is not None and ``return_aux.lse`` is True:
            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H)`.

    Shape legend:
        - :math:`N`: Batch size
        - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
        - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
        - :math:`H`: Number of attention heads
        - :math:`D`: Head dimension

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len
        ... )
    )r	   r   N)r8   r9   
torch_attnrF   listr   )r   r   r   r   r    r!   r"   rR   r$   r   r#   outr   rD   r   r   r   r      s    N
ctxinputs.rA   c              
   C   s\   |\
}}}}}}}	}
}}|\}}}|  |||||||| || _|	| _|
| _|| _|| _d S N)save_for_backwardr!   r"   r#   r$   r   )rV   rW   rA   r   r   r   r   r    r!   r"   r#   r$   r   rU   r   rC   r   r   r   _setup_context   s&   

rZ   z!torch_attn::_varlen_attn_backwardgrad_outrU   r   rC   c                 C   s   t |}tjd|jd}|jot|jj}|rFtd |d dks(|d dkr,t	dtj
jj| |||||||||	d|
|||d\}}}n$td	 tj
jj| |||||||||	d|
||||d |d d
\}}}|||fS )Nr   )r2   r%   r	   r&   r'   r(   r)   r+   )r$   r-   r.   )r   r8   rK   r2   r3   r   r4   r5   r6   r7   r9   r:   _cudnn_attention_backward_flash_attention_backward)r[   r   r   r   rU   r   r   r    r!   r"   r#   rC   r$   r   unusedr?   dqdkdvr   r   r   _varlen_attn_backward  s^   


rb   c                 C   s0   t |}t|}t|}t|}|||fS )zF
    Fake implementation for meta tensor computation and tracing.
    )r   r8   rG   )r[   r   r   r   rU   r   r   r    r!   r"   r#   rC   r$   r   
grad_querygrad_key
grad_valuer   r   r   _varlen_attn_backward_fake\  s
   



rf   grad_lsegrad_rngc                 C   sz   | j \}}}}}}	}
}| j}| j}| j}| j}| j}tjj	|||||	|
||||||||\}}}|||d d d d d d d f
S rX   )
saved_tensorsr!   r"   r#   r$   r   r8   r9   rS   rb   )rV   r[   rg   rh   r   r   r   r   r    rU   r   rC   r!   r"   r#   r$   r   r_   r`   ra   r   r   r   	_backwardy  s.   
rj   )setup_context)FNN)NN) r   logging	functoolsr   typingr   r   r8   	getLoggerr   r5   __all__rT   intr   r   r   r   library	custom_opTensorrL   tuplerF   register_fakerQ   r   rZ   rb   rf   rj   register_autogradr   r   r   r   <module>   sX   
	
		

E		

4	


"`	

D	


