o
    ki                     @   s  U d dl mZ d dlZd dlZd dlmZmZmZ ddlm	Z	 d dl
mZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlZddgZedZedZeeZ zd dl!m"Z# W n e$y   e%dd dD re &d eZ#Y nw ej'j(Z(dd Z)i Z*e+eef e,d< dd Z-d\deeeef geeef f fddZ.e.e(j/ddde0fd d!Z1e.e(j2d]de0fd"d#Z3e.e(j4d]de0fd$d%Z5e.e(j6d]de0fd&d'Z7e.e(j8					d^de0fd(d)Z9	d\d*e:e0 d+e:e0 d,e:e0 d-e;de0f
d.d/Z<e.e(j=e(j>e(j?e(j@e(jAgddde0fd0d1ZBe.e(jCde0fd2d3ZDd4d5 ZEe.e(jFe(jGe(jHgddde0fd6d7ZId8d9 ZJdd:deeKeKe0d;f eKe0d;f eKe0d;f eKe0d;f dB f  fd<d=ZLdd:deeKeKe0d;f eKe0d;f eKe0d;f eKe0d;f dB f  fd>d?ZMe.e(jNd@dAddde0fdBdCZOe.e(jPd@dAde0fdDdEZQdFdG ZRe.e(jSe(jTe(jUgddde0fdHdIZVe.e(jWd@dAde0fdJdKZXe.e(jYd@dAde0fdLdMZZi e(j/e1e(j2e3e(j4e5e(j6e7e(j8e9e(j=eBe(j>eBe(j?eBe(jAeBe(j@eBe(jCeDe(jFeIe(jGeIe(jHeIe(jSeVe(jTeVe(jUeVe(jNeOe(jPeQe(jWeXe(jYeZiZ*dNdO Z[g dPZ\dQdR Z]dSdT Z^de_fdUdVZ`dWdX ZaG dYd dZbG dZd[ d[eZcdS )_    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_PJITFunctionc                 c   s"    | ]}t tj|d d uV  qd S N)getattrtorchversion).0attr r   b/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/utils/flop_counter.py	<genexpr>   s     r!   )cudahipxpuz@triton not found; flop counting will not work for triton kernelsc                 C   s   t | tjr	| jS | S r   )
isinstancer   Tensorshape)ir   r   r    	get_shape#   s   r)   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r)   )r+   argskwargsr,   fr   r    nf+   s   zshape_wrapper.<locals>.nfr   r0   r1   r   r/   r    shape_wrapper*   s   r3   Freturnc                    s,   dt ttf dt ttf f fdd}|S )Nflop_formular4   c                    s.   st   d fdd}tjj|  S )Nr4   c                    sL   t | tjjtfstd|  dt|  | tv r td|   t| < d S )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )	r%   r   _opsOpOverloadPacket_JITFunction
ValueErrortyper*   RuntimeError)targetr5   r   r    register7   s   z=register_flop_formula.<locals>.register_fun.<locals>.register)r4   N)r3   r   utils_pytree	tree_map_)r5   r>   get_rawtargetsr=   r    register_fun3   s
   z+register_flop_formula.<locals>.register_fun)r
   r   r   )rD   rC   rE   r   rB   r    r   1   s   ()r,   c          	      O   s<   | \}}|\}}||krt d| d| || d | S )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper,   r-   r.   mkk2nr   r   r    mm_flopH   s
   rP   c                 K   
   t ||S )zCount flops for addmm.rP   
self_shaperJ   rK   r,   r.   r   r   r    
addmm_flopT   s   
rU   c                 K   sd   | \}}}|\}}}	||krt d| d| ||kr&t d| d| || |	 d | }
|
S )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rF   z0bmm: inner dimensions must match (k == k2), got rG   rH   )rJ   rK   r,   r.   brL   rM   b2rN   rO   flopr   r   r    bmm_flopY   s   

rY   c                 K   rQ   )z&Count flops for the baddbmm operation.)rY   rS   r   r   r    baddbmm_floph   s   
rZ   c	           
      K   s
   t | |S )zCount flops for _scaled_mm.rR   )
rJ   rK   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr,   r.   r   r   r    _scaled_mm_flopo   s   
ra   x_shapew_shaper,   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   rG   Nr   )
rb   rc   r,   rd   
batch_size
conv_shapec_outc_infilter_sizerX   r   r   r    conv_flop_count   s   
 rj   c          
      O   s   t | |||dS )zCount flops for convolution.rd   )rj   )
rb   rc   _bias_stride_padding	_dilationrd   r,   r-   r.   r   r   r    	conv_flop   s   rp   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   rG   )list)r'   r   r   r    t   s    zconv_backward_flop.<locals>.tr   r   Frk   )r)   rj   )grad_out_shaperb   rc   rl   rm   rn   ro   rd   _output_padding_groupsoutput_maskr,   rr   
flop_countgrad_input_shapegrad_weight_shaper   r   r    conv_backward_flop   s   F  rz   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr<n t d||  kr+|kr<n t d||
kr<|	|kr<||
ks@t dd}|t|| ||f|| ||	f7 }|t|| ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    z8sdpa_flop_count: query/key/value shapes are incompatibler   rI   rY   )query_shape	key_shapevalue_shaperV   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r    sdpa_flop_count  s   ""r   c                O   s   t | ||S )Count flops for self-attention.r   )r|   r}   r~   r,   r-   r.   r   r   r    	sdpa_flop,  s   r   c                 C   sR   ddl m} ddlm} t| ||fs| jjdkr|   S |g| 	dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r%   devicer:   difftolistsize)offsetsmax_lenr   r   r   r   r    _offsets_to_lengths5  s
   r   )grad_out.c                 c   sB   |durt |jdkrtdt |jdkrtd|dur)|j| jkr)td| j\}}	}
|j\}}}|j\}}}|du rCtd|du rKtd|j|jkrUtdt||}t||}t||d	d
D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qfdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr'   rI   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r    %_unpack_flash_attention_nested_shapesA  s6   

&r   c                 c   sH   |durt |jdkrtdt |jdkrtd|dur)|j| jkr)td| j\}}}	}
|j\}}}}|j\}}}}|du rFtd|du rNtd|j|jkrXtdt||}t||}t||d	d
D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qidS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   r   r   r    )_unpack_efficient_attention_nested_shapesu  s6   

&r   T)rC   c             	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   $    | ]\}}}}t |||V  qd S r   r   r   r|   r}   r~   r   r   r   r    r!     
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r   r   r   r   r   r   r   r,   r-   r.   sizesr   r   r    _flash_attention_forward_flop     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r    r!     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r   r   r   biasr   r   r   r   r-   r.   r   r   r   r    !_efficient_attention_forward_flop  r   r   c                 C   sf  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krFn t d||
  kr=|  kr=|krFn t d||ksJt d||krV||krV||ksZt dd}|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|S )Nr   zFsdpa_backward_flop_count: batch/heads/dimension mismatch among tensorszJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler{   )rs   r|   r}   r~   r   rV   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4r   r   r    sdpa_backward_flop_count  s(     """""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rs   r|   r}   r~   r,   r-   r.   r   r   r    sdpa_backward_flop	  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)r   r   r   r   r   r   r   r   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   r|   r}   r~   rs   r   r   r    r!   +  
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   out	logsumexpr   r   r   r   r-   r.   shapesr   r   r    _flash_attention_backward_flop     
r   c
              
   O   r   )N)r   r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r    r!   L  r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   r   r   r   r   r   r   r-   r.   r   r   r   r    "_efficient_attention_backward_flop1  r   r   c                 C   s   t | ts| fS | S r   )r%   tuple)xr   r   r    normalize_tuplej  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   rG   r   )maxminr   suffixesstr)numberindexr   r   r    get_suffix_strs  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r   r   r   r    convert_num_with_suffixz  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r    convert_to_percent_str  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r-   	flat_argsspecr   r/   r   r    r1     s   
z)_pytreeify_preserve_structure.<locals>.nfr   r2   r   r/   r    _pytreeify_preserve_structure  s   r   c                       s   e Zd ZdZ				ddejjeejj B dB dede	de
eef dB d	df
 fd
dZd	efddZd	e
ee
eef f fddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    NrG   Tmodsdepthdisplaycustom_mappingr4   c                    st   t    tdd | _|| _|| _d | _|d u ri }|d ur&tjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r    <lambda>  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itrG   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )_get_rawF)r   r3   r   rM   vr   r   r    
<dictcomp>  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr*   itemsr   mod_tracker)selfr   r   r   r   	__class__r   r    r    s   
zFlopCounterMode.__init__c                 C   s   t | jd  S )NGlobal)r   r  valuesr  r   r   r    get_total_flops  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r    r     s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r  r  r  r   r   r    get_flop_counts  s   
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv roso|D ]
}	d|	d  |	d< q]|dd| }t
|dkrzg dg}|j||ddS )Ni?B r   T)ModuleFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r  r  appendr   r   r  r   )mod_namer   r   paddingr  rM   r   global_flopsglobal_suffixis_global_subsumedr  r   r    process_mod  s    z.FlopCounterMode.get_table.<locals>.process_modr  .r   r  )r  0r   )leftrightr  )headerscolalign)r   tabulatePRESERVE_WHITESPACEr  r   sortedr  keyscountextendr   )
r  r   r"  headerr  r  mod	mod_depth
cur_valuesr   r   r  r    	get_table  s6   
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r  clearr  	__enter___FlopCounterModer  r  r   r   r    r.    s
   



zFlopCounterMode.__enter__c                 G   sH   | j d u r	td| j j| }d | _ | j  | jr"t| | j |S )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r  rI   __exit__r  r   printr,  r   )r  r-   rV   r   r   r    r0    s   

zFlopCounterMode.__exit__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr+   )r*   setr  parentsr  )r  func_packetr   r-   r.   flop_count_funcrw   parr   r   r    _count_flops  s   

zFlopCounterMode._count_flops)NrG   TNr   )__name__
__module____qualname____doc__r   nnr  rq   r   boolr  r   r  r  r   r  r,  r.  r0  r7  __classcell__r   r   r	  r    r     s.    
?
c                   @   s<   e Zd ZdZdeddfddZdd Zd	d
 ZdddZdS )r/  Tcounterr4   Nc                 C   s
   || _ d S r   )r?  )r  r?  r   r   r    r  #  s   
z_FlopCounterMode.__init__c                 C   s`   ddl }| | jj}|  || }W d   n1 sw   Y  | | jj}|| j_||fS )a  Execute a branch function and capture its FLOP counts without
        affecting self.counter.flop_counts

        Args:
            branch_fn: The branch function to execute
            operands: Arguments to pass to the branch function

        Returns:
            Tuple of (result, flop_counts) where result is the branch output
            and flop_counts is a copy of the FLOP counts after execution
        r   N)copyr?  r  )r  	branch_fnoperandsr@  checkpointed_flop_countsresultr  r   r   r    $_execute_with_isolated_flop_counting&  s   
z5_FlopCounterMode._execute_with_isolated_flop_countingc                 C   s  |t jjjt jjjhv }|r=ddlm} ddlm} ||d }t	||s4t
|dr.|j}nnt	||r%| j|d ||S |t jjju r|\}	}
}}| |
|\}}|tu rXtS | ||\}}|tu rftS t| t| B }i }|D ]4}|| }|| }i }t| t| B }|D ]}||d}||d}t||||< q|||< qv| D ]\}}| jj| | q|S tS )Nr   )
get_kernelr   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wraprF  triton.runtime.jitr   r%   hasattrrH  r?  r7  condrE  NotImplementedr2  r%  getr   r  r  update)r  functypesr-   r.   	is_tritonrF  r   kernel_namepredtrue_branchfalse_branchrB  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dictr   r   r    _handle_higher_order_ops:  sR   




z)_FlopCounterMode._handle_higher_order_opsr   c                 C   sX  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rWtS t|t jjrf| ||||S || jjvr|t jjjjur|  |j|i |}|tur|W  d    S W d    n1 sw   Y  ||i |}| j|j|||S r   )r   rI  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutrQ  r%   r6   HigherOrderOperatorrj  r?  r*   r   	decomposer7  _overloadpacket)r  rT  rU  r-   r.   rr   r   r   r    __torch_dispatch__w  s<   













z#_FlopCounterMode.__torch_dispatch__)r   N)	r8  r9  r:  supports_higher_order_operatorsr   r  rE  rj  r  r   r   r   r    r/     s    =r/  )Fr   )NNNFN)drU  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr   r	   collections.abcr
   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r  __all__r   r   	getLoggerr8  logrN  r   r8   ImportErroranywarningrI  rk  r)   r*   r  __annotations__r3   r   mmr   rP   addmmrU   bmmrY   baddbmmrZ   
_scaled_mmra   rq   r=  rj   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideablerp   convolution_backwardrz   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r   r   r   r/  r   r   r   r    <module>   s:  

*
&g6

96

7
  	

 