o
    eib3                    @  sh  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZ
d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlZd dlZd dlZd dlmZ d dlm  m Z! d dl"m#Z#m$Z$ d dl%m&Z& d d	l'm(Z( d d
l)m*Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@mAZA d dlBmCZCmDZDmEZEmFZFmGZGmHZH d dlImJZJ d dlKmLZL d dlMmNZN dd
lOm*Z* ddlPmQZQ ddlRmSZSmTZTmUZUmVZV ddlWmXZX ddlYmZZZm[Z[m\Z\ ddl]m^Z^ ddl_m`Z` ddlambZbmcZcmdZdmeZemfZf dd lgmhZhmiZimjZj erAd dlkZld dlmZme*jnZod!epd"< eqerZsd#epd$< ejtjuZuejtjvZveG d%d& d&ZweG d'd( d(ZxeG d)d* d*Zyd!d.d/Zzd"d2d3Z{d"d4d5Z|d#d7d8Z}G d9d: d:Z~d!d;d<Ze~ Zd$d?d@Zd%dAdBZd&dEdFZ		Gd'd(dRdSZd!dTdUZd!dVdWZd!dXdYZd!dZd[Zd!d\d]Zd!d^d_Zd!d`daZd!dbdcZd)dgdhZd*dmdnZd+dqdrZ	s	t	 d,d-d|d}Zd.ddZd/ddZd0ddZd1ddZd2ddZd3ddZd4ddZd4ddZd5ddZ	d6d7ddZddd8ddZdddd9ddZedZd:ddZd#ddZd;ddZejd<ddZd=ddZd>ddZd?ddτZd@dd҄ZdAddԄZdAddքZdAdd؄Zd!ddڄZdBdd݄Z	d6dCddZdDddZdEddZdFddZdGddZdHddZdIddZd dlmZ dJdd ZdKddZ	dLdMddZdNddZdOddZdPddZ	dQdddRddZ				G	dSdTdd ZdS (U      )annotationsN)defaultdictdeque)Callable)	dataclassreplace)AnyTYPE_CHECKING)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)is_with_effects)config)CustomKnapsackSolverCustomRuntimeEstimator)FakeScriptObject)
is_builtin)
LazyStringtrace_structured)	trace_log)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolsis_symbol_binding_fx_node	size_hintstatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackdp_knapsack_sliding_hirschberggreedy_knapsackilp_knapsack)KnapsackEvaluator)	AOTOutputSavedForBackwardsAOTOutput#SavedForBackwardsNoVcCheckAOTOutput)_is_functional_graph)get_aot_graph_name)_is_bwd_seed_offset_is_fwd_seed_offset
_is_primal_is_tangentget_cuda_generator_meta_val)fx_graph_cseget_aten_targetraise_getitemsboolAOT_PARTITIONER_DEBUGzlogging.Loggerlogc                   @  sl   e Zd ZU dZded< ded< ded< ded< ded< dddZdddZdddZdddZdddZ	dS )OpTypesz8Class for keeping track of different operator categorieszOrderedSet[Callable[..., Any]]fusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodefx.Nodereturnr9   c                 C     t || jv S N)r7   r=   selfrB    rI   g/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/_functorch/partitioners.py
is_fusibleg      zOpTypes.is_fusiblec                 C  rE   rF   )r7   r>   rG   rI   rI   rJ   is_compute_intensivej   rL   zOpTypes.is_compute_intensivec                 C  rE   rF   )r7   r?   rG   rI   rI   rJ   	is_randomm   rL   zOpTypes.is_randomc                 C  rE   rF   )r7   r@   rG   rI   rI   rJ   is_viewp   rL   zOpTypes.is_viewc                 C  rE   rF   )r7   rA   rG   rI   rI   rJ   is_recomputables   rL   zOpTypes.is_recomputableNrB   rC   rD   r9   )
__name__
__module____qualname____doc____annotations__rK   rM   rN   rO   rP   rI   rI   rI   rJ   r<   ]   s   
 



r<   c                   @  s~   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< ejdddZdddZdddZdddZ	dddZ
dS )NodeInfolist[fx.Node]inputsOrderedSet[fx.Node]_required_fw_nodesrequired_bw_nodestangents_closureunclaimed_nodesdict[fx.Node, int]fw_orderstatic_lifetime_input_nodesrD   c                   s    t dd  jD  fdddS )Nc                 s  s    | ]}|V  qd S rF   rI   .0nrI   rI   rJ   	<genexpr>   s    z-NodeInfo.required_fw_nodes.<locals>.<genexpr>c                   s
    j |  S rF   )r`   rd   rH   rI   rJ   <lambda>      
 z,NodeInfo.required_fw_nodes.<locals>.<lambda>key)sortedr[   rg   rI   rg   rJ   required_fw_nodes   s   zNodeInfo.required_fw_nodesrd   rC   r9   c                 C  
   || j v S rF   )r[   rH   rd   rI   rI   rJ   is_required_fw      
zNodeInfo.is_required_fwc                 C  rn   rF   )r\   ro   rI   rI   rJ   is_required_bw   rq   zNodeInfo.is_required_bwc                 C  rn   rF   )r^   ro   rI   rI   rJ   is_unclaimed   rq   zNodeInfo.is_unclaimedintc                 C  s$   || j vrtd| d| j| S )NNode z not in fw nodes!)r[   AssertionErrorr`   ro   rI   rI   rJ   get_fw_order   s   

zNodeInfo.get_fw_orderN)rD   rX   )rd   rC   rD   r9   )rd   rC   rD   rt   )rR   rS   rT   rV   	functoolscached_propertyrm   rp   rr   rs   rw   rI   rI   rI   rJ   rW   w   s   
 


rW   c                   @  s6   e Zd ZU ded< ded< ded< ded< ded< dS )MinCutOptionsr9   ban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)rR   rS   rT   rV   rI   rI   rI   rJ   rz      s   
 rz   rB   rC   rD   c                 C  s   | j dd tjtjfv S )N	recompute)metagetr$   MUST_RECOMPUTEPREFER_RECOMPUTErB   rI   rI   rJ   must_recompute   s   r   fx_gfx.GraphModulec                 C  s    | j jD ]	}t|r dS qdS )NTF)graphnodesr   r   rB   rI   rI   rJ   has_recomputable_ops   s
   r   c                 C  s<   | j jD ]}t|rt|jdrtjj|jjv r dS qdS )NtagsTF)	r   r   r   hasattrtargettorchTagnondeterministic_seededr   r   rI   rI   rJ   has_recomputable_rng_ops   s   
r   rt   c                 C  sJ   t | jd tjtjfrdS t | jd tjs#tdt| jd  dS )Nvalr%   z.expected node.meta['val'] to be SymFloat, got    )
isinstancer   r   SymIntSymBoolSymFloatrv   typer   rI   rI   rJ   sym_node_size   s   r   c                   @  s   e Zd ZdddZdS )InvalidNodeBaserD   strc                 C  s   dS )NzInvalid NoderI   rg   rI   rI   rJ   __repr__   s   zInvalidNodeBase.__repr__N)rD   r   )rR   rS   rT   r   rI   rI   rI   rJ   r      s    r   c                 C  s   t | jdd dkS )N	namespace_c10d_functional)getattrr   r   rI   rI   rJ   is_not_collective      r   getitem_nodefx.Node | Nonec                 C  s~   | j tjkrdS | jd }| jd }t|tjr|jdkrdS d|jvr&dS |jd }||vr1dS || }t|tjr=|S dS )zGiven a getitem node, check if it extracts from a higher-order op
    that has kwargs mapping the key back to an original input.

    Returns the original input node if found, None otherwise.
    Nr   r%   call_functionkwargs)	r   operatorgetitemargsr   fxNodeopr   )r   	ho_resultrk   r   original_inputrI   rI   rJ   _get_ho_op_original_input   s   



r   c                 C  sD   | j tjjjjtjjjjfvrdS | jd }t|t	j
sdS t|S )zCheck if node is a view/reshape of a higher-order op output that aliases an input.

    Returns the original input node from the higher-order op's kwargs if the pattern
    matches, None otherwise.
    Nr   )r   r   opsatenviewdefaultreshaper   r   r   r   r   )rB   sourcerI   rI   rJ   _is_copy_node_bw_only   s   
r   envdict[fx.Node, Any]c                 C  s`   t | }|dur||v rt|| ts|| S t| }|dur.||v r.t|| ts.|| S dS )a  Try to find a valid input replacement for an invalid forward output.

    This handles cases where a forward output depends on backward nodes but
    semantically aliases an input. For example, a view of a getitem from a
    triton kernel that mutates a buffer in backward, or a direct getitem from
    such a higher-order op. The original input may be a primal or a valid
    intermediate node already present in the forward graph.
    N)r   r   r   r   )rB   r   r   rI   rI   rJ   _find_input_for_invalid_output   s   r   Fjoint_graphfx.GraphrY   rX   outputsoutputs_descslist[AOTOutput]subgraph
str | Noneignore_must_be_in_fw_bwc                   sj  t  }i  |D ]}||j}|j|_| |< q| jD ]{}|sBt|r1|dkr1||vr1t |< qt|rB|dkrB||vrBt |< q| v rGq|j	dkrQt |< q|j	dkrt
j|ji |j}	 fdd|	D }	t|	rrt |< q|| fdd |< q|j	d	kr|| fd
d |< q|j	dkr	 qg }
t||D ]\}}t|t jr| vrtd| dt | trd}|jtjjjju rt|rt|jdkrt|jd t jr|jd  v rt |jd  ts |jd  }|du rt| }|dur	|
| qtd| d|
 |  q|
| q|t |
}||jd< |!  |"  |S )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardforwardplaceholderr   c                   s&   g | ]}t |tjrt  | tqS rI   )r   r   r   r   rc   xr   rI   rJ   
<listcomp>P  s    
z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>c                       |  S rF   rI   r   r   rI   rJ   rh   Y      z4_extract_graph_with_inputs_outputs.<locals>.<lambda>get_attrc                   r   rF   rI   r   r   rI   rJ   rh   \  r   outputru   z couldn't be found in envNr%   r   z was invalid, but is outputdesc)#r   Graphr   namer   r   _must_be_in_backwardInvalidNode_must_be_in_forwardr   pytreearg_tree_leavesr   r   any	node_copyzipr   r   RuntimeErrorr   r   r   r   r   copy_r   lenr   appendrv   r   tupleeliminate_dead_codelint)r   rY   r   r   r   r   	new_graphrB   new_nodeall_argsoutput_valuesr   x_descreplacementoutrI   r   rJ   "_extract_graph_with_inputs_outputs  s   











r   c                 C  s2   t jot| jtjjrt| j p| jtjj	j
kS rF   )r   is_non_builtin_to_includer   r   r   _ops
OpOverloadr   r   higher_order triton_kernel_wrapper_functionalr   rI   rI   rJ   r     s   r   c                 C  s   | j dkot| jdtS )Nr   r   )r   r   r   r   r   r   rI   rI   rJ   _is_backward_state  s   r   c                 C     | j dd dkS )Npartitioner_tagis_backwardr   r   r   rI   rI   rJ   _has_tag_is_backward  r   r   c                 C  r   )Nr   
is_forwardr   r   rI   rI   rJ   _has_tag_is_forward  r   r   c                 C  r   )Nr   must_be_in_forwardr   r   rI   rI   rJ   _has_tag_must_be_in_forward  r   r   c                 C  r   )Nr   must_be_in_backwardr   r   rI   rI   rJ   _has_tag_must_be_in_backward  r   r   c                 C  s>   t | rdS t| jtjjo| jjj}t|  ot	|  o|S NT)
r   r   r   r   r   r   _schema
is_mutabler   r   rB   r   rI   rI   rJ   r     s   
r   c                 C  s2   t | rdS t| jtjjo| jjj}t| o|S r   )	r   r   r   r   r   r   r   r   r   r   rI   rI   rJ   r     s   r   joint_modulenum_fwd_outputsEtuple[list[fx.Node], list[fx.Node], list[AOTOutput], list[AOTOutput]]c                C  s   t jdd | jjddD  }t tt| jjddjdd gt| }|d | }||d  }|d | }||d  }||||fS )Nc                 s      | ]}|j V  qd S rF   r   rc   rB   rI   rI   rJ   re         z+_extract_fwd_bwd_outputs.<locals>.<genexpr>r   r   r   )	r   r   r   
find_nodesnextiterr   r   r   )r   r   r   r   fwd_outputsbwd_outputsfwd_outputs_descsbwd_outputs_descsrI   rI   rJ   _extract_fwd_bwd_outputs  s   r  saved_valuesr   r   Nonec                 C  s(   | D ]}|j |kr| |  d S qd S rF   )r   remove)r  r   saved_valuerI   rI   rJ   _remove_by_name  s   

r  fwd_module_outputs#list[fx.Node] | tuple[fx.Node, ...]c                 C  s@   t | }tt | d ddD ]}t| | s|d } |S q|S )Nr%   )r   ranger   )r  idxirI   rI   rJ   find_first_sym_node  s   r        @-q=r   torch.fx.Graphtorch.fx.Nodemaxfloatminpositionc                 C  s^  |  |, | jtjjjj|fd}tjjj|jd |jd< t|jd |jd< W d    n1 s4w   Y  |  |2 | jtjjj	j|dgdfd}tjjj	|jd dgd|jd< t|jd |jd< W d    n1 ssw   Y  |  |0 | jtjj
jj|tjfd}tjj
j|jd tj|jd< t|jd |jd< W d    n1 sw   Y  |  |. | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n1 sw   Y  |  |, | jtjjjj|fd}	tjjj|jd |	jd< t|	jd |	jd< W d    n	1 s%w   Y  |  |	. | jtjjjj|	|fd}
tjjj|	jd ||
jd< t|
jd |
jd< W d    n	1 saw   Y  |  |
9 | jtjj
jj|
tjfd| d|j d}tjj
j|
jd tj|jd< t|jd |jd< W d    |S 1 sw   Y  |S )	Nr  r   tensor_metar  Tfp8_scale_pos__r   r   )inserting_afterr   r   r   r   absr   r   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   )r   rB   r  r   r!  abs_node	amax_nodeamax_64_nodeclamp_min_nodereciprocal_nodemul_node
scale_noderI   rI   rJ   calculate_quantization_scaling  s   




	













	





r8  r7  
quant_typetorch.dtyper,  	clamp_maxc                 C  sp  |  |0 | jtjjjj|tjfd}tjjj|jd tj|jd< t	|jd |jd< W d    n1 s8w   Y  |  |1 | jtjj
jj||fd}tjj
j|jd |jd |jd< t	|jd |jd< W d    n1 svw   Y  |  |. | jtjj
jj||fd}	tjj
j|jd ||	jd< t	|	jd |	jd< W d    n1 sw   Y  |  |	. | jtjj
jj|	|fd}
tjj
j|	jd ||
jd< t	|
jd |
jd< W d    n1 sw   Y  |  |
7 | jtjjjj|
|fd| d|j d}tjjj|
jd ||jd< t	|jd |jd< W d    |S 1 s1w   Y  |S )Nr  r   r"  fp8_quant_pos_r$  r%  )r&  r   r   r   r)  r*  r   r0  r   r   r   r.  r/  r,  r;  r   )r   rB   r7  r9  r,  r;  r!  target_node_32scaled_target_nodeclamp_min_scaled_nodeclamp_max_scaled_nodequant_activation_noderI   rI   rJ   perform_quantization-  s   	

















rB  tensortorch.Tensorc                 C  s   |   }|  }|| d S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)rC  num_elementsrF  rI   rI   rJ   calculate_tensor_sizes  s   rH  list[torch.dtype]c                  C  s.   t jjjd dd} dd | dD } | S )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16c                 S  s    g | ]}t t|d d qS ).r  )r   r   split)rc   dtyperI   rI   rJ   r     s    z&get_allowed_dtypes.<locals>.<listcomp>;)r   	_inductorr   post_grad_fusion_optionsr   rM  )rK  rI   rI   rJ   get_allowed_dtypes  s   rR  c                 C  s   t  }t| r| jd j|vrdS tjjjd dd}t	| jd }tjjjd dds2||kS tjjjd ddrJt
||kpIt||k S t
||kS )Nr   FrJ  
size_in_mbd   skip_dynamo_guardsquantize_dynamic_shape)rR  r   r   rN  r   rP  r   rQ  r   rH  r!   r    )rB   rK  size_thresholdrS  rI   rI   rJ   should_quantize  s4   rX  c                  C  s*   t jjjd dd} tt | dd S )NrJ  r9  ztorch.float8_e5m2rL  r  )r   rP  r   rQ  r   r   rM  )r9  rI   rI   rJ   get_quant_type  s   rY  rN  tuple[float, float]c                 C  s   t | }|j|jfS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )r   finfor   r  )rN  inforI   rI   rJ   calculate_range  s   
r]  c              	     s  | j ddd }|jd }t }t|\}}t  g }g }t|D ]\}}	|	jddrtj	j
jd ddrYt| |	|d	|}
t| |	|
||||}t|
sS||
 nI||
 nC| |	6 | jtjjjj|	|fd
| d|	j d}tjjj|	jd ||jd< t|jd |jd< W d    n1 sw   Y  | |< q! fddt|D }t|}|| }|r|d | | ||d   }|dt| td d  d7  < d S )Nr   r  r   saved_for_quantizationFrJ  use_scalingTr  r<  r$  r%  r   r"  c                   s   g | ]
\}}  ||qS rI   )r   )rc   r  rB   position_to_quantrI   rJ   r     s    z*quantize_activation_fw.<locals>.<listcomp>inductor%activation_quantization_fwd_aten_passr%   )r  r   rY  r]  dict	enumerater   r   r   rP  r   rQ  r8  rB  r   r   r&  r   r   r)  r*  r   r   r   r  
update_argr   r
   )r   r   r	  r9  r,  r;  tensor_scale_nodessym_scale_nodesr!  rB   r7  
quant_nodeoutput_updated_argsr  scale_nodesrI   r`  rJ   quantize_activation_fw  sb   





rl  c           	   	     s  dd | j D }d }|D ]^}|jddrk|jd |jd}tjjjd ddr| | d|j	
d	d
  t fdd|D }W d    n1 sSw   Y  | |. | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n1 sw   Y  | |1 | jtjjjj||fd}tjjj|jd |jd |jd< t|jd |jd< W d    n1 sw   Y  | |. | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n	1 sw   Y  nB| |4 | jtjjjj||fdt|j	 d}tjjj|jd ||jd< t|jd |jd< W d    n	1 sKw   Y  t|j D ]}||kri||kri||| qWqtd d  d7  < d S )Nc                 S  s   g | ]	}|j d kr|qS )r   r  r  rI   rI   rJ   r         z*quantize_activation_bw.<locals>.<listcomp>r^  Fdequant_typerJ  r_  
fp8_scale_
fp8_quant_ c                 3  s    | ]
}|j  kr|V  qd S rF   r   )rc   	bwd_input
scale_namerI   rJ   re     s    
z)quantize_activation_bw.<locals>.<genexpr>r  r   r"  dequant_r%  rb  %activation_quantization_bwd_aten_passr%   )r   r   r   popr   rP  r   rQ  r&  r   r   r  r   r   r)  r*  r   r   r   divr/  r   listuserskeysreplace_input_withr
   )	r   	bw_inputsactivation_noderB   rn  r7  divided_target_node_32dequant_nodeuserrI   rt  rJ   quantize_activation_bw  s   














r  
fwd_module
bwd_modulebwd_module_inputsdict[str, fx.Node]c              	     s  t ddd fddd tj t ddd fddd t ddd  fd	dd jjd
dd jd }|D ]P}d|jv r|tdd|j } j|  jj	|jd}W d    n1 sdw   Y  |j
d }|j
|j
 d|j
d< ||j
d< ||  j| q:tjjjd ddrt jjdd}|d }	t|D ]
}
t|
s|
}	 nqjjd
dd jd }|D ].}d|jv r j|	  jj	|jd}W d    n1 sw   Y  |j
|j
 |}	qt j t ddd  fddd d S )Nartifactc                   S  
   dddS )N,before_activation_quantization_fwd_aten_passstringr   encodingrI   rI   rI   rI   rJ   rh   T     z5perform_fp8_activation_quantization.<locals>.<lambda>c                         j ddddS NFTprint_outputinclude_strideinclude_deviceprint_readablerI   r  rI   rJ   rh   X      metadata_fn
payload_fnc                   S  r  )N+after_activation_quantization_fwd_aten_passr  r  rI   rI   rI   rI   rJ   rh   a  r  c                     r  r  r  rI   r  rI   rJ   rh   e  r  c                   S  r  )N,before_activation_quantization_bwd_aten_passr  r  rI   rI   rI   rI   rJ   rh   l  r  c                     r  r  r  rI   r  rI   rJ   rh   p  r  r   r  r   rp  z^fp8_quant_pos_\d+_rq  rr  rn  Tr^  rJ  r_  r   r  ro  c                   S  r  )N+after_activation_quantization_bwd_aten_passr  r  rI   rI   rI   rI   rJ   rh     r  c                     r  r  r  rI   r  rI   rJ   rh     r  )r   rl  r   r  r   r   resubr&  r   r   updatereplace_all_uses_with
erase_noder   rP  r   rQ  r   rz  reversedr4   r  )r  r  r  quant_fwd_module_outputsfwd_noders  quant_bwd_inputrn  quant_bwd_module_inputsbwd_input_locbw_inputscaled_fwd_module_outputsscale_bwd_inputrI   )r  r  rJ   #perform_fp8_activation_quantizationM  sv   












r  ra   OrderedSet[fx.Node] | Nonec           
      C  s(  t jdd d u rd S |rdd |D ng }dd | D }tjjjd ddr/dd | D }|jjd	d
d jd }dd |jjdd
D }d}|D ]<}	|	j	|v rt
|	r|	j	|v rctd|	j	 qKd|	jd< |	jd j|	jd< d||	j	 jd< |	jd j||	j	 jd< d}qK|rt||| d S d S )NrJ  c                 S     g | ]}|j qS rI   rr  r  rI   rI   rJ   r         z2enable_activation_quantization.<locals>.<listcomp>c                 S     i | ]}|j |qS rI   rr  r  rI   rI   rJ   
<dictcomp>      z2enable_activation_quantization.<locals>.<dictcomp>exclude_primalsFc                 S  s   i | ]}d |j vr|j |qS )primalsrr  r  rI   rI   rJ   r    s    r   r  r   c                 S  r  rI   rr  r  rI   rI   rJ   r        r   z*Skipping quantization of static input %s: Tr^  r   rn  )inductor_configrQ  r   r   rP  r   r   r  r   r   rX  r;   debugr   rN  r  )
r  r  r  ra   static_input_namessaved_values_namesr  r  should_perform_fp8_quantrB   rI   rI   rJ   enable_activation_quantization  sL   

r  )ra   saved_sym_nodes%tuple[fx.GraphModule, fx.GraphModule]c              
     sL  t | |d\}}}}| jjdd}	g tt|	}
g tt|	}g tt|	}g tt|	}g tt|	}t	| j| | | ||d}t
j }|jddD ];}|js`t|j t||j qN|rytdd |jD ryt|j t||j qNt|rt|j |stdqNt }g }g }|D ]}t|}|r|| || q|| qt| j}t||D ],}d|jvrqt|jd | }t|d	d
 dD ]}||vrq|||  q||O }q|  |||  g }g }g }D ]%}t|jdt r
|| q|jddr|| q|| q  ||  t!| t"D ]!\}}| krQ|jddsQtd| d  dt! q1t	| j|
| | | | | fddt#t!t!| t!| D  d}t	| j| | | | | ||d}t$j%&| |}t$j%&| |}t'||| ||fS )Nr   r   r  r   c                 s  s0    | ]}|j tjjjju ot|jd kV  qdS r   N)r   r   r   r   wait_tensorr   r   r{  rb   rI   rI   rJ   re     s    
z+_extract_fwd_bwd_modules.<locals>.<genexpr>z'backward_state_inputs must not be emptyr   c                 S  s   | j S rF   rr  )srI   rI   rJ   rh   "  s    z*_extract_fwd_bwd_modules.<locals>.<lambda>rj   saved_tensor_with_no_vc_checkFzi=z, no_vc_check_start_idx=z, len(saved_values)=c                   s0   g | ]}| kr|t k rt|nt|qS rI   )r   r.   r-   rc   r  no_vc_check_start_idxr  rI   rJ   r   Z  s    z,_extract_fwd_bwd_modules.<locals>.<listcomp>r   )(r  r   r  filterr3   r4   r2   r1   r   r   r   distributedis_availabler{  r  r   allrv   r#   r   addr   r   	itertoolschainr   r   rl   clearextendr   r   r   r   re  r  r   _lazy_graph_module_make_graph_moduler  ) r   r  r  r   ra   r	  r
  r  r  placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphdistributed_enabledrB   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr  saved_values_with_vc_checksaved_values_no_vc_checksaved_opaque_objectsr  	fwd_graphr  r  rI   r  rJ   _extract_fwd_bwd_modules  s   
	






	r  )static_lifetime_input_indicesra   _joint_inputsr   r  list[int] | Nonec                  s  g }d}| j jD ]}t|st|st|r|}q|du r!td| j jD ]}t|s0|| ||u r6 nq%tdd |D t	| }t
| }	|rft| j d dur`td t| |||dS t| dd	} tjsmt|  t|  t|  |du r{g }t| ||}
g }g }tj  d%dd}d%dd}d% fdd}| j jD ]}|jvrq|jdkr|jdd |  D v rq|jtjjjj tjj!j"j tjj!j#j tjj!j$j tjj!j$j%fv rqt&|r|| q||rq|j'(dt)j*kr|| q||r|r	td| d|j || q||s"|jdkr"td| dfdd|j+D }t,dd |D r<|-| qt.|sF|| qt/t01|2 }t/t01|2 }tj3rct3| j |}|du rk|
j4}t5| ||||d\}}|j j6t7d  |j j6t7d  |r|	rt8| ||t9|\}}t:|}tj;rd!d"l<m;} ||||| t=|}t=|}t>|d#d$}t9|
j?dkrt>|dd$}||fS )&a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Nzlast_node must not be Nonec                 s  s     | ]}|j d kr|jV  qdS r   Nr   r   r  rI   rI   rJ   re         z$default_partition.<locals>.<genexpr>r   zxTrying to unsafely apply AC to a non-functional graph with the default partitioner. Falling back to min-cut partitioner.)r   r  Tis_default_partitionrB   rC   rD   r9   c                 S  s    d| j v pt| j dtjjS )Nr"  r   )r   r   r   r   _subclasses
FakeTensorr   rI   rI   rJ   	is_tensor  s   z$default_partition.<locals>.is_tensorc                 S  s"   t dd | jD ot| jdkS )Nc                 s  s    | ]	}|j tjkV  qd S rF   )r   r   r   rc   r  rI   rI   rJ   re         z=default_partition.<locals>.is_multi_output.<locals>.<genexpr>r   )r  r{  r   r   rI   rI   rJ   is_multi_output  s   z*default_partition.<locals>.is_multi_outputc                   s.   | j ddo| jdvo  p| jtjjjjuS )NF)impure_random)r   r   )	is_impurer   r   r   r   r   r  r   r   )r  rI   rJ   r    s   z$default_partition.<locals>.is_impurer   c                 s  s    | ]\}}|V  qd S rF   rI   )rc   kvrI   rI   rJ   re         
r   z.Trying to apply AC on a graph with impure op: z, r   z	Expected z to be a tensorc                   s   g | ]	}|j  vr|qS rI   rr  rb   )forward_node_namesrI   rJ   r     rm  z%default_partition.<locals>.<listcomp>c                 s      | ]}t |V  qd S rF   r   rb   rI   rI   rJ   re         r  r   ra   )is_impure_noder%   enable_activation_offloadingFr   rQ   )@r   r   r   r3   r2   rv   r4   r   r#   r   r   r/   warningswarn#min_cut_rematerialization_partitioncleanup_recompute_tagsr   (unsafe_allow_optimization_of_collectivesforce_save_collectivesforce_save_effectful_opsforce_save_bw_mutation_srcclassify_nodesr   r  r  r   r   named_modulesr   r   r   _assert_scalarr   profiler_record_function_enter_new_record_function_enter_record_function_exit_RecordFunctionr   r   r   r$   	MUST_SAVEr{  r  r  r   rz  rd  fromkeysr|  _sync_decision_cross_ranksra   r  r   r   functionalize_rng_opsr   #reordering_to_mimic_autograd_enginer   ,_activation_offloading.activation_offloadingr8   thread_graphsafe_rng_from_hopsr\   )r   r  r   r  ra   forward_nodes	last_noderB   graph_has_recomputable_opsgraph_has_recomputable_rng_ops	node_infor  r  r  r  r  backward_usages	fw_module	bw_moduler   rI   )r  r  rJ   default_partitiony  s    















	



		r!  g    .ArE  c                 C  s
   | |j  S rF   )itemsize)rE  rN  rI   rI   rJ   _tensor_nbytesU  rq   r#  c                   s   ddd d| j v rP| j d }t|trdS t|ttfr(t fd	d
|D S t|tr:t fdd
| D S t|tj	rD |S t
dt| d|  | jdks^| jtjjjju r`dS t
d|  d)Nr   objectrD   rt   c                 S  s(   t | tjsdS tt|  dd| jS )Nr      fallback)r   r   r/  r#  r   rE  rN  r   rI   rI   rJ   object_nbytesZ  s   z_size_of.<locals>.object_nbytesr   r%   c                 3      | ]} |V  qd S rF   rI   rb   r(  rI   rJ   re   g  r  z_size_of.<locals>.<genexpr>c                 3  s    | ]	\}} |V  qd S rF   rI   )rc   r$  rd   r*  rI   rJ   re   i  r  zUnknown metadata type z	 on node r   r   ru   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)r   r$  rD   rt   )r   r   r   rz  r   sumrd  itemsr   r/  r   r   r   r   r   r   r  r   )rB   r   rI   r*  rJ   _size_ofY  s"   





r-  c                 C  sb   ddl m} |t}| jD ]}|jdkr||jj  d7  < qtdt	|
 tddd d S )Nr   )r   r   r%   %sTrk   reverse)collectionsr   rt   r   r   r   rR   r;   r\  rl   r,  r   
itemgetter)r   r   cntrB   rI   rI   rJ   
_count_opsv  s   

$r4  !list[torch._ops.OpOverloadPacket]c                  C  sl   g } t tjjD ]+}ttjj|}t|tjjsq| D ]}t||}tj	j
|jv r2| |  nqq| S rF   )dirr   r   r   r   r   r   OpOverloadPacket	overloadsr   	pointwiser   r   )r   	attr_nameopoverloadpacketoverloadop_overloadrI   rI   rJ   pointwise_ops  s   

r>  r   tuple[Any, ...]	depth_mapr_   list[tuple[fx.Node, int]]c                   s*    fdd| D }t | tdddS )Nc                   s&   i | ]}t |tjjjr| | qS rI   )r   r   r   rB   r   )rc   argr@  rI   rJ   r    s
    zsort_depths.<locals>.<dictcomp>r%   Tr/  )rl   r,  r   r2  )r   r@  
arg_depthsrI   rC  rJ   sort_depths  s   
rE  gmc           	        s0  t  i  | jjddD ]}| fdd |< qdd t| jjD d fdd}ttt	| jj}d}t
j}|D ]}|jD ]}| |k rS| }|}qEq@|du r[| S t| jjd|  D ]}|jdkr{|jtjjjju r{|| qgt| jj| d D ]}|| qtj | }|S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traversal, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r  c                   r   rF   rI   r   r   rI   rJ   rh     r   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>c                 S     i | ]\}}||qS rI   rI   rc   r  rB   rI   rI   rJ   r        z7reordering_to_mimic_autograd_engine.<locals>.<dictcomp>rB   rC   rD   r  c                   s   | g}t  }t|dkr)| } | |v s|  v rq||  || j7 }t|dkst|fddd}|D ]} |  fdd | < q5d S )Nr   c                   r   rF   rI   rf   )orderrI   rJ   rh     r   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>rj   c                   r   rF   rI   r   r   rI   rJ   rh     r   )r#   r   rx  r  all_input_nodesrl   r   )rB   	cur_nodesinsertable_nodesr   r   rJ  rI   rJ   insert_node_in_graph  s   

zAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graphNr   rB   rC   rD   r  )r   r   r   r  r   re  r   rz  r  r4   mathinfr{  r   r   r   r   r   r   r   GraphModule)	rF  rB   rO  r  first_node_in_bwdminimum_ordertangentr  new_gmrI   rN  rJ   r    s4   

r  r  torch.fx.GraphModuler   fw_nodebw_nodedevicetorch.device	rng_countlast_fwd_inputlast_bwd_input#tuple[torch.fx.Node, torch.fx.Node]c                 C  s  |j }|du rtd| j}	|j}
tjjj}| j| | jd| }t	||j
d< |}W d   n1 s9w   Y  |j| |jd| }t	||j
d< |}W d   n1 saw   Y  t|j}||d< | j| |	jd||jg|jR |d}W d   n1 sw   Y  || |	| t|j}||d< |
|$ |
jd||jg|jR |d}|| |
| W d   ||fS 1 sw   Y  ||fS )	a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nzdevice_idx must not be Nonefwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexrv   r   r   _prims	rng_primsgraphsafe_run_with_rng_stater&  r   r5   r   rd  r   create_noder   r   r  r  inserting_before)r  r   rY  rZ  r[  r]  r^  r_  
device_idxfw_graphbw_graphrh  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputrI   rI   rJ   %apply_graphsafe_rng_functionalization  sV   )








rt  num_sym_nodesc           '   
     s  t  }d#dd}d$d
d d%dd}|| }||}||}	i }
| jjD ]5}t|rZt|jdrZtjj	|jj
v rZ|j|vsC|j|	vrDq%||j }||j }|	|j }||d|
|< q%tjjj}tjjj}d }|jjddD ]}d|jv ry|} nqn|d u rtdg }tt|jjdd}tt|jjdd}t fdd|
 D }|td t|dk}tjj}tjo| o|j p|jj}t|
 D ]\}}|d }|d } |}|j}|j}|r|d ur|j dkrt!||||||||\}}q|"|M |j#d||jg|j$R |j%d}|j#dt&j'|dfi d}|||j(d < |j#dt&j'|dfi d} t))|j(| _(|*|  |+| |,| W d    n	1 sRw   Y  |"| d!t| }!|-|!}"|||"j(d < W d    n	1 s{w   Y  |"|# |j#d||"|jg|j$R |j%d} |*|  |+| W d    n	1 sw   Y  q|rtt.|jjd"d}#|#j$d }$t|$| }%|$d |% t/| |$|%d   }&|j0|& |j+|# |1  |1  ||fS )&Ngmodr   rD   r  c                 S  sF   i }| j jD ]}|jdkr t|jdr tjj|jjv r |||j	< q|S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )rv  random_nodesrB   rI   rI   rJ   get_rng_opsb  s   


z*functionalize_rng_ops.<locals>.get_rng_opsrB   rC   torch.device | Nonec                 S  s^   d| j vrdS | j d }t|ts|f}|D ]}t|tjr)|jjdkr)|j  S qtdS )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)r   r   r   r   r/  r[  r   )rB   
candidates	candidaterI   rI   rJ   
get_devicem  s   




z)functionalize_rng_ops.<locals>.get_devicer[  rD  c                 S  s   ddl m} | }|d u rtd|' | d ur,| jdkr,|tj W  d    S |t W  d    S 1 s=w   Y  d S )Nr   )detect_fake_modezfake_mode must not be Nonerz  )torch._guardsr  rv   r   from_tensorr   rz  get_rng_state)r[  r  	fake_moderI   rI   rJ   get_sample_rng_state  s   $z3functionalize_rng_ops.<locals>.get_sample_rng_stater   )fwdbwdr   r  rV  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc                 3  s    | ]	} |d  V  qdS )r  NrI   )rc   	node_pairr~  rI   rJ   re     s    
z(functionalize_rng_ops.<locals>.<genexpr>r{  r%   r  r  rz  r   rd  r   r   rng_state_output_r   )rv  r   rD   r  )rB   rC   rD   ry  )r[  ry  rD   rD  )2r  countr   r   r   r   r   r   r   r   r   r   rf  rg  run_and_save_rng_staterun_with_rng_stater  r   r  r  r#   valuesdiscardr[  r   rP  r   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomre  r   rt  rj  ri  r   r   r   r   r   copyr  r  r   r   r  r   r   	recompile)'r   r  r   ru  uidrx  r  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_maprB   	base_noderY  rZ  run_and_save_rngr  bw_tangent_start_nodefw_rng_state_outputsr^  r_  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationr]  r  r[  rl  rm  rq  staters  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   rI   r  rJ   r  G  s
  









	





$




r  c                 C  sB   | j jD ]}t|jtjjr|jjdkrt|st	j
|jd< qdS )z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r   r   N)r   r   r   r   r   r   r   r   r   r$   r  r   r   rB   rI   rI   rJ   r  +  s   r  c                   s@   d fdd | j jD ]}t|rt|st|s | qdS )	a\  
    Force save outputs from with_effects nodes wrapping effectful ops.

    Effectful ops (registered via _register_effectful_op) should not be recomputed
    because they may have arbitrary global side effects (I/O, RNG state, collectives,
    etc.). We mark the tensor outputs of with_effects as MUST_SAVE to prevent
    recomputation of the effectful op.

    The with_effects node returns a tuple (token, result). We recursively find all
    leaf outputs extracted via getitem and mark them as MUST_SAVE. Since these are
    saved, the with_effects op doesn't need to be recomputed in backward.
    rB   rC   rD   r  c                   sF   | j D ]}|jtju r  | t|jdttfs t	j
|jd< qd S )Nr   r   )r{  r   r   r   r   r   r   r   rz  r$   r  )rB   r  mark_getitem_outputsrI   rJ   r  H  s   
z6force_save_effectful_ops.<locals>.mark_getitem_outputsNrP  )r   r   r   r   r   r  rI   r  rJ   r  :  s   r  c                 C  s   t  }t| jjD ]6}|jdkrq	|jtjjj	j
u }|r=t|r(||jd  t|r<|jd |v r<tj|jd jd< q	 d S d S )Nr   r   r%   r   )r#   r  r   r   r   r   r   r   r   r   r   r   r  r   r   r$   r  r   )r   has_mutation_in_bwrB   is_copy_rI   rI   rJ   r	  X  s   
r	  c                 C  sN   | j tjkrdS | jd }t|tjurtdt| d|jvo&| j	dkS )NFr   z#expected parent to be fx.Node, got r"  r   )
r   r   r   r   r   r   r   rv   r   r   )rB   parentrI   rI   rJ   is_getitem_of_multi_outputo  s   
r  r  c                C  s   | j jD ]e}t|rF|jD ] }t|r-d|jv r-d|jv r-|jd |jd kr-tj|jd< q|jddrEtdd |jD sEtj|jd< qd|jvritdd |jD rit	|rad|j
d jv si|ritj|jd< q| S )	a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idr   has_backward_hookFc                 s  r  rF   r   r  rI   rI   rJ   re     r  z)cleanup_recompute_tags.<locals>.<genexpr>c                 s  r  rF   r  r  rI   rI   rJ   re     r  r   )r   r   r   r{  r   r$   r  r   r   r  r   )r   r  rB   r  rI   rI   rJ   r  x  s4   	



r  r  min_cut_optionsdont_ban)tuple[list[fx.Node], OrderedSet[fx.Node]]c           <        sB	  d u rt  t tr(t dd | jD }|t dd jD  }td| dpd
ddpdddpfddzdd lW n tyR } zt	d|d }~ww dqfdddr	fdd}dqfdddsfdd}
 t   dtdu fd"d#}	| jD ]}
|
jd$krq|
jv r|
jvrj|
jd% d&tjd'd( nj|
jd) d&tjd*d( qt|
rɈj|
jd) d&tjd+d( qt|
r|	|
d, n	t|
r|	|
d- ||
}|
r|r|	|
| d.|
jvod/|
jvpd.|
jv ot|
jd. tj }t|
rtt|
}d }n"|r/t|
jd.ttfr)d0}d }ntj}d1}n||
j \}}|rX|tjksE|t!krXj|
jd) |
jd% |d2| d( nj|
jd) |
jd% |d3 |
j"D ]}j|
jd% |jd) tjd4d( qiqdvfd9d:}	j#rj$D ]]}fd;d<|j"D }fd=d<|j"D }t%|dkr||t&|}t'|j"D ]2}|r(||kr||r| v rӐqtd>|(|||(| |	| qq	j)rwt  }| jD ]}|sq(||fg}(|}t%|dkrut*+|\}}||v r!q|,| (||d? krKt%|dkrKtd@||(|(| |	| n*|j"D ]}|rl||rl| vrlt*-|(||f qNt%|dksqz.dAd&\}}W n j/y } zut0t1j2}d }d | j3} z>| r| j4dBdCdCdDnt5| t6dEdFdG fdHdGdI t7dJdK}t8|dL}!|!9 W d    n	1 sw   Y  W n t:y } zdM| dN}W Y d }~nd }~ww dO;j<j=>t6dEdPdG fdQdGdI t?}"|"ri }#dAg}$dwdSdT}%|"D ]_\}&}'}(|$@|' |&dAkr:|%|'})|#A|)g @|( q|'d&krN|%|&})|#A|)g @|( q|%|&|%|'krf|%|&})|#A|)g @|( q|%|&}*|%|'}+|#A|+g @dU|*  qg },|#B D ]\}-}.|,@dV|- dW |.D ]}/|,@dX|/  qqdO;|,}0dY;|$}1tC\}2rt6dEdZdG fd[dGdI |rd\| dOnd}3|2r|3d]|2 dO7 }3d}4|rd^}4t	d_|0 d`|1 dO|3 |4 |tda tdbtDfdcdG tC  d }~w t:y%   tda tdbtDfdddG tC  w |\}5t  }6fded|5D D ]\}7|6Efdfd|7D  q6t  }8|6D ]/\}9}:|9d dg |:d dh krqtFdi|9d dg  dj|:d dh  |9d dg }-|8,|- qNtG| 
dkdl tH| jD tI
fdmd|8D fdndGdo};|; fS )xNc                 s  s2    | ]}|j d krt|jdrt|jjV  qdS )r   _overloadpacketN)r   r   r   r   r  r  rI   rI   rJ   re     s    
z solve_min_cut.<locals>.<genexpr>c                 s  r  rF   )r   r  rI   rI   rJ   re     r  z&Ops banned from re-materialization: %sarC   brD   r9   c                 S  sn   |j tjjjkr
dS |jd }tjj|\}}|D ]}|j	| }| |u r( dS t
|tr4| |v r4 dS qdS NFr   T)r   r   r   r   auto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   rz  )r  r  
mutable_opmutable_arg_namesr$  r   rB  rI   rI   rJ   !can_fuse_into_auto_functionalized  s$   


z8solve_min_cut.<locals>.can_fuse_into_auto_functionalizedc                 S  s\   |j tjjjkr
dS |jd }|D ]}|jd }|d u r td|| }| |u r+ dS qdS )NFtensors_to_cloner   zkwargs must not be NoneT)r   r   r   r   r   r   rv   )r  r  r  r   r   rB  rI   rI   rJ   .can_fuse_into_triton_kernel_wrapper_functional  s   

zEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functionalc                   sh   t |tjkr	dS  | |rdS | |rdS | jtju r*| jd jtjj	j
u r*dS | o3|S )NTr   F)r7   r   catr   r   r   r   r   r   r   r   rK   )r  r  )r  r  op_typesrI   rJ   rK     s   


z!solve_min_cut.<locals>.is_fusibler   zANeed networkx installed to perform smart recomputation heuristicsrB   c                   sv    | rdS t| g}t|dkr9| }|jD ]}|s( ||s( dS  |r2|| qt|dksdS r  )rO   r#   r   rx  r{  rp   r  )rB   rL  curr  )rK   r  r  rI   rJ   is_materialized_backwards   s   




z0solve_min_cut.<locals>.is_materialized_backwardsr   c                   s  | j dkrdS | jtju rdS | jddtjkrdS tj	r%
| r%dS | jtjjtjjfv r2dS jr=| s<dS n| rDdS | rKdS t| rQdS jrd | rdtd	| t| j d
S | jdk rq| jtjkrqdS jrtdd | jD }t| }|d |k rdS dS )zRReturns reason string if node should be banned from recomputation, None otherwise.r   Nr   zmarked MUST_SAVEznot in recomputable allowlistz	random opzcompute intensive opznon-builtin opzmaterialized backwards: %s %szmaterialized in backwardi  ztoo far from backwardc                 s  s$    | ]}t |tjrt|V  qd S rF   )r   r   r   r-  r  rI   rI   rJ   re   A  s    
zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>r   zreduction op)r   r   r   r   r   r   r$   r  r   recompute_viewsrO   r   lift_fresh_copyr   
lift_freshr~   rP   rN   rM   r   r}   r;   r  r   r{  dist_from_bwmax_dist_from_bwr   r+  r   r-  )rB   input_tensors_sizeoutput_size)r  r  r  rI   rJ   should_ban_recomputation  sF   



z/solve_min_cut.<locals>.should_ban_recomputationc                   s*    j dkrdS t fdd jD  S )Nr   Tc                 3  s    | ]} |V  qd S rF   rI   r  )rK   rB   rI   rJ   re   M  s    z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>)r   r  r{  r   )rK   r   rJ   is_materializedI  s   
z&solve_min_cut.<locals>.is_materializedra   rZ   tuple[float, str | None]c                   s   t jr	| |v r	dS t| }t jr| rtjdfS t| jd t	r/t| jd t
js/tdfS t|dtt| jdd  } | rF|dfS |d	 dfS )
zReturns (weight, cannot_save_reason).

        cannot_save_reason is None for finite weights, or a string explaining
        why the node cannot be saved for infinite weights.
        r  zview op (recompute_views=True)r   z$SymFloat (non-SymInt symbolic value)g?rT  r%   N   )r    treat_parameters_as_free_to_saver-  r  rO   rQ  rR  r   r   r   r   r   INT_INFrt   r  r   r  )rB   ra   mem_sz)r  r  rI   rJ   get_node_weightO  s    	
z&solve_min_cut.<locals>.get_node_weightrq  reasonr   c                   s    | rdS | v r t| jtjjo| jjdk}tjs|s dS t	| r&dS d| j
v r6t| j
d tjr6dS  |  jd| jd tj|rKd| ndd d	S )
NFr   r   r   _inzcannot recompute: zcannot recomputecapacityr  T)rO   r   r   r   r   r   r   r   r  r   r   r   r  add_edger   rQ  rR  )rB   r  is_collective)banned_nodesr  nx_graphr  rI   rJ   ban_recomputation_if_allowedz  s(   



z3solve_min_cut.<locals>.ban_recomputation_if_allowedr   _outsinkz;must be available for backward: input required for gradientr  r  z3must be computed in backward: required for gradientz+must recompute: marked by checkpoint policyzprimal inputzforward RNG seedr   r"          znon-tensor outputzcannot save: )r  zdata dependencystart_nodesrX   	max_rangert   c           	        s   g }| D ]}t |||df qt|dkrVt |\}}}|s(|S |jD ]$}|rO||kr:q+|| ||f}||vrOt || q+t|dks|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushrw   r   heappopr{  rp   )	r  r  sorted_nodesrd   r$  rB   node_is_fusibler  r   )rK   r  rI   rJ   find_first_unfusible	  s(   


z+solve_min_cut.<locals>.find_first_unfusiblec                   s    g | ]}  |r |qS rI   )rp   rw   r  r  rI   rJ   r   	  s    z!solve_min_cut.<locals>.<listcomp>c                   s   g | ]	}  |r|qS rI   )rp   r  r  rI   rJ   r   $	  s
    
z1used above/below fusible %s:(%s) -> %s -> %s:(%s)rT  ztoo long %s %s %s %sr   FTr  r  c                   S  r  )Nmin_cut_failed_fx_graphr  r  rI   rI   rI   rI   rJ   rh   	  r  zsolve_min_cut.<locals>.<lambda>c                         S rF   rI   rI   )fx_graph_strrI   rJ   rh   	      r  min_cut_failed_graphz.txtwz(failed to write: )
c                   S  r  )Nmin_cut_failed_edge_listr  r  rI   rI   rI   rI   rJ   rh   	  r  c                     r  rF   rI   rI   )edge_list_strrI   rJ   rh   	  r  	node_namec                 S  s.   dD ]}|  |r| d t|    S q| S )N)r  r  )endswithr   )r  suffixrI   rI   rJ   get_base_name	  s
   
z$solve_min_cut.<locals>.get_base_namezdepends on z  :z    - z -> c                   S  r  )Nmin_cut_failed_svgr  r  rI   rI   rI   rI   rJ   rh   	  r  c                     r  rF   rI   rI   )svg_contentrI   rJ   rh   	  r  zFX graph dump: zMin-cut graph visualization: z[Production debugging: Use tlparse to extract debug artifacts (min_cut_failed_fx_graph, min_cut_failed_edge_list, min_cut_failed_svg)]
a  AOT Autograd failed to partition the joint forward-backward graph.

The partitioner determines which intermediate values to save from the forward pass vs recompute in the backward pass. This error means a value is required for backward, but cannot be saved AND cannot be recomputed.

This is a bug in PyTorch. Please file an issue at https://github.com/pytorch/pytorch/issues

Nodes involved in the conflict:
z

[For PyTorch developers: one of the above constraints is wrong. Either the node should be recomputable, saveable, or not required for backward.]

[Debug: min-cut path] z-Failed to compute min-cut on following graph:r.  c                        d  jjS Nr  join	readwriteedgelistgenerate_edgelistrI   nxr  rI   rJ   rh   	  r  c                     r  r  r  rI   r  rI   rJ   rh   	
  r  c                 3  s    | ]	}| | fV  qd S rF   rI   rb   )r  rI   rJ   re   
  r  c                 3  s     | ]}| v r|fV  qd S rF   rI   )rc   r  )non_reachableurI   rJ   re   
  s    znode_in[:-3]=z != node_out[:-4]=c                 S  rG  rI   rI   rH  rI   rI   rJ   r  
  rI  z!solve_min_cut.<locals>.<dictcomp>c                 3  s    | ]} | V  qd S rF   rI   r  name_to_noderI   rJ   re   !
  r  c                   r   rF   rI   r   )node_idxrI   rJ   rh   !
  r   rj   )r  rC   r  rC   rD   r9   rQ   )rB   rC   rD   r   )rB   rC   ra   rZ   rD   r  )rq  )rB   rC   r  r   rD   r9   )r  rX   r  rt   rD   rt   )r  r   rD   r   )Jr#   get_default_op_listr:   r   rA   r;   r\  networkxImportErrorr   DiGraphr   r\   r]   r  r   rQ  rR  r   r3   r2   rp   r   r   r   r/  r   r  r   r   r   r   ra   r  r{  r{   rm   r   r  r   rw   r|   r  r  r  r  minimum_cutNetworkXUnboundedr9   r   handlersowning_moduler  r   r   _get_unique_pathopenwrite	Exceptionr  r  r   r  _find_infinite_capacity_pathr   
setdefaultr,  visualize_min_cut_graphr   r  rv   get_name_to_nodere  rl   )<r   r  r  r  joint_module_opsops_ignoreder  r  r  rB   
ban_reasonis_non_tensor_nodeweightcannot_save_reasonr  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr$  r  	cut_value	partitionunbounded_excstructured_tracing_enabledfx_graph_filer   finf_pathnode_constraintsraw_path_nodesr  	from_nodeto_noder  base	from_baseto_baseconstraint_linesr  constraintscconstraints_strraw_path_strsvg_pathlocal_files_msgtlparse_msg	reachablecutsetnbrs	cut_nodesnode_innode_outr  rI   )r  r  r  r  r  r  rK   r  r  r  r	  r
  r  r  r  r  r  r  r  rJ   solve_min_cut  sB  



;(
 




















	











rF  r  nx.DiGraph[str, dict[str, Any]]!list[tuple[str, str, str]] | Nonec                 C  s   t dg}tdg fg}|r\| \}}| |D ]@}||v r q| | | }|dd}|tjks5|tkrY|dd}|||f}	||	g }
|dkrM|
  S || |	||
f q|sdS )zBFS from source to sink following only infinite-capacity edges.

    Returns a list of (from_node, to_node, reason) tuples representing the path,
    or None if no such path exists.
    r   r  r   r  unknownr  N)
r#   r   popleft
successorsr   rQ  rR  r  r  r   )r  r&  queuerB   	edge_pathneighbor	edge_datar  r  new_edgenew_pathrI   rI   rJ   r  &
  s(   
	


r  	base_name	extensionc                 C  sn   |  | }t j|s|S d}t j|  d| | r.|d7 }t j|  d| | s|  d| | S )zGet a unique file path, appending a counter if the file already exists.

    For example, if "min_cut_failed.svg" exists, returns "min_cut_failed_1.svg".
    r%   r$  )ospathexists)rR  rS  rU  counterrI   rI   rJ   r  G
  s   r  tuple[str | None, str | None]c           
      C  s   ddl }zddl}W n ty   tjddd Y dS w |j|  }||d }|	 D ] }| |
  |  d }|t| |tdkrO|d	 q/| d
}tdd}t|d}	|	| W d   ||fS 1 stw   Y  ||fS )zVisualize the min-cut graph to an SVG file.

    Returns (path_to_svg, svg_content) tuple. Both are None if pydot is unavailable.
    r   NzMInstall pydot to visualize the min-cut graph for debugging: pip install pydotT)exc_info)NNr  rR  redutf-8min_cut_failed.svgr  )r  pydotr  r;   r\  nx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_color
create_svgdecoder  r  r  )
r  r  r^  
dot_format	dot_graphedger   r  r=  r/  rI   rI   rJ   r  V
  s4   


r  c                  C  s  g t jt jt jt jt jt jt jt jt j	t j
t jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt j t j!t j"t j#t j$t j%t j&t j't j(t j)t j*t j+t j,t j-t j.t j/t j0t j1t j2t j3t j4t j5t j6t j7t j8t j9t j:t j;t j<t j=t j>t j?t j@t jAt jBt jCt jDt jEt jFtGjHt jIt jJt jKt jL} t jIt jJt jMg}|t jNt jOt jPtQjRt jSt jTt jUt jVt jWg	7 }|}| g tQjtQjXt jYt jLt jZtQj[tQj@t j[t j\tQjRt jVt j]t jNt jSt jOt j^t j_t j`t jat jbt jct jdt jet jft jgt jht jit jTt jjt jkt jlt jmt jntQjotQjp7 } | t jqt jrg7 } | |7 } | ts 7 } | t jtg7 } | dd tuD 7 } tv| }tvtwdtxf  t jyt jzt j{g}t j|t j}t j~t jt jt jt jt jt jt jt jg}||B }t|tv||tv||S )Nc                 S     g | ]}t |qS rI   )r   )rc   mrI   rI   rJ   r     r  z'get_default_op_list.<locals>.<listcomp>.)r   r  r  ry  atan2r.  r  r   pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltr'  bitwise_notceilfloorfracnegreluroundsilutruncr;   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrtr-  sigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr+  mean_grad_sum_to_sizesum_to_sizer(  totype_asr   r   squeeze	unsqueezersub_to_copyaliasr   slicetr)  broadcast_in_dimexpand
as_stridedpermuteselectrM  r*  clone	full_likevarstd_unsafe_viewr   broadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota'_low_memory_max_pool_offsets_to_indicesre  gatherr>  
zeros_liker   r#   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr<   )default_recomputable_opsrecomputable_view_opsr@   rA   r?   r>   r=   rI   rI   rJ   r  }
  s  	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKM	
 !"#&
r  c                 C  s   i }| j D ]}|||j< q|S rF   )r   r   )r   r	  rB   rI   rI   rJ   r  %  s   
r  memorylist[float]runtimes
max_memoryall_recomputable_banned_nodes"tuple[float, list[int], list[int]]c           
      C  s   t j}|dkrt|||S |dkrt|||S |dkr!t|||S |dkr+t|||S |dkrKtd tj	| |||d}t||t
|djt|d	S t|tr_||| |||\}}	d
||	fS td| )Ngreedyilpdpr(   dynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   r   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr)   r*   r'   r(   r;   warningr&   inialize_from_graphr+   get_knee_point_memory_budgetr   r   r   )
r   r  r  r  r  r  SOLVERr  saved_node_idxrecomp_node_idxrI   rI   rJ   #_optimize_runtime_with_given_memory,  sH   



r  no_dispatchr   r'  c                   sN   t | j}d fddfdd|D }fd	d|  D }| j||d
S )Ndtorch.SymInt | intrD   rt   c                   s   t |  dS )Nr&  )r   )r  r&  rI   rJ   realize_symbolc  s   z8_remove_symbols_without_guarding.<locals>.realize_symbolc                      g | ]} |qS rI   rI   rc   r  r  rI   rJ   r   f  r  z4_remove_symbols_without_guarding.<locals>.<listcomp>c                   r  rI   rI   r  r  rI   rJ   r   g  r  )stride)r  r  rD   rt   )rz  shaper  new_empty_strided)r   r'  r  r  rI   )r'  r  rJ    _remove_symbols_without_guarding`  s
   
r  c                   s$  t j}ddd}|dkrdS |dkrFt ' d	d
lm} t|jjf\ |	 fdd}|W  d    S 1 s?w   Y  d S |dkrd	dl
m} t|jjf\ |dd}j i  W d    n1 stw   Y  | }t|dS t|tr|S td| )Nr   r   rD   c                 S  s   t | tjrt | jd tjrt| jd ddS t | tjr0t | jd tjr0t| jd ddS t | tjrAt | jd tj	rAdS t | tjrRt | jd tj
rRdS | S )Nr   r%  r&        ?T)r   r   r   r   r   r/  r  r   r   r   r   r   rI   rI   rJ   materialize_argn  s   z)estimate_runtime.<locals>.materialize_argtestingr%   profiler   )benchmarkerc                     s   j  i S rF   )r   rI   r   r   rB   rI   rJ   rh         z"estimate_runtime.<locals>.<lambda>flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   r   rD   r   )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsr  r   r   r   )rB   RUNTIME_MODEr  r  msr  modecounted_flopsrI   r  rJ   estimate_runtimek  s,   
$

r  memory_budgetc                   sF  |dks|dk rt d| ttjtjtjtjtjd}tjr)t	|ddddd}|dkr0j
S t|\}}|dkr>|S dJddj
|		krR|S dK	fdddL	fddt	|dddd}t|\}}||k r{|S t	|dd t \}}	||k r|S ddlm tfddj
D dMfdd}
|
|	}dd |D fdd|D }t|tdd tdkrшj
 S fd!dD 
d"d D dd#lm dN 
fd*d+tjrdOfd.d/}|d0|d1g}|d dd  |d dd  krz|d |d fg}|rz| \}}|d |d  d2k rC|| || q#||d |d  d3 }|dd  |dd  krc|||f |dd  |dd  krw|||f |s&|  dd lm} d4d |D }d5d |D }|jd6d7 |j||d8d9 t|D ]\}}|j|d:||| fd;d<d=d> q|d? |d@ | dA |!d |" }|#  t$% }tj&d urtj&}t$j'|ddB dC}t(j)* r t(j)+ r dDt(j),  }t$j-.|dE| dFt/  dG}|0| t12dH| |dId S )PNr%   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )r{   r|   r}   r~   r   F)r{   r|   r}   r~   r  rX   rD   r  c                 S  s   t tt| d S N    eA)r+  mapr-  )r  rI   rI   rJ   estimate_activations_size  r   z:choose_saved_values_set.<locals>.estimate_activations_sizeszc                   s   | d    S r  rI   )r  )max_act_sizemin_act_sizerI   rJ   get_normalized_size  s   z4choose_saved_values_set.<locals>.get_normalized_sizeactivationsc                   s    |    S rF   rI   )r  )r  r  r  rI   rJ   get_mem_ratio  s   
z.choose_saved_values_set.<locals>.get_mem_ratio)r{   r|   r}   )r~   get_node_storagec                 3  r)  rF   rI   r  r   rI   rJ   re     r  z*choose_saved_values_set.<locals>.<genexpr>r  rZ   c                   s    fdd| D S )Nc                   s2   g | ]}|j td k r |vst|r|qS )r  )r  rt   r   r  r!  input_storagesrI   rJ   r     s    zRchoose_saved_values_set.<locals>.get_recomputable_banned_nodes.<locals>.<listcomp>rI   )r  r"  rI   rJ   get_recomputable_banned_nodes  s   z>choose_saved_values_set.<locals>.get_recomputable_banned_nodesc                 S  s$   g | ]}|j d dtjkr|qS )r   F)r   r   r$   r  r  rI   rI   rJ   r     s
    z+choose_saved_values_set.<locals>.<listcomp>c                   s   g | ]}| vr|qS rI   rI   r  )must_save_nodesrI   rJ   r     s    Tr/  c                   s   g | ]} t |qS rI   r-  r  )r  rI   rJ   r     s    c                 S  rm  rI   )r  r  rI   rI   rJ   r     r  r  r  r  rW   r   r   tuple[list[fx.Node], float]c           
        s     t |t| d|\}}}W d    n1 sw   Y  t }|D ]}z	||  W q' ty;   Y q'w |sEtdt|| |\}}	trbt	||||dd D |d	 ||fS )Nr   z:dont_ban must be a subset of all_recomputable_banned_nodesc                 S  rm  rI   r&  r  rI   rI   rJ   r   >  r  zNchoose_saved_values_set.<locals>.get_saved_values_knapsack.<locals>.<listcomp>)	r   r  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodes normalized_memories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)
r  r  r#   r  BaseExceptionissubsetrv   rF  r:   r   )
r  r  r   r*  r(  r)  r  r  r  r$  )aggressive_optionsr  r+  r  r-  rI   rJ   get_saved_values_knapsack  s\   
z:choose_saved_values_set.<locals>.get_saved_values_knapsackr  tuple[float, float, float]c                   s(   | d\}}| t |  |fS )N)r  r   )r+  )r  r  r*  )r  r2  r   r  r-  rI   rJ   estimate_for_budgetI  s   

z4choose_saved_values_set.<locals>.estimate_for_budgetr  r   gMbP?r  c                 S     g | ]}|d  qS )r  rI   rc   itemrI   rI   rJ   r   f  r  c                 S  r5  r%   rI   r6  rI   rI   rJ   r   g  r  )
      )figsizeo)markerz.4fzoffset points)r   r9  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okrq  _rank_memory_budget_paretor$  r]  z%Generated Pareto frontier curve at %s)r  r  r   )r  rX   rD   r  )r  r  rD   r  )r  rX   rD   r  )r  rZ   rD   rX   )r  r  r  rW   r   r   rD   r'  )r  r  rD   r3  )3r   rz   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rY   rF  torch._inductor.fx_utilsr!  r#   rl   r-  r   torch.utils._mode_utilsr  visualize_memory_budget_paretorx  r   sortmatplotlib.pyplotpyplotfigureplotre  annotatexlabelylabeltitlegridgcfshowrT  getcwdmemory_budget_pareto_dirmakedirsr   r  r  is_initializedget_rankrU  r  r0   savefigr;   r  )r   r  r  r  runtime_optimized_saved_valuesr$  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r$  recomputable_banned_nodesr4  optionsbisectslhsrhsmidpltx_valuesy_valuesr  txtfigfig_dirrank_suffixfig_namerI   )r1  r  r  r  r!  r  r2  r#  r   r  r+  r  r%  r  r  r-  rJ   choose_saved_values_set  s  




3
"








rr  list[torch.fx.Node]c              	     s  ddl m ddd}dfd	d
}tj rtj rtj dkr|| r|| rt    dd |D g}dd ttj D }tj	||d  t
|  g }i }t|D ]1\}}	 fdd|	D }
d}|
D ]}t|}||7 }|tj kr|||j< qm||d< || q\tj|tjj d}tjj|tjjjjd tt| }d| d| tddd fddd  fdd|| D }W d    n1 sw   Y  W d    |S W d    |S 1 sw   Y  |S )Nr   unset_fake_temporarilyr   r  rD   r9   c                 S  s2   | j D ]}t|jtjjr|jjdv r dS qdS )N>   c10d_functionalr   TF)r   r   r   r   r   r   r   )r   rB   rI   rI   rJ   has_collectives  s   

z3_sync_decision_cross_ranks.<locals>.has_collectivesc              	     s   d dd | jD }t|d }dd ttj	 D  t
 "   tj | W d    n1 s:w   Y  W d    n1 sIw   Y  t fdd D S )N/c                 s  r  rF   rr  r   rI   rI   rJ   re     r  zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>r[  c                 S  s   g | ]}d qS rF   rI   rc   r$  rI   rI   rJ   r     r  zF_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<listcomp>c                 3  s    | ]	} d  |kV  qdS r  rI   r   
all_inputsrI   rJ   re     r  )r  r   hashlibsha256encode	hexdigestr  r   r  get_world_sizer  all_gather_objectr  )r   node_strrY   rt  rz  rJ   has_same_nodes  s    z2_sync_decision_cross_ranks.<locals>.has_same_nodesr%   c                 S  r  rI   rr  r   rI   rI   rJ   r     r  z._sync_decision_cross_ranks.<locals>.<listcomp>c                 S  s   g | ]}g qS rI   rI   ry  rI   rI   rJ   r     s    c                      g | ]} | qS rI   rI   )rc   op_namer  rI   rJ   r     r  z
total size)r[  r  zpicked_rank_idx=z, saved_nodes of current rank=r  c                   S  r  )N)aot_joint_graph_sync_decision_cross_ranksr  r  rI   rI   rI   rI   rJ   rh     r  z,_sync_decision_cross_ranks.<locals>.<lambda>c                     r  rF   rI   rI   )sync_decision_cross_ranks_strrI   rJ   rh     r  r  c                   r  rI   rI   rb   r  rI   rJ   r     r  )r   r  rD   r9   )torch._subclasses.fake_tensorru  r   r  r  r]  r  r  r  r  r  re  r-  r^  r   r   rC  distributed_c10d_get_object_coll_device
all_reduceReduceOpMAXrt   argminr7  r   )r   r  rw  r  objectssaved_ops_names_all_rankssaved_sizessaved_ops_with_sizesr  saved_ops_namessaved_nodes
saved_sizerB   size_of_nodesaved_sizes_tensorpicked_rank_idxrI   )r	  r  ru  rJ   r    sn   




	(,,,r  moduler   c              
   C  s  d}|rdnd}t t| jjdd}| jjdtjjjdD ]}t| |j	d j
}t|tjrg }|jjddD ];}||jv rs| j|& | j| d| }	|d	7 }|jd
 |	jd
< |	}||	 W d   n1 snw   Y  q8|r| j|! | jdtjjjg |j	|R i }
|j|
dd W d   n1 sw   Y  |jd}|r|\}}g |dd |D R }||f|
jd< | j| q| S )u  
    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
    For graphs without HOPs, the partitioner adds placeholder nodes
    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
    compiled graphs.

    This works well for no-HOP graphs. With HOPs, the partitioner runs
    recursively: it first partitions the HOP (producing forward/backward HOP
    subgraphs) and then stitches them back into the outer joint graph. For HOPs
    that contain RNG ops, the outer joint graph now includes HOP subgraph
    modules with extra RNG placeholders. We must thread these placeholders
    through the outer module partitioned forward and backward graphs—this
    function does exactly that. It collects the RNG placeholder nodes from the
    HOPs and creates corresponding placeholders in the outer forward and
    backward graphs.

    There is a catch: for a short period, the joint graph is in a “bad” state.
    The HOP subgraphs expect additional inputs (because of the new
    placeholders), but the outer graph call sites don't yet provide them. We
    can't fix this in the joint graph because the joint graph's input signature
    is fixed (primals, tangents). As a compromise, we keep the joint graph in
    somewhat of a bad state for some time and, once the outer forward and
    backward graphs are partitioned, insert the corresponding RNG placeholders
    and wire up the calls.
    r   ro  rn  r   r  r   )r   r   r$  r%   r   NT)propagate_metaeager_input_valsc                 S  s   g | ]}|j d  qS )r   )r   )rc   inprI   rI   rJ   r   -  rI  z2thread_graphsafe_rng_from_hops.<locals>.<listcomp>)r  r  r   r  r   r   r   invoke_subgraphr   r   r   r   r   rS  r   r&  r   r   r   ri  r  r   r  )r  r   r]  
rng_string
last_inputhop_noder   new_rng_inputsplaceholder_noderc  new_hop_node_with_fixed_args
eager_vals
eager_argseager_kwargsnew_eager_argsrI   rI   rJ   r    s^   


	
r  	list[int]c                   sb  t | j t | jjD ]%}|jdkrd|jv r| n	t|r'| |v r1|j	 qt
tt| jj}t
tt| jj}|| }t| |d\}}}	}
 }dd |D  t| j|||	d}t fdd|jD tfdd| jjD }tfd	dt|D }d
}i }| jjD ]}|v r|||< |d7 }qt|||||S )Nr   tangentsr  c                 s  s&    | ]}|d ur|j dkr|V  qd S )Nr   r  )rc   r<  rI   rI   rJ   re   U  s    z!classify_nodes.<locals>.<genexpr>r   c                 3  s$    | ]}|j d kr |j V  qdS r  r  r  r  rI   rJ   re   [  s    
c                 3  s$    | ]}|vr| vr|V  qd S rF   rI   r  )r\   rm   rI   rJ   re   `  s    c                 3  s     | ]\}}| v r|V  qd S rF   rI   )rc   r  pr  rI   rJ   re   e  r  r   r%   )r  r   r#   r   r   r   r  r   r  r{  rz  r  r3   r2   r  r  r   re  rW   )r   r  r   rB   r  r  rY   r	  r
  r  r  r]   forward_only_graphr^   ra   fw_cntr`   rI   )r	  r\   rm   r  rJ   r
  8  s^   



	
r
  rb  r  compilerc                C  s  | j   |   | j }tjrt|}|| _ | j }t| }t| }	|r)t| dd} tj	s0t
|  t|  t|  |du r>g }t| ||}
t|
jdkrUt| ||||
jdS t| j jD ]+}|jdkrhtd|_q[|
|sqd|_q[td|_|jD ]}t|j|jd |_qyq[tj}|jD ]}t|jd	dtr|jd	 } nqt||
|d
}tj rt ||}t!t"t#|}t!t"dd |}t$| ||||
jd\}}|r|	rt%| ||t|\}}t&|}tj'rddl(m'} |||||
j t)|}t)|}t*|dd}t*|dd}t+rt,dd |D }t-dd |D d }t./d| t./d| t0dd |j jD }t0dd |j jD }||@ }t1t}|j jD ]}|j2|v rdt3|j4drd|t5|j4j6  d7  < qIt./dt|t|t| t,|7 t89ddd}t./d| ||fS )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Fr  Nr   )r   r  ra   r   r  r%   r  )r  c                 S  s
   t |  S rF   r  rf   rI   rI   rJ   rh     ri   z5min_cut_rematerialization_partition.<locals>.<lambda>r  r  r  Tc                 S  s   g | ]
}t |t|fqS rI   )r-  r   r  rI   rI   rJ   r     s    z7min_cut_rematerialization_partition.<locals>.<listcomp>c                 s  r  rF   r&  r  rI   rI   rJ   re     r  z6min_cut_rematerialization_partition.<locals>.<genexpr>z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc                 s       | ]}|j d kr|jV  qdS r   Nr  r  rI   rI   rJ   re     r  c                 s  r  r  r  r  rI   rI   rJ   re     r  r  z# remat/fw/bw: %d/%d/%dr/  zCount of Ops Rematerialized: %s):r   r   r  r   cser6   r   r   r  r  r  r  r	  r
  r   r\   r!  ra   r  r   r   rt   r  rp   r{  r   activation_memory_budgetr   r   r   r  rr  r  rz  r  r   r  r  r  r   r  r8   r  r:   rl   r+  r;   r\  r#   r   r   r   r   r   r  r,  r   r2  )r   r  r  r   r  r   	cse_graphr   r  r  r  rB   r  r  r  r  r  r   r   sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opsrI   rI   rJ   r  y  s   
"







	r  fx_graphTtracedfnamefigname
clear_metaprogstr | list[str] | Noneparse_stack_tracedot_graph_shapec                 C  s   |rt | j}t| |} | jjD ]}i |_qtj	|\}	}
|
s'dt
j }
td|	|
 tj| |||d}| }t|d|
d }|	 |
 }|d u rU|| d S |||d d S )NrL  zWriting FX graph to file: %s%s)r  r  write_)r  )r  deepcopyr   r   rS  r   r   rT  rU  splitextr   torch_compile_graph_formatr;   r\  r"   FxGraphDrawerget_main_dot_graphr   lstrip)r  r  r  r  r  r  r  r   rB   r5  extgr   write_methodrI   rI   rJ   
draw_graph%  s*   	
r  rQ   )r   r   rD   r9   )rB   rC   rD   rt   )r   rC   rD   r   )rB   rC   rD   r   )rB   rC   r   r   rD   r   )NF)r   r   rY   rX   r   rX   r   r   r   r   r   r9   rD   r   )r   r   r   rt   rD   r   )r  rX   r   r   rD   r  )r  r  rD   rt   )r  r  r   )r   r  rB   r  r  r  r   r  r!  rt   rD   r  )r   r  rB   r  r7  r  r9  r:  r,  r  r;  r  r!  rt   rD   r  )rC  rD  rD   r  )rD   rI  )rB   r  rD   r9   )rD   r:  )rN  r:  rD   rZ  )r   r  rD   r  )r  r   r  r   r  r  rD   r  rF   )
r  rX   r  r   r  r   ra   r  rD   r  )r   r   r  rX   r  rX   r   rt   ra   r  rD   r  )r   r   r  r   r   rt   r  r  ra   r  rD   r  )rE  rt   rN  r:  rD   rt   )r   r   rD   r  )rD   r5  )r   r?  r@  r_   rD   rA  )rF  r   rD   r   )r  rX  r   rX  rY  r  rZ  r  r[  r\  r]  rt   r^  r  r_  r  rD   r`  )
r   r   r  r   r   r   ru  rt   rD   r  )r   r   rD   r  )r   r   r  r9   rD   r   )
r   r   r  rW   r  rz   r  r  rD   r  )r  rG  rD   rH  )rR  r   rS  r   rD   r   )r  rG  rD   rX  )rD   r<   )r   r   rD   r  )r   r   r  r  r  r  r  r  r  rW   r  rX   rD   r  )r   rD  r'  rt   rD   rD  )rB   rC   rD   r  r8  )r   r   r  rW   r  r  rD   rX   )r   r  r  rs  rD   rs  )r  r   r   r9   rD   r   )r   r   r  r  r   rt   rD   rW   )rb  )r   r   r  r   r  r   r   rt   r  r  rD   r  )r  TNFN)r  rX  r  r   r  r   r  r9   r  r  r  r9   r  r   rD   r  )
__future__r   r  rx   r|  r  r  loggingrQ  r   rT  os.pathr  r  r1  r   r   collections.abcr   dataclassesr   r   typingr   r	   r   torch._inductor.inductor_primstorch.distributedtorch.fxr   torch.utils._pytreeutils_pytreer   torch._dynamo.utilsr
   r   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   $torch._functorch._aot_autograd.utilsr   torch._inductorr   r  !torch._inductor.custom_graph_passr   r   "torch._library.fake_class_registryr   torch._library.utilsr   torch._loggingr   r   torch._logging._internalr   r  r   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   r    r!   torch.fx.passesr"   torch.utils._ordered_setr#   torch.utils.checkpointr$   rq  -_activation_checkpointing.graph_info_providerr&   "_activation_checkpointing.knapsackr'   r(   r)   r*   ,_activation_checkpointing.knapsack_evaluatorr+   _aot_autograd.descriptorsr,   r-   r.   _aot_autograd.functional_utilsr/   _aot_autograd.logging_utilsr0   _aot_autograd.utilsr1   r2   r3   r4   r5   compile_utilsr6   r7   r8   r  r  sympydebug_partitionerr:   rV   	getLoggerrR   r;   r   r   r)  r<   rW   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r8  rB  rH  rR  rX  rY  r]  rl  r  r  r  r  r!  rt   r  r#  r-  r4  cacher>  rE  r  rt  r  r  r  r	  r  r  rF  r  r  r  r  r  r  rL  r  r  r  rr  r  r  r
  r  r  rI   rI   rI   rJ   <module>   s8    !
$o
NF
AP^5 ) Z
	N^ e	<    z!' )00 ~QRD 0