o
    ei(I                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZm Z  d dl!m!Z! d dlm"Z" d dl	m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5 d d	lm6Z6 d dl7Z7d dl8Z8d dl9m:  m;Z< d d
l=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dl9mGZGmHZH ddgZId dlJmKZKmLZLmMZMmNZN e-rgd dlmOZOmPZPmQZQ d dlRmSZS d dl8mTZTmUZUmVZV d dlWmXZX d dlYmZZZ d dl[m\Z\ d dl]m^Z^ ddl_m`Z` ddlambZb ddlcmdZd ddlemfZf ddlgmhZhmiZimjZjmkZkmlZlmmZm dd lnmoZo dd!lpmqZqmrZr g d"Zse0d#Ztejud7d&d'Zvd d(lwmxZx d d)lymzZz d d*l{m|Z| d d+l}m~Z~ d d,lmZ d d-lmZ d d.lmZmZmZmZmZ d d/lmZmZ d d0lmZmZ dd1lmZ dd2lmZ ejd3kZeeZe0d4Zee7je7jf Ze+e1e8jee8jVf  Zerd5ned6d7Zd8d9d:e d;Zd<Zd<Zd<Zd=ZeFe8je8je8je8je8je8je8je8je8je8je8je8je8je8je8jgZd>ed?< d@Zeed @ d kr9edAks=J dBd8dEdFZd9dJdKZG dLdM dMe7jZejdNdOG dPdQ dQZd:d;dYdZZ	R	S	[d<d=d]d^Z	R	S	[d<d=d_d`Zejud>dadbZd?dfdgZd@djdkZÐdAdodpZĐdBdsdtZŐdCdxdyZdDd|d}ZƐdEddZǐdFddZȐdGddZɐdHddZʐdIddZdd fdJddZ̐dKddZ͐dLdMddZ		dNdOddZ					dPdQddZАdRddZѐdSddZҐdTddZӐdUddƄZԐdVddʄZe4d˃Ze0ddNd͍Zee'e%ef ef ZG ddτ de,e(eef ZِdWddӄZڐdWddՄZېdXddلZܐdYddބZݐdZddZސd[ddZ	d\d]ddZd^ddZd_ddZd`ddZdaddZdbddZdcddZddddZdeddZdfddZdgddZeg dZdhddZdiddZdjddZd dlZdkddZg Zded< dld d!Zdkd"d#Zejdmd&d'Zej			Ndndod-d.ZeZeZeZd[d/dpd3d4Zd[d/dqd8d9Ze"dAdrd<d=ZG d>d? d?e*ZejG d@dA dAZG dBdC dCZ G dDdE dEe ZejdsdFdGZG dHdI dIZG dJdK dKeZejudtdudNdOZej"dvdPdQZej"d>dRdSZdvdTdUZ	d\dwdZd[Z	dxd`daZ
dydcddZdydedfZd[d[dNdgdzdkdlZdd[dmd{dsdtZd[dud|dvdwZd[dud|dxdyZd}ddZej"ddd>ddZej"ddd>ddZej"ddd>ddZd~ddZdddZ		ddddZdddZe1ee7jf Zded< eju	ddddZejudddZejudddZejudddZejudddZdddZdddZ dddZ!dddZ"dddZ#dddZ$	[	N	[	ddddZ%d>ddÄZ&G dĐdń dŃZ'ddʐd˄Z(dd̐d̈́Z)ddϐdЄZ*ddѐd҄Z+ddӐdԄZ,ddՐdքZ-ddؐdلZ.ejddܐd݄Z/	d\dddZ0dddZ1dddZ2dddZ3dddZ4dddZ5dddZ6ejdddZ7dvddZ8ejudvddZ9ejudddZ:ejudvddZ;dvdd Z<dvddZ=dddZ>dddZ?d>dd	Z@d>d
dZAdddZBdgddZCG dd dejDZEdddZFdddZGdddZH	d\dd"d#ZIdd%d&ZJ	d\dd+d,ZKdd-d.ZLdd1d2ZMdd6d7ZNd8d fdd?d@ZOdAd fddBdCZPddFdGZQddIdJZRejG dKdL dLZSejddNdOZTddPdQZUddRdSZVd>dTdUZWddVdWZXddXdYZYdd_d`ZZddadbZ[ddedfZ\ddhdiZ]ddkdlZ^ddndoZ_ddrdsZ`ddxdyZaddzd{Zb	d\dddZcdddZddddZedddZfd>ddZgdddZhddddddddZidd eij D ZkeldZmdddZndddZodddZpdddZqejudddZrejG dd dZsi Ztded< dddZueF Zvded< dddZwd\dddZxdddZye0dZze0dZ{G dd deeze{f Z|e3dNdd\dNdOdÐddÄZ}dĐdŐdƄZ~G dǐdȄ dejDZejudŐdɐdʄZd>dːd̄ZdƐdΐdτZdǐdҐdӄZdǐdԐdՄZdȐdאd؄Zd7dِdڄZdɐdېd܄Zd>dݐdބZdʐdߐdZdZdːddZdːddZd̐ddZ		dd͐ddZdΐddZdϐddZd>ddZdАddZdѐddZejdNdOG dd dZede%f Zeeegef ZG d	d
 d
Ze ZdҐddZdӐddZdԐddZdՐddZd֐ddZeFg dZdddZe"dאddZdؐd!d"Zdِd-d.Zdڐd1d2Z	[dېdܐd5d6ZdS (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xrN   rN   _/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>l   s    z get_gpu_type.<locals>.<listcomp>r6   r   rG   )	GPU_TYPESlenpop)
avail_gpusgpu_typerN   rN   rU   get_gpu_typej   s   r\   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rG   hiprI         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2nbytesintc                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr6   )ALIGN_BYTES)r{   rN   rN   rU   _align   s   r~   v
sympy.Exprboolc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdr}   )r   rN   rN   rU   r      s   r   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr6   Tvaluer   rL   Optional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)r   r|   r   Integerr~   r   )clsr   rN   rN   rU   eval   s
   z
align.evalN)r   r   rL   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rN   rN   rN   rU   r      s    r   Tfrozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r|   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rN   rN   rN   rU   r      s   
 r      d   fnCallable[[], Any]warmuprepfloatc              
   C  s   |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|	  |   q)|  t j  |
|d }tdt|| }tdt|| }	t|D ]}|   qYdd	 t|	D }d
d	 t|	D }t jjt jjjgdP}
t j  t|	D ],}|	  ||   t jjd |   W d   n1 sw   Y  ||   qt j  t dd	 t||D }W d   n1 sw   Y  t | }td t|
 jddd tdd	 |
 D }|r|tdd |D d 8 }td| |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArG   dtypedeviceTenable_timing   r6   c                 S     g | ]	}t jjd dqS Tr   rP   rG   EventrS   _rN   rN   rU   rV         zfp8_bench.<locals>.<listcomp>c                 S  r   r   r   r   rN   rN   rU   rV     r   
activitiesRunCudaModuleNc                 S  s   g | ]	\}}| |qS rN   )elapsed_time)rS   serN   rN   rU   rV     r   
raw eventsself_device_time_totalsort_by	row_limitc                 S  s.   g | ]}|j tjkrtd |jdur|qS )zfused_abs_max_\dN)device_typer_   CUDArematchnamerS   eventrN   rN   rU   rV     s    c                 s      | ]}|j V  qd S r   device_time_totalr   rN   rN   rU   	<genexpr>(      zfp8_bench.<locals>.<genexpr>     @@profiling results: %s ms)rP   rG   synchronizeemptyr|   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestabler`   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventsrN   rN   rU   	fp8_bench   sh   	





r   Fis_vetted_benchmarkingc                 C  s   ddl m} |t| |||S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   r   r   rN   rN   rU   do_bench_using_profiling0  s   r   c                   s"  |sddl m} |  t }|  t|}|   |  tjtdtj|d}|j	dd}|j	dd}	|
  tdD ]	}
|  |   q<|	
  |  ||	d }tdt|| }tdt|| }t|D ]}
|   qk|  tjjttjj gd	}t|D ]	}
|  |   q|  W d
   n1 sw   Y  td t| jddd t fdd| D }t|| dkrtd|t||t|| tfddt|D }|  | }td t|jdd tdd |D d | }td| |S )r   r   )may_ban_benchmarkingr   r   Tr   r   r6   r   Nr   r   r   r   c                   s*   g | ]}|j tt kr|jd kr|qS )zContext Sync)r   rO   r_   r   r   )device_type_upperrN   rU   rV     s    
z-_do_bench_using_profiling.<locals>.<listcomp>zWFailed to divide all profiling events into #repeat groups. #%s events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   rN   )rS   r   r   )num_event_per_grouprN   rU   rV     s
    zprofiling time breakdown)r   c                 s  r   r   r   r   rN   rN   rU   r     r   z,_do_bench_using_profiling.<locals>.<genexpr>r   r   )r   r   r\   upperr]   r   rP   r   r|   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r`   r   rX   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   device_interfacer   r   r   r   r   r   r   r   r   actual_eventsr   rN   )r   r   rU   r   H  sx   




r   c               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr   rM   )r  r   rN   rN   rU   has_torchvision_roi_align  s   
r  r   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
rP   r   r   r   rM   typer  r]   Workercurrent_devicer   r  rN   rN   rU   decode_device  s   


r  itIterable[sympy.Expr]c                 C  s   t tj| tjjS r   )	functoolsreduceoperatormulr   SOner  rN   rN   rU   sympy_product     r"  seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S r   rN   )rS   abrN   rN   rU   r     s    zsympy_dot.<locals>.<genexpr>)rX   r   expandr  r   )r$  r&  rN   rN   rU   	sympy_dot  s   r*  Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS rN   )r   rR   rN   rN   rU   
<dictcomp>      zunique.<locals>.<dictcomp>)valuesr!  rN   rN   rU   unique     r0  numberUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )r   r   Exprrc   sympifyr|   r  runtime_ceildiv)r2  r4  rN   rN   rU   rn     s    
rn   keyOptional[torch.dtype]c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|d'd( t| D  t| t r`| S d)||  S )*Nz*i8rt   r   r   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                 S  s   i | ]}||qS rN   rN   )rS   r   rN   rN   rU   r-    s    z_type_of.<locals>.<dictcomp>*)rM   splitupdatelistr/  r   )r9  	dtype_strtysrN   rN   rU   _type_of  sZ   
re  lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                 C     dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS rN   )r   r7  rS   r   rN   rN   rU   rV     r.  z-convert_shape_to_inductor.<locals>.<listcomp>rN   rf  rN   rN   rU   convert_shape_to_inductor  s   rl  valUnion[int, torch.SymInt]c                 C  s   t | tjr
| jjS | S )z
    Convert SymInt to sympy.Expr, leave int as is.

    Unlike sympy.sympify() which converts int to sympy.Integer,
    this function preserves int as int and only converts SymInt to Expr.
    r   rP   r1   nodeexprrm  rN   rN   rU   convert_symint_to_expr  s   rs  r   c                 C  sB   ddl m} t| tr| S t| tjrt| S |jjjj	| ddS )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r6   VN)hint)
virtualizedru  r   r|   r   r   graphsizevars	shape_envcreate_symintnode)r   ru  rN   rN   rU   convert_to_symint  s   
r|   Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                 C  ri  )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                 S     g | ]}t |qS rN   )r|  rj  rN   rN   rU   rV   5      z+convert_shape_to_symint.<locals>.<listcomp>rN   rk  rN   rN   rU   convert_shape_to_symint.  s   r  optorch._ops.OpOverloadc                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S r   )
alias_inforS   r'  rN   rN   rU   r   <      zis_view.<locals>.<genexpr>)any_schema	argumentsr  rN   rN   rU   is_view8  s   r  c                 C     dS NFrN   )r   rN   rN   rU   <lambda>A      r  user4   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   s~   | j dkrdS t| jtjjs| jtju sdS ttjj| j}|tju s(t	|r4t
 fdd| jD S tjj|jv p> |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S r   )is_pointwise_use)rS   ur  rN   rU   r   R  r  z#is_pointwise_use.<locals>.<genexpr>)r  r   targetrP   _ops
OpOverloadr  getitemr   r  r   usersTag	pointwisetags)r  r  r  rN   r  rU   r  ?  s   

r  r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c                   s   t j  g d
 fdd} j| gtt j|||fR  }t| jjdkr5t	| jjd j
d	kr5|f} | t ji  }|fS )Nargtorch.TensorrL   r4   c                   s    |   dt S )Nr  )appendplaceholderrX   )r  g
graph_argsrN   rU   add_tensor_arg]  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr6   r   Tensor)r  r  rL   r4   )rP   fxGraphr  r$   r  rX   r  returnsrM   r  outputr3   )r  r   r  r  rp  gmrN   r  rU   gen_gm_and_inputsW  s   

r  rG   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr  )r]   rQ   r   r  rN   rN   rU   r   o  s   r   modelCallable[..., Any]example_inputsSequence[Any]r   c                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r   rP   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1rN   rN   rU   timedw  s   

r  rN   
         ?repeatbaselinec                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]	}t  qS rN   )r  r   r   r  r  r   rN   rU   rV     r   z%print_performance.<locals>.<listcomp>z.6f)rP   r   r   medianprintr   )r  r  r   r  r  r   timingstookrN   r  rU   print_performance  s   r  objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S r   rN   rN   r  rN   rU   r    r  z#precompute_method.<locals>.<lambda>N)rO   setattr)r  r  rN   r  rU   precompute_method  s   r  methodsr   c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  rN   rN   rU   precompute_methods  s   r  r'  r(  c                 C  s   t | |kt | |k  S r   )r|   r'  r(  rN   rN   rU   cmp     r  rT   Union[int, Sequence[int]]sizeSequence[int]c                 C  s:   t | tr
| g| S t| dkrt| | d g| S | S )Nr6   r   )r   r|   rX   r  )rT   r  rN   rN   rU   pad_listlike  s
   

r  tuple[_T, ...]list[_T]c                 C  s&   t | dkrg S d	dd}t| |dS )
Nr   elemrp   rL   rM   c                 S  s0   t | tr| S ddlm} t | |sJ |  S )Nr6   )rE   )r   rM   	schedulerrE   get_name)r  rE   rN   rN   rU   	sort_func  s
   
ztuple_sorted.<locals>.sort_funcr9  )r  rp   rL   rM   )rX   sorted)rT   r  rN   rN   rU   tuple_sorted  s   
	r  PRV)	covariantc                   @  s$   e Zd ZedddZdddZdS )CachedMethodr   r   rL   r  c                 C     d S r   rN   )r   rN   rN   rU   clear_cache     zCachedMethod.clear_cacher   P.argsr  P.kwargsr  c                 O  r  r   rN   selfr   r  rN   rN   rU   __call__  r  zCachedMethod.__call__N)r   r   rL   r  )r   r  r  r  rL   r  )r   r   r   staticmethodr  r  rN   rN   rN   rU   r    s    r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sl   | j }d| d d| i}td| d  d  d | t| || d }d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rL   r  c                      t |  rt|   d S d S r   r  delattrr  r  rN   rU   r       
z"cache_on_self.<locals>.clear_cacher  r   rL   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rN   r  rU   cache_on_self  s$   	r  c                 C  s   t | S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  r   rN   rN   rU   cache_property_on_self  s   r   
class_name*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]]c                   s   d fdd}|S )Nr   FN_TYPE[P, RV]rL   c                   sh   d d| j  d d| i}td  d  d  d | t| |d	 }d fdd}||_|S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerr  r   rL   r  c                   r  r   r  r  r  rN   rU   r    r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cacher  r  )r   r  r  r  r  r  rU   r    s"   z'cache_on_self_and_args.<locals>.wrapper)r   r  rL   r  rN   )r  r  rN   r  rU   cache_on_self_and_args  s   &r  node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr6   irc                 S  s$   g | ]}t |d r|jr|jjqS )rp  )r  rp  originsrS   rp  rN   rN   rU   rV   ,  s    z%aggregate_origins.<locals>.<listcomp>) r  r   rb  r  r  r  or_r"   r>   r  )r  r  rN   rN   rU   aggregate_origins$  s   

r  Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                   s  t | }|dkrdd   fdd|D }tt|}nb|dkrpg }|D ]D}|jdkrhd }d}d	|jv r<|jd	 d
 }nd|jv rJ|jd d
 }d}|sMq$t|d tr^||d |  q$||d j|  q$tt|}n|dkr|dd |D }nt	d
dg| S )Noriginal_atenc                 S  sF   | j d }d}t|tjjr|jj}|S t|tjjr!t|	 }|S )Nr  r  )
r  r   rP   r  r  _overloadpacketr   HigherOrderOperatorrM   r   )originr  r9  rN   rN   rU   get_origin_meta_strA  s   
z2get_fused_kernel_name.<locals>.get_origin_meta_strc                   s6   g | ]}|j d krd|jv r|jd dur |qS )r  r  N)r  r  rS   r  r  rN   rU   rV   K  s    

z)get_fused_kernel_name.<locals>.<listcomp>rP   r  r  source_fn_stackr   fwd_source_fn_stackbackwardr6   inductor_nodec                 S  s   g | ]
}|j d kr|jqS r  )r  r   r  rN   rN   rU   rV   i  s    r   fused)r  r  r"   r  r  r   rM   r  r   NotImplementedErrorjoin)r  r  all_originssourcesr  	source_fnsuffixrN   r  rU   get_fused_kernel_name:  s>   




r'  r  r9   tuple[str, str]c                   s  t | }dd |D }tt}tt}d|rItdd |D }t|dkrI|d jtds?d	d
 tj	D }|_
|jfddd |D ]`}d|jv r|jd dur|jd }	d}
t|	tjjrmt|	j}
nt|	tjjrzt|	 }
|
r||
 |j d|jv r|jd d j}
||
 |j qK|jddkr||j |j qKdurdnd}|j d| dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  qڈdurddlm  ||j d t }g }t|  jsddlm } d; fd%d&}d<d)d*d=fd-d.}| D ]}t|d/r;|j!du r=q-t|j!d0r|j!j"dur|j!j"D ];}|j|v rYqO|#|j |j$|j}|du rmqO|||j\}}||j d1| d2|| d3| d qOt|j!d4r|j!j%dur|j!j%D ] }|j$|j}|du rq|||j\}}|d5|  qq-|D ]}||j d|j&d6d7  q||j d8d9|  |d:|fS )>aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                 S  s   g | ]	}|j d kr|qS r  r  r  rN   rN   rU   rV     r   z'get_kernel_metadata.<locals>.<listcomp>Nc                 s  r   r   )rx  )rS   nrN   rN   rU   r     r   z&get_kernel_metadata.<locals>.<genexpr>r6   r   )_inductor_kernel_metadata_node_to_idx_mapc                 S     i | ]\}}||qS rN   rN   )rS   idxr)  rN   rN   rU   r-    r.  z'get_kernel_metadata.<locals>.<dictcomp>c                   s
    j |  S r   )r*  r)  )single_graphrN   rU   r    s   
 z%get_kernel_metadata.<locals>.<lambda>r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r5  z], Original ATen: []z" Source node to ATen node mapping:z   z => r
  z Graph fragment:rt  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerM   rL   tuple[str, ir.Layout | None]c                   sp   t |  jrt | j jr| jjj}n| j}|d u r|}n|j}z	|  }W ||fS  ty7   d }Y ||fS w r   )r   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr!  )r5  r7  r<  r   layoutr
  rN   rU   get_buffer_info  s   
z,get_kernel_metadata.<locals>.get_buffer_infoshapeIterable[int]c                 S  s   dd dd | D  dS )N[r5  c                 S  r  rN   )rM   rR   rN   rN   rU   rV     r  z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>r4  )r"  )r@  rN   rN   rU   stringify_shape  s   z,get_kernel_metadata.<locals>.stringify_shaper>  ir.Layout | Nonec                   sJ   | d u rdS  | j  } | j }| j }dt| j  | | | dS )Nr  ")r  strider   r!   r   )r>  shape_annotationstride_annotationdevice_annotation)rC  rN   rU   stringfy_layout  s   z,get_kernel_metadata.<locals>.stringfy_layoutread_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r5  r6  r7  rM   rL   r8  )r@  rA  rL   rM   )r>  rD  rL   rM   )'r  collectionsdefaultdictrb  r"   rX   rx  r  r   nodesr*  sortr  r   rP   r  r  rM   r  r  r   r  getcommentr"  keysr  itemsr  r  r>   rw  ru  rK  rL  addtry_get_bufferrM  format_node)r  r  r#  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_maprp  r  r9  sort_strmetadatadetailed_metadataoriginal_noderT  	all_reads
all_writesru  r?  rJ  r)  rr5  
input_namer>  woutput_namer   rN   )r  r.  rC  rU   get_kernel_metadataq  s   











rl  initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)rb  r"   rY   r  rZ  r  )rm  ro  dominated_setrp  userrN   rN   rU   dominated_nodes  s   


	rt  Sequence[IRNode]dict[str, IRNode]c                   sp   ddl m  d fddt|\}}fd	d
|D }t| \}}fdd
|D }ttjg ||R  S )Nr6   r
  r)  r?   rL   r   c                   sT   t |  jr| jS t |  jr| jS t |  jo)t |  j j j jf S r   )	r   r9  r:  r;  r?   ComputedBufferInputsKernelInputBufferTemplateBufferr-  r  is_unrealized_noderN   rU   r|  $  s   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS rN   r  rS   rm  r|  rN   rU   rV   6  r   z"gather_origins.<locals>.<listcomp>c                   r}  rN   r~  r  r  rN   rU   rV   8  r   )r)  r?   rL   r   )r  r  r#   r"   	itertoolschain)r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsrN   r{  rU   gather_origins  s   r  rq  c                   s@   ddd d fdd	d fd
ddfdd| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    rq  r   rL   r   c                 S  s(   t | tjot| jdko| jd dkS )N   r   r   )r   r   MulrX   r   rq  rN   rN   rU   is_neg_leadC  s   &zsympy_str.<locals>.is_neg_leadrM   c                   sj   t | tjr1t| jdkr( | jd r(| jd  d| jd jd  S dt| jS | S )Nr  r6   r   z - z + )r   r   r   rX   r   r"  r   r  )r  sympy_str_mulrN   rU   sympy_str_addH  s
   (z sympy_str.<locals>.sympy_str_addc                   sB   t | tjr | rd| jd  S dt| jS | S )N-r6   z * )r   r   r  r   r"  r   r  )r  sympy_str_atomrN   rU   r  S  s
   z sympy_str.<locals>.sympy_str_mulc                   sp   t | tjr	| jS t | tjtjfrd |  dS t | tttt	fr4| j
j ddtt| j dS t| S )N()r5  )r   r   Symbolr   r   r  rg   rd   re   rf   funcr   r"  r   	sympy_strr   rM   r  )r  rN   rU   r  ^  s   "z!sympy_str.<locals>.sympy_str_atomN)rq  r   rL   r   rq  r   rL   rM   rN   r  rN   )r  r  r  r  rU   r  <  s
   

r  r  ValueRanges[Any]c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr6   rt  current_node
index_expr)
rw  ru  rm   compute_all_boundsrO   interpreterr  rj   rk   unknown)r  ru  fx_noderN   rN   rU   get_bounds_index_exprk  s   
r  prefixc                 C  s   | d dkS )Nr   rh  rN   )r  rN   rN   rU   prefix_is_reductiony     r  ri   r,  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)ri   SIZErh   )r  r,  rN   rN   rU   sympy_index_symbol_with_prefix}  s   r  checkc                 C  s   | st jot jS r   )rm   debug_index_assertsassert_indirect_indexing)r  rN   rN   rU   generate_assert     r  r   c                 C  s    | d dksJ t j| dddS )r  r   r   Tr  )r   r  r   rN   rN   rU   sympy_index_symbol  s   r  replacementsdict[sympy.Expr, Any]c                   s,   ddd t |  fd	d
| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedr   replacementUnion[sympy.Expr, str]rL   r  c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nr  )r   r   r6  rM   r  r   is_nonnegative)r  r  rN   rN   rU   	to_symbol  s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS rN   rN   rS   kr   r  rN   rU   r-        zsympy_subs.<locals>.<dictcomp>N)r  r   r  r  rL   r  )r   r7  xreplacerY  )rq  r  rN   r  rU   
sympy_subs  s   

r  ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s   t | tjpt | tjo| jS r   )r   rP   r1   r  _has_symbolic_sizes_strides)r'  rN   rN   rU   is_symbolic  s   r  c                  G     t dd | D S )Nc                 s      | ]}t |V  qd S r   )r  r  rN   rN   rU   r         z"any_is_symbolic.<locals>.<genexpr>r  )r   rN   rN   rU   any_is_symbolic  r1  r  )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarr  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  sR   ddl m} | jjD ]}t|r|  S |jd }d ur&||r&|  S q
d S )Nr   )r(   rm  )%torch.fx.experimental.symbolic_shapesr(   rx  rT  is_cudagraph_unsafe_fx_noder  rV  )r  r(   rp  rm  rN   rN   rU   %get_first_incompatible_cudagraph_node  s   r  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphr  )nextiterreversedrx  rT  r  )r  	last_noderN   rN   rU   output_node  s   r  OrderedSet[torch.device]c                 C  s\   | j jdd}tdd |D }t| jd }t|tr|n|f}tdd |D }||B S )Nr  r  c                 s  s0    | ]}t |jd tjr|jd  jV  qdS rm  N)r   r  rV  rP   r  r   r  rN   rN   rU   r     s    

z"get_all_devices.<locals>.<genexpr>r   c                 s  s>    | ]}t |tjjrt |jd tjr|jd  jV  qdS r  )r   rP   r  r4   r  rV  r  r   )rS   r  rN   rN   rU   r     s    

)rx  
find_nodesr"   r  r   r   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicesrN   rN   rU   get_all_devices  s   r  c                  C  s   t tj D ]@} | dsqtj|  }|jD ]+}|drBt||}t|tj	j
jjrB|jD ]}t|tj	j
jjrA|jjj  q/qtj| = qdtjv r]tjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rb  sysmodulesrX  
startswith__dict__rO   r   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  rN   rN   rU   unload_xpu_triton_pyds  s.   









r  _registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  rN   rN   rU   clear_on_fresh_cache  s   
r  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r  r  r  rN   rN   rU   clear_caches)  s   
r  r   Iterator[None]c              
   c  sn    t j| }z|t j| < dV  W |du rt j| d dS |t j| < dS |du r1t j| d w |t j| < w )a  Thread-safe env var set/restore using atomic C-level lookups.

    We avoid mock.patch.dict(os.environ, ...) because it internally calls
    os.environ.copy(), which iterates all env var keys then fetches values in
    separate steps. That approach is not atomic and can race with background threads
    (e.g. Triton async compilation) modifying the environment, causing KeyError,
    so we use os.environ.get() for individual keys which is an atomic C-level lookup.
    N)osenvironrV  rY   )r9  r   oldrN   rN   rU   _set_env1  s   

r  cache_entriesOptional[dict[str, Any]]dirOptional[str]deletec              	   #  sT   t   ddlm} |tj|d zz~td U td  |tj	
 dtd1 dV  t| trXt| dksAJ d	tj	rXt}| fd
d|D  W d   n1 sbw   Y  W d   n1 sqw   Y  |rt rtj rt  tj t  fddd W n ty   td   w W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)r  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)r  pathgetsizer"  )rS   f)triton_cache_dirrN   rU   r-  d  s
    zfresh_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  r  r  )inductor_cache_dirrN   rU   r  t  s
    zfresh_cache.<locals>.<lambda>)ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr  r   r   r  r  r"  r   dictrX   existslistdirra  
is_windowsrP   rI   rQ   r  shutilrmtree	Exceptionr  )r  r  r  r  filesrN   )r  r  rU   fresh_cacheF  sP   



r  )reverseseqr  	list[int]c                C  s8   | j }tt| }tt||dd}|stt|S |S )NTr9  r  )__getitem__r   rX   rb  r  r  )r  r  gettera_rsort_idxrN   rN   rU   argsort  s   r%  rz  r*   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                  sF   d fdd}dd	 t |D }t|t||d
}dd	 |D }|S )Nr'  tuple[int, sympy.Expr]r(  rL   r|   c                   sZ   | \}}|\}}d
 fdd}|||k rdS |||krdS ||k r%dS ||kr+dS d	S )Nrq  %Union[bool, torch.SymInt, sympy.Expr]rL   r   c                   s   t | tr| S  j| ddS )NT)size_oblivious)r   r   evaluate_exprr  rz  rN   rU   evaluate  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r6   r   )rq  r(  rL   r   rN   )r'  r(  a_idxa_valb_idxb_valr,  r+  rN   rU   r    s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS rN   ro  )rS   r,  r   rN   rN   rU   rV     s    zargsort_sym.<locals>.<listcomp>r   c                 S  s   g | ]\}}|qS rN   rN   )rS   r,  r   rN   rN   rU   rV     r  )r'  r'  r(  r'  rL   r|   )r   r  r  
cmp_to_key)rz  r  r  r  exprsr  rN   r+  rU   argsort_sym  s   r3  r   torch.dtypec                 C  s    | t jkrdS t jd| d S )Nrz   rN   r   )rP   r^  r   element_sizer5  rN   rN   rU   get_dtype_size  s   
r7  c                   @  s   e Zd ZU ded< dS )LineContextr   contextNr   r   r   r   rN   rN   rN   rU   r8    s   
 r8  c                   @     e Zd ZU ded< ded< dS )ValueWithLineMaprM   r   zlist[tuple[int, LineContext]]line_mapNr:  rN   rN   rN   rU   r<       
 r<  c                   @  s   e Zd ZdZdDdEddZejdFddZdGddZdHddZ	dHddZ
dIddZdJddZdHddZdIddZdKd d!ZdLd$d%ZdMdNd)d*ZdMdOd+d,ZdMdOd-d.Z	/dPdQd3d4ZdRd7d8ZdHd9d:ZdSd=d>ZdTdAdBZdCS )UIndentedBuffer   r   initial_indentr|   rL   r  c                 C  s   g | _ || _d S r   )_lines_indent)r  rA  rN   rN   rU   __init__     
zIndentedBuffer.__init__tabwidthr  c                 c  s*    | j }z|| _ d V  W || _ d S || _ w r   )rF  )r  rF  prevrN   rN   rU   set_tabwidth  s   zIndentedBuffer.set_tabwidthr<  c                 C  s   t  }d}g }| jD ]:}t|tr| }|d u rq
nt|tr(|||jf q
|}t|ts1J || |d |d|	d 7 }q
t
| |S )Nr6   rQ  )r   rB  r   DeferredLineBaser8  r  r9  rM   writecountr<  getvalue)r  bufr   linemaplilinerN   rN   rU   getvaluewithlinemap  s$   




z"IndentedBuffer.getvaluewithlinemaprM   c                 C  s
   |   jS r   )rQ  r   r  rN   rN   rU   rL       
zIndentedBuffer.getvaluec                 C  s   t  }| jD ]8}t|tr| }|d u rqnt|trq|}t|ts%J |dr4||d d  q|| |d q| S )N\r   rQ  )	r   rB  r   rI  r8  rM   endswithrJ  rL  )r  rM  rO  rP  rN   rN   rU   getrawvalue  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S r   )rB  clearr  rN   rN   rU   rV       zIndentedBuffer.clearr   c                 C  
   t | jS r   )r   rB  r  rN   rN   rU   __bool__  rR  zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr3  )rC  rF  r  rN   rN   rU   r    r  zIndentedBuffer.prefixc                 C  s   |  d d S )NrQ  	writeliner  rN   rN   rU   newline  rW  zIndentedBuffer.newlinerP  )Union[LineContext, DeferredLineBase, str]c                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )r   r8  rB  r  rI  with_prefixr  stripr  rP  rN   rN   rU   r[    s   

zIndentedBuffer.writelinelines3Sequence[Union[LineContext, DeferredLineBase, str]]c                 C  s   |D ]}|  | qd S r   rZ  )r  rb  rP  rN   rN   rU   
writelines"  s   zIndentedBuffer.writelinesr6   offset'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )NrL   r  c                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w r   rC  rN   re  r  rN   rU   r  )  
   "z"IndentedBuffer.indent.<locals>.ctxrL   r  )
contextlibcontextmanager)r  re  r  rN   rh  rU   indent(  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S r   rg  r  re  rN   rN   rU   	do_indent3  r1  zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S r   rg  rn  rN   rN   rU   do_unindent6  r1  zIndentedBuffer.do_unindentF
other_codeUnion[IndentedBuffer, str]r`  c                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   rQ  )r   r?  r   rB  r8  minrX   r  mathisinfr  r[  r|   textwrapdedentrstripr`  )r  rq  r`  rx  rP  r   rN   rN   rU   splice9  s,   





zIndentedBuffer.splicer  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )NrA  c                      g | ]} |qS rN   rN   )rS   rP  r  rN   rU   rV   U  r  z&IndentedBuffer.map.<locals>.<listcomp>)r?  rC  rB  )r  r  r   rN   r~  rU   r   S  s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr  r  )r  rL  r  rN   rN   rU   __repr__X  r  zIndentedBuffer.__repr__otherr   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nr|  )rC  r?  rd  rB  )r  r  r   rN   rN   rU   __add__[  s
   zIndentedBuffer.__add__new_line)Union[DeferredLineBase, LineContext, str]c                 C  s
   || j v S r   )rB  )r  r  rN   rN   rU   containsc  rR  zIndentedBuffer.containsNr   )rA  r|   rL   r  )rF  r|   rL   r  )rL   r<  rL   rM   rL   r  rL   r   )rP  r]  rL   r  )rb  rc  rL   r  r   )re  r|   rL   rf  )re  r|   rL   r  F)rq  rr  r`  r   rL   r  )r  r{  rL   r?  )r  r   rL   r?  )r  r  rL   r   )r   r   r   rF  rD  rk  rl  rH  rQ  rL  rU  rV  rY  r  r\  r[  rd  rm  ro  rp  rz  r   r  r  r  rN   rN   rN   rU   r?    s.    












r?  c                      s(   e Zd Zd
 fddZddd	Z  ZS )FakeIndentedBufferrL   r  c                   s   t    d S r   )superrD  r  	__class__rN   rU   rD  h  rW  zFakeIndentedBuffer.__init__r   rM   r   c                 C  s$   |dkr
t | |S td| d)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   rN   rN   rU   r  k  s
   
z#FakeIndentedBuffer.__getattribute__r  )r   rM   rL   r   )r   r   r   rD  r  __classcell__rN   rN   r  rU   r  g  s    r  c               	   c  s<    t jt j} }zd V  W | |t _t _d S | |t _t _w r   )r  stdoutstderr)initial_stdoutinitial_stderrrN   rN   rU   restore_stdout_stderrv  ri  r  c                   @  s`   e Zd ZdZdddZddd	ZdddZd ddZd!ddZd"ddZ	d#ddZ
d$ddZdS )%rI  z.A line that can be 'unwritten' at a later timerP  rM   c                 C  s   |  sd}|| _d S r^  )r`  rP  ra  rN   rN   rU   rD    s   
zDeferredLineBase.__init__rL   Union[str, None]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r!  r  rN   rN   rU   r       zDeferredLineBase.__call__r   c                 C  r  )z3Returns a new deferred line with the same conditionr  ra  rN   rN   rU   	_new_line  r  zDeferredLineBase._new_liner  c                 C  s   |  | | j S r   r  rP  )r  r  rN   rN   rU   r_    r#  zDeferredLineBase.with_prefixc                 C  s   |  | j S r   )r  rP  r  r  rN   rN   rU   r    r  zDeferredLineBase.lstripr  Union[int, slice]c                 C  s   |  | j| S r   r  )r  r  rN   rN   rU   r!    r  zDeferredLineBase.__getitem__r   c                 C  rX  r   )r   rP  r  rN   rN   rU   rY    rR  zDeferredLineBase.__bool__r|   c                 C  rX  r   )rX   rP  r  rN   rN   rU   __len__  rR  zDeferredLineBase.__len__N)rP  rM   )rL   r  )rP  rM   rL   r   )r  rM   rL   r   )rL   r   )r  r  rL   r   r  rL   r|   )r   r   r   r   rD  r  r  r_  r  r!  rY  r  rN   rN   rN   rU   rI    s    






rI  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r9  rM   value_fnCallable[[], str]rP  c                   s   t  | || _|| _d S r   )r  rD  r9  r  )r  r9  r  rP  r  rN   rU   rD    s   
zDelayReplaceLine.__init__rL   c                 C  s   | j | j|  S r   )rP  replacer9  r  r  rN   rN   rU   r    r#  zDelayReplaceLine.__call__c                 C  s   t | j| j|S r   )r  r9  r  ra  rN   rN   rU   r    r  zDelayReplaceLine._new_line)r9  rM   r  r  rP  rM   r  )rP  rM   rL   r  )r   r   r   r   rD  r  r  r  rN   rN   r  rU   r    s
    
r  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntt | }t|}tjjr3|jd us J |jdk s*|jdkr1t	
d dS dS |jdkr:dnd}|j}||k rOt	j
d	||d
d dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrI   rv   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r   rP   r   r\   r   createversionru   majorr   r  r  multi_processor_count)r  r   propr  r  rN   rN   rU   
is_big_gpu  s&   

r  c                   C  s$   t j rt j jS t jdjS )NrG   )rP   rI   rQ   get_device_propertiesgpu_subslice_countrG   r  rN   rN   rN   rU   get_max_num_sms  s   
r  c                  C  s*   t j sdS t jt j } | jdkS )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rG   rQ   r  r  r  )device_propertiesrN   rN   rU   
using_b200  s   

r  c                  C  s2   t j rt S t j } t | dur|  S d S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rP   rI   rQ   r  r	  _get_sm_carveout_experimental)carveoutrN   rN   rU   get_num_sms  s   

r  num_tma_descriptorsnum_programsOptional[int]r7   c                 C  sH   ddl m}m} |du rt }|d}||  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r6   )r7   WorkspaceZeroModeNF)rK  	zero_moder   
outer_name)codegen.commonr7   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r  r   r  r7   r  r  r  rN   rN   rU   get_tma_workspace_arg  s   
r  r>  r@   allowed_layout_dtypeslist[torch.dtype]c                 C  s:   | j |vrtd| j | t| jjo| j |v ot| jS )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r>  r  rN   rN   rU   _use_template_for_gpu  s   
r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS rN   r`  rR   rN   rN   rU   rV   
      z)_use_autotune_backend.<locals>.<listcomp>rP  )r   rm   max_autotune_gemm_backendsr`  r  rN   rN   rU   _use_autotune_backend	     r  c                 C  r  )Nc                 S  r  rN   r  rR   rN   rN   rU   rV     r  z._use_conv_autotune_backend.<locals>.<listcomp>rP  )r   rm   max_autotune_conv_backendsr`  r  rN   rN   rU   _use_conv_autotune_backend  r  r  )enable_int32enable_float8check_max_autotuner  r  r  c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g t| jjo1t| |p<| jjdko<| j|v oPtjpEtjpE| oPtdoP|| j|jS )Nr6   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rP   r   rJ  rL  rT  extendrD  rE  r  r   r  r  r   rm   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r>  r  r  r  r  r  layout_dtypesrN   rN   rU   use_triton_template  s"   	
r  output_layout
add_guardsmatricesr?   r  Optional[Layout]r  c                   s   ddl m} ddlm  d  fd	d
d!fdd}d" fddd# fddd# fdd| oEtfdd|D oE|| S )$u^  
    Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
      * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
      * Base pointer 16-byte aligned
      * Exactly one contiguous ("inner") dim with stride 1
      * All "outer" dims have 16-byte aligned strides
      * Inner dim size × itemsize is a multiple of 16
      * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
    r   )has_triton_tma_devicer6   rt  
expr_bytesr3  rL   r   c                   s    j j| tS r   )rx  ry  statically_known_multiple_ofTMA_ALIGNMENT)r  rt  rN   rU   _alignedG  r  zcan_use_tma.<locals>._alignedr>  r  c                   s8   | d u rdS | j }| j}| j} | jsdS |||S )NTF)r  rF  r   re  )r>  sizesstridesr   )r  _is_tma_compatiblerN   rU   _is_tma_compatible_layoutJ  s   
z.can_use_tma.<locals>._is_tma_compatible_layoutr  r?   c                   s^   |   }|  }|  }|   jjv rdS |   }d ur)|jdkr)|||S |||S )NFrI   )get_size
get_stride	get_dtyper  rx  unaligned_buffers
get_devicer  )r  r  r  r   m_device)ru  r  _is_tma_compatible_xpurN   rU   _is_tma_compatible_matrixW  s   z.can_use_tma.<locals>._is_tma_compatible_matrixr  r%  r  Sequence[_IntLike]r   r4  c                   s  t | }|j}|dk s|dkrdS |tvrdS r( jj| } jj|}n fdd| D } fdd|D } fddt|D }t |dkrMdS |d }t|D ]\}	}
|	|kr^qU|
| sg dS qU|| }|| stdS |dkr jj|d	sdS d
S )Nr6   r   Fc                      g | ]	} j j|qS rN   rx  ry  symbolic_hint)rS   r   rt  rN   rU   rV   w  r   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>c                   r  rN   r  )rS   strt  rN   rU   rV   x  r   c                   s$   g | ]\}} j j|d r|qS r   rx  ry  statically_known_equals)rS   r   r  rt  rN   rU   rV   {  s    r       T)rX   itemsizerx   rx  ry  guard_int_seqr   statically_known_geq)r  r  r   rankr  sizes_i	strides_ir  	inner_idxr   r  	inner_dim)ru  r  r  rN   rU   r  e  s:   
z'can_use_tma.<locals>._is_tma_compatiblec                   s`   |d } j j|} j j|dsdS d}| D ]} j j|} j j||r- dS qdS )Nr   r6   Fl    T)rx  ry  r  r  statically_known_gt)r  r  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintrt  rN   rU   r    s   z+can_use_tma.<locals>._is_tma_compatible_xpuc                 3      | ]} |V  qd S r   rN   rS   r  )r  rN   rU   r     r  zcan_use_tma.<locals>.<genexpr>N)r  r3  rL   r   )r>  r  rL   r   )r  r?   rL   r   )r  r%  r  r  r   r4  rL   r   )torch.utils._tritonr  rw  ru  r   )r  r  r  r  r  rN   )ru  r  r  r  r  r  rU   can_use_tma2  s   1r  )r  c                 G  s:   t jjr| nd }tdd |D ot|||dot jjS )Nc                 s  s     | ]}t | d kV  qdS )r  N)rX   r  r	  rN   rN   rU   r         z*use_triton_tma_template.<locals>.<genexpr>r  )rm   r  enable_template_tma_storer   r  enable_persistent_tma_matmul)r  r  r  r>  rN   rN   rU   use_triton_tma_template  s   r  c                 G  s8   t || |ds
dS ddlm} ddlm} | o| S )Nr  Fr   )%has_triton_tensor_descriptor_host_tmar6   is_datacenter_blackwell_arch)r  r
  r  codegen.cuda.cuda_envr  )r  r  r  r  r  rN   rN   rU   !use_triton_blackwell_tma_template  s   r  scale_option_ar5   scale_option_bscaling_typeslist[ScalingType]c                 C  s   | |v o||v S r   rN   )r  r  r  rN   rN   rU   use_triton_scaling_template  s   r  )maxsizec                   C  (   z	t jdduW S  ty   Y dS w )zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    cutlassNF	importlibutil	find_specr  rN   rN   rN   rU   ensure_cute_available  
   r!  c                   C  r  )zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

    Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
    in the same interpreter to retry the import.
    cutlass_apiNFr  rN   rN   rN   rU   "ensure_nv_universal_gemm_available  r"  r$  c                   C  r  )aG  Check if nvMatmulHeuristics is importable; cache the result for reuse.

    nvMatmulHeuristics provides performance model-based kernel selection
    for NVIDIA GEMM operations.

    Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
    nvMatmulHeuristics in the same interpreter to retry the import.
    nvMatmulHeuristicsNFr  rN   rN   rN   rU   $ensure_nvmatmul_heuristics_available  s
   
r&  mat_amat_ba_is_2db_is_2doffsOptional[Any]biasscale_resultc           
      C  s   t  sdS tdsdS ddlm} t|jjsdS | sdS tjg}	t	||	s)dS t
js1t
js1dS t| ||ds:dS tdd | |fD rGdS |rK|rMdS |du rSdS |dus[|dur]dS d	S )
a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr6   r  )r  c                 s  r  r   )
is_dynamicrR   rN   rN   rU   r   0  r  z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>NT)r!  r  r  r  r  r   r  rP   rJ  r  rm   r  r  r  r  )
r'  r(  r>  r)  r*  r+  r-  r.  r  r  rN   rN   rU    use_blackwell_cutedsl_grouped_mm  s2   
r1  r  r)  r  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jg}t| |oAtjp=tjoAtd}|rQ| sQtd	tjj dS |S )
Nr6   rt  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rw  ru  rx  ry  optimization_hintrm   r  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsr4  rP   r  ru   r   rJ  rT  r  r  r  r  r   r  cutlass_dir)	r>  r  r)  r  ru  	gemm_sizer4  r  r   rN   rN   rU   use_cutlass_template?  s*   

r;  _IntLikeOptional[IRNode]r  Optional[_IntLike]c           
        s   ddl m t sdS t sdS tdsdS ddlm   jr!dS | jj	dks+t
jjr-dS tjs5tjs5dS |||g}|durC|| tfd	d
|D rPdS ||g}	|dur]|	| t fdd
|	D rjdS dS )a3  
    Return True if we can use the NVIDIA Universal GEMM Template.

    Required conditions:
        1. NVGEMM backend is enabled
        2. cutlass_api is available
        3. We are on a NVIDIA GPU
        4. Max autotune or max autotune gemm is enabled
        5. Not in AOT Inductor mode (requires runtime JIT compilation)
        6. Base pointers are 16-byte aligned
        7. Shape dimensions are not unbacked symbols

    Note:
        - Shape and stride constraints are handled internally by
          cutlass_api.get_kernels() which filters incompatible kernels.
        - GroupedGemm currently only supports TN layout (column-major B).
          Any other layout will act as a noop and fall back to ATen.
        - Dynamic shapes are supported as long as they have hints
          (from example inputs).
    r   has_free_unbacked_symbolsFNVGEMMr6   rt  rG   Nc                 3  r  r   rN   )rS   dimr?  rN   rU   r     r  z1use_nv_universal_gemm_template.<locals>.<genexpr>c                 3  s     | ]}|   jjv V  qd S r   )r  rx  r  )rS   trt  rN   rU   r     r  T)r  r@  r!  r$  r  rw  ru  aot_compilationr   r  rP   r  ru   rm   r  r  r  r  )
r>  r  r)  r  r'  r(  r+  r  dims_to_checktensors_to_checkrN   )ru  r@  rU   use_nv_universal_gemm_template`  s2   


rG  op_namec                 C  s4   t jj }|dkrdS |  dd |dD v S )z8Check if CUTLASS should be used for the given operation.ALLTc                 S  r  rN   r  rR   rN   rN   rU   rV     r  z'_use_cutlass_for_op.<locals>.<listcomp>rP  )rm   r  cutlass_enabled_opsr   r`  )rH  enabled_opsrN   rN   rU   _use_cutlass_for_op  s   rL  r   threshold_multiplec              
   C  sf   ddl m} tjj| }|jjt	t
|||  t
||| o2|jj o2|jj o2tjjdkS )Nr   rt  )torch._inductor.virtualizedru  rm   r  decompose_k_thresholdrx  ry  statically_known_truer   AndGeaot_modecpp_wrappernum_decompose_k_splits)r  r)  r  rM  ru  rO  rN   rN   rU   use_decompose_k_choice  s   
rV  c              
   C  sb   t jj}ddlm} ttjjo0|j	j
tt|||  t||| o0|j	j o0|j	j S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rt  )rm   rocmcontiguous_thresholdrN  ru  r   rP   r  ru   rx  ry  rP  r   rQ  rR  rS  rT  )r  r)  r  rX  ru  rN   rN   rU   use_contiguous  s   rY  c                   s0  t jj}g d}t|tjr|js|S |dkrg S t| tjr"| jr+t|tjr.|js.d n	t||  ||  dt|} fdd|D }g g g }}}|D ].}	||	 }
|
dk r]qR|
|
d @ dkro|
dkro|	|	 qR|
d	 dkr{|	|	 qR|	|	 qRt j
d
kr|| | S || | }|d | S )N)rv   r  ry   rw      r   rZ  r  c                   s    g | ]}| kr|kr|qS rN   rN   )rS   divisormax_k_splitmin_k_splitrN   rU   rV     s
    z get_k_splits.<locals>.<listcomp>rw   r6   r  
EXHAUSTIVE)rm   r  rU  r   r   r6  	is_numberrt  divisorsr  max_autotune_gemm_search_space)r  r)  r  k_splits_limitdefault_k_splitsra  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsrN   r\  rU   get_k_splits  s@   


rk  c                 C  s   t j| jS r   )rP   rG   r  gcnArchNamer   rN   rN   rU   _rocm_native_device_arch_name	  s   rn  Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s|   zdd l } ddlm}m} ddlm} tj| j	}W n t
y7   ddd}ddd	}G d
d d}d }Y nw ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrL   r  c                   S     g S r   rN   rN   rN   rN   rU   rp  /	  r  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  rs  r   rN   rN   rN   rN   rU   rq  2	  r  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rN   rN   rN   rU   rr  5	  s    rr  )rL   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesrp  rq  ck4inductor.universal_gemm.oprr  r  r  dirname__file__r  )rt  rp  rq  rr  package_dirnamerN   rN   rU   try_import_ck_lib	  s   

rz  c                   s   t jst jsdS tjjsdS | jjdkrdS t| j}dd t j	j
D p,|dd |i  fdd  t j	j@ D }|s@dS | jtjtjtjfvrMdS t \}}}}|s]td	 dS |t j	_d
S )NFrG   c                 S  s   i | ]
}| d d |qS ):r   )r`  rS   r  rN   rN   rU   r-  I	  r  z#use_ck_template.<locals>.<dictcomp>r{  r   c                   s   g | ]} | qS rN   rN   r|  requested_archsrN   rU   rV   L	  s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packageT)rm   r  r  rP   r  ru   r   r  rn  rW  archr`  rX  ck_supported_archr   r   rJ  rL  rz  r   r  ck_dir)r>  native_archrequested_supported_archsck_package_dirnamer   rN   r}  rU   use_ck_template<	  s.   


r  c                 C  :   ddl m} tdot| o|jjj|| | dddkS )Nr6   rt  CKr   r2  r   rw  ru  r  r  rx  ry  r6  r>  r  r)  r  ru  rN   rN   rU   use_ck_gemm_templatea	     r  c                 C  r  )Nr6   rt  CKTILEr   r2  r   r  r  rN   rN   rU   use_ck_tile_gemm_templatek	  r  r  c                 C  s   t dot| S )Nr  )r  r  r>  rN   rN   rU   use_ck_conv_templateu	  r  r  c                 C  s   t jpt jo| jjdkS r  )rm   r  r  r   r  r  rN   rN   rU   _use_template_for_cpuy	  s   

r  mat1Union[ReinterpretView, Buffer]mat2c                 C  s   ddl m} t|j|sJ |jj}|jj}t| o:| tj	ko:t
|dko:t
|dko:|d |d ko:|d dk}t| ||ddoI|j pI|S )Nr6   )r@      r  F)require_constant_mat2)r  r@   r   r>  r  rF  r  r  rP   rL  rX   use_cpp_gemm_templateis_contiguous)r>  r  r  r@   	mat1_sizemat1_stridemat1_each_batch_is_contiguousrN   rN   rU   use_cpp_bmm_template	  s$   


r  mat2_transposedr  is_woq_int4q_group_sizec                 C  s>  ddl m} ddlm} ddlm}	 ddlm}
 t| r t	ds"dS t
jjs(dS | tjtjfv }tjtjtjtjtjg}|
|||rF| jnd ||d\}}}} }}t||frZdS t||jrd| }|	| \}}|d	|||| | |t | |d

}ddd}| j|v o|d uo||ot||jo| p| S )Nr6   r
  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refr  rT   r?   rL   r   c                 S  s   |    |  d dkS )Nr   r6   )freeze_layoutr  rT   rN   rN   rU   is_last_dim_stride1	  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1)rT   r?   rL   r   )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  rm   cppweight_prepackr  rP   r[  rP  rL  rJ  halfr   has_free_symbolsr   BaseViewunwrap_viewparallel_num_threadsr;  is_module_buffer)r>  r  r  r  r  r  r  r  r  r  r  	int8_gemmr  r  r)  r  r  r   r  r  rN   rN   rU   r  	  sX   		


r  c                   C  s   t jpt j p
tdS )NATEN)rm   r  r  r  rN   rN   rN   rU   use_aten_gemm_kernels	  s   
r  c                   @  s>   e Zd ZU edZded< dddZddd	ZdddZ	dS )DebugDirManagerr   rM   prev_debug_namerL   r  c                 C  s   t tj| _d S r   )r  r  counterr   r  rN   rN   rU   rD  	  r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )N_tmp_)rP   _dynamorm   debug_dir_rootr  r   new_namer  rN   rN   rU   	__enter__	  s   zDebugDirManager.__enter__r   r   c                 G  s   t | j | jtjj_d S r   )r  r  r  r  rP   r  rm   r  )r  r   rN   rN   rU   __exit__	  s   zDebugDirManager.__exit__Nr  )r   r   rL   r  )
r   r   r   r  rK  r  r   rD  r  r  rN   rN   rN   rU   r  	  s   
 


r  Callable[P, _T]r  r  tuple[_T, list[str]]c                   st   ddl m} t  d
 fdd}tj|d	| tj  | |i |}W d    n1 s/w   Y  |t	 fS )Nr6   r;   coderM   rL   r  c                        |  d S r   )rZ  r  source_codesrN   rU   save_output_code	  rW  z*run_and_get_code.<locals>.save_output_coder  r  rM   rL   r  )
rx  r<   r"   r   patchr  rP   r  resetrb  )r   r   r  r<   r  r  rN   r  rU   run_and_get_code	  s   
r  c                 O  sd   | dd}t| g|R i |\}}g }|D ]}|td|tj |r-dd |D }q||fS )Nremove_quoteFz	'''.*?'''c                 S  s   g | ]}|d d qS )r  rN   )rS   r  rN   rN   rU   rV   
      z'run_and_get_kernels.<locals>.<listcomp>)rY   r  r  r   findallDOTALL)r   r   r  r  r  r  kernelsr  rN   rN   rU   run_and_get_kernels
  s   r  tuple[Any, list[str]]c                   s   d fdd}t |S )NrL   r   c                    s     } |     | S r   )r  r  r  r  rN   rU   run_with_backward
  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)rL   r   )r  )r   r  rN   r  rU   run_fw_bw_and_get_code
  s   r  c              	     s   ddl m} g dfdd d fdd}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r6   r;   r  rM   rL   r  c                   r  r   r  r  r  rN   rU   r  
  rW  z"get_code.<locals>.save_output_coder  r<   r   c                   sF   G dd d}| j r|  n|  \}} |j |r  |j | S )Nc                   @  s$   e Zd ZdZdddZdd	d
ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerL   r  c                 S  r  r   rN   r  rN   rN   rU   rD  %
  r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__r   r   r  c                 _  r  r   rN   r  rN   rN   rU   call(
  r  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNr  r   r   r  r   rL   r  )r   r   r   r   rD  r  rN   rN   rN   rU   DummyModule"
  s    
r  )rT  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_code)r  rN   rU   patched_compile_to_module!
  s   

z+get_code.<locals>.patched_compile_to_modulecompile_to_moduler  Nr  )r  r<   rL   r   )rx  r<   r   r  r  rP   r  r  )r   r   r  r<   r  r   rN   )r  r  rU   get_code
  s$   
(


r  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr6   r  z%expected one or two code outputs got r   )r  rX   )r   r   r  r  rN   rN   rU   get_triton_codeC
  s
   r  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r  )r  rX   )r   r   r  r   r  rN   rN   rU   run_and_get_triton_codeM
  s
   r  tuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg d fd	d
}tj|d| | |i |}W d    |fS 1 s7w   Y  |fS )Nr   r;   rC   r   r   r  rL   r  c                    s2   | i | | d }t | sJ | d S )Nr  )r   r  )r   r  rx  r<   graph_lowerings	real_initrN   rU   	fake_initb
  s   z-run_and_get_graph_lowering.<locals>.fake_initrD  r  )torch._inductor.graphr<   torch._inductor.output_coderD   rD  r   r  r  )r   r   r  rD   r  r  rN   r  rU   run_and_get_graph_loweringY
  s   
r  aten_opoverride_fnc              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)r  r  r  orig_fnrN   rN   rU   override_loweringn
  s   
r  pre_fnpost_fnOptional[Callable[..., Any]]c                   s6   ddl m} |j d fdd}tjj|d	|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   rT  rL   c                   s&   | |  | |}r| | |S r   rN   )r  rT  outr  r  r  rN   rU   r  
  s
   


z(add_scheduler_init_hook.<locals>.wrapperrD  N)r  r   rT  r   rL   r   )torch._inductor.schedulerr  rD  unittestr   r  r  )r  r  r  r  rN   r  rU   add_scheduler_init_hook
  s   r  msgc                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rm   developer_warningsr   r  info)r  rN   rN   rU   developer_warning
  s   r  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr6   r   r  z--only=N)r  argvr  rX   
ValueErrorr  )r,  r  rN   rN   rU   get_benchmark_name
  s   

r  rY  c                 C  r  )Nc                 s      | ]}|d kV  qdS r6   NrN   rR   rN   rN   rU   r   
  r  zis_ones.<locals>.<genexpr>r   rY  rN   rN   rU   is_ones
  r1  r  c                 C  r  )Nc                 s  r	  )r   NrN   rR   rN   rN   rU   r   
  r  zis_zeros.<locals>.<genexpr>r  r  rN   rN   rU   is_zeros
  r1  r  inputsSequence[torch.Tensor]c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r  N)r   rP   r  r   )rS   r   rN   rN   rU   r   
  s    

z is_cpu_device.<locals>.<genexpr>r  )r  rN   rN   rU   is_cpu_device
  s   r  c                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r   r   r6  r   rP   rV  rN  rr  rN   rN   rU   get_sympy_Expr_dtype
  s   r  should_profileIterator[Any]c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S r   )rP   r   r   )r  r   r  r   rN   rN   rU   maybe_profile
  s   "
r  c                  C  s   t jj} | dk rt } | S Nr6   )rm   r  threadsrP   get_num_threads)r  rN   rN   rU   r  
  s   r  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr6   )get_backend_options
num_stagesr  r  )runtime.triton_helpersr  rV  rP   r  ru   )r  optionsrN   rN   rU   get_backend_num_stages
  s   r  c                 C  s  t | tjjjjdkd}|dur|S ddlm}m} tj	 o%tj
 dk}| tjtjtjfv s2J t|jdrgddlm} | }| tjtjfv rS|rS|| |S tjjjjdkra|tj|S |tj|S | tjtjfv ru|ru|| S tjjjjdkr|tjS |tjS )	z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    tf32)is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)rz   r   
clock_rate)max_clock_rate)r   rP   backendsrG   matmulfp32_precisiontriton.testingr   r!  rQ   get_device_capabilityr   rJ  rL  inspect	signature
parametersrV  torch._utils_internalr#  )r   ds_topsr   r!  SM80OrLaterr#  sm_clockrN   rN   rU   get_device_tflops
  s*   


r0  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r'  r2  r1  rN   rN   rU   get_gpu_dram_gbps  s   r3  c                  C  s"   ddl m}  | jjdddS )Nr   r  max_shared_mem)triton.runtimer  r  r  r  rV  r4  rN   rN   rU   get_gpu_shared_memory&  s   r7  c                  C  s:   t j rt j j} t j j}||  S d} d}||  S )Nr  i   )rP   rG   rQ   r  	warp_sizemax_threads_per_block)r8  r9  rN   rN   rU   get_max_numwarps,  s   
r:  reduction_typec                 C  s
   |  dS )Nwelford)r  r;  rN   rN   rU   is_welford_reduction8  rR  r>  c                 C  s   t | rdS | dkrdS dS )Nr  online_softmax_reducer  r6   )r>  r=  rN   rN   rU   reduction_num_outputs<  s
   r@  c                   C  s   t  dkS )NLinux)platformsystemrN   rN   rN   rU   is_linuxE  r  rD  c                   C  s
   t jdkS )Nro   )r  rB  rN   rN   rN   rU   r  I  rR  r  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S r   )r   r   r6  r`  rR   rN   rN   rU   r   N     " z#has_free_symbols.<locals>.<genexpr>r  )rE  rN   rN   rU   r  M  r1  r  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr6   r
  rN   Tzunexpected type for is_dynamic F)r  r  r   r9  r;  r  rw  r=   r  maybe_get_sizemaybe_get_strider?   	TypeErrorr  )r   r  rC  rN   rN   rU   r0  Q  s   
r0  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   rL  rM  rN   rN   rN   rU   rK  e  s    rK  r  r3   inpc              	   C  s2  ddl m} tjddd}t }t }t|t|dj|  t	d|j
 |d t	|j
|d t }t|| | |j
 W d    n1 sKw   Y  t | }	||j
 |j
  |  t	d	|j
 |d t	|j
|d | | k}
td
||j|
|	 W d    d S 1 sw   Y  d S )Nr6   )stable_topological_sortrj  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherrO  r  NamedTemporaryFileior   rb   r^   	propagater  rx  r
   nowra   lint	recompilerL  r   r  r   )r  r  rN  r  rO  r
  	before_ioafter_io
start_timetime_elapsedrC  rN   rN   rU   pass_execution_and_saveo  s<   

"r_  	input_buf"Optional[Union[Buffer, Operation]]c                 C  s&   ddl m} t| |jot| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r6   r
  )r  r  r   CppTemplateBufferr>  MultiOutputLayoutr`  r  rN   rN   rU   is_multi_outputs_template  s   re  c                 C  s4   ddl m} t| |jot| jdkot| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r6   r
  r   )r  r  r   MultiOutputrX   r  re  rd  rN   rN   rU   #is_output_of_multi_outputs_template  s   rg  rp   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                 C  s   | d u rdS ddl m} t| |jo!t| |j o!|d u p!| j|u pXt| |ju oXtt	j
jdo8| jt	j
jjjkpXtt	j
jdoH| jt	j
jjjkpXtt	j
jdoX| jt	j
jjjkS )NFr6   r
  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r  torchrecrj  defaultrk  rl  rp  r  r  rN   rN   rU   is_collective  s(   

rt  "Optional[Union[IRNode, Operation]]c                 C  s   ddl m} t| |ju S Nr6   r
  )r  r  r  rn  )rp  r  rN   rN   rU   is_wait  s   rw  snoderE   	filter_fn-Optional[Callable[[BaseSchedulerNode], bool]]c                 C  sD   ddl m} t| |rtdd | jD S t| jo!|d u p!|| S )Nr   GroupedSchedulerNodec                 s  r  r   )contains_collectiverR   rN   rN   rU   r     r  z&contains_collective.<locals>.<genexpr>)r  r|  r   r  snodesrt  rp  )rx  ry  r|  rN   rN   rU   r}    s   
r}  c                 C  s4   ddl m} t| |rtdd | jD S t| jS )Nr   r{  c                 s  r  r   )contains_waitrR   rN   rN   rU   r     r  z contains_wait.<locals>.<genexpr>)r  r|  r   r  r~  rw  rp  )rx  r|  rN   rN   rU   r    s   

r  Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr|g}t| |jo| j|v S rv  )r  r  r   rP   r  r  rp  ro  rs  rN   rN   rU   is_fallback_op  s   r  buf_namename_to_bufname_to_fused_nodec                 C  s   |||  j   S r   )defining_opr  )r  r  r  rN   rN   rU   buf_name_to_fused_snode  s   r  c                 C  r  r  rN   rx  rN   rN   rU   r    r  collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                 C  sP   || rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Nr  )rZ  unmet_dependenciesr  r   find_recursive_deps_of_node)rx  r  r  r  r  depdefining_op_for_deprN   rN   rU   r    s"   

r  c                 C  r  r  rN   r  rN   rN   rU   r    r  c              	   C  s   || rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr%q|j |vr-q||j  }||v r9qt|||||d qqd S )NOUTPUTr  )rZ  get_outputsr  rp  r  find_recursive_users_of_node)rx  r  r  r  r  ors  user_oprN   rN   rU   r    s,   

r  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rP   
_functorchrm   functionalize_rng_ops)r  r  num_rng_seed_offset_inputsrN   rN   rU   num_fw_fixed_arguments4  s   r  fx_gc                 C  sd   ddd}d}g }| j jD ]}|jdkr!||r|| |d	7 }q|ttt|ks.J t|S )z>
    Infers which inputs are static for a backwards graph
    rT   r4   rL   r   c                 S  s(   d| j vod| j vod| j vod| j vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  rN   rN   rU   is_saved_tensorD  s   
z'count_tangents.<locals>.is_saved_tensorr   r  r6   N)rT   r4   rL   r   )rx  rT  r  r  rb  r   rX   )r  r  	arg_countstatic_arg_idxsr)  rN   rN   rU   count_tangents?  s   


r  c                   @  s.   e Zd ZU ded< dddZedd	d
ZdS )	BoxedBoolr   r   rL   c                 C  s   | j S r   )r   r  rN   rN   rU   rY  \  s   zBoxedBool.__bool__r  r   Union[BoxedBool, bool]c                 C  s   t | tr
d| _| S dS r  )r   r  r   r  rN   rN   rU   disable_  s   
zBoxedBool.disableNr  )r  r   rL   r  )r   r   r   r   rY  r  r  rN   rN   rN   rU   r  X  s
   
 
r  kernel_listc                 #  sh    ddl m} |j	 		 dd fdd}tj|d| d V  W d    d S 1 s-w   Y  d S )Nr6   r8   Tr  r9   kernel_namerM   r  rc  r  gpur   cpp_definitionrL   r   c                   s     | | |||||S r   r  )r  r  r  rc  r  r  r  orig_define_kernelrN   rU   define_kernelm  s   
z.collect_defined_kernels.<locals>.define_kernelr  )NTN)r  r9   r  rM   r  rM   rc  r  r  r   r  r  rL   r   )codegen.wrapperr9   r  r   r  r  )r  r9   r  rN   r  rU   collect_defined_kernelsg  s   "r  c                 C  s   | d S )N__original__rN   r  rN   rN   rU    get_cloned_parameter_buffer_name~     r  c                 C     | t v S r   )rW   rm  rN   rN   rU   r    r  r  c                   C  s   t jjduS )z,Check if we're running on ROCm/HIP platform.N)rP   r  ru   rN   rN   rN   rU   is_rocm  s   r  c                 C  s   | dkot | S )NrH   )r  rm  rN   rN   rU   device_need_guard  r  r  c                 C  sJ   | t jkrt j rt j dk S | t jkrt j rdS | t jt jfv S )N)r  r   T)rP   rJ  rG   rQ   r(  rI   rV  r   r5  rN   rN   rU   ,needs_fallback_due_to_atomic_add_limitations  s
   r  ro  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |fvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjfv p]t S )NFrZ  r  r  r6   )overloadpacketrP   r  atenscatter_reduce_scatter_reducescatter_r  r  rm   r  fallback_scatter_reduce_sumdynamic_threadsr  r   rV  $are_deterministic_algorithms_enabled)ro  r;  r  r  r  r  	reduce_tyrN   rN   rU   use_scatter_fallback  s8   	r  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr3  3r{  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rX   r   r   is_reductionrp  r:  reduction_hintrK  rL  rM  r   r  )r  r  r  r  r,  rp  is_redr  rN   rN   rU   dump_node_schedule  s0   




r  r   r  c                 C  s*   ddl m} ||  t| j t dkS )Nr   )rP  )r  rP  storage_offsetr7  r   GPU_ALIGN_BYTES)r   rP  rN   rN   rU   tensor_is_aligned  s   r  example_inputc                 C  s   t | jjsdS tjpt| S r  )r  r   r  rm   assume_aligned_inputsr  )r  rN   rN   rU   should_assume_input_aligned  s   r  rf  c                  C  s>   t jj } | st S | jr| jjst S | jj}| S r   )	rP   _guardsTracingContexttry_getrk  nullcontextrR  rz  suppress_guards)tracing_contextrz  rN   rN   rU   #maybe_get_suppress_shape_guards_ctx  s   r  tuple[_T, str]c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)r   r   r  r  rm   rP   r  r  rV  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGrL  removeHandler)r   r   r  rV  r  log_capture_stringchr  
prev_levelr  r   rN   rN   rU   run_and_get_cpp_code  s$   




r  Sequence[InputType]Optional[ShapeEnv]c                 C  s   t | }|d ur|jS | D ]>}t|tjr|jj  S t|tjrK| D ]}t|tjr5|jj    S q%| D ]}t|tjrJ|jj    S q:qd S r   )	r^   rz  r   rP   r1   rp  r  r  rF  )r  rR  inputr  rF  rN   rN   rU   shape_env_from_inputs   s"   r  Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                   s&   t  dkrS d fdd}|S )	Nr   
new_inputslist[InputType]rL   r   c                   s0   t |  \}}| }t|rt|| |S r   )copy_misaligned_inputsrX   rP   _foreach_copy_)r  old_tensorsnew_tensorsr  r  r  r  rN   rU   r  E  s   z)align_inputs_from_check_idxs.<locals>.run)r  r  rL   r   )rX   )r  r  r  r  rN   r  rU   align_inputs_from_check_idxs=  s   r  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS r
  rN   )rS   r@  rF  rN   rN   rU   r   [  r  z)clone_preserve_strides.<locals>.<genexpr>r6   r   )r  r  r   rF  rP   
as_stridedclone)rT   needed_sizer5  rN   rN   rU   clone_preserve_stridesU  s   "r  r  r  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                 C  s   g }g }|du}|D ]3}| | }t |tjsJ dt| | t r=t|| |< |r=||v r=|| || |  q
||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r   rP   r  r  data_ptr	ALIGNMENTr  r  )r  r  r  r  r  ret_pair_definedr   _inprN   rN   rU   r  a  s   

r  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r   rP   r  r  r  r  rX   )r  r  aligned_static_input_idxsr,  r  rN   rN   rU   remove_unaligned_input_idxs  s   
r  r   c                 C  s   ddl m} ttjj}|jjj}|jjj	j
}tjr%|jj| | dS |jj| |kr0dS |jr>|jj| dk r>dS || oG|| |kS )Nr6   rt  Tg@xDF)rw  ru  rP   iinforT  r   rx  ry  r  rz  has_hintrm   assume_32bit_indexing	check_leqrP  rD  )r   ru  int_maxr  r  rN   rN   rU   expr_fits_within_32bit  s   
r  compiled_graphrD   c                   s   t jj }|d urX|jd urZt|jdksJ t| |jd us#J |jD ]5}|d u r3|jd  q&d t jj  }r@|j d fdd|jt	fd	d
|D  q&d S d S d S )Nr   Fr   r   rL   ,Union[float, int, SymInt, SymFloat, SymBool]c                   s(   d u rt | S  r| S | S r   )r|   deserialize_symexprevaluate_symexpr)r   )fakify_first_callrz  rN   rU   map_expr  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  r  r   rN   rS   r   )r  rN   rU   r     r  z5set_tracing_context_output_strides.<locals>.<genexpr>)r   r   rL   r  )
rP   r  r  r  output_stridesrX   r  r  r  r  )r  r  r9  r2  r  rN   )r  r  rz  rU   "set_tracing_context_output_strides  s"   
r  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rm   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  rN   rN   rU    should_use_remote_fx_graph_cache  s   

r&  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )r   subr  rN   rN   rU   normalize_name  rW  r(  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                 C  r+  rN   rN   r  rN   rN   rU   r-    r.  r-  z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer'  rM   _triton_type_mappingrV  )r   triton_type_namerN   rN   rU   triton_type  s   r-  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nr)  r  )_torch_triton_mappingrV  r  rO   rP   r   r   )r   adjusted_type	type_namer  rN   rN   rU   triton_type_to_torch  s
   
r1  r:  c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS r   )	is_mkldnnr  rF  r   r   untyped_storager  r  r:  r   rN   rN   rU   is_same_tensor  s   

r5  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS r   )r2  r  r   r   rP   r  mkldnnr  r4  rN   rN   rU   is_same_mkldnn_tensor  s   

r7  tuple[str, ...]c                   C  r  )N)rv  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorrN   rN   rN   rN   rU   boolean_ops"  r  rE  c                   @  r;  )OpDtypeRuler2   type_promotion_kindr:  override_return_dtypeNr:  rN   rN   rN   rU   rF  6  r>  rF  zdict[str, OpDtypeRule]op_dtype_propagation_rulesrG  r2   rH  c                 C  s   t ||t| < d S r   )rF  rI  )r   rG  rH  rN   rN   rU   #register_op_dtype_propagation_rules?  s   rJ  zOrderedSet[str]op_requires_libdevice_fp64c                 C  s   t |  d S r   )rK  rZ  r  rN   rN   rU   #register_op_requires_libdevice_fp64L  rW  rL  r   c                 C  sJ   ddl m} | s|j j} | dkrtjS | dkrdS | dkr"tjS tjS )Nr   rt  r  rH   rI   )	rN  ru  rx  get_current_device_or_throwr  rm   cpu_backendxpu_backendcuda_backend)r   ru  rN   rN   rU   get_current_backendP  s   rQ  c                 C  s,   | t jt jfv rtjjrt dkrt jS | S )z"Maybe upcast [b]float16 to float32r  )rP   r   rJ  rm   r  codegen_upcast_to_fp32rQ  rL  r5  rN   rN   rU   upcast_compute_type_  s   
rS  KeyTypeValTypec                   @  sl   e Zd ZdZd#ddZd$d
dZd%ddZd&ddZd'd(ddZd)ddZ	d*ddZ
d+dd Zd,d!d"ZdS )-
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                 C  s   || _ i | _d S r   rW  	new_items)r  rW  rN   rN   rU   rD  v  rE  zScopedDict.__init__r9  rT  rL   rU  c                 C  s   || j v r
| j | S | j| S r   rZ  rW  r  r9  rN   rN   rU   r!  z  s   


zScopedDict.__getitem__r   r  c                 C  s   || j |< d S r   )rZ  )r  r9  r   rN   rN   rU   __setitem__  rW  zScopedDict.__setitem__r  r   c                 C  s   || j v p	|| jv S r   r[  r\  rN   rN   rU   __contains__  r#  zScopedDict.__contains__Nrr  Optional[ValType]c                 C  s"   || j v r
| j | S | j||S r   )rZ  rW  rV  )r  r9  rr  rN   rN   rU   rV    s   

zScopedDict.getr|   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S r  )rX   rW  rZ  )r  r)  r  rN   rN   rU   r    s   


zScopedDict.__len__Iterator[KeyType]c                 c  s.    | j E d H  | jD ]
}|| j vr|V  q
d S r   rY  )r  r  rN   rN   rU   __iter__  s   

zScopedDict.__iter__c                 C  s   t | jp| jS r   )r   rW  rZ  r  rN   rN   rU   rY    r  zScopedDict.__bool__c                 C  r  r   r  r\  rN   rN   rU   __delitem__  r  zScopedDict.__delitem__)rW  rX  )r9  rT  rL   rU  )r9  rT  r   rU  rL   r  )r9  r  rL   r   r   )r9  rT  rr  r_  rL   r_  r  )rL   r`  r  )r9  rT  rL   r  )r   r   r   r   rD  r!  r]  r^  rV  r  ra  rY  rb  rN   rN   rN   rU   rV  n  s    






rV  )frozen_defaultr   Optional[type[Any]]r   c                 s"   d fdd}| d u r|S || S )Nr   rp   rL   c                   s   t j| d dS )NT)kw_onlyr   )dataclasses	dataclass)r   r   rN   rU   wrap  r  zir_dataclass.<locals>.wrap)r   rp   rL   rp   rN   )r   r   rh  rN   r   rU   ir_dataclass  s   ri  Optional[list[int]]c                  C  s&   t jj } | d ur| jr| jjS d S r   )rP   r  r  r  fw_metadatabw_donated_idxs)r  rN   rN   rU   get_donated_idxs  s   rm  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r6   r  r  r@  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrN   rN   rN   rU   rn    s    rn  c                  C  sT   t jdd u rtjS dd l} dd l} t| jj	drtj
S t| j	j	dr'tjS tjS )Nr  r   AttrsDescriptor)r  r  r   rn  ro  triton.backends.compilertriton.compiler.compilerr  r$  compilerrq  rp  rs  )r  rN   rN   rU   #get_triton_attrs_descriptor_version  s   rx  c                   C  s   t  tjkS r   )rx  rn  rs  rN   rN   rN   rU   triton_version_uses_attrs_dict  r  ry  torch._ops.OperatorBasec                 C  s2   |   }t| tjjr| d| j n|}||fS )Nrt   )r   r   rP   r  r  _overloadname)r  op_overload_packet_nameop_overload_namerN   rN   rU   get_op_names  s   r~  r  torch.fx.Nodec                 C  s   ddl m} | j}t|tjjsdS |tjjj	j
tjjjj
tjjjj
fv rS||| j| jdd}|durS|\}}|d }|D ]}|durR|jd jtjtjfv rR dS q=dS )	a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsNindicesrm  )torch.fx.operator_schemasr  r  r   rP   r  r  r  r  	index_putrr  
index_put__unsafe_index_putr   r  r  r   r   r[  )r  r  r  
normalizedr   r  r  r,  rN   rN   rU   ,_fx_node_is_input_dependent_cudagraph_unsafe  s.   


r  c                 C  s   | j }t|tv rdS t|tjjrtjjj	|j
v rdS t| r"dS | jd }durIt|ttfs6|gn|}|D ]}t|tjrH|jrH dS q:dS )a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    Trm  NF)r  rM   FORBIDDEN_CUDAGRAPH_OPSr   rP   r  r  r	  r  cudagraph_unsafer  r  r  rV  rb  r  r  	is_sparse)r  r  rm  valsr   rN   rN   rU   r    s    
r  rA   c                 C  s\   ddl m} t| |j|jfrdS t| |j|jfsdS t| dd}|dur,t|r,dS dS )ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r6   r
  TFr  N)	r  r  r   Conditional	WhileLooprp  r>   rO   r  )rp  r  r  rN   rN   rU   is_cudagraph_unsafe_op*  s   
r  c                  C  sX   t jdd} t r*ddlm} | }|r*t j|dd}| r(t j	|| gn|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  rV  rm   r   libfb.py.parutilr  r  r"  pathsep)r  r  runtime_pathlib_pathrN   rN   rU   get_ld_library_pathD  s   r  c                 C  s    ddl m} t| |o| jd uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  r   partition_signatures)r  r  rN   rN   rU   #is_codegen_graph_partition_subgraphQ  s   
r  c                   C  s    t jjjjp
tjd uot jjjS r   )rP   r  rm   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionrN   rN   rN   rU   is_using_cudagraph_partitionZ  s
   r  c                 C  s8   ddl m} |jj| dr|jj| drtjS tjS )Nr6   rt  l        i   )	rw  ru  rx  ry  statically_known_ltr  rP   rT  rV  )r  ru  rN   rN   rU   dtype_from_sizea  s   r  )r  rI   c                 C  $   | dkr
t jj S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r  rI   TF)rP   r  r6  _is_mkldnn_bf16_supportedr   rN   rN   rU   is_mkldnn_bf16_supportedo  
   r  c                 C  r  )z;
    Returns True if the device supports MKL-DNN FP16.
    r  rI   TF)rP   r  r6  _is_mkldnn_fp16_supportedr  rN   rN   rU   is_mkldnn_fp16_supported{  r  r  elementsSequence[Sequence[T]]headersSequence[T]c              	   C  s   dd |D }| D ]"}t |t |ksJ t|D ]\}}t|| t t|||< qq	g }|ddd t||D  t|t |d  t |d  }|d|  | D ]}|dd	d t||D  qWd
|S )Nc                 S  s   g | ]}t t|qS rN   )rX   rM   r  rN   rN   rU   rV     r  ztabulate_2d.<locals>.<listcomp>|c                 s  $    | ]\}}d || d V  qdS r3  NrN   )rS   hrj  rN   rN   rU   r     rG  ztabulate_2d.<locals>.<genexpr>r  r6   r  c                 s  r  r  rN   )rS   r   rj  rN   rN   rU   r     rG  rQ  )rX   r   r   rM   r  r"  r   r  )r  r  widthsrowr   r   rb  total_widthrN   rN   rU   tabulate_2d  s     "
r  dict1rX  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c                 c  s`    t |  t | B }|D ]}| |}||}||dur"|n||dur)|n|fV  qdS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r"   rX  rV  )r  r  r  r  all_keysr9  value1value2rN   rN   rU   	zip_dicts  s   

r  config_patchesc                 C  s   dd	d
}ddd}|  dtjj}|  } |rA|| dd || dd || dtjj  || dd || dtjj	 || dd |  dtj
j}|  dtj
j}|dkr[|r[td| S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    r  r  config_namerM   config_valuer   rL   r  c                 S  sP   |  |tt|}|d u r|| |< d S |s$||kr&td| d| dd S d S )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)rV  rO   rm   r   r  r  r  r   rN   rN   rU   patch_config  s   z2maybe_aoti_standalone_config.<locals>.patch_configc                 S  s4   |  |tt|}||krtd|| || |< d S )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)rV  rO   rm   r   r  r  rN   rN   rU   force_patch_config  s   z8maybe_aoti_standalone_config.<locals>.force_patch_configz$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.N)r  r  r  rM   r  r   rL   r  )rV  rm   aot_inductor_modecompile_standalonecopyrP   r  ru   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )r  r  r  r  r  r  rN   rN   rU   maybe_aoti_standalone_config  sF   

r  consts_sizetuple[bool, bool]c                 C  s   t jjrt jjdkrtdt jjr$t jjdkrtdd}d}||fS t jjdkr2d}d}||fS | dkr8dS d}t   }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)rm   r  force_mmap_weights package_constants_on_disk_formatr   r  r   )r  use_mmap_weightsuse_external_weightsrN   rN   rU   determine_aoti_mmap_flags  s.   

r  c                  C  sV   ddl m}  | jj}|du rdS t|tstd|dkrdS td|s)tddS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rl   NTz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rm   r  model_name_for_generated_filesr   rM   r  r   r   )rm   
model_namerN   rN   rU   is_valid_aoti_model_name8  s   
r  r)   unbacked_onlyOrderedSet[sympy.Symbol]c                 C  s   |rt | S t| S r   )r(   r'   )rT   r  rN   rN   rU   get_free_symbolsS  s   r  dict[str, str]c                  C  s@   i t jdt jdt jtji} t rt	
d| d< | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr:  
PYTHONHOME)r  r  rV  r  r"  r  r  rm   r   	sysconfigget_path)envrN   rN   rU   python_subprocess_envZ  s   r  c                   @  s"   e Zd ZU dZded< ded< dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r|   num_partitionspartition_indexNr   rN   rN   rN   rU   r  u  s   
 r  .c                   @  s   e Zd ZU dZded< dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   rN   rN   rN   rU   r    s   
 r  CUDAGraphWrapperTypec                 C  s
   | t _d S r   )r  r  )r  rN   rN   rU   !set_customized_partition_wrappers  rR  r   tuple[list[Any], dict[str, Any]]c                   s   | j j}| j g || j j| j j}| j j}t||f\}}ddd  fdd|D }ddd	dfddfdd|D }t||\}}||fS )NrL   r   c                 S  s"   t | tjjjot | tjjj S r   )r   rP   r  r  r?   GeneratorStater  rN   rN   rU   _is_tensor_ir  s   
z(snode_args_kwargs.<locals>._is_tensor_irc                   s*   g | ]} |rt jjj|d dn|qS )F)guard_shape)rP   r  r  ir_node_to_tensorr  )r  rN   rU   rV     s    z%snode_args_kwargs.<locals>.<listcomp>r  c                 S  s   t j| ||dS )Nr   )rP   r   )r  r   r   rN   rN   rU   _tensor  r  z"snode_args_kwargs.<locals>._tensorr   r   c                   s(   t | tjs| S  |  | j| j}|S r   )r   rP   r  r  r   r   )r   r  )r  rN   rU   to_real_tensor  s   z)snode_args_kwargs.<locals>.to_real_tensorc                   r}  rN   rN   r  )r  rN   rU   rV     r  r  )rL   r  )r   r   rL   r   )rp  r  fill_non_provided_argsconstant_argsr  pytreer#   tree_unflatten)rx  r   r  	flat_argsflat_args_pytree_specrN   )r  r  r  rU   snode_args_kwargs  s    


r  r  r:   c                 C  s6   ddl m} | j}|jjr||jjd }|dS )Nr6   rt  r   )primals_r  fwd_rng_stater  r  )rw  ru  r   rx  removeprefixr  )r  ru  dep_namerN   rN   rU   is_nonfreeable_buffers  s   r  template_dirr.   c                 C  s>   t ||  d }| W  d   S 1 sw   Y  dS )z,Load a template file and return its content.z	.py.jinjaN)openread)r   r   r
  rN   rN   rU   load_template  s   $r  c                 C  s   | j }t|tjjtjjfsJ dt| tjsdS t	tj
jjjtj
jjjg}||v r0dS t	tj
jjg}t|tjjrC||v S t|  S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r   rP   r  r  r  r  rm   fallback_by_defaultr"   r  r  _assert_scalarrr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr    )rp  r  "skip_fallback_due_to_dynamic_shapefallback_hopsrN   rN   rU   should_fallback_by_default  s*   



r  )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                 C  r  )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)rH  rN   rN   rU   is_collective_op  s   r  c                  C  s6   t  rz	ddlm}  | W S  ty   g  Y S w g S )Nr   tlx_only_cuda_options)rm   r   )torch._inductor.fb.tlx_templates.registryr  r  r  rN   rN   rU   r    s   r  yc                 C  s   | | d | | S )z(Round x up to the nearest multiple of y.r6   rN   )rT   r  rN   rN   rU   	_round_up"  s   r  mat_sizetuple[Any, Any]
scale_sizetuple[Any, ...]scale_numel	mat_dtypescale_dtypeeq_fnCallable[[Any, Any], bool]#tuple[Optional[Any], Optional[Any]]c                 C  s  ddl m}m} ||dr|j|jfS t|dkr||d | d r)||d ds9||d dr?||d | d r?|j|jfS ||d | d rT||d t| d dsi||d | d ro||d t| d dro|j|jfS ||d t| d dr||d t| d dr|j	|jfS |t
jkrdnd}|t
jkr|t
jkrt| d dtt|| d  dd }	t| d dtt|| d  dd }
|||	s|||
r|j|jfS |t
jkrGt
jjst| d dtt|| d  dd }	t| d dtt|| d  dd }
|||	s|||
r|j|jfS d	S t| d d| | d  }	t|| d  d| d  }
|||	sA|||
rG|j|jfS d	S )
z:
    Core implementation for scale/swizzle inference.
    r   )r5   SwizzleTyper6   r  rw   rv   r@  r  NN)torch.nn.functionalr5   r  
TensorWise
NO_SWIZZLErX   RowWisern   BlockWise1x128BlockWise128x128rP   rH  rD  r  BlockWise1x16SWIZZLE_32_4_4rF  r  ru   BlockWise1x32)r  r  r  r  r  r  r5   r  K_multiplierexpected_numel_aexpected_numel_brN   rN   rU   _infer_scale_swizzle_impl'  sj   
 

r+  matscalec                 C  s6   t | jd | jd ft|j| | j|jdd dS )a  
    Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

    This function determines how scale factors are laid out relative to the matrix:
    - TensorWise: Single scale for entire tensor
    - RowWise: One scale per row
    - BlockWise1x128/128x128: Block-scaled with float32 scales
    - BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
    - BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

    Args:
        mat: The matrix tensor (FP8 or FP4)
        scale: The scale factor tensor

    Returns:
        Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
    r   r6   c                 S  s   | |kS r   rN   r  rN   rN   rU   r    s    z%infer_scale_swizzle.<locals>.<lambda>r  r  r  r  r  r  )r+  r@  r  numelr   )r,  r-  rN   rN   rU   infer_scale_swizzlew  s   r0  r=   	transposec                   s   ddl m  |  }| }|r|d |d f}|r"ttj|dnd}d fd	d
}tt|dkr:|d |d fn|d dft	||| j
|j
|dS )z
    Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

    This is the IR-compatible version of infer_scale_swizzle, using symbolic
    size comparisons via V.graph.sizevars.statically_known_equals.
    r   rt  r6   r'  r   r(  rL   r   c                   s    j j| |S )z5Compare values using symbolic equality when possible.r  r  rt  rN   rU   symbolic_eq  s   z+infer_scale_swizzle_ir.<locals>.symbolic_eqr  r.  N)r'  r   r(  r   rL   r   )rN  ru  r  r  r  r  r  r+  rX   r  r   )r,  r-  r1  r  r  r  r2  rN   rt  rU   infer_scale_swizzle_ir  s   &r3  r  )r{   r|   rL   r|   )r   r   rL   r   )r   r   )r   r   r   r|   r   r|   rL   r   )r   r   F)
r   r   r   r|   r   r|   r   r   rL   r   r  )r   r  rL   r  )r  r  rL   r   )r$  r%  r&  r%  rL   r   )r  r+  rL   r,  )r2  r3  r4  r3  rL   r3  )r9  r:  rL   rM   )rf  rg  rL   rh  )rm  rn  rL   r3  )r   r3  rL   rn  )rf  r}  rL   r~  )r  r  rL   r   )r  r4   r  r  rL   r   )r  r   r   r  r  r  rL   r  )rG   )r   rM   rL   r  )r6   rG   )
r  r  r  r  r   r|   r   rM   rL   r   )rN   r  r  r  rG   )r  r  r  r  r   r|   r  r|   r  r   r   rM   rL   r   )r  r   r  rM   rL   r  )r  r   r  r   rL   r  )r'  r|   r(  r|   rL   r|   )rT   r  r  r|   rL   r  )rT   r  rL   r  )r   r  rL   r  )r  rM   rL   r  )r  r  rL   r	  )r  r  r  r  rL   rM   )r  r  r  r9   rL   r(  r   )rm  rn  ro  rp  rL   rq  )r   ru  r  rv  rL   rq  r  )r  r   rL   r  )r  rM   rL   r   )r  ri   r,  r|   rL   r  )r  r   rL   r   )r   rM   rL   r  )rq  r   r  r  rL   r   )r'  r   rL   r  )r   r   rL   r   )r  r  rL   r  )r  r  rL   r4   )r  r  rL   r  r  )r  r   rL   r   )r9  rM   r   rM   rL   r  )NNT)r  r   r  r  r  r   rL   r  )r  r  r  r   rL   r  )rz  r*   r  r&  r  r   rL   r  )r   r4  rL   r|   rj  r   )r  r  rL   r   r  )r  r|   r   r  r  r  rL   r7   )r>  r@   r  r  rL   r   )r  rM   rL   r   )
r>  r@   r  r   r  r   r  r   rL   r   )r  r?   r  r  r  r   rL   r   )r  r?   r  r@   r  r   rL   r   )r  r5   r  r5   r  r  rL   r   )r'  r   r(  r   r>  r@   r)  r   r*  r   r+  r,  r-  r,  r.  r,  rL   r   )
r>  r@   r  r|   r)  r|   r  r|   rL   r   r  )r>  r@   r  r<  r)  r<  r  r<  r'  r?   r(  r?   r+  r=  r  r>  rL   r   )rH  rM   rL   r   r   )
r  r<  r)  r<  r  r<  rM  r|   rL   r   )r  r<  r)  r<  r  r<  rL   r   )r  r<  r)  r<  r  r<  rL   r  )r   rM   rL   rM   )rL   ro  )r>  r@   rL   r   )r>  r@   r  r  r  r?   rL   r   )FTFN)r>  r@   r  r?   r  r?   r  r   r  r   r  r   r  r  rL   r   )r   r  r   r  r  r  rL   r  )r   r  rL   r  )r   r  r   r  r  r  rL   r   )r   r  r   r  r  r  rL   rM   )r   r  r   r  r  r  rL   r  )r  r  r  r  rL   r  )r  r  r  r  rL   r   )r  rM   rL   r  )rL   r  )rY  r  rL   r   )r  r  rL   r   )rm  r   rL   r4  )r  r   r   r   r  r   rL   r  )r   r4  rL   r   )r;  rM   rL   r   )r;  rM   rL   r|   )rE  rF  rL   r   )
r  r  r  r3   rN  r  r  rM   rL   r  )r`  ra  rL   r   )rp  rh  r  ri  rL   r   )rp  ru  rL   r   )rx  rE   ry  rz  rL   r   )rx  rE   rL   r   )rp  r  r  r  rL   r   )r  rM   r  r  r  r  rL   r   )rx  rE   r  r  r  r  r  r  r  r  rL   r  )r  r|   r  r|   rL   r|   )r  r  rL   r|   )r  r   rL   r  )r   rM   rL   rM   )r   r  rL   r   )r   rM   rL   r   )r   r4  rL   r   )ro  r  r;  r  r  r4  r  r4  r  rM   r  r   rL   r   )r  r  rL   r  )r   r  rL   r   )r  r  rL   r   )rL   rf  )r   r  r   r  r  r  rL   r  )r  r  rL   r  )r  r  r  r  r  r  rL   r  )rT   r  rL   r  )r  r  r  r  r  r  rL   r  )r  r  r  r  rL   r  )r   r   rL   r   )r  r  r  rD   rL   r  )r   r4  rL   rM   )r   rM   rL   r4  )r:  r  r   r  rL   r   )rL   r8  )r   rM   rG  r2   rH  r:  rL   r  )r   rM   rL   r  )r   r  rL   rM   )r   r4  rL   r4  )r   rd  r   r   rL   r   )rL   rj  )rL   rn  )r  rz  rL   r(  )r  r  rL   r   )rp  rA   rL   r   )r  r9   rL   r   )r  r|   rL   r4  )r   rM   rL   r   )r  r  r  r  rL   rM   )
r  rX  r  rX  r  r  r  r  rL   r  )r  r  rL   r  )r  r|   rL   r  )rT   r)   r  r   rL   r  )rL   r  )r  r  rL   r  )rx  rE   rL   r  )r  r:   rL   r   )r   rM   r   r.   rL   rM   )rp  r  rL   r   )rL   r   )rT   r|   r  r|   rL   r|   )r  r  r  r  r  r   r  r4  r  r4  r  r  rL   r  )r,  r  r-  r  rL   r  r  )r,  r=   r-  r=   r1  r   rL   r  (  
__future__r   rR  rk  rf  enumr  r  r)  rV  r  r  ru  r  r  rB  r   r  r   r  r  r  rw  r  r   collections.abcr   r   r   r   r   r   r	   r
   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   rP   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   !torch.fx.passes.regional_inductorr    torch.utils._dtype_abbrsr!   torch.utils._ordered_setr"   r#   r$   OPTIMUS_EXCLUDE_POST_GRADr  r'   r(   r)   r*   r+   r,   r-   pathlibr.   r/   r0   r1   torch._prims_commonr2   torch.fxr3   torch.fx.noder4   r  r5   r  r7   r  r9   dependenciesr:   rx  r<   r  r=   r>   r?   r@   rA   rB   output_coderD   r  rE   rF   rW   rK   r   r\   torch._dynamo.device_interfacer]   torch._dynamo.utilsr^   torch.autogradr_   torch.autograd.profiler_utilr`   (torch.fx.passes.graph_transform_observerra   torch.fx.passes.shape_proprb   torch.utils._sympy.functionsrc   rd   re   rf   rg   torch.utils._sympy.symbolrh   ri   torch.utils._sympy.value_rangesrj   rk   r  rm   runtime.runtime_utilsrn   r8  _IS_WINDOWS	getLoggerr   r   rp   r  r6  	VarRangesr  r|   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr  r  r  r  r[  rP  r\  rR  r]  rT  rV  r   rJ  rL  rN  rD  rE  float8_e4m3fnuzfloat8_e5m2fnuzrx   r   r}   r~   r   Functionr   rg  r   r   r   r   r  r  r"  r*  r0  re  rl  rs  r|  r  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r   r  r  r'  rl  rt  r  r  r  r  r  r  r  r  r  r  	frozensetr  r  r  r  r  r  r  r  r  rl  r  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher%  r3  r7  r8  r<  r?  r  r  rI  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r$  r&  r1  r;  rG  rL  r<  rV  rY  rk  rn  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r0  r3  r7  r:  r>  r@  rD  r  r  r0  EnumrK  r_  re  rg  rt  rw  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r(  r+  rY  r.  compiler*  r-  r1  r5  r7  rE  rF  rI  rJ  rK  rL  rQ  rS  rT  rU  rV  ri  rm  rn  rx  ry  r~  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyper  r  r  r  r  r  r  r  r  r  r  r  r+  r0  r3  rN   rN   rN   rU   <module>   sZ   $	< 

$
Lb&
		
.7 /;'  		 A(H8%

@	+
!
*	
#&		'!	'


$
0	

$%	#U)

#&P!