o
    einI                    @  sN  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ dd	l m!Z! erd d
l"m#Z#m$Z$m%Z% d dl&m'Z' d dl(Z(d dl)Z)d dl*Z)d dl+m,  m-Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC ddlDmEZEmFZFmGZGmHZHm Z mIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZR ddlHmSZSmTZTmUZUmVZV ddlWmXZXmYZY ddlZm[Z[ ddl m\Z\m]Z]m^Z^m_Z_m`Z`maZa ddlbmcZc ddldmeZemfZf ddlgmhZh ddlimjZjmkZk dd llmmZm dd!l,mnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZ dd"lmZ eeZe)jed#Ze)jed$Ze)jed%Ze)jed&Zed' Zd(ed)< ed*Zed+ZejG d,d- d-ZejG d.d/ d/ZG d0d1 d1ZejG d2d3 d3ZejG d4d5 d5eZG d6d' d'Zejdd9d:Zdd=d>Zdd@dAZddCdDZejdEdFG dGdH dHZddKdLZG dMdN dNZddUdVZG dWdX dXeZG dYdZ dZeZG d[d\ d\eZdd_d`ZddedfZG dgdh dheZG didj djeZG dkdl dleZG dmdn dneZ	odddwdxZdd}d~ZdddZejG dd dZe ZdddZdddZdddZdddZdddZdddZdddZG ddb dbZG dd dZdS )    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                   @  sZ   e Zd ZU dZded< dZded< dZded< dd	 ZedddZ	e	ddddZ
dS )FusionResultNzOptional[bool]should_fusezOptional[Callable[[], bool]]callable_fnOptional[LambdaFuture]futurec                 C  s    | j d u| jd uA sJ dd S )NzLFusion result should contain either fusion decision or callable_fn, not both)r`   ra   self rf   c/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/_inductor/scheduler.py__post_init__n   s   zFusionResult.__post_init__boolc                 C  s
   t |dS )N)r`   r_   )clsr`   rf   rf   rg   fuses   s   
zFusionResult.fuseCallable[[], bool]c                 C  s   t ||dS )Nra   rc   rj   )rk   ra   rc   rf   rf   rg   from_callablew      zFusionResult.from_callable)r`   ri   N)ra   rm   rc   rb   )__name__
__module____qualname__r`   __annotations__ra   rc   rh   classmethodrl   ro   rf   rf   rf   rg   r_   h   s   
 r_   c                   @  s<   e Zd ZU ded< ded< ded< dZded< dddZdS )PendingFusionrm   ra   r[   node1node2Nrb   rc   return+tuple[BaseSchedulerNode, BaseSchedulerNode]c                 C  s   | j | jfS rq   rx   ry   rd   rf   rf   rg   get_fusion_nodes      zPendingFusion.get_fusion_nodes)rz   r{   )rr   rs   rt   ru   rc   r}   rf   rf   rf   rg   rw   ~   s   
 rw   c                   @  s   e Zd ZdZed'ddZed(d	d
Zed)ddZed*ddZ	ed+ddZ
ed)ddZed,ddZed-ddZed)ddZed)dd Zed'd!d"Zed.d$d%Zd&S )/MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    noder[   rz   ri   c                 C  s   |   otdd |  D S )Nc                 s  s:    | ]}t |tr| rt |jtr|jjd uV  qd S rq   )
isinstanceSchedulerNodeis_reductionr   r   _split_size.0subnoderf   rf   rg   	<genexpr>   s    


z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>)r   all	get_nodesr   rf   rf   rg   is_split_reduction   s   z$MixOrderReduction.is_split_reductiontuple[sympy.Expr, sympy.Expr]c                 C  s   |  |r{d }d }| D ]c}t|tr| rt|jtsq|jjd us'J tj	j
t|jj}|jjd us:J tj	j
t|jj}|d u rN|}|}qtj	j
||s_J | d| tj	j
||spJ | d| q|d uswJ ||fS |jd S )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrV   graphsizevarssimplifyrU   _original_reduction_rangesstatically_known_equalsgroup)rk   r   xnumelrnumelr   	curxnumel	currnumelrf   rf   rg   get_numel_rnumel   sF   




z"MixOrderReduction.get_numel_rnumelrx   ry   c                 C  sL   |  |}|  |}t|dkst|dks||krdS t|tt|kS )N   F)r   lentuplereversed)rk   rx   ry   g1g2rf   rf   rg   has_mix_reduction_orders   s
   

 z*MixOrderReduction.has_mix_reduction_ordersbufstrc                 C  s   d}|j jD ]}t|tr|j|kr|} nq|sdS |j}|j j}|s7t|ts0J t| |j	d j j}|s;J t
|t
|j sFdS tjjt|jt| rWdS dS )z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsr   r1   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r   rV   r   r   r   rU   sizevalues)rk   r   r   	found_depdepr   r   rf   rf   rg   _is_full_access   s*   z!MixOrderReduction._is_full_access	list[str]c                 C  sD   g }|  |  @ }|D ]}| ||r| ||r|| q|S rq   )used_buffer_namesr   append)rk   rx   ry   outcommon_readsr   rf   rf   rg   get_common_read   s   
z!MixOrderReduction.get_common_readc                 C  s   t | ||dkS Nr   )r   r   rk   rx   ry   rf   rf   rg   has_common_read   s   z!MixOrderReduction.has_common_readintc                 C  s(   |  |}tjjj|d |d  ddS )Nr   r   fallback)r   rV   r   r   optimization_hint)rk   r   r   rf   rf   rg   	get_numel   s   
zMixOrderReduction.get_numelc                 C  s
   |  |S rq   )r   r   rf   rf   rg   get_fusion_score  s   
z"MixOrderReduction.get_fusion_scorec                 C  s  t jjsdS tjjrdS | r| sdS | j}|dvs%t	|dkr'dS |
 r/|
 s1dS |j| @ s?|j| @ rAdS | ||sIdS t||}t|dkrWdS | |rb||}}n| |rm||}}ndS | |}|\}}	t jjsd}
tjjt||	 |
sdS tjjt||	d sdS tjjt|dsdS tdd	 | D rdS tjj|	d
sdS tdd	 | D }|S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   i  P r   i   c                 s  s.    | ]}|  r|jjjtjtjfvV  qd S rq   )r   r   datareduction_hintr@   INNERDEFAULTr   rf   rf   rg   r   V  s    
z-MixOrderReduction.can_fuse.<locals>.<genexpr>i @  c                 s  s&    | ]}|  r|j d v V  qdS )>   sumprodN)r   r   get_reduction_typer   rf   rf   rg   r   i  s    
)r%   r   mix_order_reductionrV   r   cpp_wrapperrQ   
get_devicer   rH   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   )rk   rx   ry   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   rf   rf   rg   can_fuse  s\   



		zMixOrderReduction.can_fusec                 C  s   |  ||S rq   )r   r   rf   rf   rg   are_mix_order_reductionst  rp   z*MixOrderReduction.are_mix_order_reductionsc                   s$   t  fddjjD sdS dS )Nc                 3  s    | ]
}  |jV  qd S rq   )is_contiguous_loadr   r   r   rk   r   rf   rg   r   |  s    
z7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>FT)r   r   r   r   rf   r   rg   r   z  s
   z$MixOrderReduction.is_contiguous_nodeparent_nodec                   s   ddl m} | D ]N}t|tsJ |j}|j|j } fdd|D }t|dkr,q
|D ])}|j	| }	|j
}
t|
 }tjj|	||}|d dksW|d dksW  dS q.q
dS )	Nr   )MemoryUsageTypec                   s   g | ]
}|j  kr|jqS rf   )buffer_name
index_name)r   er   rf   rg   
<listcomp>      z8MixOrderReduction.is_contiguous_load.<locals>.<listcomp>r   FT)torch._inductor.loop_bodyr   r   r   r   _bodymemory_usageLOADr   indexing_exprsr   listkeysrV   r   r   stride_vars)rk   r   r   r   r   	loop_bodyentriesindex_namesr   
index_exprr   var_symbolsr   rf   r   rg   r     s,   
z$MixOrderReduction.is_contiguous_loadNr   r[   rz   ri   )r   r[   rz   r   rx   r[   ry   r[   rz   ri   )r   r   r   r[   rz   ri   )rx   r[   ry   r[   rz   r   )r   r[   rz   r   rx   r[   ry   r[   rz   r   )r   r   r   r[   rz   ri   )rr   rs   rt   __doc__staticmethodr   rv   r   r   r   r   r   r   r   r   r   r   r   rf   rf   rf   rg   r      s4    	%!gr   c                   @  s   e Zd ZU ded< ded< ded< ejedZded	< ejedZ	d
ed< d(ddZ
d)ddZd(ddZd(ddZd*ddZd+ddZd,ddZd-d d!Zd-d"d#Zd.d%d&Zd'S )/SchedulerBuffer	Scheduler	schedulerz	ir.Bufferr   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr>   
mpi_bufferrz   r   c                 C  s   | j }|d us	J | S rq   )r   get_name)re   oprf   rf   rg   defining_op_name  s   z SchedulerBuffer.defining_op_namer   c                 C  s   t | jjS rq   )hashr   r   rd   rf   rf   rg   __hash__  r~   zSchedulerBuffer.__hash__c                 C  s  t  }|  }|| dt| jj  || d| jj  |  r3|| dt|    | 	 rE|| dt| 	   t
| jdkr[|| d| j  | S || d |d | jD ]
}|| d qlW d    n1 sw   Y  |d	 | S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rN   r  	writeliner   r   rr   layoutget_aliasespformatget_mutationsr   r  indentgetrawvalue)re   resultr   userrf   rf   rg   	debug_str  s&   

zSchedulerBuffer.debug_strc                 C  
   | j  S rq   r   r  rd   rf   rf   rg   r       
zSchedulerBuffer.get_nameNonec                 C  s   | j d usJ | j  sd S | j  s!| j  s!t| j  tjr+tj	j
| j  d S ttjdra|  tjjv ratjj|   }|| jjv rO| jj| j }n| jj| j }tj	j
|| j  d S tj	j
| j  d S )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr(   CommBufferLayoutrV   r   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr   name_to_donated_buffername_to_bufcodegen_inplace_reuse)re   input_buffer_nameinput_bufferrf   rf   rg   allocate  s6   

zSchedulerBuffer.allocateri   c                 C  sN   | j d usJ t| j jtjst| j rdS | jD ]}t|j tr$ dS qdS NFT)r   r   r  r(   r<   rR   r  
OutputNode)re   userf   rf   rg   can_free  s   
zSchedulerBuffer.can_freec                 C  s\   i }|D ] }t |j|v r||t |j |t |j< q||t |j< qt| | _d S rq   )idr   merger   r   r  )re   r  r  r.  rf   rf   rg   	set_users  s    zSchedulerBuffer.set_usersSequence[str]c                 C     | j d usJ | j  S rq   )r   r  rd   rf   rf   rg   r       
zSchedulerBuffer.get_aliasesc                 C  r4  rq   )r   r  rd   rf   rf   rg   r    r5  zSchedulerBuffer.get_mutationsOptional[torch.device]c                 C  s   | j   S rq   )r   r  r   rd   rf   rf   rg   r   
     zSchedulerBuffer.get_deviceNrz   r   rz   r   rz   r  rz   ri   )r  r  rz   r  rz   r3  rz   r6  )rr   rs   rt   ru   dataclassesfieldr   r  r>   r  r  r	  r  r  r+  r/  r2  r  r  r   rf   rf   rf   rg   r     s$   
 





!



r   c                   @  s   e Zd ZU dZded< dS )SchedulerDonatedBufferNr   r   )rr   rs   rt   r   ru   rf   rf   rf   rg   r@    s   
 r@  c                   @  s  e Zd ZU ded< ded< ded< ded< ded< d	ed
< ded< dZded< ded< ded< dZded< ded< ded< dZded< dd!d"Zdd$d%Zdd'd(Z	dd)d*Z
dd+d,Zdd.d/Zdd0d1Zdd2d3Zdd7d8Zdd:d;Zdd>d?Zdd@dAZddCdDZddGdHZddIdJZddKdLZddMdNZddOdPZddQdRZddUdVZddWdXZddYdZZedd[d\Zedd]d^Zedd_d`Z eddadbZ!ddddeZ"ddgdhZ#ddkdlZ$ddndoZ%ddpdqZ&ddrdsZ'ddtduZ(ddvdwZ)ddxdyZ*ddzd{Z+dd|d}Z,dd~dZ-dddZ.dddZ/dddZ0	ddddZ1edddZ2edddZ3edddZ4dddZ5dddZ6edddZ7dddZ8edddZ9dddZ:dddZ;e<dddZ=dS )r[   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager   	min_order	max_orderr?   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]r   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFri   writtenr   r   rz   r  c                 C  s   || _ dd | _d S )Nc                  _  s   g S rq   rf   )r  kwargsrf   rf   rg   <lambda>+  s    z,BaseSchedulerNode.__init__.<locals>.<lambda>)r   debug_device_strre   r   rf   rf   rg   __init__(  s   zBaseSchedulerNode.__init__ir.Operationc                   sT   | _ t  _tt   _d _ fdd| D  _dd  jD  _i  _	d S )NFc                   s   g | ]
}t  j| d qS ))r   r   r   )r   r   )r   outputrd   rf   rg   r   5  s    z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>c                 S     i | ]}|  |qS rf   r  r   r   rf   rf   rg   
<dictcomp>=      z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>)
r   r   r   r   rB  rQ  get_outputsrI  rK  rG  re   r   rf   rd   rg   _init_from_node.  s   

z!BaseSchedulerNode._init_from_noder   c                 C  s   t | j d|  dS )Nz(name=)r   rr   r  rd   rf   rf   rg   __repr__F     zBaseSchedulerNode.__repr__c                 C  s  |   }t }|| dt| j dtt| ddj d| dt| jj d| dt| j	 d| d	t| jj
| j	  d| d
 |  |  D ]	}||  qKW d   n1 s_w   Y  |d z	||   W n ty   tjddd Y nw |  S )#Longer form printout for trace logsr
  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r  Ignoring error in debug_str()Texc_info)r  rN   splicer   rr   getattrr  r   writesrP  r   r  r^  r  r  debug_str_extra	Exceptionlogwarningr  rstrip)re   r   r   r   rf   rf   rg   r  I  sH   

	
zBaseSchedulerNode.debug_strc                 C     dS )N rf   rd   rf   rf   rg   rr  b     z!BaseSchedulerNode.debug_str_extrar   c                 C  s
   |  | S rq   )rT  rd   rf   rf   rg   _debug_str_for_devicee  r  z'BaseSchedulerNode._debug_str_for_devicec                 C  sz   t | jdd }d}t|tjjjrd|j| gddd }nt|tjjj	r7d|j|
 | gddd }|  | S )Nr   rx  , F)shorten	multiline)rp  r   r   torch	_inductorr(   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )re   
maybe_datadata_strrf   rf   rg   debug_str_shorth  s   
z!BaseSchedulerNode.debug_str_shortc                 C  s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)rt  inforP  r   rq  rd   rf   rf   rg   log_detailsw  s   zBaseSchedulerNode.log_detailsself_depr1   	other_depc                 C  rw  NFrf   )re   r  r  rf   rf   rg   reorder_loops_by_dep_pair     z+BaseSchedulerNode.reorder_loops_by_dep_pairrenamesc                   s<    fdddd | j  D D | _| | j | j d S )Nc                      i | ]}| v r| | qS rf   rf   r   r   r  rf   rg   r\    
    z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>c                 s      | ]}|j V  qd S rq   r   r   rf   rf   rg   r         z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>)r   reads_and_writesrG  set_read_writesrenamere   r  rf   r  rg   update_mutated_names  s   
z&BaseSchedulerNode.update_mutated_namesr   r0   c                 C  s   |  | j| d S rq   )r  r   	with_readre   r   rf   rf   rg   add_fake_dep     zBaseSchedulerNode.add_fake_depc                 C     t dd |  D S )Nc                 s  s     | ]}|  p| V  qd S rq   )r  r  r[  rf   rf   rg   r     s    
z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>)r   r^  rd   rf   rf   rg   has_aliasing_or_mutation  s   z*BaseSchedulerNode.has_aliasing_or_mutationrwc                 C  s   || _ | j j| _|   d S rq   )r   r   rP  
prune_deps)re   r  rf   rf   rg   r    s   
z!BaseSchedulerNode.set_read_writesfuture_used_buffersmutation_real_namec                   s,   |   }t fdd|D }|| | _d S )Nc                 3  s    | ]	}  ||V  qd S rq   )get)r   kr  rf   rg   r         z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>)used_or_aliased_buffer_namesr   rB  )re   r  r  used_buffersrf   r  rg   set_last_usage  s   z BaseSchedulerNode.set_last_usagec                 C  s   | j D ]}|  qd S rq   )rI  r+  )re   r   rf   rf   rg   mark_run  s   

zBaseSchedulerNode.mark_runc                 C  s"   t dd t| jj| jjD S )Nc                 s  r  rq   r  r   rf   rf   rg   r     
    
z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>)r   	itertoolschainr   r   rq  rd   rf   rf   rg   r     s   z#BaseSchedulerNode.used_buffer_namesc                   s   t   dd t| jj| jjD }t|dkr@| } | t	j
j|r:| fddt	j
j|  D  t|dks S )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        c                 S  s"   g | ]}t |tr|js|jqS rf   )r   r3   is_faker   r   rf   rf   rg   r     s    zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r   c                 3  s    | ]	}| vr|V  qd S rq   rf   )r   alias
used_namesrf   rg   r     s    zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>)r   r  r  r   r   rq  r   popaddrV   r   name_to_bufferr  extendr  )re   depsr   rf   r  rg   r    s    
z.BaseSchedulerNode.used_or_aliased_buffer_namesc                   s   t  fdd jD  _d S )Nc                 3  s"    | ]}|j  jjvr|V  qd S rq   )r   r   available_buffer_namesr   rd   rf   rg   r         z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>r   rP  rd   rf   rd   rg   r    s   zBaseSchedulerNode.prune_depsc                   s>   d	 fddt fdd jjD }  j| d S )
Nr   r0   rz   ri   c                   s>   t | tsdS | j jjvrdS  jj| j  }|tjjv S r  )	r   r3   r   r   r'  r  rV   r   removed_operations)r   op_namerd   rf   rg   should_prune  s   
z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                 3      | ]	} |r|V  qd S rq   rf   r   r  rf   rg   r         
z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>r   r0   rz   ri   )r   r   r   r  remove_reads)re   	to_removerf   )re   r  rg   prune_weak_deps  s
   	z!BaseSchedulerNode.prune_weak_depsname_to_fused_nodedict[str, BaseSchedulerNode]c                 C  s   t | || jj d S rq   )_prune_redundant_depsr   r'  )re   r  rf   rf   rg   prune_redundant_deps  s   z&BaseSchedulerNode.prune_redundant_depsc                 C  r4  rq   )r   get_operation_namerd   rf   rf   rg   r    r5  zBaseSchedulerNode.get_namec                 C  s   |   S rq   rZ  rd   rf   rf   rg   get_first_name  s   z BaseSchedulerNode.get_first_namec                 C  r  )Nc                 s      | ]}|  V  qd S rq   rZ  r   r   rf   rf   rg   r         z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>)r   r   rd   rf   rf   rg   r        z%BaseSchedulerNode.get_operation_namesc                 C     t dd | jD S )Nc                 s  r  rq   rZ  r   r   rf   rf   rg   r     r  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>)r   rI  rd   rf   rf   rg   get_buffer_names     z"BaseSchedulerNode.get_buffer_namesc                 C  r  )Nc                 s  s&    | ]}t |tot|d dV  qdS )T)disallow_fp32_opsNr   r   r*   r   nrf   rf   rg   r     s    


zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>r   r   rd   rf   rf   rg   can_codegen_in_low_precision  s   z.BaseSchedulerNode.can_codegen_in_low_precisionc                 C  r  )Nc                 s  s"    | ]}t |tot|V  qd S rq   r  r  rf   rf   rg   r     s
    
z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>r  rd   rf   rf   rg   r*     s   z-BaseSchedulerNode.can_codegen_without_upcastsSequence[BaseSchedulerNode]c                 C  s   | gS rq   rf   rd   rf   rf   rg   r        zBaseSchedulerNode.get_nodesSequence[SchedulerBuffer]c                 C     | j S rq   )rI  rd   rf   rf   rg   r^    r  zBaseSchedulerNode.get_outputsbuf_namer   c                 C  s
   | j | S rq   )rK  )re   r  rf   rf   rg   
get_output  r  zBaseSchedulerNode.get_outputr6  c                 C  r4  rq   )r   r   rd   rf   rf   rg   r     r5  zBaseSchedulerNode.get_devicec                 C  s   |   }|d uo|jdkS Ncpu)r   r   re   devicerf   rf   rg   is_cpu     zBaseSchedulerNode.is_cpuc                 C  s   |   }|d uot|jS rq   )r   rQ   r   r  rf   rf   rg   rQ     r  zBaseSchedulerNode.is_gpuc                 C  rw  r  rf   rd   rf   rf   rg   r     ry  zBaseSchedulerNode.is_reductionc                 C  rw  r  rf   rd   rf   rf   rg   is_native_matmul  ry  z"BaseSchedulerNode.is_native_matmulc                 C  rw  r  rf   rd   rf   rf   rg   is_split_scan  ry  zBaseSchedulerNode.is_split_scanc                 C  rw  r  rf   rd   rf   rf   rg   is_template  ry  zBaseSchedulerNode.is_templatec                 C  rw  r  rf   rd   rf   rf   rg   	is_extern   ry  zBaseSchedulerNode.is_externc                 C  rw  r  rf   rd   rf   rf   rg   
is_foreach#  ry  zBaseSchedulerNode.is_foreachread_depdependencies.Depc                 C  rw  r  rf   re   r  rf   rf   rg   can_inplace&  ry  zBaseSchedulerNode.can_inplacec                 C  rw  r  rf   rd   rf   rf   rg   has_side_effects)  ry  z"BaseSchedulerNode.has_side_effectsc                   sd  ddl m} ttr1tjr1tj	 t
jr1ttjtjjjjr+ttjdddur1ttjds3dS jtjjB jjB  dfd
d} D ]}|j}|dusTJ | rh| sh| sh| tjjv riqIjj D ]}|j!jj"v r~jj"|j! }njj#$|j!}|r.tjj%&|r.t|j't(s.|j)dusJ  fdd|j)D }t*|dkr.|d j+r.|d ju r.|jdur.t|j, t-j.t-j/t-j0fs.|j'rt|j'jt-j1t-j2frt*|j dks.||j|jr.||r.tjj34| |  ttjtjjjjr"tjj56|  tjj56|  | tjj7| <  nqmqIdS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr  buf_to_be_inplacedr   rz   ri   c                   s   | j }|   t }| jD ]3}|j}t|tsq| | j j	vs+| j ||ur,q| fdd|j
 D O }t|dkrC dS qdS )Nc                 3  s    | ]
}|j  kr|V  qd S rq   r  )r   or  rf   rg   r   _  s    
z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>r   FT)r   get_fused_noder  r   r  r   r   r[   r  r  r   r  r   )r  
fused_noder  r  	user_noderd   r  rg   single_index_in_fused_nodeG  s*   


zKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodec                   s   g | ]}|j   vr|qS rf   r  r   x)inconsequential_nodesrf   rg   r     s
    z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   )r  r   rz   ri   )8codegen.wrapperr  r   r   r%   inplace_buffersrV   r   has_featurer   r+   INPLACE_BUFFERSr$  r~  r  codegensimd
SIMDKernelrp  r#  r   r  r   completed_operationsr^  r   r  r  r  r  removed_buffersr   r   r   r&  r'  r  r!  	can_reuser   NopKernelSchedulerNoder  r   r  r  r(   r<   r;   MutationLayoutSHOULDREMOVEFallbackKernelr:   r  make_inplacer  r  r%  )re   r  r  r   buf_noderead	input_bufremaining_usesrf   )r  re   rg   decide_inplace_update,  s   
"



z'BaseSchedulerNode.decide_inplace_updateTbufferrN   	only_oncec           	      C  s(  t jsd S |r| jrd S | jd usJ | j }g }|D ]e}|jdkr$q|d |d d|j d|j }d|jv rG|d|jd   }|| d|jv r|jd  }|j	d	d
dd }|d|
dd
dd
dd
dd  |d |d qt|dkrd S || d| _d S )NrX  rx  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}ri  \z\\z#pragma CMT END ORIGINr   T)r%   comment_originrQ  r   get_originsr  r   targetmetarsplitreplacer   
writelines)	re   r  r  origins	out_linesr  op_info_strr  stack_trace_last_linerf   rf   rg   codegen_originating_info  sH   









	


z*BaseSchedulerNode.codegen_originating_infoc                 C  s   | j dddS )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrd   rf   rf   rg   get_read_write_buffers_sizes     z.BaseSchedulerNode.get_read_write_buffers_sizesc                 C     | j dddS )NTFr&  r)  rd   rf   rf   rg   get_read_buffer_sizes  r,  z'BaseSchedulerNode.get_read_buffer_sizesc                 C  r-  )NFTr&  r)  rd   rf   rf   rg   get_write_buffer_sizes  r,  z(BaseSchedulerNode.get_write_buffer_sizesr'  r(  c                 C  s   t | j||d ddS )Nr&  r   )start)r   get_read_write_buffer_accessesr   )re   r'  r(  rf   rf   rg   r*    s   z3BaseSchedulerNode.get_read_write_buffers_sizes_impldict[str, int]c                   s
  t tri S t trt jtri S t tr+t jtjr+jjtj	j
ju r+i S dddt trHt d t d  ntd	tt}|rbjjD ]
}||j | qW|rsjjD ]
}||j | qh|rtd
d jjD nt }|rtdd jjD nt }dfddt trtfdd|D }|| }|| }i }||B D ]I}	tfdd||	 D  |	tjjv rtjj|	 }
n|	tjjv rtjj|	 }
nqd fdd|
}|	|vr|||	< q||	  |7  < q|S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprrz   r   c                 S  s   t jjj| ddS )Nr   r   )rV   r   r   r   )r3  rf   rf   rg   try_size_hint#     zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hintr   r       eAc                 s  r  rq   r  r   rf   rf   rg   r   8  r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>c                 s  r  rq   r  r   rf   rf   rg   r   =  r  r   r   r   r  ri   c                   s4    j j|  j}tdd |D }t|t| dkS )Nc                 s  r  rq   r   r   r  rf   rf   rg   r   D  r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>r   )r   r'  r  r   r   )r   r   r  buf_usesrd   rf   rg   is_materializedB  s   zIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedc                 3  s     | ]} |j s|V  qd S rq   r   r   )r:  re   rf   rg   r   H  s    
c                 3  s    | ]} V  qd S rq   rf   r   )
node_numelrf   rg   r   Q      <Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]c                   s   | sdS t | tjr|  S t | jtrNjj|   j	}d}|D ]*}t |j
tr*q!t |j
ts2J t |j
j
trI|j
 D ]	}||j
7 }q>q! dS |S t | jtjrbtfdd|  D S t|  }t|  t | S )Nr   c                 3  s     | ]} t j|V  qd S rq   )rV   r   
get_buffer)r   mut_name)get_buf_bytesrf   rg   r   u  s
    
zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>)r   r(   TorchBindObjectrA  r  r;   r   r'  r  r  r   r-  r[   r:   r^  r<   r   r  rU   r  rJ   	get_dtypemin)r   r  totr  	sched_buf	buf_elems)buf_accessed_elemsrA  re   r5  rf   rg   rA  Z  s2   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesN)r3  r4  rz   r   )r   r   r   r  rz   ri   )r   r>  rz   r   )r   r  ExternKernelSchedulerNoder   r:   r(   r	  op_overloadr~  _prims	rng_primsgraphsafe_run_with_rng_stater   rU   
get_rangesr   collectionsr   r   r   r   r   r   rq  r   r   r   rV   r   r  graph_inputs)re   r'  r(  buf_accessesr   r   rq  r  buf_byte_accessesr  r   	buf_bytesrf   )rH  rA  r:  r<  re   r5  rg   r1    st   




%
z0BaseSchedulerNode.get_read_write_buffer_accesses
int | Nonec                 C  sv   | j d u rd S | j  }|d u rd S t|}|d u rd S t|tjr&|j j}tjj	j
|dd}td d  |7  < |S )Nr   r   inductor
flop_count)r   get_origin_noder6   r   r~  SymIntexprrV   r   r   r   r   )re   fx_nodeflopsresolved_flopsrf   rf   rg   estimate_flops  s   

z BaseSchedulerNode.estimate_flopsfloatc                 C  s   | j d ur| j S |  S rq   )rM  _get_estimated_runtimerd   rf   rf   rg   get_estimated_runtime  s   
z'BaseSchedulerNode.get_estimated_runtimec              
   C  s  |   d  d }|j }tt|sdS t| jrt| jtj	s%J z:t
jrZt| }t }||}|durCt|ts@J |W S t| }|du rPt| j}|j||d |W S t| jW S  tyw } zt| W Y d}~dS d}~w ty } zt| W Y d}~dS d}~ww t| jrdS t| }|dur|S |j }	z!t }
t|	d }|
dkrtd|
 |dkrtd| W n
 ty   Y dS w |  }|dks|du r|  |
 }|d }|S d}|  }|du rdn|}|| | d	 }||
 }t ||}|d }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r7  )!r   r^  r   r  rQ   r8   rO   r   r(   IRNoder&   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupr^  r/   r.   	set_value
ValueErrorrt  r  	TypeErrorrT    maybe_estimate_runtime_benchmarkmaybe_get_dtyperK   rI   AssertionErrorrs  r]  r+  max)re   r   r  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_timerf   rf   rg   r_    sz   








z(BaseSchedulerNode._get_estimated_runtimeOptional[ir.TemplateBuffer]c                 C  s   d S rq   rf   rd   rf   rf   rg   get_template_node  ry  z#BaseSchedulerNode.get_template_nodeir.TemplateBufferc                 C  s   |   }|d us
J |S rq   r~  )re   templaterf   rf   rg   get_template_node_or_throw  s   z,BaseSchedulerNode.get_template_node_or_thrownodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                 C  sD   t dd t| D }| d| }| | }| |d d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c                 s  s     | ]\}}|  r|V  qd S rq   r  r   ir  rf   rf   rg   r     s    zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>Nr   )next	enumerate)r  template_indexprologuetemplate_nodeepiloguerf   rf   rg   get_prologue_template_epilogue   s
   
z0BaseSchedulerNode.get_prologue_template_epilogue)r   r   rz   r  )r   rW  rz   r  r8  )rz   r   r:  r  r1   r  r1   rz   ri   r  rF  rz   r  )r   r0   rz   r  r;  )r  rN  rz   r  r  rA  r  rF  rz   r  rz   rA  r  r  rz   r  rz   r  )rz   r  )r  r   rz   r   r=  r  r  rz   ri   T)r  rN   r  ri   rz   r  r9  )r'  ri   r(  ri   rz   r   )r'  ri   r(  ri   rz   r2  rz   rT  rz   r^  rz   r}  )rz   r  )r  r  rz   r  )>rr   rs   rt   ru   r   rM  rQ  rV  r`  rc  r  rr  rz  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rE   r   r  r  r*   r   r^  r  r   r  rQ   r   r  r  r  r  r  r  r  r  r%  r+  r.  r/  r*  r1  r]  r`  r_  r~  r  r   r  rf   rf   rf   rg   r[     s   
 




































 /


 

W
rz   $torch._inductor.codecache.LocalCachec                   C  s   t jj S rq   )r~  r  	codecache
LocalCacherf   rf   rf   rg   rf       rf  snoder   c                   s|   t | jdd}| jj}| jg || jj| jj}| jj}t||f\}}d	dd t|ft	 fdd|D  }|S )
Npython_kernel_namerx  rz   ri   c                 S  s   t | tjot | tj S rq   )r   r(   rc  GeneratorStater  rf   rf   rg   _is_tensor_ir  rd  z@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_irc                 3  s(    | ]} |rt | nd V  qd S rq   )r   r  r   ar  rf   rg   r   #  s   & z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>r;  )
rp  r   inputsfill_non_provided_argsconstant_argsrR  pytreetree_flattenr   r   )r  r  r  rR  	flat_argsflat_args_pytree_specro  rf   r  rg   re    s   
re  Optional[Callable[[Any], Any]]c                 C  s`   t | tsd S tjjjtjjjtjjjd}t| j	dd}||vr#d S t | j	t
js,d S || S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  rx  )r   rI  r~  opsatenmmbmmaddmmrp  r   r(   ExternKernel)r  mms_fnsr  rf   rf   rg   _get_mm_like_fn(  s   
r  rL  c                   s   d }d }t jrt }|d u rd S |} fdd}nd S t }t }||}|d ur6t|ts4J |S ddlm	 | \}}ddl
m}	 |	j|||dddd	}
|j||
d
 |
S )Nc                     s    S rq   rf   rf   r  snode_args_kwargsrf   rg   rS  A      z2maybe_estimate_runtime_benchmark.<locals>.<lambda>r   )r  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationra  )r%   !runtime_estimations_mms_benchmarkr  re  rf  rg  r   r^  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkrh  )r  bench_fnargs_kwargs_fnmm_fnro  rp  rq  r  rR  r  rr  rf   r  rg   rk  8  s8   

	rk  T)slotsc                   @  sL   e Zd ZU ded< ded< ded< ded< dddZdddZdddZdS )	WhyNoFuser   name1name2reasonztuple[Any, ...]r  rx   r[   ry   rz   r  c                 C  s   |  | _|  | _d S rq   )r  r  r  re   rx   ry   rf   rf   rg   rV  e  s   
zWhyNoFuse.__init__r   c                 G  s   || _ || _t|  d S rq   )r  r  
fusion_logdebug)re   r  r  rf   rf   rg   __call__i  s   zWhyNoFuse.__call__c                 C  s"   d| j  d| j d| j| j  S )Nzcannot fuse z with r
  )r  r  r  r  rd   rf   rf   rg   __str__n  s   
zWhyNoFuse.__str__Nrx   r[   ry   r[   rz   r  )r  r   r  r   rz   r  r8  )rr   rs   rt   ru   rV  r  r  rf   rf   rf   rg   r  ^  s   
 

r  objr   c                 C  sF   t | ttfrt| td} tj| dd}d|v r!dt|d S |S )Nkey   )r  ri      )	r   r   setsortedr   pprintr  textwrapr  )r  r  rf   rf   rg   r  t  s   r  c                   @  s8   e Zd ZdddZddd	ZdddZdddZeZdS )r-  r   r2   rz   r  c                 C  s   t |g| _d S rq   r  r  rf   rf   rg   rV       zOutputNode.__init__ri   c                 C  rw  r  rf   rd   rf   rf   rg   r     ry  zOutputNode.is_reductionr3  c                 C  rw  )Nrf   rf   rd   rf   rf   rg   r    ry  z'OutputNode.get_inputs_that_alias_outputr   c                 C  rw  )NOUTPUTrf   rd   rf   rf   rg   r    ry  zOutputNode.get_nameN)r   r2   rz   r  r;  r<  r8  )rr   rs   rt   rV  r   r  r  rc  rf   rf   rf   rg   r-  ~  s    



r-  r   r  r  r'  rJ  r  c                   s   t  jD ]}t|ts! |j  }|    d7  < qd fddtfdd	jD }|rKj| _	j
| d
S d
S )am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r0   rz   ri   c                   sX   t | tr* | j  }|   dkoj| | }| k}|p)|S dS )Nr   F)r   r3   r   r  r  r   fusable_weak_dep)r   r  is_redundantis_self_dep)r'  name_to_dep_countr  r   rf   rg   r    s   


z+_prune_redundant_deps.<locals>.should_prunec                 3  r  rq   rf   r   r  rf   rg   r     r  z(_prune_redundant_deps.<locals>.<genexpr>Nr  )rO  r   rP  r   r3   r   r  r  r   r  r   r  )r   r  r'  r   r  deps_to_prunerf   )r'  r  r  r   r  rg   r    s   

r  c                      s<   e Zd Zd fddZdd
dZdddZdddZ  ZS )rI  r   r   r   rW  rz   r  c                   (   t  | | | | |  d S rq   superrV  r`  r  get_read_writesre   r   r   	__class__rf   rg   rV       
z"ExternKernelSchedulerNode.__init__r   c                 C  s   |    dt| jdd  S )Nz.node.kernel = r  )r  rp  r   rd   rf   rf   rg   rr    s   z)ExternKernelSchedulerNode.debug_str_extrari   c                 C  rw  NTrf   rd   rf   rf   rg   r    ry  z#ExternKernelSchedulerNode.is_externc                 C  s$   | j d usJ t| j do| j  S )Nr  )r   r#  r  rd   rf   rf   rg   r    s   z*ExternKernelSchedulerNode.has_side_effectsr   r   r   rW  rz   r  r8  r;  )rr   rs   rt   rV  rr  r  r  __classcell__rf   rf   r  rg   rI    s
    

rI  c                      s   e Zd Zd	 fddZ  ZS )
r  r   r   r   rW  rz   r  c                   r  rq   r  r  r  rf   rg   rV    r  zNopKernelSchedulerNode.__init__r  )rr   rs   rt   rV  r  rf   rf   r  rg   r    s    r  c                      s\  e Zd ZU dZded< ded< d` fddZ		dadbddZ		dadcddZddddZdedd Z	dfd!d"Z
dgd$d%Zdfd&d'Zdhd+d,Zdfd-d.Zdid2d3Zdjd5d6Zdkd8d9Zdld:d;Zdld<d=Zdld>d?Zdld@dAZdmdCdDZdndGdHZdodJdKZdpdLdMZ	NdqdrdQdRZedsdSdTZedsdUdVZdtdYdZZedud\d]Zedl fd^d_Z   Z!S )vr   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr=   r   r   r   r   +Union[ir.ComputedBuffer, ir.TemplateBuffer]rz   r  c                   s"   t  | | | |   d S rq   )r  rV  r`  _compute_attrsr  r  rf   rg   rV    s   
zSchedulerNode.__init__Nextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[_P, _T]]c                 C  s   t | jtjtjfsJ | jj||d\| _}|| _| j }| j	
|j}||| jf| _tj p7t|j }t | jtjrK| | jj|d d S | tj| jg| jR d|i d S )Nr  r  )	normalizer  )r   r   r(   r   TemplateBuffersimplify_and_reorderr  r   get_device_or_errorr   get_backendgroup_fnr   r%   loop_ordering_after_fusionrQ   r   r  extract_read_writesr'   )re   r  r  bodyr  r  should_normalizerf   rf   rg   r    s2   

zSchedulerNode._compute_attrsOptional[Callable[..., Any]]c                 C  s   | j ||d d S )Nr  )r  )re   r  r  rf   rf   rg   recompute_size_and_body  s   
z%SchedulerNode.recompute_size_and_bodyr  ri   need_clear_tiling_cachec                 C  st   t dd | jjD }| tj| jg| jR d|i|	| j
 | j|  |r8ddlm} |j  d S d S )Nc                 s  s"    | ]}t |ttfr|V  qd S rq   )r   r3   r2   r   rf   rf   rg   r     s    
z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>r  r   SIMDScheduling)r   r   r   r  r'   r  r   r  r  r  rG  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)re   r  r  	fake_depsr  rf   rf   rg   refresh_dependencies  s&   z"SchedulerNode.refresh_dependencies	new_orderSequence[int]c                 C  s*   | j || _ | j j| _| jddd d S )NFTr  r  )r   reorder_iter_loopssizesr  r
  )re   r  rf   rf   rg   apply_new_loop_order.  s
   
z"SchedulerNode.apply_new_loop_orderc                 C  s   | j  }t| j j| }tt|}tt||| }| ||  t| jd dks.J | jd | jd d | jd d ff| _d S )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   ranger  r   )re   	num_rdims
num_pwdimspwdimsrdimsrf   rf   rg   swap_pw_red_dimension6  s   
,z#SchedulerNode.swap_pw_red_dimensionr[   c                 C  s   | j  | _ | S rq   )r   extract_pw_from_reductionrd   rf   rf   rg   r  @  s   z'SchedulerNode.extract_pw_from_reductionc                 C  sX   t | sd S t| jtjsJ | j  |   W d    d S 1 s%w   Y  d S rq   )r   r   r   r   r(   r   with_original_inner_fnr  rd   rf   rf   rg   cancel_reduction_splitD  s   

"z$SchedulerNode.cancel_reduction_split	dimensionr   	new_rangec                 C  sl   t | jtjtjfsJ | j||| _| jj| _| j	 }| j
|j}||| jf| _| jddd d S )NTr  )r   r   r(   r   r  r   #expand_dimension_for_pointwise_noder  r  r  r   r  r  r   r
  )re   r  r  r  r  rf   rf   rg   r  K  s   

z1SchedulerNode.expand_dimension_for_pointwise_nodec                 C  s(   | j  | _ | j j| _| jddd d S )NTFr  )r   merge_loopsr  r  r
  rd   rf   rf   rg   r  \  s   
zSchedulerNode.merge_loopsr  r1   r  c                 C  s~   d }| j d }t||j  kr|jkrn n||}|r5t jd7  _td|  | | 	| dS td|   dS )Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr)   num_loop_reorderingloop_ordering_logr  r  r  )re   r  r  r  
self_sizesrf   rf   rg   r  h  s    
 


z'SchedulerNode.reorder_loops_by_dep_pairr   c                 C  s   |   }| d| jd  | d| jd  | d| j g}| j D ]#}t|tsG|j}tj	
|}t|tjsG|| dt|j  q$t| jtrc|d| d |t| j d	 | jd usjJ ||   d
|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  ri  )r  r   r  r   r  r   r3   r   rV   r   r?  r(   rB  r   r  r  r   r=   r  r  r  r   r  rz  join)re   r   linesr   r  r   rf   rf   rg   rr    s$   

zSchedulerNode.debug_str_extraSequence[Sequence[sympy.Expr]]c                 C  r  rq   )r  rd   rf   rf   rg   rN    r  zSchedulerNode.get_rangesc                 C  sJ   t | jtjtjfsJ dt| jt| j o$| jd u p$| jj	 S Ntype(self.node)=)
r   r   r(   r   r  r   ri   r   r   has_partial_accumulaterd   rf   rf   rg   r     s   zSchedulerNode.is_reductionc                 C  s0   t | jtjsJ dt| j| j dkS )Nr)  dot)r   r   r(   r   r   r   rd   rf   rf   rg   r    s   "zSchedulerNode.is_native_matmulc                 C  sF   t | jtjtjfsJ dt| jt | jtjo"t | jjtjS r(  )r   r   r(   r   r  r   r   	SplitScanrd   rf   rf   rg   r    s   
zSchedulerNode.is_split_scanc                 C  s   t | jtjS rq   r   r   r(   r  rd   rf   rf   rg   r    r7  zSchedulerNode.is_templater}  c                 C  s   t | jtjr
| jS d S rq   r-  rd   rf   rf   rg   r~       zSchedulerNode.get_template_node
index_varsSequence[sympy.Expr]c                 G  s   |    |   | | d S rq   )r  r  r  )re   r/  rf   rf   rg   run  s   zSchedulerNode.rundict[sympy.Expr, sympy.Expr]c                 C  sH   | j }ttt|ttt|ksJ tttj|tj|}|S rq   )	r  r   mapr   dictzipr  r  from_iterable)re   r/  r  r   rf   rf   rg   ranges_from_index_vars  s    

z$SchedulerNode.ranges_from_index_varsc              	   C  s   |  |}zCttt |. tj|  | j|  W d   n1 s'w   Y  W d   W dS W d   W dS 1 sAw   Y  W dS  tyW   t	
d| j  w )a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r7  rV   set_ops_handlerrC   get_ops_handlerr$  set_current_noder   rs  rt  fatalr   )re   r/  r   rf   rf   rg   r    s   

VzSchedulerNode.codegenT	pointwiserN  c                 C  s:   |r| j nt| j \}}tj| j|tjjgt| gdS )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	r  r   r'   r  r   r   SZeror   )re   r<  
keep_sizesignore_sizesrf   rf   rg   "pointwise_or_reduction_read_writes  s   z0SchedulerNode.pointwise_or_reduction_read_writesc                 C     | j ddS )zH
        Get the memory dependencies in the non-reduction axes.
        Tr<  rB  rd   rf   rf   rg   r       z#SchedulerNode.pointwise_read_writesc                 C  rC  )zD
        Get the memory dependencies in the reduction axes.
        FrD  rE  rd   rf   rf   rg   reduction_read_writes  rF  z#SchedulerNode.reduction_read_writesr  r  c                 C  s   |   rdS tdd |  D rdS t| jjdkrDt|tjrDt	t
| jj}t|tjs8J dt||j|jkoC|j|jkS dS )NFc                 s  r  rq   )r  r  rf   rf   rg   r     r  z,SchedulerNode.can_inplace.<locals>.<genexpr>r   ztype(write_dep)=)r  r   r^  r   r   rq  r   r'   r1   r  iterr   r   r   )re   r  	write_deprf   rf   rg   r    s   zSchedulerNode.can_inplacerA  c                 C  s   t  }t| jtrP| j D ]A}|jdkrO|jdkrOd|jv r&|jd dks4t|j	dkrO|j	d dkrO|
d|jv r@|jd nt|j	dkrL|j	d	 nd
 q|S )Ncall_methodstoremode
atomic_addr  r  r   r   r   rx  )r   r   r   r=   r   r  r  rR  r   r  r  )re   buffers_store_as_atomic_addr   rf   rf   rg   _get_atomic_add_buffers  s   



z%SchedulerNode._get_atomic_add_buffersc                   s$   | j d ur| j drdS t  S )Ndevice_assert_asyncT)r   has_opr  r  rd   r  rf   rg   r    s   
zSchedulerNode.has_side_effects)r   r   r   r  rz   r  NN)r  r  r  r  rz   r  )r  r  r  r  rz   r  )r  ri   r  ri   rz   r  )r  r  rz   r  r:  rz   r[   )r  r   r  r   rz   r  r  r8  )rz   r'  r;  r  )r/  r0  rz   r  )r/  r'  rz   r2  )r/  r'  rz   r  r  )r<  ri   rz   rN  )rz   rN  r  r  )"rr   rs   rt   r   ru   rV  r  r   r
  r  r  r  r  r  r  r  rr  rN  r   r  r  r  r~  r1  r7  r  rB  rE   r  rG  r  rO  r  r  rf   rf   r  rg   r     sP   
 #




















r   group_snode/Union[FusedSchedulerNode, GroupedSchedulerNode]c                   sV    j } tjdd |D  t fddtjdd |D  D  jj  _	d S )Nc                 S     g | ]}|j qS rf   r   r  rf   rf   rg   r   '      z3refresh_group_node_dependencies.<locals>.<listcomp>c                 3  "    | ]}|j   vr|V  qd S rq   r   r  r   rT  rf   rg   r   +  r  z2refresh_group_node_dependencies.<locals>.<genexpr>c                 S  rV  rf   )rP  r  rf   rf   rg   r   -  rX  )
r   r  r'   
ReadWrites
merge_listr   unionr   rq  rP  )rT  r   rf   r[  rg   refresh_group_node_dependencies"  s   r_  r   r   r   r  c                 C  s   t | ttfs	J || _|| _d | _tjdd |D  | _t	|  t
dd | jD | _tdd | jD | _dd |  D | _d S )Nc                 S  s   g | ]
}|j d ur|j qS rq   r   r  rf   rf   rg   r   >  r   z#init_group_node.<locals>.<listcomp>c                 s  r  rq   rC  r  rf   rf   rg   r   C  r  z"init_group_node.<locals>.<genexpr>c                 s  r  rq   )rD  r  rf   rf   rg   r   D  r  c                 S  rY  rf   rZ  r[  rf   rf   rg   r\  E      
z#init_group_node.<locals>.<dictcomp>)r   r   GroupedSchedulerNoder   r   r   r   r^  r   r_  rD  rC  rn  rD  r^  rK  )rT  r   r   rf   rf   rg   init_group_node4  s   rd  c                      s  e Zd ZU dZded< edXdd	ZdYd
dZdZddZe	d[ddZ
d\ddZd] fddZe	d^ddZd^dd Ze	d_d"d#Zd`d%d&Zd^d'd(Zd^d)d*Zda fd.d/Ze	d_d0d1Ze	d_d2d3Zdbd5d6Zd^d7d8Ze	dcd9d:Ze	dcd;d<Ze	dcd=d>Ze	dcd?d@Ze	dddBdCZdedEdFZe	dcdGdHZdfdJdKZdgdNdOZ dhdRdSZ!d^dTdUZ"e	dc fdVdWZ#  Z$S )ir   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   rx   r[   ry   rz   c                 C  s:  |j |j u sJ t|ttfsJ | rt|trt|jts"J t|j	j
dks,J ttt|j	j
ts9J tt|j	j
j}dd | D }t|dksSJ |d }t|j	j
dksaJ tt|j	j
}t|tspJ tt||j|j|j|jg|j	_
n	t|ttfsJ tt| | }| |j |S )Nr   c                 S     g | ]}|  r|qS rf   r  r  rf   rf   rg   r   c      z+FusedSchedulerNode.fuse.<locals>.<listcomp>r   )r   r   r   r   r  rI  r   r:   r   r   rq  r  rH  r2   r   r   r1   r   r   	var_namesr   rL  r   r  r  )rk   rx   ry   r   template_nodesr  writer  rf   rf   rg   rl   S  s,   
zFusedSchedulerNode.fusec                 C  s2   | j D ]}t|tsJ | sJ |  q| S rq   )r   r   r   r   r  re   r   rf   rf   rg   r  u  s
   

z,FusedSchedulerNode.extract_pw_from_reductionr  c                 C  s&   | j D ]}t|tsJ |  qd S rq   )r   r   r   r  rj  rf   rf   rg   r  |  s   

z(FusedSchedulerNode.swap_pw_red_dimensionrT  c                 C  8   t td dd |  D }t|dkrd S t|}|S )Nc                 s  (    | ]}|  s| r| V  qd S rq   r  r  r]  r  rf   rf   rg   r         
z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>r   r   filterr   r   r   re   fpsrs  rf   rf   rg   r]       
z!FusedSchedulerNode.estimate_flopsr  r1   r  ri   c                 C  s  |   rdS d}| jD ]%}t|tsJ |dur+t|t|jd kr+td  dS |jd }qd}|dus9J t||j	  krG|j	krNn n|
|}|sZtd|   dS t jd7  _td|  | | jD ]}t|tsvJ || qmt|  dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r  r   r   r   r   r  r#  r  r   r   r!  r  r)   r"  r  r_  )re   r  r  r$  r  r  rf   rf   rg   r    s<   
 


z,FusedSchedulerNode.reorder_loops_by_dep_pairr   r   c                   s6   t  | t| || g | _t|dd dj| _d S )Nc                 S  s   t |  S rq   )r   r   r  rf   rf   rg   rS        z-FusedSchedulerNode.__init__.<locals>.<lambda>r  )r  rV  rd  r  rn  r   )re   r   r   r  rf   rg   rV    s   zFusedSchedulerNode.__init__r   c                 C     d dd | jD S )N_c                 S     g | ]}|  qS rf   rZ  r  rf   rf   rg   r         z/FusedSchedulerNode.get_name.<locals>.<listcomp>r%  r   rd   rf   rf   rg   r    r  zFusedSchedulerNode.get_namec                 C     | j d  S r   r   r  rd   rf   rf   rg   r    r7  z!FusedSchedulerNode.get_first_namerA  c                 C     t jdd | jD  S )Nc                 S  rw  rf   r  r  rf   rf   rg   r     rx  z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>r   r^  r   rd   rf   rf   rg   r    r  z#FusedSchedulerNode.get_buffer_namesrH  c                 C  "   g }| j D ]	}||  q|S rq   r   r  r^  re   r  r   rf   rf   rg   r^       
zFusedSchedulerNode.get_outputsc                   sP    fddt  jD } jd j}|d ur|   td| dS )Nc                   s,   g | ]\}}    d | d|  qS )z.snodes[z] =
)r  r  )r   r  r   rd   rf   rg   r     s    z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   ri  r  )	r  r   r   r  rz  r  r  r%  rv  )re   r&  r   rf   rd   rg   rr    s   
z"FusedSchedulerNode.debug_str_extrac                 C  s   dd | j D }|  d| S )Nc                 S  rw  rf   )r  r  rf   rf   rg   r     rx  z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>z
, snodes: r;  )re   
snodes_strrf   rf   rg   r    s   z"FusedSchedulerNode.debug_str_shortr  r  rF  c                   s@   t  || t }t| jD ]}||| ||j qd S rq   )r  r  r   r   r   updaterB  )re   r  r  r   r  rf   rg   r    s   z!FusedSchedulerNode.set_last_usagec                 C  r|  )Nc                 S  rw  rf   )r   r  rf   rf   rg   r     rx  z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>r~  rd   rf   rf   rg   r     r  z$FusedSchedulerNode.used_buffer_namesc                 C  r|  )Nc                 S  rw  rf   )r  r  rf   rf   rg   r     rx  zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r~  rd   rf   rf   rg   r    s   z/FusedSchedulerNode.used_or_aliased_buffer_namesr  c                 C  r  rq   r;  rd   rf   rf   rg   r     r  zFusedSchedulerNode.get_nodesc                 C  s   t | j d|   dS )Nz(nodes=ra  rb  rd   rf   rf   rg   rc    rd  zFusedSchedulerNode.__repr__c                 C  r  )Nc                 s  r  rq   r   r  rf   rf   rg   r     r  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>r   r   rd   rf   rf   rg   r      r  zFusedSchedulerNode.is_reductionc                 C  r  )Nc                 s  r  rq   )r  r  rf   rf   rg   r     r  z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>r  rd   rf   rf   rg   r    r  z#FusedSchedulerNode.is_native_matmulc                 C  r  )Nc                 s  r  rq   )r  r  rf   rf   rg   r   
  r  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  rd   rf   rf   rg   r    r  z FusedSchedulerNode.is_split_scanc                 C  r  )Nc                 s  r  rq   r  r  rf   rf   rg   r     r  z1FusedSchedulerNode.is_template.<locals>.<genexpr>r  rd   rf   rf   rg   r    r  zFusedSchedulerNode.is_templater}  c                 C  s$   | j D ]}| r|   S qd S rq   )r   r  r~  r_  rf   rf   rg   r~    s
   
z$FusedSchedulerNode.get_template_nodetorch.devicec                 C  s
   | j d S r   )r   rd   rf   rf   rg   r     r  zFusedSchedulerNode.get_devicec                 C  r  )Nc                 s  r  rq   )r  r  rf   rf   rg   r     r  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  rd   rf   rf   rg   r    r  z+FusedSchedulerNode.has_aliasing_or_mutationr  c                 C     t rq   NotImplementedErrorr  rf   rf   rg   r     ry  z'FusedSchedulerNode.update_mutated_namesr   r0   c                 C  r  rq   r  )re   r   rf   rf   rg   r  #  ry  zFusedSchedulerNode.add_fake_depr  r  c                 C  r  rq   r  r  rf   rf   rg   r  &  ry  zFusedSchedulerNode.can_inplacec                 C  s  |   }ddd | jD }t }|| dt| j d| d| dt| jj	 d| d	t| j
 d| d
t| jj| j
  d| d |  |  D ]	}||  qOW d   n1 scw   Y  |d z	||   W n ty   tjddd Y nw |  S )re  r  c                 s  s    | ]}t |jV  qd S rq   )r   rr   r  rf   rf   rg   r   ,      z/FusedSchedulerNode.debug_str.<locals>.<genexpr>r
  rf  rg  rh  ri  rj  rk  z.outputs = [
            Nr  rl  Trm  )r  r%  r   rN   ro  r   rr   r  r   rq  rP  r   r  r^  r  r  rr  rs  rt  ru  r  rv  )re   r   node_typestrr   r   rf   rf   rg   r  )  sJ   

	
zFusedSchedulerNode.debug_strc                   s(   | j d urtdd | j D S t  S )Nc                 s  r  rq   )r  r  rf   rf   rg   r   F  r  z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>)r   r   r  r  rd   r  rf   rg   r  C  s   

z#FusedSchedulerNode.has_side_effectsrx   r[   ry   r[   rz   r   rS  r:  r  r  )r   r   r   r  rz   r  r8  r  rz   rH  r  r  r;  r  )rz   r  r  )r   r0   rz   r  r  )%rr   rs   rt   r   ru   rv   rl   r  r  rE   r]  r  rV  r  r  r  r^  rr  r  r  r   r  r   rc  r   r  r  r  r~  r   r  r  r  r  r  r  r  rf   rf   r  rg   r   J  sZ   
 
!

*










r   c                      s<   e Zd Zd fddZdd
dZdddZdddZ  ZS )FusedMixOrderReductionsrx   r[   ry   rz   r  c                   sd   t |st |sJ ||}}|| _|| _t |jt| t|   t 	| j| _
d S rq   )r   r   rx   ry   r  rV  r   r   r   r   numelr  r  rf   rg   rV  K  s   

z FusedMixOrderReductions.__init__other_nodestuple[BaseSchedulerNode, ...]c                 C  s   t |trJ t |trJ | jj||ddsdS t|r%t|s%dS ddd}dd	d
}|rG|||f||@ sE|||||f@ rGdS |  p[tt	| jj
||dd| jkS )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionr  r  rz   rA  c                 S     t  }|jdd | D  S )Nc                 s  r  rq   r`  r  rf   rf   rg   r   u  r  zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>r   r^  r  r   rf   rf   rg   _get_ancestorss  s   zAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorsc                 S  r  )Nc                 s  r  rq   )r   r  rf   rf   rg   r   {  r  zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>r  r  rf   rf   rg   _get_operation_namesw  s   zGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names)count_bytesN)r  r  rz   rA  )r   r  r   r   r   r   r   typingcastr   score_fusion_memoryr  )re   rx   ry   r  r  r  rf   rf   rg   sub_node_can_fuseW  s0   


z)FusedMixOrderReductions.sub_node_can_fuseotherc                 C  s`   t |ts| | j|| jfp| | j|| jfS | | j|j| j|jfo/| | j|jt S rq   )r   r  r  rx   ry   r   re   r  rf   rf   rg   can_fuse_with  s   
z%FusedMixOrderReductions.can_fuse_withc                 C  s   | j  }| j|}t|tr%|| j |j }|| j|j}t||S | | j || jfr<|| j |}t|| jS || j|}t| j |S rq   )	rx   r   r   r  r   r  rl   ry   r  )re   r  r  backendfused_node1fused_node2r  rf   rf   rg   	fuse_with  s   


z!FusedMixOrderReductions.fuse_withr  )rx   r[   ry   r[   r  r  )r  r[   )rr   rs   rt   rV  r  r  r  r  rf   rf   r  rg   r  J  s
    

4r  c                      s   e Zd ZU dZd<ddZd=d	d
Zed>ddZed?ddZ			d@dA fddZ	edBddZ
edCd!d"ZeZd#ed$< edDd&d'ZedCd(d)ZdEd*d+ZdEd,d-ZdFd.d/ZdGd0d1ZdHd3d4ZdId6d7ZdJd:d;Z  ZS )KForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerr[   rz   r   c                 C  s2   |  D ]}| | jv r| j|    S qd S rq   )r^  r  read_to_node)re   r  r   rf   rf   rg   get_consumer_subnode_for  s
   z3ForeachKernelSchedulerNode.get_consumer_subnode_forconsumerc                 C  sp   t t  }|jjD ] }|j| jjvrq	| jj|j  }|| jv r)|	| j|  q	t
|dkr6tt|S d S Nr   )r   r[   r   r   r   r   r'  r  name_to_noder  r   r  rH  )re   r  	producersrd	node_namerf   rf   rg   get_producer_subnode_for  s   

z3ForeachKernelSchedulerNode.get_producer_subnode_forri   c                   s&  t  |}  r;| r;tt  tt|}t jt|jk}|s)|d |o:t fddt j|jD S | re 	 rI|d dS tt|}|
 }|d ur_|j |S |d dS   r|	 rs|d dS tt   |}|d ur j||S |d dS td	)
Nzforeach do not have same lengthc                 3  s"    | ]\}} j ||V  qd S rq   )r   r   r   lrr  rf   rg   r     s
    
z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  r  r  r  r   r   r   r5  r   r  r   r   r  rm  )rk   r  r  whyforeach_matchconsumer_subnodeproducer_subnoderf   r  rg   r     sJ   


z#ForeachKernelSchedulerNode.can_fusec                 C  s  |  s
|  s
J |  rtt|}|j}|j}ntt|}|j}|j}d }d }|  rL|  rLtt|}tt|}dd t|j|jD }nj|  rtt|}||}g }|}d }|jD ]}	|	|u rxt	
|	|}
|
}||
 qd||	 qdn7|  rtt|}||}g }|}d }|jD ]}	|	|u rt	
||	}
|
}||
 q||	 qntd| |j|||||dS )Nc                 S  s   g | ]
\}}t ||qS rf   )r   rl   r  rf   rf   rg   r   	  s    
z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  r5  r   r  r   rl   r   r  rm  r   )rk   r  r  r  r  r  r  fused_nodesr  r   new_noder  rf   rf   rg   rl     sj   



zForeachKernelSchedulerNode.fuseNFr   r   r   r  r  r  r  r  r  c                   s  i  _ i  _|d u s|d u r4t || |D ]}|jjD ]}| j |j< q| D ]}	| j|	< q*qn| _| _	d  _
g  _ tj|j|jg t fddt|j|jD  jj  _t|j|jg _t|j|jg _| rt|tsJ ||}
}nt|tsJ ||}
}|
j _ j|j |
j _| D ]}	| j|	< qdd  j	D  _| _|d  }|sJ |t !dfff _"tt#j$j%   _&| _'d S )Nc                 3  rY  rq   rZ  r   rd   rf   rg   r   \	  s    z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>c                 S  s&   i | ]}|j  D ]\}}||q	qS rf   )rK  items)r   r  r  vrf   rf   rg   r\  w	  s
    

z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>r   combo_kernel)(r  r  r  rV  r   r   r   r   r   r   r   r  r  r'   r\  r]  r   r^  rP  rq  rD  rC  rn  rD  r  r   r  r   r  rK  r  r   r   Exprr   r~  fxNoder!  r  )re   r   r   r  r  r  r  r   r  r   foreach_noder   r  r  rd   rg   rV  :	  sb   	


z#ForeachKernelSchedulerNode.__init__r  c                   s   dd |D }|rt dt|dd |D  dd |D }|r(t dt| dd |D }|r9t dt| d	d |D }d
d |D }|rQt dt| dd |D }dd |D   rjt dt    fdd|D }tjrdd |D }|rt dt| dd |D }|S )Nc                 S     g | ]	}t |tr|qS rf   )r   rI  r  rf   rf   rg   r   	      z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>z/ComboKernels: %d external nodes are filtered %sc                 S  s    g | ]}|j d ur|j  qS rq   r   r  r  rf   rf   rg   r   	  s     c                 S  r  rf   )r   rc  r  rf   rf   rg   r   	  r  z+ComboKernels: %d grouped nodes are filteredc                 S  r  rf   )r   r  r  rf   rf   rg   r   	  r  z;ComboKernels: %d FusedMixOrderReductions nodes are filteredc                 S  s"   g | ]}t |ttttfs|qS rf   )r   r  rI  rc  r  r  rf   rf   rg   r   	  s    c                 S  r  rf   r   r  r  rf   rf   rg   r   	  
    
z+ComboKernels: %d foreach nodes are filteredc                 S  s   g | ]	}t |ts|qS rf   r  r  rf   rf   rg   r   	  r  c                 S  re  rf   r  r  rf   rf   rg   r   	  rf  z0ComboKernels: %d template nodes are filtered: %sc                   s   g | ]}| vr|qS rf   rf   r  rh  rf   rg   r   	  rf  c                 S  re  rf   r  r  rf   rf   rg   r   	  rf  zCComboKernels: %d reduction nodes are filtered (pointwise_only mode)c                 S  s   g | ]}|  s|qS rf   r  r  rf   rf   rg   r   	  rf  )rt  r  r   r%   combo_kernels_pointwise_only)rk   r  externgrouped	mix_orderfiltered_nodesforeach_nodesreduction_nodesrf   r  rg   combinable_nodes	  s^   z+ForeachKernelSchedulerNode.combinable_nodeslist[list[BaseSchedulerNode]]c              	     s   |   }g }dtdd |D }|D ]D}tt}|D ]!}| }|r.|jdks-|jdkr.q| |@ r5q|| | q| D ] |	 fddt
dt D  qAq|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                 S  s2   g | ]}|D ]}t |tr| D ]}|qqqS rf   )r   r  r  )r   r   r   r  rf   rf   rg   r   	  s    
zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>mpsr  c                   s   g | ]
} ||  qS rf   rf   )r   r  device_nodesmax_num_nodesrf   rg   r   	  s    r   )_topological_sort_nodesr   r   r   r   r   r   r   r   r  r  r   )r   sorted_nodesgrouped_nodesexcluded_buffer_namesr  device_groupsr   r  rf   r  rg   &_default_group_nodes_for_combo_kernels	  s4   	zAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                 C  s
   | t _d S rq   r  r  )r  rf   rf   rg   %set_group_algorithm_for_combo_kernels	  s   z@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsc                 C  s
   t | S rq   r  r   rf   rf   rg   group_nodes_for_combo_kernels	  s   
z8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsc                 C  r  rq   r  rd   rf   rf   rg   r  
  ry  z#ForeachKernelSchedulerNode.mark_runc                 C  r  rq   r  rd   rf   rf   rg   r  
  ry  z"ForeachKernelSchedulerNode.codegenc                 C  rw  r  rf   rd   rf   rf   rg   r  	
  ry  z%ForeachKernelSchedulerNode.is_foreachc                 C  s
   t | jS )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r   rd   rf   rf   rg   get_subkernel_nodes
  s   
z.ForeachKernelSchedulerNode.get_subkernel_nodesr  c                 C  s   t tjdd | jD S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c                 s  r  rq   )r   r  rf   rf   rg   r   
  r  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>)r   r  r  r6  r   rd   rf   rf   rg   r   
  s   z$ForeachKernelSchedulerNode.get_nodesr   c                 C  rz  r   )r   r  rd   rf   rf   rg   r  
  r7  z)ForeachKernelSchedulerNode.get_first_namer  r  c                 C  s*   t | || jj | jD ]}|| qd S rq   )r  r   r'  r   r  )re   r  r   rf   rf   rg   r  
  s   
z/ForeachKernelSchedulerNode.prune_redundant_deps)r  r[   rz   r   )r  r[   rz   r   r  r[   r  r[   rz   ri   )r  r[   r  r[   rz   r  )NNF)r   r   r   r  r  ri   r  r   r  r   r  ri   rz   r  r  r  rz   r  )r   r   rz   r  )r  r  rz   r  r:  r;  rz   r  r  r8  r  )rr   rs   rt   r   r  r  rv   r   rl   rV  r  r   r  r  ru   r  r  r  r  r  r  r   r  r  r  rf   rf   r  rg   r    s:   
 

	.EHA.






r  c                      s   e Zd ZU dZded< ed.ddZ	d/d0 fddZd1ddZd2ddZ	e
d3ddZd3ddZe
d4ddZd5ddZe
d6d!d"Zd7d$d%Zd8d'd(Zed9d,d-Z  ZS ):rc  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   rz   c                   sX   |d j  t fdd|D sJ |  |}|D ]	}| j| < q| j| < |S )Nr   c                 3  s    | ]}|j  u V  qd S rq   r  r  r  rf   rg   r   1
  r  z.GroupedSchedulerNode.create.<locals>.<genexpr>)r   r   r  r  )rk   r   grouped_snoder  rf   r  rg   create.
  s   

zGroupedSchedulerNode.createFr   r   temp_groupingri   r  c                   s"   t  | t| || || _d S rq   )r  rV  rd  r  )re   r   r   r  r  rf   rg   rV  8
  s   
zGroupedSchedulerNode.__init__c                 C  sD   | j r| jS | jD ]
}|| jj| < q	| jj|  = | j| jS )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r   r   r  r  
fuse_nodes)re   r  rf   rf   rg   unpackG
  s   
zGroupedSchedulerNode.unpackfake_depr0   c                 C  s"   |  | j| | j| d S rq   )r  r   r  rP  r  )re   r  rf   rf   rg   r  T
  s   z!GroupedSchedulerNode.add_fake_depr   c                 C  ru  )Nrv  c                 S  rw  rf   rZ  r  rf   rf   rg   r   Z
  rx  z1GroupedSchedulerNode.get_name.<locals>.<listcomp>ry  rd   rf   rf   rg   r  X
  r  zGroupedSchedulerNode.get_namec                 C  rz  r   r{  rd   rf   rf   rg   r  \
  r7  z#GroupedSchedulerNode.get_first_namerA  c                 C  r|  )Nc                 S  rw  rf   r}  r  rf   rf   rg   r   a
  rx  z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>r~  rd   rf   rf   rg   r  _
  r  z%GroupedSchedulerNode.get_buffer_namesrH  c                 C  r  rq   r  r  rf   rf   rg   r^  c
  r  z GroupedSchedulerNode.get_outputsrT  c                 C  rk  )Nc                 s  rl  rq   rm  r  rf   rf   rg   r   o
  rn  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>r   ro  rq  rf   rf   rg   r]  i
  rs  z#GroupedSchedulerNode.estimate_flopsr  c                 C  r  rq   r;  rd   rf   rf   rg   r   {
  r  zGroupedSchedulerNode.get_nodesr6  c                 C  s   | j r
| j d  S d S r   )r   r   rd   rf   rf   rg   r   ~
  r.  zGroupedSchedulerNode.get_devicer  r[   r  c                 C  rw  r  rf   )rk   r  r  rf   rf   rg   r   
  r  zGroupedSchedulerNode.can_fuse)r   r  rz   rc  )F)r   r   r   r  r  ri   rz   r  r  )r  r0   rz   r  r8  r  r  r  r  r=  r  )rr   rs   rt   r   ru   rv   r  rV  r  r  rE   r  r  r  r^  r]  r   r   r   r  rf   rf   r  rg   rc  "
  s*   
 	





rc  rf   stride_lengthslist[list[int]]r  r0  priority_idxr  	list[int]c                   sb   t jd fdd}ttttd }t|dkr&fdd	|D tjr/|j|d
 |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    r  r   brz   c                   s     dks dkrt   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krIdS ||krOdS t  S )	Nr   c                      g | ]}t |  qS rf   absr   sl)r  rf   rg   r   
  rf  z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                   r  rf   r  r  )r  rf   rg   r   
  rf  c                 s  s$    | ]\}}|d kp||k V  qdS r   Nrf   r   sl_asl_brf   rf   rg   r   
      
z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s  s$    | ]\}}|d kp||k V  qdS r  rf   r  rf   rf   rg   r   
  r  r   )rF   r   r5  )r  r  stride_len_astride_len_ba_firstb_firstr  r  )r  r  rg   	index_cmp
  s   
z"pick_loop_order.<locals>.index_cmpr   c                      g | ]} | qS rf   rf   )r   pi)r  rf   rg   r   
  rx  z#pick_loop_order.<locals>.<listcomp>r  N)r  r   r  r   rz   r   )		functools
cmp_to_keyr   r   r  r   r%   pick_loop_orderssort)r  r  r  r  orderrf   r  rg   pick_loop_order
  s   
r  	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferc                 C  s   |  }|   }t|trt|tsJ | }|  }t|tr&t|ts(J tjj|= ||_tjj|= ||_	tjj
| }tjj
| |tjj
|< |tjj|< tjj| }tjj| |tjj|< |tjj|< d S rq   )r  r   r   r  rV   r   r  r   
name_to_opoperation_namebuffersr   remove
operations)r  r  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigrf   rf   rg   _replace_operation_buffer
  s$   

r  r^  c                 C  s4   |  }|  }|| }|| }|d|  }|| S r  )r.  r/  )rx   ry   epilogue_runtimetotal_read_bytestemplate_write_bytesextra_bytesextra_bytes_ratioextra_memory_ratiorf   rf   rg    _estimate_fused_epilogue_runtime
  s   r"  c                   @  sV   e Zd ZU ded< dZded< dZded< dd	d
ZdddZdddZdddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]r   Fri   r  is_weakrz   r   c                 C  s   t | j | j| jfS rq   )r  r   r  r  r%  rd   rf   rf   rg   r	  
  r.  zNodeUser.__hash__r  objectc                 C  s2   t |to|  | ko| j|jko| j|jkS rq   )r   r#  r  r  r%  r  rf   rf   rg   __eq__
  s   


zNodeUser.__eq__r   c                 C  r  rq   r  rd   rf   rf   rg   r  
  r  zNodeUser.get_namec                 C  s.   | j |j u sJ t| j | jo|j| jo|jS rq   )r   r#  r  r%  r  rf   rf   rg   r1  
  s   

zNodeUser.mergeNr9  )r  r&  rz   ri   r8  )r  r#  rz   r#  )
rr   rs   rt   ru   r  r%  r	  r'  r  r1  rf   rf   rf   rg   r#  
  s   
 


r#  ri   c                   C  s   t jS rq   )r%   r  rf   rf   rf   rg   *used_non_deterministic_runtime_estimations  r  r(  	ir.IRNodeOrderedSet[sympy.Symbol]c                 C  sx   t  }|  }t|tjr/|t|jt|jB t|j	B  t|tj
r-|t|j |S |du s:J d| |S )z=Get free symbols from a node's layout (size, stride, offset).Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r(   Layoutr  r   r   strideoffsetr  get_layout_symintsr  )r   free_symbol_usesr  rf   rf   rg   r/    s   r/  c                 C  sX   t | trt jdd | jD  S | jdusJ | j }|jdd | j D   |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c                 s      | ]}t |V  qd S rq   get_scheduler_node_symbol_uses)r   r  rf   rf   rg   r      r  z1get_scheduler_node_symbol_uses.<locals>.<genexpr>Nc                 s  r1  rq   )r/  )r   ir_noderf   rf   rg   r   %  r  )	r   r   r   r^  r   r   get_free_symbol_usesr  r^  )r   r0  rf   rf   rg   r3    s   

r3  rx   ry   c                 C  s   |   otjo|   S rq   )r  r%   epilogue_fusionr|   rf   rf   rg   is_epilogue_fusion*  r.  r7  c                 C  s   |  otjo|    S rq   )r  r%   prologue_fusionr|   rf   rf   rg   is_prologue_fusion.  r.  r9  c                 C  s   t | |p	t| |S rq   )r7  r9  r|   rf   rf   rg   is_template_fusion2  s   r:  c                 C  s   t | |r|S | S rq   r7  r|   rf   rf   rg   template_fusion_pw_node6  r6  r<  c                      s  e Zd ZdZdddZd fdd	ZdddZedddZej	dddZdddZ
d ddZd!ddZdddZddd Zdd!d"Zdd#d$Zd"d(d)Zd#d+d,Zd$d.d/Zd%d1d2Zdd3d4Zdd5d6Zd#d7d8Zdd9d:Zd&d=d>Z	?d'd(dCdDZd)dHdIZd*dLdMZddNdOZd+dUdVZd,dXdYZ	?d'd-d[d\Z d.d`daZ!d/dbdcZ"d0dfdgZ#d1djdkZ$d2dndoZ%d3dvdwZ&d4dxdyZ'd5d|d}Z(d6d~dZ)d'd7ddZ*d8ddZ+d9ddZ,d:ddZ-d:ddZ.d;ddZ/d:ddZ0d<ddZ1d=ddZ2d=ddZ3d>ddZ4d?ddZ5d@ddZ6		dAdBddZ7d:ddZ8dCddZ9dDddZ:dEdFddZ;			dGdHddZ<dIddZ=dJddZ>dddĄZ?dddƄZ@dddȄZAdKdd̄ZBdLddτZCdMddфZDdNddӄZEdOdd؄ZFdPddڄZGeHdQdd݄ZIdPdd߄ZJdRddZKdSddZLdTddZMdUddZNdVddZOd#ddZPd#ddZQd#ddZRdWddZSdXdd ZTdNddZUdddZVdYddZWdZdd	ZXdXd
dZYdddZZd8ddZ[d[ddZ\d\ddZ]d]ddZ^dddZ_  Z`S (^  r   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    r  list[ir.Operation]rz   r  c                 C  s8   t d | | W d    d S 1 sw   Y  d S )NScheduler.__init__)r   _initre   r  rf   rf   rg   rV  @  s   
"r>  c                   sV  t     tj_i  _tt _t	
  _t  _tg tjj tjj tjj  _ fdd|D  _d  _d  _    jtjj   jD ]}|  qRd  _   _dd  jD  _dd  jD  _ j  _i  _ i  _!t  _"t#$ j j j _ %   & j _ '  dd  jD  _ (  t) j*t+ j7  _*ddl,m-}m.} | j t+ j _/ 0   & j _tt1t2t2f    _3t4j5d urt45 j _t4j6rd	d
l7m8} |9   (   : j _t4j;d urt4; j _ <   =  t4j>st4j?r)t@ r)tAjBjCjDE  t4jFrKtGdddd  jHd d W d    n	1 sFw   Y  t4jIrld	dlJmI} | j j jttjj ttjK  _t4jLst4jMrt4jIsd	dlJmN} | j j tO rtPjQrt4jRstPjSrd} jD ]}tT|jUrd} nq|rd	dl#mV}	 |	 j tPjWrddlXmY}
 |
ddd  fddd t#Z j _ [  t4j\rt4j]j^rt4j]j_r ` j _ a j _ b  tAjBj4jcjdr e  | j tjfg j  h  t  _ii  _jtkdl fdd t  _md S )Nc                   s   g | ]}  |qS rf   )create_scheduler_noder  rd   rf   rg   r   S  r]  z#Scheduler._init.<locals>.<listcomp>c                 S  rY  rf   rZ  r  rf   rf   rg   r\  b  rb  z#Scheduler._init.<locals>.<dictcomp>c                 S  s$   i | ]}|  D ]}| |qqS rf   )r^  r  )r   r   r   rf   rf   rg   r\  f  s
    
c                 S  rY  rf   rZ  r  rf   rf   rg   r\    r]  r   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotune#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                   S  s
   dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingrf   rf   rf   rf   rg   rS    s   z!Scheduler._init.<locals>.<lambda>c                     s   d dd t jD S )Nz

c                 S  s2   g | ]\}}d | d|   d|   qS )zsnode[r  z buffer_names:)r  r  r  rf   rf   rg   r     s    
z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>)r%  r  r  rf   rd   rf   rg   rS    s
    )metadata_fn
payload_fngraph_statsc                     s    j  jt jdS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  rf   rd   rf   rg   rS    s   )nr  rV  rV   r   r   backendsr  _post_grad_graph_counterrX  r  count_graph_partition_counterr   r  rP  r   	constantstorchbind_constantsr  r  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr&  r  r'  copyr  r  rG  seen_template_fusionsr$   decide_global_ordering_of_commsrY   topological_sort_scheduledead_node_eliminationcompute_ancestorsr)   ir_nodes_pre_fusionr   torch._inductor.debugrB  rC  rY  create_foreach_nodesr   r   logged_slow_fusionr%   _pre_fusion_custom_passdistributed_max_autotune_gemmrx  rD  scheduler  _post_fusion_custom_passr  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r~  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodesrJ  memoryget_output_namesdeterministic reorder_for_compute_comm_overlaprK  r(  r&   6runtime_estimations_align_across_all_distributed_ranksr  rd  rO   r   rL  reorder_sink_verbose_loggingtorch._loggingrM  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   rZ   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)re   r  r   rB  rC  rD  rJ  rK  has_collectivesrL  rM  r  rd   rg   r?  D  s  











	






zScheduler._init!dict[str, SchedulerDonatedBuffer]c                 C  sD   i }t jjD ]}tt jj| tjrt| t jj| d d||< q|S )N)r   )rV   r   graph_inputs_originalr   r(   DonatedBufferr@  )re   name_to_donated_bufr   rf   rf   rg   rd    s   

zScheduler.get_donated_buffersr6  c                 C  s   t jjS rq   rV   r   current_devicerd   rf   rf   rg   r  &     zScheduler.current_devicer  c                 C  s   |t j_d S rq   r  r  rf   rf   rg   r  *  r  c                 C  s4   t jdddkrddlm} || jdd dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  r  r  )re   r  rf   rf   rg   r  .  s   zScheduler.debug_draw_graphlabelr   c                 C  s4   t tjrt d| | jD ]}|  qd S d S )Nz%s:)rt  isEnabledForloggingINFOr  r  r  )re   r  r   rf   rf   rg   debug_print_nodes5  s   

zScheduler.debug_print_nodesr   rW  r[   c                 C  s`   |  d us
J d| rt| |S t|tjtjfr!t| |S t|tjr,t	| |S t
|)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r(   r   r  r   r  rI  r  r_  rf   rf   rg   rA  ;  s   


zScheduler.create_scheduler_nodec                   s   t  g }j  tjj D ]9} fdd|D }|sq| fdd|D }tj	dk}t
|d|d}|| |D ]}|j|< qAqfddjD t| _d S )Nc                   s(   g | ]}| v rt j| ts|qS rf   )r   r  r  r  )kept_node_namesre   rf   rg   r   N  s    z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                   s   g | ]} j | qS rf   r  r  rd   rf   rg   r   Y  r]  r   Fr  r  c                   s   g | ]
}|   vr|qS rf   rZ  r  )removed_node_namesrf   rg   r   h      )r   r  r   rV   r   listsr   r  r%   combo_kernels_autotuner  r   r  r   )re   fe_nodesnamesr   r  fe_noder   rf   )r  r  re   rg   rm  H  s6   





zScheduler.create_foreach_nodesc           $        s  G  fdddt t  t jD ]`}| D ]Y}| }t|jj	t
jr1t| dkr1q| D ]=}|v ra|v ra| }| }|| }D ]}| |u s[| |u r_||< qMq5|v rl| |< q5| |< q5qqd/fdd				d0d1fdd}	i }
tjj D ]1}t|tjr|jD ]}d|
|< qqt|t
jrdd | D }|D ]}|jD ]}d|
|< qqqd	}jD ]-}|jdusJ t|j dd d}|D ]}t|tjsJ d}||
vr| |
|< qqǈjD ]j}td|j |rK|jdusJ t|jjdddd d}|D ].}||
v s,J | d|
 |
|  }durIj|  D ]}|t|  q<qt|j j!dkrit"t#|j j! }rit|t$ri|j%}nd}| D ]p}t|& dks|J |& D ]]}|}|	|| |t||d | j'D ]B}| | krqt|jt(sJ |j D ]%}| }|}|| v }|t)|| | d |	||dd  qqqqotjj*|  D ]}|	||dd  |t)|| dd! qtjj+|  D ]}|	||d	d  |t| q|j j,D ]}t|t)s/|	|j-||.| q|/j0 | D ]'}|& D ]}| j0|< | j0|< j12||j1| < qAq;qtj3 D ]}td"| |	|t4t| qi|rtjj5D ]?}|jddD ]5}||
v sJ | d|
6  |
|  }rj| 7 D ]}td#|| |	|t4t| qqqj0D ],}|tjjv r|	|t4t| tjj89| q|tjj:v r|	|t4t| qd$d% t;tjj6 D fd&dtjj8D tj_<jD ]}| D ]}|=|  j' qqj>D ]}j>| =| j' q)t? } | @d' ' D ].\}}!| A  d(d |!j'D }"| @d)| d*|" d+ W d   n	1 slw   Y  qD| @d, | B C }#tDd- tDd.|# dS )2zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                      s8   e Zd ZdZ		ddd	d
ZdddZd fddZdS )z1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr  Optional[list[_T]]
membershipOptional[OrderedSet[_T]]rz   r  c                 S  s   |pg | _ |p	t | _d S rq   )r  r   r  )re   r  r  rf   rf   rg   rV  |  s   
z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	node_userr]   c                 S  s*   || j v rd S | j| | j | d S rq   )r  r  r   r  )re   r  rf   rf   rg   r     s   
z8Scheduler.compute_dependencies.<locals>.DedupList.appendr  DedupList[_T]c                   s4   t  j|j} j fdd|jD  }||S )Nc                   s   g | ]	}| j vr|qS rf   )r  r  rd   rf   rg   r     s    zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>)r   r^  r  r  )re   r  new_membership	new_items	DedupListrd   rg   __add__  s
   
z9Scheduler.compute_dependencies.<locals>.DedupList.__add__rR  )r  r  r  r  rz   r  )r  r]   rz   r  )r  r  rz   r  )rr   rs   rt   r   rV  r   r  rf   r  rf   rg   r  r  s    
r  r   r  r   rz   c                   s   | j v r j |  S | S rq   rG  r  )r  re   rf   rg   r    s   
z.Scheduler.compute_dependencies.<locals>.renameFused_by_namer  r$  r  ri   r%  r  c                   s    |   t||| d S rq   )r   r#  )r  r  r  r%  )name_to_usersr  rf   rg   add_user  s   
z0Scheduler.compute_dependencies.<locals>.add_userNc                 S  s   g | ]
}t |tjr|qS rf   )r   r   r  r   r3  rf   rf   rg   r     r   z2Scheduler.compute_dependencies.<locals>.<listcomp>c                 S  r  rq   r  r  rf   rf   rg   rS        z0Scheduler.compute_dependencies.<locals>.<lambda>r  Tzscheduling %s)unbacked_onlyc                 S  r  rq   r  r  rf   rf   rg   rS    r  z not in )rL  )mutating_bufr  )r%  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S     i | ]\}}||qS rf   rf   )r   r   r   rf   rf   rg   r\  `  rb  z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                   r  rf   rf   r  )	inp_namesrf   rg   r   c      r  c                 S  rw  rf   rZ  )r   r  rf   rf   rg   r   t  rx  'z': r  r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   rz   r   )FF)
r  r   r  r$  r  ri   r%  ri   rz   r  )Er   r]   rO  r   r  r^  r  r   r   r  r(   r<   r   r  rV   r   rP  r   r   r  r   	TensorBoxr  r  get_unbacked_symbol_defsSymbolrt  r  r5  r  r  r2   r   rq  r  rH  r1   rL  r  r  r[   r3   additional_buffer_depsadditional_star_depsr   r   r  r  rG  r  r  r|  r-  graph_outputsr   r  mutated_inputsr  r^  r  mutated_input_idxsr2  r&  rN   ro  r  r  rv  compute_dependencies_log)$re   r   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_nodevalfssym_sizer3  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namer  out_buf
other_nameis_aliasadd_depr  r  r   r   logbufrb  r  r   rf   )r  r  r  r  re   rg   rY   l  s\   
	




	
	








zScheduler.compute_dependenciesc                   sF  ddl m}m}m}m} ttjj	 }| j
|}tjjjs&| j
 j ttj }| j
||\}}	}	dd tt j
D |D ]&}
|
jdkrR|
jdkrRqE|
j }|
j d | |
j d | qEddlm} |  d fdd}g }t j
D ]\}}|| ||||t j
d kd q| _
d S )Nr   )rK  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                 S  s   g | ]}g g fqS rf   rf   )r   rv  rf   rf   rg   r     r  z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>r   )register_check_mem_opstep_idxr   is_final_stepri   rz   rI  c                   sn   |  d }|  d }|||g}t jttddtjjjjg |dd d}d j	|  
  |_t |S )	Nr   r   r  r  c                 S  s   | |d |d |d dfS )Nr   r   r   )alivedeadr  rf   )tensor_argsr  rf   rf   rg   rS    s   zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>)r  r$  r  nontensor_argsunflatten_args
mem_check_)r(   MemoryCheckKernelr<   r~  r  r  _inductor_debugcheck_memory_stepdefaultr  r  r  rI  )r  r  expected_newly_aliveexpected_newly_deadr  r   re   step_allocs_deallocsrf   rg   construct_mem_check_node  s   


zEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node)r  )r  r   r  ri   rz   rI  )r{  rK  r  r  r  r   rV   r   rP  r   r  r~  r  r%   rJ  r'  r|  r  r   
size_alloc	size_freer  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr  r  )re   rK  r  r  r  rP  name_to_freeable_input_bufr  buf_info_listrv  buf_infor  r  r  	new_nodesr  r   rf   r  rg   r  {  sB   





z#Scheduler.insert_memory_check_nodesc                   s*  t jsdS g }t| jD ]uddd d} D ]$}t fd	d
|jD }|r;td|	  t
jj|	  qd}q  oE| }|sN| qtd	  t
jj	  jjD ]}|j| jv r| j|j j}fdd|D | j|j _qcqtt|| _| jD ]  qdS )z0
        Remove any nodes without users
        Nr  r#  rz   ri   c                 S  s   | j p
|  tjjv S rq   )r%  r  rV   r   r  )r  rf   rf   rg   can_eliminate_user  r  z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userFc                 3      | ]} |V  qd S rq   rf   r   u)r  rf   rg   r     r  z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead buffer: %sTzremoved dead operation: %sc                   s"   g | ]}|j    kr|qS rf   r  r  r   rf   rg   r     s    z3Scheduler.dead_node_elimination.<locals>.<listcomp>)r  r#  rz   ri   )r%   use_dcer   r  r^  r   r  rt  r  r  rV   r   r  r  r  r   r  r   r   r   r'  r   r  )re   updated_nodesactive_buffersr   can_eliminater  r  rf   )r  r   rg   ri    s:   



zScheduler.dead_node_eliminationrL  Optional[str]ri   c                 C  s   |duS )z:Check if store mode requires cross-thread synchronization.Nrf   )re   rL  rf   rf   rg   mode_requires_synchronization  r  z'Scheduler.mode_requires_synchronizationr  c                   s^   t t  t  g d fdd|D ]}| D ]}| |< qq|D ]}| q&S )	z?
        Ensure nodes is in topologically sorted order
        r  r[   rz   r  c                   sV   | vr) |  t| jdd dD ]}|j vrq |j  q|  d S d S )Nc                 S  r  rq   r  )drf   rf   rg   rS    r  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>r  )r  r  rP  r   r   )r  r   r  r  seenvisitrf   rg   r    s   

z2Scheduler.topological_sort_schedule.<locals>.visitN)r  r[   rz   r  )r   r[   r4  r  )re   r  r   r   rf   r  rg   rh    s   



z#Scheduler.topological_sort_scheduler  c                   st   t  }t|tttttfr|jD ]}||j	 qn
t
dt| d fdd|D }tt  fdd|D S )Nz+get_unmet_dep_nodes is not implemented for .c                 3  s    | ]
} j |  V  qd S rq   )r'  r  r   rd   rf   rg   r   &      z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>c                 3  s    | ]} j | V  qd S rq   r  r  rd   rf   rg   r   '  r  )r   r   r   rI  r  r   rc  rP  r  r   RuntimeErrorr   r   )re   r  
unmet_depsr   unmet_dep_opsrf   rd   rg   _get_unmet_dep_nodes  s$   

zScheduler._get_unmet_dep_nodesr  c                 C  s   g }t | jd}i }| jD ]!}| |}t|||< |D ]}||g }|| |||< qqdd | D }|rf|| |D ]}	||	g D ]
}
||
  d8  < qJ||	 qBdd | D }|s;|rlJ d|S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                 S     g | ]
\}}|d kr|qS r   rf   r   r  r  rf   rf   rg   r   8  r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>r   c                 S  r  r  rf   r  rf   rf   rg   r   ?  r   zTopological sort failed!)	r4  fromkeysr  r  r   r  r   r  r  )re   r  r  childrenr   r  r   czero_deg_nodesr  r  rf   rf   rg   r  )  s,   




z!Scheduler._topological_sort_nodesc                 C  s~   i }| j D ]'}t }|jD ]}| j|j  }|| ||| O }q||| < ||_qt	| j D ]
\}}||_
||_q2dS )z.
        Populate each node.ancestors
        N)r  r   rP  r'  r   r  r  r  r   r  rC  rD  )re   name_to_ancestorsr   r   r   dep_node_namer  rf   rf   rg   rj  C  s   


zScheduler.compute_ancestorsc                 C  sf   t jsd S | jD ](}t|ttfr| st jdkrq| D ]}t|tr*|	 r+q|
  qqd S )Nhalide)r%   r  r  r   r   r   rQ   cpu_backendr   r  r  )re   r   r  rf   rf   rg   r  V  s   


zScheduler.merge_loopsc                 C  s   t ddddR tdD ]6}t|}td|d | | j|dd}t|}td	|d || ||ks8|dkrBtd
|d   nqtjsItjrP| j|dd}|W  d   S 1 s\w   Y  dS )zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTrF  r  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   r  r  fuse_nodes_oncer%   r  loop_index_inversion_in_fusion)re   r  r  old_lennew_lenrf   rf   rg   r  q  s>   $zScheduler.fuse_nodesc                 C  s8   g }| j D ]}|t|tr| n|g q|| _ dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  r   rc  r  )re   r  r   rf   rf   rg   r    s   

zScheduler.process_grouped_nodesr  tuple[float, str]c                 C  sh   t |dksJ |d  }|| _| |}tdddd ||W  d   S 1 s-w   Y  dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)rG  dynamo_compile_column_usN)r   r   r  r  r   r*  )re   r  r  r  rf   rf   rg   r*    s   
$zScheduler.benchmark_fused_nodesNbenchmark_kernelhint_overrideOptional[int]c                 C  sh   t |dksJ |d  }|| _| |}td |j|||dW  d   S 1 s-w   Y  dS )r)  r   generate_kernel_code_from_nodesr.  N)r   r   r  r  r   r0  )re   r  r-  r.  r  r  rf   rf   rg   r0    s   


$z)Scheduler.generate_kernel_code_from_nodesmoduler   r  c                 C  sF   || _ | |}td ||W  d   S 1 sw   Y  dS )r)  benchmark_codegened_moduleN)r  r  r   r3  )re   r2  r  r  rf   rf   rg   r3    s
   

$z$Scheduler.benchmark_codegened_module
multi_noder  c                 C  s   t jj}|sdS td|| |jD ]<}| }t|ddr"||vr#q|j}|| }t	|t
jr9||j |j}t	|t
jrN||krNtd|||  dS qdS )z
        Check if selecting a Triton template would cause layout conflicts.
        Returns True if there's a conflict and we should fall back to ATen.
        FzNode %s has constraints %sr  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rV   r   buffer_layout_constraintsrt  r  r  r  rp  r  r   r(   FlexibleLayout freeze_layout_with_exact_stridesr-  FixedLayoutru  )re   r4  constraintsinpinp_namer  expected_layoutrf   rf   rg   !_has_layout_conflict_for_template  s.   
z+Scheduler._has_layout_conflict_for_templatec              	   C  s  t | jD ]\}}t|trt|jtjr|j}tjj	s#|
 \}}ntdd | D }t|tjjjrZ| |rZ| D ]}t|tjjjrM|} nq?t|tjjjsZJ dt|tjjjrtjri }||d< tjD ]!}|j|d}	dd |	 D }
t|
 dd	 d
d }|||< qn|j| n|j| qtj|j | }W d   n1 sw   Y  |j}t|tjsJ |j}t|tjsJ |jrt ||j |j!|_!| "|||| qdS )a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c                 s  s$    | ]}t |tjjjr|V  qd S rq   )r   r~  r  rv  ExternKernelCaller)r   timingrf   rf   rg   r   	  s    
z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>zZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr1  c                 S  s    i | ]\}}t |tr||qS rf   r   r   )r   r  r  rf   rf   rg   r\  4  s    z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>c                 S     | d S r  rf   r  rf   rf   rg   rS  9  r  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>r  r   )#r  r  r   r   r   r(   MultiTemplateBufferr%   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr~  r  r   r=  rv  r>  multi_kernel_hintsr  rD  finalize_as_triton_callersfinalize_as_triton_callerrc  current_originsr!  output_noder   
StorageBoxOperationBufferorigin_noder7   r  _replace_node)re   r  r   r4  min_node_unfusedrv  choicecallershinttimingstriton_timingsout_tensorboxout_storage
out_bufferrf   rf   rg   rs    sx   





z)Scheduler.finalize_multi_template_buffersrW  r  r  r   r   c                   s   t || | |}|| j|< || j| < || j| < i  t|jj	|j
D ]}| j|jd  }r9|j |< q(d fdd}||j
|_
||jj	|j_	t| | D ]\}	}
|	| j|
 < |
j|	_qX|j|_|j|_|j|_|j|_d S )Nr  rO  rz   c                   s   t  fdd| D S )Nc                 3  s    | ]}|  V  qd S rq   )r  r   r  rf   rg   r   c  r  z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>r   )r  r  rf   rg   rename_depsb  r  z,Scheduler._replace_node.<locals>.rename_deps)r  rO  rz   rO  )r  rA  r  r  r  r  r  r  r   r   rP  r  r  r   r5  r^  r'  r  rC  rD  r   rB  )re   rW  r4  r  r   new_scheduler_noder   	real_namerX  new_outold_outrf   r  rg   rN  N  s4   




zScheduler._replace_node	node_listc                 C  s   t dd |D S )Nc                 s  sB    | ]}t |jd o|jduot |jjdo|jjjdkV  qdS )r   Nscatter_moderM  )r#  r   r   r^  r  rf   rf   rg   r   x  s    
z,Scheduler._any_atomic_add.<locals>.<genexpr>)r   )re   r]  rf   rf   rg   _any_atomic_addw  s   zScheduler._any_atomic_add)tuple[Optional[LambdaFuture], ModuleType]c                 C  s^   | j |d|d}t|}tjj }| sd }||fS |jd|d}t	|t
s+J ||fS )NT)r-  r.  triton_)kernel_namesource_code)r0  r   loadr~  r  async_compileAsyncCompileuse_process_poolr   r   r   )re   r  r.  src_codemodre  futrf   rf   rg   compile_kernel  s   
zScheduler.compile_kernelrx   ry   r_   c                   s2  t dd fD }tjs|stdS  r!t tj	r)
 s)
 r.tdS  }|d  s<J jdkrKtjdkrKtdS  }tt||}|ratdS ddlm  t|d  dusxJ d%fdd|rst dd fD rs dur n ttjsJ rtdS i 
g tjD ]}|t dd dD ]1\}}	t|tjjjsqɈ | !|gj"||j#dR  W d   n1 sw   Y  qt$d}
d}i }D ]f\}}}z|dur|%  W n) t&y> } zt'(t)j*r3t'+ds-dndt,| W Y d}~qd}~ww  | -|\}}|||< ||
k rZ|}
|}W d   n	1 sew   Y  q|j.|< t|t/syJ |
|< qtj0t1dd j2D }t3 o o|tj4k	t$dt$dd	s 5 \t t67dd}ndd j2D }r׈rψ8|n8|\}nstdS 9 t:g d}|D ]Y\}}t|t/sqst;|dr|j<j<krqr| kr n0|d7 }|tj4kr% n$ | !|g"|R  W d   n	1 sBw   Y  qt=dkrUtdS d&	
fd!d"}t>|d d S "|"|"|d& fd#d"}tj>|d d$S )'
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c                 s  s(    | ]}|  ot| tjV  qd S rq   )r  r   r~  r(   rB  r  rf   rf   rg   r     s    
z.Scheduler.speedup_by_fusion.<locals>.<genexpr>Tr   r  r   CompilationErrorNms_fusedr^  ms1ms2rz   r  c              	     st   t tjr8| || k r"t d   t|| |  d d S t d   t| ||  d d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r  rA   rB   )ro  rp  rq  r|   rf   rg   
log_fusion  s   z/Scheduler.speedup_by_fusion.<locals>.log_fusionc                 s  s    | ]	}|  d uV  qd S rq   r  r  rf   rf   rg   r     s    
Fc                 S  rA  r  rf   r  rf   rf   rg   rS    r  z-Scheduler.speedup_by_fusion.<locals>.<lambda>r  r1  infException in compiling %s: %sr  r  c                 s      | ]}t |tV  qd S rq   r@  r   r  rf   rf   rg   r     s    

r   c                 S  s   g | ]}|d fqS r  rf   rx  rf   rf   rg   r   '  rx  z/Scheduler.speedup_by_fusion.<locals>.<listcomp>allowed_prologue_inpsri   c                    s  t d} d }i }r(rttjsJ   \	tfdddD ]\}}}z|d ur9| }n sC|j}|	  nd }W n% t
yk } zttjratds[dndt| W Y d }~q*d }~ww  r| |\}}	|||< || k r|} |}W d    n1 sw   Y  q*|kp
	 |  k}
|rt|jdkr|jd	 jd
kr|
r|} nq* rƈ| 	
  r| 	
 k r|d urtjr|d <  n| |jd < dS dS )Nru  c                   s    | d  S r   rf   r  )rE  rf   rg   rS  f  rt  KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>r  rv  r  r  r   r   r  TF)r^  r   r(   rB  rE  rD  r  r  ra  
precompilers  r  r  r  rs  r  r   swap_as_triton_callerr3  r   	launchersn_spillsr%   rF  rG  rH  _choice_timings)min_ms_fusedms_fused_choicenew_timingsrP  rc   	mod_fusedresr   ro  pathfusible_choice)bench_epiloguerE  r  r6  future_choicesget_choice_timings_async hint_override_best_fusion_choicert  
min_choicerp  rq  	ms2_fusedr4  re   rf   rg   benchmark_when_readyY  s   



	

z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyc               
     sp  ddl m}  zd 
d 	d fD ]
}|d ur|  qd \ t r3d W dS 
d \trId W dS 	d \tr_d W dS   tdr  krfjvrjf t	d
 fd	d
   k W S  | y   Y dS  y } zdt|v rW Y d }~dS  d }~ww )Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc                	     s       dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiorf   rf   rp  rq  ro  path1path2
path_fusedrf   rg   rS    s   
rz  Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r  r3  mathisinfr   rn  r  r   r  r   )r  rj  r   )rn  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rt  re   r  r  rg   r    s`   


rn   )ro  r^  rp  r^  rq  r^  rz   r  r;  )?r   r%   benchmark_fusionr_   rl   r  r   r~  r(   TritonTemplateBufferr  r   r   r   r"  r   r  r  r_  triton.compiler.errorsrn  r  rB  r=  rF  rE  r  r  r~  r  rv  TritonTemplateCallerr|  r   rk  r.  r^  r  rs  r  r  r  rs  r  r   r3  r  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesrD  operator
itemgetterr*  r_  r"  r#  ry  r   ro   )re   rx   ry   is_multi_templatenode_list_1node_list_2node_list_fusedr.  rP  rv  r  r  r  rc   r  r   ro  r  num_triton_callerschoice_timings_iterr  triton_choicesunfused_timer  rf   )rn  r  rE  r  r6  r  r  r  r  r  r  rt  r  rp  rq  r  r4  rx   ry   re   r  rg   speedup_by_fusion  s4  




















(X


HzScheduler.speedup_by_fusionc                 C  s   | j |  S )z0Look up the node in Scheduler name_to_fused_node)r  r  r_  rf   rf   rg   r    s   zScheduler.get_fused_noder  OrderedSet[BaseSchedulerNode]c                   s   t d| |  | }| |ksJ | ||| || || |  | j	 fdd 
 D   S )Nzfusing %s with %sc                      i | ]}|   qS rf   rZ  r  node3rf   rg   r\    r]  z,Scheduler.fuse_two_nodes.<locals>.<dictcomp>)r  r  r  r   r  rl   r  r  r  r  r   )re   rx   ry   r  r  rf   r  rg   fuse_two_nodes  s   


zScheduler.fuse_two_nodes
speedup_fnrm   c                 C  s4   |  ||r| ||s| r| ||| dS dS NTF)r   will_fusion_create_cycler  )re   rx   ry   r  r  rf   rf   rg   fuse_if_speedup  s   

zScheduler.fuse_if_speeduptemplate_fusion_candidates,dict[BaseSchedulerNode, list[PendingFusion]]c                 C  sj  |rg }i }t  }|D ]v}||v rt|| dksJ || d}t|| dkr/|| | \}}	|	|krCt||	s@J |}
n||ksIJ t||	sPJ |	}
| |
|
urZq|jrs|jj}|dusgJ |	| ||f||< q| 
||	|j|r|| qt|D ]}|| \}}| 
| |j| |j|j|r|| q|D ]}|| q|sdS dS )z
        Evaluate pending template fusions for a set of fusion candidate nodes.
        The fusion candidate nodes are pointwise nodes as potential epilogue
        or prologue fusions
        r   r   N)r   r   r  r  r}   r7  r9  r  rc   r   r  ra   r   rx   ry   )re   r  r  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionrx   ry   r  fcandrf   rf   rg   "_evaluate_pending_template_fusions-  sV   






z,Scheduler._evaluate_pending_template_fusionspossible_fusion_pairs1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]pending_fusions&dict[BaseSchedulerNode, PendingFusion]template_fusion_nodesr#  c                   s  d	 fdd}|D ]\}}|||  |} |}t||r+||fjv r+q|||r||s||}	|	jd urt|	j|||	jd}
t||rx||fjvs[J j	||f t
||}||vrpg ||< || |
 n|
|< |
|< q|	jsq||  qd S )
Nrx   r[   ry   rz   r  c                   s    | v s |v rk |  |}|d us#J | \}}|j}|d  |d   ||u sAJ  ||u sJJ | rS| |rTq ||   | v s |v sd S d S rq   )r  r  r}   ra   r  r  r  )rx   ry   r  	node_key1	node_key2
is_speedupr  r  re   rf   rg   resolve_pending_fusions{  s$   z<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions)ra   rx   ry   rc   r  )r  r:  rf  r   r  r  ra   rw   rc   r  r<  r   r`   r  )re   r  r  r  r  r#  r  rx   ry   
fusion_resr  template_pw_noderf   r  rg   _try_fusion_pairss  sJ   






zScheduler._try_fusion_pairsc                 C  s|   t  }| D ]4}| \}}|j}||v st||rq|| | ||u s*J | ||u s3J | |||| qd S rq   )r   r   r}   ra   r:  r  r  r  )re   r  r  seen_pair_speedup_fnr  r  r  is_speedup_fnrf   rf   rg   _finish_pending_fusions  s   

z!Scheduler._finish_pending_fusionspossible_fusionsdeferred_prologue_fusionsc                 C  s\   t dd |D }g }|D ]\}}t||r"||v r"|||f q|||f q|}d S )Nc                 S  s   g | ]\}}t ||r|qS rf   r;  )r   n1n2rf   rf   rg   r     s    z6Scheduler._handle_template_overlap.<locals>.<listcomp>)r   r9  r   )re   r  r  epilogue_template_nodesnew_possible_fusionsr  r  rf   rf   rg   _handle_template_overlap  s   z"Scheduler._handle_template_overlapc           	      C  s   |  | t|}ttjr!td |D ]
}td|  qi }i }g }| ||}t	j
s3t	jr?t	jr?t	jr?| || | ||||| | || | || |  |ri| ||||| | || t|dd d}| |}|S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                 S  r  rq   ra  r  rf   rf   rg   rS  3  r  z+Scheduler.fuse_nodes_once.<locals>.<lambda>r  )r  r   r  r  r  rs  r  r  get_possible_fusionsr%   rt  ru  r8  r6  r  r  r  r  clearr  rh  )	re   r  r#  r  r   r  r  r  r  rf   rf   rg   r$    s\   

	
zScheduler.fuse_nodes_oncerI  c           	        s<  t | j}d}t| j}td| tt| D ]a\}}t|}t|dk r)q|dur3||kr3 nH| 	|s?td| q|d7 }t
jdk}t|d j|d|d td	t|| |D ]}|| q^|  | j fd
d  D  qt|dd d| _| | j| _td||t| j | | j dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                   r  rf   rZ  r  r[  rf   rg   r\  [  r]  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>c                 S  r  rq   ra  r  rf   rf   rg   rS  ]  r  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   rt  r  r  r  r  r  speedup_by_combo_kernelr%   r  r   r  r  r  r  r  r   r  rh  r  )	re   rI  r  r\  num_nodes_orignumr]  r  r   rf   r[  rg   rz  7  sV   





rE  c                 C  s   |D ]}| | j qd S rq   )r  r  )re   r  r   rf   rf   rg   r  g  s   zScheduler.prune_redundant_depsc           
        s   g t tttf   d fdd}tt}|D ]}|r$q| D ]	}|| | q(q|	 D ]}|| q7t
jrett}|D ]}t|dd}	|	rY||	 | qH|	 D ]}|| q^jjd	d
 tdt S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  rz   r  c                   s   t | D ]E\}}| |d |d tj  D ]3}||f}|v r q| || r2| q| s:| rH|| rH||f qqd S r  )r  r%   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r  r  )r  node1_indexrx   ry   r  r#  r  r  re   rf   rg   check_all_pairsv  s*   
z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr   NT)r  reversezfound %d possible fusionsr  r  rz   r  )r   r   r[   rO  r   r   unfusable_noder   r   r   r%   aggressive_fusionrp  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )
re   r  r#  r  buffer_names_groupingr   r   node_groupinggroup_groupingr   rf   r  rg   r  k  s6   




zScheduler.get_possible_fusionsc                   s   t t  d fdd| j | j B |jj |jj B   tfdd D }|rAt||d	 |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r   r[   rz   ri   c                   s^   t | tr-| vr-|  |   rdS t| j@ p,tfdd| j  D S dS )NFc                 3      | ]
} j | V  qd S rq   r  r  
found_pathre   rf   rg   r     
    
zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)r   r   r  r   issubsetri   r   r   r   combined_ancestorscombined_namesr  re   visitedrf   rg   r    s   

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3  r  rq   r  r  r  rf   rg   r     r  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycleNr   )r   r   r   _dictr   r   r   r  )re   rx   ry   cyclerf   r  rg   r    s   
z"Scheduler.will_fusion_create_cyclec              	     s   ddl m  dfdd}||}||}t fd	d
|D }t fdd
|D }||}d}	|D ]}
z
|	t|
d 7 }	W q4 tyK   Y  dS w ||}tjj	
|	d| r^dS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   buffer_reuse_keyr   r[   rz   list[ir.Buffer]c                   sL   g }| j jD ]} j|j}|r#t|jdkr#|j r#|	|j q|S r  )
r   r   r'  r  r   r   r  r   has_tensor_outputr   )r   rX  r  r   rd   rf   rg   _find_single_user_inputs  s   zKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsc                 3  r  rq   rf   r[  r  rf   rg   r     r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>c                 3  r  rq   rf   r[  r  rf   rg   r     r  r   r   F    TN)r   r[   rz   r  )r  r  r   intersectionr   ri  r  rV   r   r   statically_known_gt)re   rx   ry   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingrf   )r  re   rg   can_fusion_increase_peak_memory  s$   
z)Scheduler.can_fusion_increase_peak_memory	thresholdc                 C  s  t dd | D dd | D  }t dd |jjD }t dd |jjD }||@ }t  }|jjD ]}	| |	j|rD||	j q5t dd |jjD t dd |jjD B }
t d	d |jjD t d
d |jjD B }|
| }|| }||B }t||kS )Nc                 S  rw  rf   rZ  r  rf   rf   rg   r     rx  zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>c                 S  rw  rf   rZ  r  rf   rf   rg   r     rx  c                 s  r  rq   r  r   rf   rf   rg   r     r  zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>c                 s  r  rq   r  r   rf   rf   rg   r     r  c                 s  r  rq   r  r   rf   rf   rg   r   &      
c                 s  r  rq   r  r   rf   rf   rg   r   (  r  c                 s  r  rq   r  r   rf   rf   rg   r   +  r  c                 s  r  rq   r  r   rf   rf   rg   r   -  r  )	r   r   r   rq  r   $can_buffer_be_removed_through_fusionr   r  r   )re   rx   ry   r  fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionrI  all_read_namesall_write_namesunique_readsunique_writesunique_io_buffersrf   rf   rg   (fusion_prevent_too_many_reads_and_writes
  s:   z2Scheduler.fusion_prevent_too_many_reads_and_writesc                 C  s*   t t|j|j t|j|j }|dkS )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heuristic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )rn  r  rC  rD  )re   rx   ry   proximity_scorerf   rf   rg   are_long_distant_nodes:  s
   z Scheduler.are_long_distant_nodescommon_buf_names'Union[tuple[str, ...], OrderedSet[str]]c                 C  sb  i }dd |j  D }dd |j  D }|D ]}tj|}|| }	|| }
t|	tr2t|
tsAdt|	 dt|
 ||< q|	 |
 krXd|	  d|
  ||< qt	|	j
t	|
j
krgd||< q|	 }|
 }||kr~d| d| ||< q|	 |
 krd	|	 d|
 ||< qd
}t|tjsd|j }d|	 d|
 d| ||< qt|S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                 S     i | ]}|j |qS rf   r  r   rf   rf   rg   r\  `  rx  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>c                 S  r  rf   r  r   rf   rf   rg   r\  a  rx  znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: rx  zLayout: zUnknown reason: z. )r   r  rV   r   r?  r   r1   r   r   rU   r   
get_offsetnormalize_with_stride_orderr(   rB  r  r   )re   rx   ry   r  reasonsnode1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strrf   rf   rg   decide_fusion_fail_reasonU  sD   
z#Scheduler.decide_fusion_fail_reasonc                 C  s  t jsdS tdd ||fD rdS |j }|j }||@ }|s$dS tdd |jD }|| r4dS t|dkr<dS t|jjdksLt|jj	dkrNdS t
t|jj}t
t|jj	}t|trht|tsjdS dd |jj	D }	|j|	vrzdS |	|j }
t|
tsdS |
 }
|
j|jkr|
j|jkrdS |j|jkst|jdkrdS t|jjdkrdS |jjrdS d	|jjv rd
|jjv sJ tdd |j D }t|dkrdS t
t|}||jjd	 krd	}d
}n||jjd
 ksJ d
}d	}ddlm} |jjd }t|dkrdS g }tj|D ]}|tjj !| qt"|}|||d }|du r7dS |jj| |jj|< ||jj|< |#dd | $||}t|t%s[J t&'d| |S )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c                 s  r  rq   r  r  rf   rf   rg   r     r  zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>c                 s  r  rq   r  r   rf   rf   rg   r     r  r   c                 S  r  rf   r  r   rf   rf   rg   r\    rx  zBScheduler.shared_data_after_inverting_indexing.<locals>.<dictcomp>r   index0index1c                 s  s    | ]}|V  qd S rq   rf   )r   rY  rf   rf   rg   r     r=  r   )generate_inverse_formulaNTFz!Shared memory after inversion: %d)(r%   r%  r   r   buffer_namesr   rP  r   r   rq  r  rH  r   r1   r   r  r   r   rg  r   r   	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr.  varsr   Add	make_argsr   rV   r   r   combine_modular_indexing_pairsr   r
  r  r   r  r  )re   rx   ry   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writenode1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr.  r/  simplified_termstermsimplified_read_exprinverse_formulascorerf   rf   rg   $shared_data_after_inverting_indexing  s   

 



z.Scheduler.shared_data_after_inverting_indexingc                 C  s  t jrtdd ||fD rdS | s| rdS |j }|j }||@ }|s,dS dd |j D }dd |j D }g }|D ]#}	||	 }
||	 }|
 | krg|t	j
jj|
 dd|
|f qDt|dkrpdS t|tdd	\}}
}t|
trt|tsdS |
j|jkr|
 | kr| |
S dS d
}| s||
|}n| s|||
}ntd| |  |rtt| ||S dS )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c                 s  r  rq   r+  r  rf   rf   rg   r   2  s    
z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>r   c                 S  r  rf   r  r   rf   rf   rg   r\  D  rx  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>c                 S  r  rf   r  r   rf   rf   rg   r\  E  rx  r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s) r%   r  r   r  r   r/  r  r!  r   rV   r   r   	size_hintr   r   rn  r  r  r   r1   r   r  dep_size_hintr   r  r#  r  r  r  r  r   r  )re   rx   ry   r7  r8  r9  r#  r$  
candidatesr   r%  r&  _numel	reorderedrf   rf   rg   !shared_data_after_reordering_loop"  sf   


z+Scheduler.shared_data_after_reordering_loopc                 C  s$   t |ttfo|  ot|j S )z>
        Is this node unfusable under any conditions.
        )r   rI  r  r  rS   r   r_  rf   rf   rg   r  }  s
   
zScheduler.unfusable_nodeprologue_noder  r  r  c           	      C  s   |  tjjkr
dS | }| }d}||| kr |d dS tdd | D }|tj	j
jjfkr:|d dS ddd}|| jrP| sP|d dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc                 s  s:    | ]}|j d ur|j  D ]}|jdkr|jV  qqd S )Ncall_function)r   r  r  r  r   r  r   rf   rf   rg   r     s    

zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsrt  torch.dtyperz   ri   c                 S  s   | j dko| jS )Nr   )itemsizeis_floating_point)rt  rf   rf   rg   low_prec_fp  r  zGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fpzVprologue fusion that must be upcast to fp32 not profitable for low precision templatesN)rt  rS  rz   ri   )r   rV   r   invoke_quant_opsr.  r/  r   r   r~  r  r  constant_pad_ndr  r  rt  r  )	re   rO  r  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr!  rV  rf   rf   rg   (check_prologue_fusion_heuristics_fusable  s4   

z2Scheduler.check_prologue_fusion_heuristics_fusable/Optional[tuple[int, SchedulerNode, sympy.Expr]]c                   s  t |tr
t |tsdS t |jtjrt |jtjsdS | s$| r&dS tjdkr-dS |j|j}}|\}}|\}}|	 sP|	 sP||ksPt
|t
|krRdS t
|jjdksbt
|jjdkrddS  tt|jj}	 tt|jj}
t|	|
tjkrdS d fdd	}||s||rdS g }tt||D ]\}\}}||kr|| qt
|dkrdS |d
 }|| || }}tjj||r|||fS tjj||r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr!  r   r   r[   rz   ri   c                   s`   | j jD ])}|j jv r j|j }n j|j}|r-tjj	|| r-t
|jts- dS qdS r  )r   r   r   r&  r'  r  rV   r   r!  r  r   r   r  )r   r  r  rd   rf   rg   has_reusable_buffer  s   
zIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_bufferr   r   )r   r   r   r(   r   r  r%   r"  r  r   r   r   rq  rJ  r  rH  rn  small_memory_access_thresholdr  r5  r   rV   r   r   statically_known_lt)re   rx   ry   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr^  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2rf   rd   rg   "get_expand_dim_for_pointwise_nodes  s`   
 


z,Scheduler.get_expand_dim_for_pointwise_nodesFTcan_reorderr  c                   s  ||u rdS t |tr||S t |trdS t||}| r-| | ||r-dS t |ts7t |tr=|d dS t |t	t
frN| sN|d dS t |t	t
fr_| s_|d dS | |j@ rl|d dS | r>tjsz|d dS | s| r|d dS | }t |tjs|d	 dS | }td
d |jD | }| |@ r|d dS | s| r|d dS |   dd D ]}	|	 }
|
D ]}t fdd|jD s|d   dS qqt |ts|gndd |jD }t|dksJ |d }t d jdkr.t d jd jdkr. d jd jd j |u s4|d dS | !|||s>dS | rW| sQ| sQtj"sW|d dS | t#j$j%@ si| t#j$j%@ ro|d dS | }| }||kr|d|| dS ~| j&|||d}t |t'sJ |r|tj(k rtj)r| *||}|dkr|}tj+r| ,|| }r|\}}}|-|| | &||}t |t'sJ tj.r|tj(k r| /||}|dkr|}t01t2j3rt04d|5 |5 | t#j67| |||sdS | |j@ r-| 8||o,t#j68| |||o,| |8||S t#j69| |||o?| |9||S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc                 s  r  rq   rZ  )r   r:  rf   rf   rg   r   g  r  z%Scheduler.can_fuse.<locals>.<genexpr>z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr   c                 3  s    | ]}|j  v V  qd S rq   r   r8  prologue_nodesrf   rg   r   w  r  z7template prologue can only fuse nodes with a single usec                 S  re  rf   r  r  rf   rf   rg   r   ~  rf  z&Scheduler.can_fuse.<locals>.<listcomp>r   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data):r   r  r  r  r  r  r   can_fuse_multi_outputs_templaterc  rI  r  r   r   r%   r8  r   r  r(   r  get_allowed_prologue_inpsr   r  r  r  r   r^  r   r  r   r   r   rI  r   r\  r6  rV   r   no_fuse_buffer_namesr  r   score_fusion_memory_thresholdr  rN  $expand_dimension_for_pointwise_nodesrp  r  r%  rH  r#  r  r  rs  r  r  r  r   can_fuse_verticalcan_fuse_horizontal)re   rx   ry   rq  r  r  r  ry  unsupported_prologue_argsr   	node_outsr   template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerf   rr  rg   r   $  s$  









zScheduler.can_fusec                 C  s*  |  }t||}tt}|jD ]}| j|j|j}t|t	r(| 
|||r(q|| | q|jjD ]&}t|ts<q4|| j|j|j}	|	rZ|	D ]}
| |
|rY|	|
 qLq4tdd tj| D }||@ rt|d dS | }|D ]}| j|  }|| j| j@ r|d  dS qzdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c                 s  r  rq   r  r   rf   rf   rg   r     r  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   rP  rG  r  r   r   r3   r  r   r   rq  r1   fusable_read_and_writer  r   r  r  r6  r   r   r'  r  r  r   )re   rx   ry   node1_buf_namesr  remaining_deps_by_namer   r   cd	remainingr  remaining_depsnode1_op_namesr  rf   rf   rg   ry    sB   




zScheduler.can_fuse_verticalweak_depr3   c           	        s   j | vr	dS fdd|jjD }t|dkrdS |d ttr'dS tts.J tj	t
jr7dS | jj  |g}t|trH|j}d}|D ]"} fdd|jjD }|s\qL|d7 }tfdd|D sn dS qL|dkS )	NFc                   s   g | ]
}|j  jkr|qS rf   )r   r  )r   ri  )r  rf   rg   r     
    z.Scheduler.fusable_weak_dep.<locals>.<listcomp>r   r   c                   s   g | ]	}|j  kr|qS rf   r  r   r  )rZ  rf   rg   r   -  s
    
c                 3  sB    | ]}t |tot|jtj o|j jko|j jkV  qd S rq   )r   r1   r    r   r"   TMPr   r  )ri  rf   rg   r   5  s    



z-Scheduler.fusable_weak_dep.<locals>.<genexpr>)r   r  r   rq  r   r   r2   r1   r    r   r"   r  r  r  r  r   r   r   )	re   r  rx   ry   mutating_writesrelevant_reading_nodesnum_concurrent_readsreading_noderelevant_readsrf   )rZ  r  ri  rg   r    s>   



zScheduler.fusable_weak_depr  r0   ri  r1   c                 C  s  t |trY| j|j|j}||jks!t|jtjs!t|jtjr#dS t	j
r4|j|jkr4| }| }| |jr<dS |j|jkoXt|jt|jkoX|jd t|j |jkS t |tr| j|j|j}| j|j|j}|j|jkr|jd ur||krdS dS r,  )r   r1   rG  r  r   r    r   r"   r  r%   r  r   r  r  rL  r   r   r2   )re   r  ri  	read_name
write_namerf   rf   rg   r  C  s4   



z Scheduler.fusable_read_and_writer   r  c                 C  s   t j||S rq   )rV   r   get_dep_size_hint)re   r   r  rf   rf   rg   rJ  j  r7  zScheduler.dep_size_hintreturn_is_mix_order_reductionint | tuple[int, bool]c                   s  fdd}|rt |rt |}||dS t|jjt|jj }tjjtjj }	t||	d t||	k rd||	krF|}fdd|jj|jjB D }
|t	 fdd|
D d	S |jj|jjB jjjjB @ }|t	fd
d|D d	S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        c                   s    r| |fS | S rq   rf   )rG  is_mix_order_reduction)r  rf   rg   _construct_return_valuez  s
   z>Scheduler.score_fusion_memory.<locals>._construct_return_valueTr  c                   s(   g | ]}| j jv s| j jv r|qS rf   )r   r   rq  r   )ry   rf   rg   r     s
    z1Scheduler.score_fusion_memory.<locals>.<listcomp>c                 3  s    | ]	} | V  qd S rq   rJ  r   )r  re   rf   rg   r     r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>Fc                 3  s    | ]}  |V  qd S rq   r  r   rd   rf   rg   r     r  )
r   r   r   r   r   r   rq  rD  rn  r   )re   rx   ry   r  r  r  r  rG  node1_dep_lennode2_dep_lenr  common_memory_depsrf   )r  ry   r  re   rg   r  m  s*   


zScheduler.score_fusion_memoryc                 C  s   t |dkr|S i }|D ]2\}}| | ksJ | }t| |||}||vr5||fg||< q|| ||f qt| t	ddd }t |dksTJ |S )Nr   r  r   )
r   r   r   r  get_fusion_pair_priorityr   rD  r  r  r  )re   r  "possible_fusions_group_by_priorityrx   ry   r  fusion_pair_priority&possible_fusions_with_highest_priorityrf   rf   rg   r    s.   
z4Scheduler.get_possible_fusions_with_highest_priorityr{   r   c                 C  s   t jj| g|R  S )z-
        Shim for list.sort(key=...)
        )rV   r  score_fusionr@  rf   rf   rg   r    s   zScheduler.score_fusion_keyc                 C  s<   t tj }t| jD ]}||| j ||j	 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rV   r   r|  r   r  r  r  r  rB  )re   r  r   rf   rf   rg   r    s
   zScheduler.compute_last_usagec                 C  s   t | jtjj tjjj D ]Q}|| jv r'| j| }| r&tjj	|j
 q|tjjv r_tjj| }t|tjrAtjj	| qt|tjrHq|j}t|tjrU| sWJ tjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r  r  rV   r   r  r!  freedr'  r/  codegen_freer   rP  r   r(   rB  r  r   rK  is_input_bufferr  )re   r   r   r:  storagerf   rf   rg   free_buffers  s4   


zScheduler.free_buffersc                 C  s$   | j  D ]}|  q|   d S rq   )rZ  r   flushr  )re   r  rf   rf   rg   r    s   
zScheduler.flushscheduler_noderI  c                 C  s   t |tsJ td d  d7  < ttdd |  |  W d    n1 s,w   Y  |j}t |t	j
sCJ dt||tjj |   d S )NrU  extern_callsr   F)increase_kernel_countztype(node)=)r   rI  r   rV   set_kernel_handlerr-   r  r  r   r(   r  r   r  r   r!  r  )re   r  r   rf   rf   rg   codegen_extern_call  s   
zScheduler.codegen_extern_callBaseSchedulingc                 C  s   t |jr|jd usJ | dtj| t|j}|d u r(td|j t sR|jdkrBt	j
| }jdk rBt|t t |jrR|jdksRtt || S )Nz( should have been normalized in loweringzUnsupported device type: r      r  )rQ   r   r   rV   r   add_device_infor,   r  r#   r~  r   get_device_propertiesmajorr4   inspectcurrentframer5   )re   r  device_schedulingdevice_propsrf   rf   rg   create_backend  s   

zScheduler.create_backendc                 C  s0   |d usJ || j vr| || j |< | j | S rq   )rZ  r  r  rf   rf   rg   r    s   

zScheduler.get_backendc                   s`   dfdd  fdd|  D }t| }|r.t|td	d
\}}tjj	| d S d S )Nr  torch.fx.Noderz   r   c                   s2   |  j vr j dd t| jjD   j |  S )Nc                 S  r  rf   rf   r  rf   rf   rg   r\  !  r]  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  r  r  r   r  r  rd   rf   rg   	get_order  s   

z*Scheduler.enter_context.<locals>.get_orderc                   s4   i | ]}|j d ur|j  D ]	} ||fd qqS rq   r  rR  )r  rf   rg   r\  %  s    
z+Scheduler.enter_context.<locals>.<dictcomp>r   r  )r  r  rz   r   )
r   r   r   rn  r  r  rV   r   r!  enter_context)re   r   r!  rv  lastrf   )r  re   rg   r    s   
zScheduler.enter_contextr   r  rA  c                   sP   z| j | j}W n
 ty   Y dS w t fdd|D o'|| jvo'|| jvS )NFc                 3  s"    | ]}|j p|  v V  qd S rq   )r%  r  r8  r  rf   rg   r   8  s     zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>)r'  r  KeyErrorr   rG  r  )re   r   r  r  rf   r  rg   r  0  s   z.Scheduler.can_buffer_be_removed_through_fusionc                 C  s8  |j }t|tjjjr.|j }r.t|\}}|tj	v s |tj	v r.t|tj
js)J d| S tjjjjs;tjdu r;dS t|trS|jD ]}| |}|rP|  S qCdS |j dusZJ | se|  dS t|j tjrndS t|j tjrwdS t|j ddrdS t|j rd	S | | }r|S tjjrt|rd
S dS )z
        Return the reason why we should partition the inductor graph on this node,
        or None if the node is cudagraphable.
        zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r~  r  r(   r	  rJ  rL   r%   custom_should_partition_ops_ops
OpOverloadr   rZ   rD   wrapperr   r   should_partitionrQ   r   
DeviceCopyConditionalrp  rP   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsr3  )re   r   r4  r  op_overload_packet_nameop_overload_namer  r  rf   rf   rg   r  =  sJ   








zScheduler.should_partitionr*  c                 C  s   t  }tjs|S | jD ]H}|j}|du rqt|tjjj	sq|j
}|du r&qt|\}}|tjvr7|tjvr7q| D ]}tjj|}t|tjtjfrR|| q;q|S )zc
        Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
        N)r   r%   cudagraph_unsafe_unbacked_opsr  r   r   r~  r  r(   r	  rJ  rL   r  rV   r   r   r   r!   r"   UNBACKED_INTUNBACKED_FLOATr  )re   unsafe_symintsr   r4  r  r  r  symrf   rf   rg   &_get_cudagraph_unsafe_unbacked_symints|  s.   



z0Scheduler._get_cudagraph_unsafe_unbacked_symintsc                 C  sZ   |   }|sd S t|}|D ]}tjj|}|jD ]}||v r)d|     S qqd S )Nz'uses cudagraph-unsafe unbacked symint: )r  r3  rV   r   r   r   r   )re   r   r  node_symbolsr  simplified_symfree_symrf   rf   rg   r    s   
z0Scheduler._uses_cudagraph_unsafe_unbacked_symint;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]]c                 C  s@   i }| tjj | jD ]}|j D ]	\}}|j||< qq|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rV   r   rP  r  rK  r  r   )re   r  r   r   scheduler_bufferrf   rf   rg   get_name_to_nodes  s   
zScheduler.get_name_to_nodes
signatureslist[GraphPartitionSignature]c           
      C  s   dd t tjjD }dd t tj D }g tj_t |D ]7\}}|jr'qg }|jD ]
}||	| q,g }|j
D ]}	||	|	  q<tjjt||||j qdS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                 S  r  rf   rf   r   rj  r   rf   rf   rg   r\    rb  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>c                 S  r  rf   rf   r  rf   rf   rg   r\    rb  N)r  rV   r   rP  r|  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rM   constant_names)
re   r  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingr   output_mappingr   rf   rf   rg   compute_graph_partition_maps  s2   


z&Scheduler.compute_graph_partition_maps	partitionr\   r  c                   s   ddd ddd	}t  jd
d |D  }|j fdd| D   ||}t  }|D ]}tjj|}||j q,t t	|t
ddS )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        r   0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]rz   r*  c                 S  s8   t | tjr	t S t | tjrt| S tdt|  )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r(   rB  r   rc  r/  r  r   r   rf   rf   rg   get_input_node_symbols  s
   zKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbolssymbolsc                 S  s   t dd | D S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c                 s  s.    | ]}t |tjtjtjtjfr|V  qd S rq   )r!   r"   SIZEFLOATr  r  r  rf   rf   rg   r     s    
zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>r   )r  rf   rf   rg   filter_symbols  s   zCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbolsc                 s  r1  rq   r2  r  rf   rf   rg   r     r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>c                 3  s    | ]	\}} |V  qd S rq   rf   )r   rv  r   r  rf   rg   r   !  r  r   r  N)r   r  rz   r*  )r  r*  rz   r*  )r   r^  r  r  rV   r   r   r   r   r  r  
attrgetter)re   r  r  r  candidate_symbolsr  r3  symplified_srf   r  rg   !get_graph_partition_symbol_inputs  s   

z+Scheduler.get_graph_partition_symbol_inputs
partitionslist[PartitionType]skip_cudagraphs
list[bool]c                   s  g }t tj } dfddtt|t|D ]\}}t  }|D ]
}||j	  q'|
|}	tjdd |D }
t d	d |
j|
jB D | }t fd
d|D }t   |D ]} |j qafdd | D }|| fdd|D } fdd|D } fdd|D }|	| t fdd|	D }	fdd|	D }dd |D }||}t||||||}|| |||	 }q|ddd S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        r  r   rz   ri   c                   sJ   j | d}|du rdS t|jjtr#j| d }r! |S dS dS )z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)r'  r  r   r   r  r<   r  )r  r   rZ  )is_unallocated_bufferre   rf   rg   r  :  s   zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_bufferc                 S  rV  rf   rW  r  rf   rf   rg   r   ]  rX  z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>c                 S  s   g | ]
}t |ts|jqS rf   )r   r3   r   r  rf   rf   rg   r   d  s    c                 3      | ]
} j ||V  qd S rq   r  r  r  rd   rf   rg   r   m  r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>c                   s   g | ]}| v r|qS rf   rf   r  r  rf   rg   r   y  s
    c                   r  rf   rf   r  r  rf   rg   r\    r  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>c                   s   i | ]}|v r|| v qS rf   rf   r  r  r  rf   rg   r\    r  c                   s    g | ]}|v r| vr|qS rf   rf   r  r  rf   rg   r     
    c                 3  r  rq   r  r  rd   rf   rg   r     r  c                   s   g | ]
} |s| qS rf   rf   r  )r  r  rf   rg   r     s    c                 S  s   g | ]
}|t jjv r|qS rf   )rV   r   r^  r  rf   rf   rg   r     r  Nr   )r  r   rz   ri   )r   rV   r   r|  r  r5  r   r  rK  r   r  r'   r\  r]  r   rq  rB  r  r9   r   r^  )re   r  r  r  unmet_output_namesr  r  output_namesr   returned_output_namesr   partition_input_namesextra_input_namesr  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturerf   )r  r  r  re   rg   get_graph_partition_signature.  s   







	z'Scheduler.get_graph_partition_signaturer  r9   c                 C  s^   dd |j  D }dd |j D }dd |jD }dd |jD }t|j||||j|S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                 S  "   i | ]\}}|t jjvr||qS rf   rV   r   r  )r   r   r  rf   rf   rg   r\    
    zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>c                 S  r  rf   r  )r   r   r  rf   rf   rg   r\    r	  c                 S  s    g | ]}|  tjjvr|qS rf   )maybe_get_namerV   r   r  r  rf   rf   rg   r     r  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>c                 S  s   g | ]
}|t jjvr|qS rf   r  r  rf   rf   rg   r     r  )r  r  r  r  r  r9   r  r  )re   r  r  r  r  r  rf   rf   rg   .clean_removed_buffer_from_partition_signatures  s(   z8Scheduler.clean_removed_buffer_from_partition_signaturesc                   s  ddl t g  g dd t|D d fd	d
dfdd}|D ]}t|jj|< | dkr=| q)g }d}|t|k rsL rr`\}}|| || sN rt \}}|| ||  sb|d7 }|t|k rsL sL|t|krtd|S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                 S  r  rf   rf   )r   rj  r   rf   rf   rg   r\    r]  z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>r   r[   rz   r  c                   s6   |  | f} | r| d S  | d S rq   )r  heappush)r   node_with_index)cudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesre   rf   rg   insert_pending_nodes  s   
zHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodesc                   sF   | j jD ]}| dksJ |  d8  < | dkr  | qd S )Nr   r   )rE  
succ_nodes)r   	succ_node)r  node_to_indegreerf   rg   update_indegree  s   zCScheduler.reorder_for_minimizing_partition.<locals>.update_indegreer   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r[   rz   r  )	r  r4  r  r   rE  
pred_nodesheappopr   r  )re   r  r  r   rq  	num_itersrv  rf   )r  r  r  r  r  r  re   rg    reorder_for_minimizing_partition  sP   

z*Scheduler.reorder_for_minimizing_partitionc           
      C  sp   ddl m}m} ttj }||| j| jttjj	
 |\}}| |}||||\}}	||d k r6|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_inforP  )r{  r  r  r   rV   r   r|  r'  r  rP  r   r  )
re   r  r  r  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryrv  rf   rf   rg   r     s    
z0Scheduler.maybe_reorder_for_minimizing_partitionc                 C  sz   g }g }g }d	dd}|D ])}|  |du}|r%t|jdkr%|| q|r1||r1|| q|| q|| | S )
a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        r   r[   rz   ri   c                 S  s2   |   D ]}|jD ]}t|jts  dS q	qdS r,  )r^  r  r   r   r-  )r   r   r.  rf   rf   rg   only_output_userM  s   
zPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userNr   r   )r  r   rP  r   )re   r  frontmiddlebackr!  r   r  rf   rf   rg   r  ?  s   

z6Scheduler.reorder_for_partition_with_simple_dependency9tuple[list[PartitionType], list[GraphPartitionSignature]]c                 C  s   g }d}g }g }| j D ]"}| |du}|r&||kr&|| || g }|}|| q|r:|| || | j||d}| | | || ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        TN)r  r  )r  r  r   r  r  _log_graph_partitions)re   r  r  cur_partitionr  r   node_should_partitionr  rf   rf   rg   r  _  s*   





zScheduler.graph_partitionc           
   
   C  s   t tjsd S tdd tjjD }|sd S tdd |D }t	|| }t 
dt	||| tt||D ]*\}\}}t 
d|t	||jrIdndt	|jt	|j |jra|D ]}	| |	 qYq7d S )Nc                 s  r1  rq   )rQ   )r   r  rf   rf   rg   r     r  z2Scheduler._log_graph_partitions.<locals>.<genexpr>c                 s  s    | ]}|j sd V  qdS )r   N)r  r  rf   rf   rg   r     r  zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)cudagraphs_logr  r  rs  r   rV   r   device_typesr   r   r  r  r5  r  r  r  _log_non_cudagraphable_node)
re   r  r  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  r  r  r   rf   rf   rg   r&    s6   zScheduler._log_graph_partitionsc           
      C  s   |  |}|s	dS | }|jdur|j nd}d| g}t|jj}|d|  |durK|j dddd |j	D  d}|d	|  t
d
|d| |durr|jdd}|rt| dD ]}	t
d|	 qidS dS dS )z)Log details for a non-cudagraphable node.Nzreason=zir=rf  r{  c                 s  r1  rq   )r   r  rf   rf   rg   r     r  z8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>ra  zfx=z
    %s: %sr  ri  z         %s)r  r  r   rW  r   rr   r   r  r%  r  r*  r  r  r  stripsplit)
re   r   r  r  rZ  partsir_typefx_strr  linerf   rf   rg   r,    s(   
$z%Scheduler._log_non_cudagraphable_nodec                 C  sL   t d tjjjr|  n| | j	 W  d    S 1 sw   Y  d S )NScheduler.codegen)r   r~  r  r%   r  _codegen_partitions_codegenr  rd   rf   rf   rg   r    s   


$r6  c           	      C  s   ddl m} tjj}t| j}tj B tjjdd| ||d | 	| t
tjj|s0J | |}|tjj_tjj  tjj}tjjtjj\}}W d   n1 sYw   Y  tjj|| tjj|| tjjjdd |jD  dS )	z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                 S  rw  rf   rZ  r  rf   rf   rg   r     rx  z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>)r  r9  rV   r   r!  r  r]  set_current_wrapper_codeinit_wrapper_coder8  r   r  r>  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  r  )	re   r  r  r9  r=  graph_partition_id
graph_namepartition_coderv  rf   rf   rg   _codegen_partition_wrapper  s.   





z$Scheduler._codegen_partition_wrapper'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )Nrz   Iterator[None]c                   3  s       jr#tjjr#jjd usJ dtjjjj zd V  W jr7tjjr7tjj	  d _d S jrKtjjrKtjj	  d _w )Ndevice should have an index)
%update_graph_partition_default_devicerc  rG   r   r   rV   r   r!  codegen_device_guard_entercodegen_device_guard_exitrf   r  re   r  rf   rg   ctx  s.   
z1Scheduler.use_default_device_context.<locals>.ctx)rz   rL  )
contextlibcontextmanager)re   r  r  rR  rf   rQ  rg   use_default_device_context  s   z$Scheduler.use_default_device_contextc                 C  s   t |dkr|d jsd S ddd}ddd}d }t||D ]\}}|js+||} nq|d u r2d S t||D ]\}}|jrF|||sF d S q7|| _d S )Nr   r   r  r\   rz   r  c                 S  s   | d   }|d usJ |S r   r   )r  partition_devicerf   rf   rg   get_cudagraph_partition_device  s   zWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_devicetarget_deviceri   c                 S  s$   | D ]}|  }||kr dS qdS r,  rV  )r  rY  r   r  rf   rf   rg   all_on_target_device  s   zMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device)r  r\   rz   r  )r  r\   rY  r  rz   ri   )r   r  r5  rc  )re   r  r  rX  rZ  cudagraph_partition_devicer  r  rf   rf   rg   rN    s&   	

	
z/Scheduler.update_graph_partition_default_devicec                 C  s  |   \}}t|dkrtd d  t|7  < | ||0 t||D ]"\}}t|dks7J dt| |jr@| | q$| || q$W d   n1 sQw   Y  t| j	}t
jj| |dkrt
jjdusnJ |tt
jjksJ d| dtt
jj dS dS )	z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   rU  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   r   rU  r5  r  r8  rJ  r  r]  rV   r   r!  set_all_partition_namesr  )re   r  r  r  r  num_partitionsrf   rf   rg   r7  :  s,   
zScheduler._codegen_partitionsc              	   C  s  t jr@dd l}t }t }t|D ]-}|jdkr#|j|j	j
jkr# n|j|jf}||vs:J d|j d|j d|| q| j| _| jd u sKJ | jrXt jjrXtjj  |D ]X}ttjrztd| |  W n ty   td|  Y nw | | |   }r|| jks|! s|" r| #  || jkr| jrt$| jj%rtjj&  || _t$|j%r|j'd usJ dtjj(|j' || _)| j*+|j, |" r|-t.|/ \}	}
}| 0|1|
||	 ns|! rt23t4|}| 5| nb|6 r7t23t7|}| 0|}d	d
l8m9} d	dl:m;} t<|||fr(|}n	t=dt%| |>| n,t<|t?rF| 0|@| nt<|tAtBfrW| 0|C| nt<|tDs_J |E  t jjFro| 0|G  | jH+|I  | jJ+|K  t<|tDs|  }|d ur|j%dkr| 0|L r| #  tMdd |/ D r|| _qZd | _qZ| j| jkr| jd usJ t$| jj%rtjj&  d | _| #  d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0rM  r   )CUDACombinedSchedulingr  ztype(self)=r  c                 s  rw  rq   )r   r   r  rf   rf   rg   r     r  z%Scheduler._codegen.<locals>.<genexpr>)Nr%   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  rc  r  r`  r   autotune_at_compile_timerV   r   r!  write_get_raw_stream_headerrt  r  r  rs  r  r  r`  rs  r  r   r  r  r  rG   r   rP  r   rO  ra  r  r  rB  r  r   r   r  codegen_templater  r  rI  r  r  r   codegen.cuda_combined_schedulingra  r  r  r   rm  codegen_combo_kernelr  codegen_mix_order_reductionr   r   codegen_noder  r  debug_sync_kernelcodegen_syncr  r  r  r   ready_to_flushr   )re   r  r~  stackr  framer  r   r  r  r  r  backend_ra  r  r  rf   rf   rg   r8  Z  s   














zScheduler._codegen(tuple[float, float, list[Optional[str]]]c                 C  s<   |d   }| tj_|| _|dusJ | |}|||S )r)  r   N)r   rV   r   r   r  r  benchmark_combo_kernel)re   r]  node_benchmark_resultsr  r  rf   rf   rg   ry    s   
z Scheduler.benchmark_combo_kernelc                   s  |}|d    t fdd|D sJ dtjsdS ddlm} dg }}i }t|D ]Z\}}| }	| |	r?t	
d z| |	\}
}|
|f||< t|
r\t	
d	| W  d
S W n  |y} } zdt|v rxt	
d W Y d}~ dS  d}~ww ||
7 }|| q-z| ||\}}}W n |y } zdt|v rt	
d W Y d}~dS  d}~ww || dk p|dk }t	tjr||ks|rt	
dt|| d nt	
dt|| d || |k p|S )rl  r   c                 3  s    | ]	}|   kV  qd S rq   rV  r  r  rf   rg   r     r  z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>z<All nodes in a combo kernel group must be on the same deviceTrm  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speeduprr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r%   ry  r  rn  r  r   r_  r  r  r*  r  r  r   r   r  r  rs  rA   rB   )re   r  subkernel_nodesrn  rp  
path1_listrz  r  r  r]  rr  r  r   rq  	ms2_clone_path2_listsmall_kernelrf   r  rg   r    s|   


	z!Scheduler.speedup_by_combo_kernelr  	ir.Layoutc                 C  s"   | j | }|jd usJ |j S rq   )r'  r   
get_layout)re   r  r   rf   rf   rg   get_buffer_layout<  s   

zScheduler.get_buffer_layoutc                 C  sr   | j D ]3}| r6|jjD ](}tjj|j}|r5t	|dkr5t
|jttfs5| g kr5tjj|j qqd S r  )r  rQ   r   r   rV   r   r  r  r   r8   r   r  r<   r;   r  zero_dim_cpu_tensor_listr  )re   r   r  r  rf   rf   rg   rb  A  s   

z$Scheduler.update_zero_dim_cpu_tensor)r  r=  rz   r  )rz   r  r=  )r  r6  rz   r  r:  )r  r   rz   r  )r   rW  rz   r[   )rL  r  rz   ri   r  )r  r[   rz   r  )rz   r  r  r  rz   r(  rq   r  r  r-  ri   r.  r/  rz   r   )r2  r   r  r  rz   r(  )r4  r  rz   ri   )
rW  r  r4  r  r  r   r   r   rz   r  )r]  r  rz   ri   )r  r  r.  r/  rz   r`  )rx   r[   ry   r[   rz   r_   )r   r[   rz   r[   )rx   r[   ry   r[   r  r  rz   r[   )rx   r[   ry   r[   r  rm   r  r  )r  r  r  r  rz   r  )
r  r  r  r  r  r  r  r  r#  ri   )r  r  r  r  )r  r  r  r  )r  r  r#  ri   rz   r  )rI  r/  rz   r  r  )r  r  r#  ri   rz   r  r   )rx   r[   ry   r[   r  r   rz   ri   )rx   r[   ry   r[   r  r  rz   r   r   r   )rO  r[   r  r[   r  r  rz   ri   )rx   r[   ry   r[   rz   r]  )FT)
rx   r[   ry   r[   rq  ri   r  ri   rz   ri   )r  r3   rx   r[   ry   r[   rz   ri   )r  r0   ri  r1   rz   ri   r  )r   r0   r  ri   rz   r   )TFT)rx   r[   ry   r[   r  ri   r  ri   r  ri   rz   r  )r  r  rz   r  )r  r{   rz   r   )r  rI  rz   r  )r  r  rz   r  )r  r6  rz   r  r  )r   r   r  rA  rz   ri   )r   r[   rz   r  )rz   r*  )rz   r  )r  r  rz   r  )r  r\   r  r  rz   r*  )r  r  r  r  rz   r  )r  r9   rz   r9   )rz   r%  )r  r  r  r  rz   r  )r  r\   r  r9   rz   r  )r  r  r  r  rz   rK  r]  r  rz   rx  )r  r  rz   ri   )r  r   rz   r  )arr   rs   rt   r   rV  r?  rd  propertyr  setterr  r  rA  rm  rY   r  ri  r  rh  r  r  rj  r  r  r  r*  r0  r3  r=  rs  rN  r_  rk  r  r  r  r  r  r  r  r  r$  rz  r  r  r  r
  r  r  r*  rH  rN  r  r\  rp  r   ry  r  r  rJ  r  r  r  r  r  r  r  r  r  r  r  r  rE   r  r  r  r  r  r  r  r  r  r  r  r&  r,  r  rJ  rU  rN  r7  r8  ry  r  r  rb  r  rf   rf   r  rg   r   :  s     X$  M-#&X)
  yFQG08.90> [
;f 853'5 ?#)D $A "$+/  Pc                      s   e Zd ZdN fddZdOddZdPddZdQddZdQddZdQddZdRddZ	dSddZ
dTd%d&Z	'dUdVd-d.ZdWd1d2ZdXd4d5ZdOd6d7ZdYd8d9ZdOd:d;ZdZd=d>Zd[dAdBZd\dDdEZd]dHdIZ	'dUd^dLdMZ  ZS )_r  r   Optional[Scheduler]c                   s   t    || _d S rq   )r  rV  r   rU  r  rf   rg   rV  R  s   

zBaseScheduling.__init__rz   r  c                 C  s   | j r
| j   d S d S rq   )r   r  rd   rf   rf   rg   free_buffers_in_schedulerV  s   z(BaseScheduling.free_buffers_in_schedulerr  r  OrderedSet[BackendFeature]c                 C  s   t  S )z0Return a set of .codegen.common.BackendFeature()r   r  rf   rf   rg   get_backend_featuresZ  s   z#BaseScheduling.get_backend_featuresrx   r[   ry   ri   c                 C  r  )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  rf   rf   rg   ry  ^     z BaseScheduling.can_fuse_verticalc                 C  r  )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  rf   rf   rg   rz  f  r  z"BaseScheduling.can_fuse_horizontalc                 C  rw  )au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Frf   r  rf   rf   rg   rt  n  s   
z.BaseScheduling.can_fuse_multi_outputs_templater   c                 C  sR   |  s|  rt||S t||rt||S t|tr#||S t||S )z 
        Fuse two nodes
        )	r  r  rl   r   r   r  r   r  r   r  rf   rf   rg   rl   z  s   


zBaseScheduling.fuser  r'  "tuple[tuple[sympy.Expr, ...], ...]c                 C  r  )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )re   r  rf   rf   rg   r    r  zBaseScheduling.group_fnr  epilogue_nodesr  rs  r  c                 C  r  )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )re   r  r  rs  rf   rf   rg   rm    s   zBaseScheduling.codegen_templateNr  r-  r.  r/  r   c                 C  r  zD
        Generate a kernel given a list of pre-fused nodes.
        r  )re   r  r-  r.  rf   rf   rg   r0    s   	z.BaseScheduling.generate_kernel_code_from_nodesr   (Union[FusedSchedulerNode, SchedulerNode]c                 C  r  r  r  r_  rf   rf   rg   rq       zBaseScheduling.codegen_noder  c                 C  r  rq   r  r_  rf   rf   rg   rp    ry  z*BaseScheduling.codegen_mix_order_reductionc                 C  r  )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rd   rf   rf   rg   rs    r  zBaseScheduling.codegen_syncc                 C  rw  )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Frf   rd   rf   rf   rg   rt       zBaseScheduling.ready_to_flushc                 C  r  )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rd   rf   rf   rg   r    r  zBaseScheduling.flushr(  c                 C  r  )r)  r  r@  rf   rf   rg   r*       z$BaseScheduling.benchmark_fused_nodesr2  r   c                 C  r  )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )re   r2  rf   rf   rg   r3    r  z)BaseScheduling.benchmark_codegened_moduler   c                 C  rw  )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   rf   r  rf   rf   rg   r    r  z'BaseScheduling.get_fusion_pair_priorityr]  rx  c                 C  r  )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  )re   r]  rz  rf   rf   rg   ry    r  z%BaseScheduling.benchmark_combo_kernelnode_schedulerb  c                 C  s2   |rddl m} |||}tjj|| d S d S )Nr   )'set_kernel_post_grad_provenance_tracing)rl  r  rV   r   r!  write_provenance_debug_handle)re   r  rb  r  debug_handlerf   rf   rg   codegen_comment  s   zBaseScheduling.codegen_comment)r   r  r:  )r  r  rz   r  r   r  )r  r'  rz   r  )r  r[   r  r  rs  r  rz   r  rq   r  )r   r  rz   r  )r   r  rz   r  r;  r  )r2  r   rz   r(  r   r  )r  r  rb  r  rz   r  )rr   rs   rt   rV  r  r  ry  rz  rt  rl   r  rm  r0  rq  rp  rs  rt  r  r*  r3  r  ry  r  r  rf   rf   r  rg   r  Q  s.    














	

	r  )rz   r  )r  r[   rz   r   )r  r[   rz   r  )r  r[   rz   rL  )r  r   rz   r   )r   r[   r  r  r'  rJ  rz   r  )rT  rU  rz   r  )rT  rU  r   r   r   r  rz   r  )rf   )r  r  r  r0  r  r  rz   r  )r  r  r  r  rz   r  r  r;  )r   r)  rz   r*  )r   r[   rz   r*  )rx   r[   ry   r[   )
__future__r   rO  rS  r>  r  r  r  r  r  r  r  r  r  rd  r  r   r   concurrent.futuresr   r   r   r   r	   r
   r   r   r   typing_extensionsr   torch.utils._ordered_setr   r(   r   collections.abcr   r   r   typesr   r   r~  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._sympy.symbolr    r!   r"   torch.utils._tritonr#   rx  r$   r%   r&   r'   r)   analyze_preserves_zero_maskr*   codegen.commonr+   r,   r-   comm_analysisr.   r/   r0   r1   r2   r3   excr4   r5   fx_utilsr6   r7   r8   r9   r:   r;   r<   r   r=   r{  r>   r?   runtime.hintsr@   runtime.runtime_utilsrA   rB   r   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   virtualizedrV   	getLoggerrr   rt  _logginggetArtifactLoggerr  r#  r  r*  r   r\   ru   r]   r^   	dataclassr_   rw   r   r   r@  r[   rp  rf  re  r  rk  r  r  r-  r  rI  r  r   r_  rd  r   r  r  rc  r  r  r"  r#  r\  r[  r(  r/  r3  r7  r9  r:  r<  r   r  rf   rf   rf   rg   <module>   s0   $  P

  k      



&


-  
Q
  ^  |h
.







                                7