o
    ei                    @  sH  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dlm$Z$m%Z% d dl&m'Z' d d	lm(Z( d d
l)m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl9Z9d dl:m;  m<Z< d dl=m>Z? d dl9m@Z@mAZA d dlBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJ d dlKmLZLmMZMmNZN d dlOmPZPmQZQmRZR d dlSmTZU d dlVmWZWmXZX d dlYmZZZ d dl[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZg d dlhmiZi d dljmkZkmlZlmmZmmnZnmoZo d dlpmqZqmrZr d dlsmtZt d dlumvZvmwZw d dlxmyZymzZzm{Z{m|Z|m}Z}m~Z~ d dlmZ d dlmZ d d lmZmZmZ d d!lmZ d d"lmLZ d d#lmZmZmZ d d$lmZmZ d d%lmZ d d&lmZmZmZ d d'lmZ d(d)lmZ d(d*lmZ d(d+lmZ d(d,lmZ d(d-lmZ d(d.lmZ e4d/Ze3r9d d0lmZmZmZmZ d d1lmZ d(d2lmZ d(d3lmZ d(d4lmZ d(d5lmZ d(d6lmZmZ d(d7lmZmZ d(d8lmZmZ d(d9lmZ d(d:lmZ ejd;kZeLjZe9jĠed<Ze9jĠed=ZeeơZdd@dAZddCdDZddGdHZG dIdJ dJZG dKdL dLe΃ZG dMdN dNe΃ZddOdPZddSdTZdddYdZZ	Uddd_d`ZdddcddZG dedf dfZ	U	V	U	d ddjdkZאddmdnZ	o	odddudvZejG dwdx dxZېddzd{ZܐdddZG dd dejރZߐdddZdddZed	ddZdddZejG dd dZG dd deZG dd dZd
ddZdddZG dd de0e ZejG dd deZG dd dee ZejdddZezG dd dZG dd dZdaded< dddZejew dZejedZejdddZdddĄZddddǄZezG ddɄ dɃZddd΄ZezG ddЄ deZezG dd҄ deZezG ddԄ deZ dddلZddd܄ZezG ddބ dރZdddZed	ddZG dd dZe$dddZG dd dZezG dd deZ	ezG dd dZ
G dd dZG dd deZG dd deZdS (      )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)	lru_cachepartial)Path)_TemporaryFileWrapper)timetime_ns)
ModuleType)AnycastGenericNoReturnOptionalTYPE_CHECKINGTypeVarUnion)overrideSelf)SymIntTensor)get_interface_for_device)	SkipFrame)CompileEventLoggercountersdynamo_timedget_metrics_context)configexcmetrics)custom_backend_codegen_configscustom_backend_passesinit_backend_registration)compile_utils)rocm_compile_commandrocm_compiler)in_toplevel_process)_LINKER_SCRIPT_set_gpu_runtime_env_TORCH_PATHconvert_cubin_to_obj
CppBuilder
CppOptionsCppTorchDeviceOptionsget_compiler_version_infoget_ld_and_objcopy&get_name_and_dir_from_output_file_pathnormalize_path_separatorrun_asm_build_object)pick_vec_isa)CustomGraphModulePassCustomGraphPassCustomGraphPassTypeCustomPartitionerFnCustomPartitionerFnType)has_frozen_paramsis_frozen_param)_reload_python_module)	cache_dirdefault_cache_dir)ALIGN_BYTESclear_on_fresh_cachedetermine_aoti_mmap_flagsis_linux
is_windowsXPU_KERNEL_FORMAT)FakeScriptObject)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)log_cache_bypass)r"   )CacheArtifactCacheArtifactFactoryCacheArtifactManager)TensorPropertiesWeights)CUSTOM_OBJ_FILENAME_PREFIX)has_hintShapeEnv	size_hint)
OrderedSet   CompiledFxGraph)create_cache)autotune_cache)AutotuneCacheBundler)TritonBundler)VT)Callable	GeneratorKeysViewSequence)Future)_CompileFxKwargs)BuildOptionsBaseGraphLowering)ChoiceCaller)CompiledFxGraphConstants
OutputCode)
JsonDataTyRemoteCache)HalideInputSpec
HalideMeta)CachingAutotuner)	InputTypewin32output_code
autotuningreturnstrc                   C  s   t jjd u rdS dS )N
cubin_path
hsaco_path)torchversionhip r~   r~   c/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/_inductor/codecache.pyget_cpp_wrapper_cubin_path_name   s   r   devicec                 C  s,   | dkrt jjd u rdS dS | dkrtS dS )Ncudacubinhsacoxpu )r{   r|   r}   rH   r   r~   r~   r   get_kernel_bin_format   s
   r   device_typedict[str, str]c                 C  s4   t jt ttjj  tt	| 
 d}|S )zJ
    Gets all the current device information used to compile the .so.
    )AOTI_PLATFORMAOTI_MACHINEAOTI_CPU_ISAAOTI_COMPUTE_CAPABILITY)sysplatformmachinerx   r{   	_inductorcpu_vec_isar8   upperr   get_compute_capability)r   metadatar~   r~   r   get_device_information   s   
r   c                   @  sV   e Zd ZeejdddZeeejdddZdd	d
Z	dddZ
dddZdS )	CacheBaserw   dict[str, Any]c               	   C  s   ddl m} m} | r| }nd }z9dd id|id}tjtj }tjjd ur9|j|d d< tjj|d d< n|j	|d d< tjj
|d d	< W n ttfyV   i }Y nw ttj|d
dd |d< |S )Nr   )
HAS_TRITON
triton_keynametriton)r   r|   r   r|   r   r}   T)	sort_keysutf-8hash)%torch._inductor.runtime.triton_compatr   r   r{   r   get_device_propertiescurrent_devicer|   r   gcnArchNamer}   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   r   triton_versionsystemdevice_propertiesr~   r~   r   
get_system   s6   zCacheBase.get_systemr   c                   C  s   t tjt dt d S )Ncacher   )r   ospathjoinrA   r   r   r~   r~   r~   r   get_local_cache_path   s   zCacheBase.get_local_cache_pathNonec                 C  s   t  | _d S N)r   r   r   selfr~   r~   r   __init__   s   zCacheBase.__init__c                 C  sT   |   }| s
i S t|}t|}W d    |d S 1 s!w   Y  |d S )Nr   )r   is_fileopenr   load)r   local_cache_pathlocal_cache_fplocal_cacher~   r~   r   get_local_cache   s   

zCacheBase.get_local_cacher   c                 C  s0   |   }tt|tj| j|ddddd d S )N)r   r      indentT	make_dirs)r   write_atomicrx   r   r   r   )r   r   r   r~   r~   r   update_local_cache   s   
zCacheBase.update_local_cacheN)rw   r   )rw   r   rw   r   )r   r   rw   r   )__name__
__module____qualname__staticmethod	functoolsr   r   rD   r   r   r   r   r~   r~   r~   r   r      s    $

r   c                   @  s    e Zd ZdddZdd
dZdS )
LocalCachekeysrx   rw   dict[str, Any] | Nonec                 G  s0   |   }|}|D ]}||v r|| }q d S |S r   )r   )r   r   r   	sub_cachekeyr~   r~   r   lookup   s   
zLocalCache.lookupvaluer   r   c                G  sL   |   }|}|dd D ]}||i  || }q|||d < | | d S )Nr   )r   
setdefaultr   )r   r   r   r   r   r   r~   r~   r   	set_value   s   
zLocalCache.set_valueN)r   rx   rw   r   )r   rx   r   r   rw   r   )r   r   r   r   r   r~   r~   r~   r   r      s    
r   c                   @  s   e Zd Z	ddddZdS )PersistentCacheNchoiceslist[ChoiceCaller]oprx   inputs	benchmark1Callable[[Any], dict[ChoiceCaller, float]] | Nonehint_override
int | Nonerw   dict[ChoiceCaller, float]c           
        s   t  |dur| d| n| i d fdd}tjr%|  ni }||sl|durl|tfd	d
D s@J |i  |  i i   D ]\}}	|	|    | < qV| 	| S )a  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            2. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        N_r   r   rw   boolc                   s\   d}D ]'}|  }|| i  i i v r'|     | |< qd} |S |S )z2Check if `cache` contains data for all the choicesTF)hash_keyget)r   hitchoicechoice_hash	cache_keyr   r   	precisiontimingsr~   r   check_cache#  s   z+PersistentCache.lookup.<locals>.check_cachec                 3  s    | ]}| v V  qd S r   r~   ).0r   )r   r~   r   	<genexpr>7      z)PersistentCache.lookup.<locals>.<genexpr>)r   r   rw   r   )
r{   get_float32_matmul_precisionr"   autotune_local_cacher   allr   itemsr   r   )
r   r   r   r   r   r   r   r   r   timingr~   r   r   r     s   
zPersistentCache.lookupr   )r   r   r   rx   r   rx   r   r   r   r   rw   r   )r   r   r   r   r~   r~   r~   r   r     s    r   c                  C  s.   t jt d} t j| st j| dd | S )NlocksTexist_ok)r   r   r   rA   existsmakedirs)lock_dirr~   r~   r   get_lock_dirB  s   r   databytesc                 C  s&   t t|  d d d S )N3   r   )base64	b32encoder   r   digestdecodelower)r   r~   r~   r   sha256_hashI  s   &r  r   codestr | bytesextrac                 C  sL   t | tr| n| d}|r t |tr|n|d}|d | }dt| S )Nr   s   ||c)
isinstancer   r   r  )r  r	  hashing_strextra_br~   r~   r   	code_hashN  s
   r  basename	extensionspecified_dirtuple[str, str, str]c                 C  sb   |rt j|r|}nt jt |}nt jt | dd }t j||  d| }| ||fS )NrY      .)r   r   isabsr   rA   )r  r  r  subdirr   r~   r~   r   get_pathV  s   
r  content	hash_typec                 C  s:   |dv r	t | |S |ddthv rt t| S td| )N>   ptxspvr  amdgcnr   r   zUnknown hash type )r  rH   reprr   )r  r	  r  r~   r~   r   get_hashd  s
   
r  c                   @  s:   e Zd ZdZ	dddddddZdddZdddZdS )WritableTempFileao  
    Avoid "Permission denied error" on Windows:
      with tempfile.NamedTemporaryFile("w", suffix=".gv") as temp_file:
        # Not writable on Windows:
        # https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile

    Example:
        with WritableTempFile("w", suffix=".gv") as temp_file:
            tree.to_dotfile(temp_file.name)
    wN)encodingsuffixmoderx   r!  r   r"  rw   r   c                C  s   || _ || _|| _d S r   )r#  r!  r"  )r   r#  r!  r"  r~   r~   r   r   x  s   
zWritableTempFile.__init___TemporaryFileWrapper[Any]c                 C  s    t j| j| j| jdd| _| jS )NF)r!  r"  delete)tempfileNamedTemporaryFiler#  r!  r"  	temp_filer   r~   r~   r   	__enter__  s   zWritableTempFile.__enter__exc_typeexc_valexc_tbc              
   C  sP   | j   z
t| j j W d S  ty' } ztrn|W Y d }~d S d }~ww r   )r(  closer   unlinkr   OSError_IS_WINDOWS)r   r*  r+  r,  er~   r~   r   __exit__  s   
zWritableTempFile.__exit__)r   )r#  rx   r!  r   r"  r   rw   r   )rw   r$  )r*  r   r+  r   r,  r   rw   r   )r   r   r   __doc__r   r)  r2  r~   r~   r~   r   r  l  s    
r  r   
str | Nonetuple[str, str]c           	      C  sL   |d u rt |  ||}t|||\}}}tj|s"t|| dd ||fS )NTr   )r  stripr  r   r   r   r   )	r  r  r	  r  r  r   r  _subdirr   r~   r~   r   write  s   r8  textc                 C  s   t | dd S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtrY   r8  )r9  r~   r~   r   
write_text  s   r<  Fpath_r   r   encode_utf_8r   c                 C  s   t |ttfsJ dt| }|r|jjddd |jdt  dt	  d }t |tr0dnd}|j
||r9dnd d	}|| W d    n1 sMw   Y  z	|j|d
 W d S  tyt   tse tj||d t| Y d S w )Nz6Only strings and byte arrays can be saved in the cacheT)parentsr   r  z.tmpr   wbr   r!  )target)srcdst)r  rx   r   r   parentmkdirr   getpid	threading	get_identr   r8  renameFileExistsErrorr0  shutilcopy2remove)r=  r  r   r>  r   tmp_path
write_modefr~   r~   r   r     s&    r   c                   @  s"   e Zd ZU dZded< ded< dS )TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    rM   tensor_metadata	list[Any]valuesNr   r   r   r3  __annotations__r~   r~   r~   r   rR    s   
 rR  xc                 C     | S r   r~   rX  r~   r~   r   _ident     r[  tr   rM   c                 C  s&   t | }t| dstj|ddd}|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    _is_inductor_staticr   N)storage_offsetstorage_bytes)rK   hasattrdataclassesreplace)r]  metar~   r~   r   %extract_tensor_metadata_for_cache_key  s   
re  c                      s   e Zd ZdZ	d/d0 fd	d
Zd1ddZd2ddZd3ddZd4ddZd5ddZ	d6d!d"Z
d7d%d&Zd8d(d)Zd9d-d.Z  ZS ):FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    Fgmtorch.fx.GraphModulehas_user_defined_triton_kernelsr   rw   r   c                   s   t  | _t | j tj | _| jt	t
| jtjt
| jtjjjt
| jtjt
| jtjjjjt
| jtt
| ji |rQt
| j| j|j< d| _dS )a2  
        Create an FX graph pickler. If include_non_inlined=True, then pickling will
        include the _values_ for all Tensors. (Note that any tensors are constants
        attached as attributes to the GraphModule). Otherwise, pickling will include
        only the metadata for these tensors.
        TN)ioBytesIO_streamsuperr   copyregdispatch_tabler   updaterL   r   r
   _reduce_fake_tensorr{   r   _reduce_tensornn	parameter	Parameterr   _reduce_symintfxexperimental_backward_stateBackwardState_reduce_unsupportedrI   _reduce_fake_script_object_reduce_graph_module	__class__fast)r   rg  ri  r~  r~   r   r     s&   

zFxGraphCachePickler.__init__r]  r   .tuple[Callable[[T], T], tuple[TensorMetadata]]c                 C  s   t |}t|ffS )z7
        Custom reducer to pickle FakeTensors.
        )re  r[  )r   r]  r   r~   r~   r   rq    s   
z'FxGraphCachePickler._reduce_fake_tensorHtuple[Callable[[T], T], tuple[TensorMetadata | TensorMetadataAndValues]]c                 C  s   ddl m} |jrtdt|}t|r||st|ffS t }|	 }t | }|dkr9t
d|dd tt||ffS )z
        Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
        stored as attributes on the GraphModule.
        rY   ri   zmkldnn tensors unpickleableg      ?z0FX graph cache copying of a large constant took z.1zs. Please file an issue.)graphrj   	is_mkldnnBypassFxGraphCachere  r?   can_inline_constantr[  r   tolistwarningswarnrR  )r   r]  rj   r   startrU  elapsedr~   r~   r   rr    s   

z"FxGraphCachePickler._reduce_tensorsr   #tuple[Callable[[T], T], tuple[str]]c                 C  s   t t|ffS )z3
        Custom reducer to pickle SymInts.
        )r[  rx   r   r  r~   r~   r   rv  =  s   z"FxGraphCachePickler._reduce_symintr   r   c                 C     t d)z{
        Custom reducer to handle any objects that we don't support and therefore
        raise to bypass caching.
        zReduce unsupported)r  r  r~   r~   r   r{  F     z'FxGraphCachePickler._reduce_unsupported&tuple[Any, tuple[dict[str, Any], str]]c                 C  sH   |  \}\}}|d }tdd|}tdd|}||d< |||ffS )a  
        Custom reducer for graph module to handle irrelevant data for user
        defined triton kernels
        Essentially what we are doing here is a huge hack where user defined
        triton kernel contain a dynamo time side table and the arguments to the
        call_function are indices into this side table. These arguments are not
        for hashing purposes since we included the source code into the cache
        key and the numbers are prone to give false negatives due to ordering.
        _codezkernel_idx = \d+r   zconstant_args_idx = \d+)
__reduce__resub)r   rg  fnr   importsr  r~   r~   r   r}  M  s   z(FxGraphCachePickler._reduce_graph_modulerI   *tuple[Callable[..., Any], tuple[Any, ...]]c                 C  sR   |j d urt|j }t|rt|rt|st|jffS t|j|j|j ffS r   )	real_objtypeopaque_objectis_opaque_typeshould_hoisthas_membersr[  script_class_namewrapped_obj)r   r]  clsr~   r~   r   r|  `  s   

z.FxGraphCachePickler._reduce_fake_script_objectobjr   c                 C  s   zZz|  | | j W W | jd | jd S  tttjt	fy6 } zt
jddd td|d}~w tyZ } zdt|v rUdt|v rUt
jddd td| d}~ww | jd | jd w )z<
        Pickle an object and return a byte string.
        r   zFailed to pickle cache keyTexc_infoNpybind11zis not pickleable)dumprl  getvalueseektruncate	TypeErrorAttributeErrorpicklePicklingError
ValueErrorlogwarningr  r   rx   )r   r  r1  r~   r~   r   r   p  s$   


	zFxGraphCachePickler.dumpsrx   c                 C  s   |  |}t|S )zE
        Serialize an object and return a hash of the bytes.
        )r   r  )r   r  serialized_datar~   r~   r   r    s   
zFxGraphCachePickler.get_hashinpFxGraphHashDetails	list[str]c           
        s   d fdd}g }t | D ]k\}}t|tr>tt|D ]} || }|d| d| d| d	|||   qqt|tre| D ]\}}	 |	}|d| d| d| d	||	  qGq |}|d| d| d
||  q|S )z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        r  r   rw   rx   c                   s   t | tjrtt| S t | tr(| jddd}t|dkr |S |d d d S t|  j	v r<t j	t|  | d S t| S )Nr   rc  )errors   z...rY   )
r  r{   r   rx   re  r   r  lenr  ro  )r  valr   r~   r   get_str  s   
 z0FxGraphCachePickler.debug_lines.<locals>.get_str[z] z]: z: Nr  r   rw   rx   )	varsr   r  listranger  r  appenddict)
r   r  r  linesattrr  iihkvr~   r   r   debug_lines  s    
,

(
"zFxGraphCachePickler.debug_linesF)rg  rh  ri  r   rw   r   )r]  r   rw   r  )r]  r   rw   r  )r  r   rw   r  )r  r   rw   r   )rg  rh  rw   r  )r]  rI   rw   r  )r  r   rw   r   r  )r  r  rw   r  )r   r   r   r3  r   rq  rr  rv  r{  r}  r|  r   r  r  __classcell__r~   r~   r  r   rf    s    

%
	
"
	



rf  rootslist[str] | Noneprefixhasherhashlib._Hashc              	   C  s   t t| |dd dD ]L}|j|jd }|d usJ |j}|d us%J t|d}||j	d ||
  W d    n1 sEw   Y  |jrXt|j|j d| qd S )Nc                 S  s   | j S r   )r   rZ  r~   r~   r   <lambda>      z!build_code_hash.<locals>.<lambda>r   rbr   r  )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   rp  r   readispkgbuild_code_hashsubmodule_search_locations)r  r  r  libspecmodulerQ  r~   r~   r   r    s   r  funcCallable[[], bytes]c                   s@   g  d fdd}d fdd}d fd	d
}||_ ||_|S )z
    This function is a reimplementation of functools.lru_cache with a
    set function that allows prepopulating the cache.
    rw   r   c                     s    t  dkr    d S Nr   r  r  r~   _cacher  r~   r   wrapper  s   z torch_key_cache.<locals>.wrapperr  r   c                   s   t  dksJ  |  d S r  r  )r  r  r~   r   set_val  s   z torch_key_cache.<locals>.set_valc                     s       d S r   )clearr~   r  r~   r   r       ztorch_key_cache.<locals>.clearNrw   r   )r  r   rw   r   r   )setr  )r  r  r  r  r~   r  r   torch_key_cache  s   r  c                  C  sx   t ddd, t sddd	} | tW  d
   S ddlm} |d dW  d
   S 1 s5w   Y  d
S )zS
    Compute a key that contains relevant information about torch source files
    inductor_codecache_torch_keyF)log_pt2_compile_eventrootrx   rw   r   c              	     s   d}t jt  fdd|D }t }|tj	d t
| gd| |D ]$}t j|rKt|d}||  W d    n1 sFw   Y  q'| S )N)z"codegen/aoti_runtime/interface.cppz	script.ldc                   s   g | ]	}t j |qS r~   )r   r   r   r   rX  inductor_rootr~   r   
<listcomp>  s    z4torch_key.<locals>.get_code_hash.<locals>.<listcomp>r   r   r  )r   r   dirname__file__r   r   rp  r{   __version__r   r  r   r   r  r  )r  extra_filesr  r   rQ  r~   r  r   get_code_hash  s   z torch_key.<locals>.get_code_hashNr   )parutilztorch/src_hash.txtascii)r  rx   rw   r   )	r    r"   	is_fbcoder.   libfb.pyr  get_file_contentsrstripr   )r  r  r~   r~   r   	torch_key  s   
$r   c                   C  s   t jtS r   )r   r   r  r  r~   r~   r~   r   get_inductor_root  r  r  c                   @  s   e Zd ZU dZded< dS )OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    rT  r   NrV  r~   r~   r~   r   r     s   
 r  c                   @  s   e Zd ZdZdS )r  zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   r3  r~   r~   r~   r   r  
  s    r  c                   @  s>   e Zd ZdZdgZdddZdddZdddZdddZdS ) r  zz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    graph_idrg  rh  example_inputsSequence[InputType]	fx_kwargsrg   inputs_to_checkSequence[int]rw   r   c                 C  s  || _ || _tj| _i | _t| D ] \}}|| jvr3t|t	t
fv r.tt|| j|< q|| j|< qddlm}m}m}	 ddlm}
 g | _|d ur| D ][}t|tjjsYqOt|jjd|d|jjd|	dD ]>}ddlm} ||jd }d }t||r|j rt!tdd	 |j D }|j"}|
|}|#|jd
 }| j$|||f qkqO|| _%t&dd	 |D  }|rtj'( rtj') | _*t+ t, tj-j.j/f| _0tj1j2j3j4tj1j2j3j5tj1j2j3j6f| _7t8 | _9t:; | _<t=j>dd| _?| @t=jA| _AtjBj=jC| _D| @t=jE| _E| @t=jF| _F| @t=jG| _G| Ht=jI| _I| Ht=jJ| _JtK  tLtM| j@tNO | _Ndd tP D | _P| Qt=jR| _Si | _TtUV }|d urf|jTrhdd t|jT dd dD | _Td S d S d S )Nr   )kernel_side_table triton_kernel_wrapper_functionaltriton_kernel_wrapper_mutation)9user_defined_triton_kernel_transitive_closure_source_codecall_function)r   rB  )	Autotuner
kernel_idxc                 s  s*    | ]}t d d |  D V  qdS )c                 s  s    | ]}t |V  qd S r   rx   )r   kvr~   r~   r   r   Q  r   z8FxGraphHashDetails.__init__.<locals>.<genexpr>.<genexpr>N)r  
all_kwargsr   )r   r
  r~   r~   r   r   P  s
    
z.FxGraphHashDetails.__init__.<locals>.<genexpr>constant_args_idxc                 s  s    | ]	}t |tjV  qd S r   )r  r{   r   r  r~   r~   r   r   f  s    Fignore_private_configsc                 S  s&   i | ]\}}|d ur||j ddqS )NFr  )save_config_portable)r   r   custom_configr~   r~   r   
<dictcomp>  s
    z/FxGraphHashDetails.__init__.<locals>.<dictcomp>c                 S  s   i | ]	\}}t ||qS r~   r  )r   symr  r~   r~   r   r    s    c                 S  s   t | d S r  r  rZ  r~   r~   r   r    s    z-FxGraphHashDetails.__init__.<locals>.<lambda>r  )Wrg  r  cconfigcache_key_tagr  r  r   EXCLUDED_KWARGSr  r  rX   r  *torch._higher_order_ops.triton_kernel_wrapr	  r
  r  torch._inductor.codegen.wrapperr  user_defined_triton_sourcemodulesr  r{   rw  GraphModule	itertoolschainr  
find_nodestriton.runtime.autotunerr  
get_kernelkwargsconfigsrx   r  get_constant_argsr  r  anyacceleratoris_availablecurrent_device_indexdefault_cuda_device_index$are_deterministic_algorithms_enabled-is_deterministic_algorithms_warn_only_enabledutilsdeterministicfill_uninitialized_memory!deterministic_algorithms_settingsbackendsr   matmulfp32_precision&allow_fp16_reduced_precision_reduction&allow_bf16_reduced_precision_reductioncuda_matmul_settingsr   torch_versionr   r   system_infor"   r  inductor_config_get_custom_pass_detailpost_grad_custom_pre_pass
_functorchbundled_autograd_cacheprecompile_enabledpost_grad_custom_post_passjoint_custom_pre_passjoint_custom_post_pass_get_custom_pass_detail_unsafe_pre_fusion_custom_pass_fuse_ddp_communication_passesr'   tuplemapr&   rU  r%   !_get_custom_partitioner_fn_detailcustom_partitioner_fn_custom_partitioner_fnvar_to_hint_overrideFxGraphCache_get_shape_env)r   rg  r  r  r  r  r  r	  r
  r  r  r  noder  kernelr(  kernel_sourceconstant_argsno_tensor_inputs	shape_envr~   r~   r   r     s   


#



zFxGraphHashDetails.__init__custom_passr   
Any | Nonec                   sf   |sd S t |tr fdd|D S t |tr|S t |tr"| S t|r(d S tdtt| )Nc                   s   g | ]}  |qS r~   )rF  r  r   r~   r   r        zEFxGraphHashDetails._get_custom_pass_detail_unsafe.<locals>.<listcomp>zunknown config type: )r  r  rx   r:   uuidcallabler   r  r   rW  r~   r   r   rF    s   


z1FxGraphHashDetails._get_custom_pass_detail_unsafe+CustomGraphPassType | CustomGraphModulePassc                 C  s"   |sd S t |ttfsJ | S r   )r  r:   r9   rZ  r\  r~   r~   r   r>    s   z*FxGraphHashDetails._get_custom_pass_detailrL  r=   c                 C  s   |sd S t |tsJ | S r   )r  r<   rZ  )r   rL  r~   r~   r   rK    s   z4FxGraphHashDetails._get_custom_partitioner_fn_detailN)
rg  rh  r  r  r  rg   r  r  rw   r   )rW  r   rw   rX  )rW  r]  rw   rX  )rL  r=   rw   rX  )	r   r   r   r3  r  r   rF  r>  rK  r~   r~   r~   r   r    s    
 
$
r  rg  rh  r  r  r  rg   r  r  tuple[str, list[str]]c           
      C  sf   t | |||}t|jdk}t| |}d|| }||}d|}	td| d|	  ||fS )z=
    Generate a unique hash of the FX graph for caching.
    r   rQ  
z$FX graph cache hash details for key z:
)	r  r  r  rf  r  r  r   r  debug)
rg  r  r  r  detailsri  picklerr   r  	debug_strr~   r~   r   compiled_fx_graph_hash  s   	


rd  time_saved_nsintc                 C  s|   t j r
t j sdS t| d }t r-t jd}t	
d|| |t|| d 7 }t	
d| tjt|d |S )z}
    Ephemerally increases the NCCL timeout when compiling for a distributed job
    Returns amount of seconds increased
    r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)r{   distributedr,  is_initializedrf  r"   r  _utils_internaljustknobs_getval_intr  infodistdistributed_c10d"_add_ephemeral_timeout_for_all_pgsr   )re  increased_timeout_secfudge_factorr~   r~   r   .add_ephemeral_timeout_increase_for_distributed  s$   rs  c                   @  sd   e Zd ZdZed(ddZed)ddZed*ddZed+ddZed,d"d#Z	ed-d%d&Z
d'S ).GuardedCachezJ
    Mixin for caches that have guards associated with their entries.
    r  type[GuardedCache[T]]_keyrx   rw   c                 C  r  )Nz.Implement _get_tmp_dir_for_key on parent classNotImplementedError)r  rv  r~   r~   r   _get_tmp_dir_for_key     z!GuardedCache._get_tmp_dir_for_keyr   	local_hitr   
local_miss
remote_hitremote_missr   c                 C  r  )Nz(Implement _record_result on parent classrw  r  r   r{  r|  r}  r~  r~   r~   r   _record_result  s   	zGuardedCache._record_resultlocalremote_cacheRemoteCache[JsonDataTy] | None,Generator[tuple[T, bytes, bool], None, None]c           
   	   c  s@   |rW|  |}tj|rWtt|D ]A}|drqz)ttj||d}|	 }t
||dfV  W d    n1 s@w   Y  W q tyV   tjddd Y qw |rz1|| }d urt|tsjJ |d }	t|	ttfswJ t|	}t
||dfV  W d S W d S  ty   tjd| jdd Y d S w d S )	Nr  r  Tz,fx graph cache unable to load compiled graphr  r   Fz %s unable to load compiled graph)ry  r   r   r   r  listdir
startswithr   r   r  r  loads	Exceptionr  r  r   r  r  rx   r   r  	b64decoder   )
r  r  r  r   r  r   rQ  r  
cache_datar   r~   r~   r   iterate_over_candidates"  sD   



z$GuardedCache.iterate_over_candidatesevaluate_guards5Callable[[str, list[int] | list[torch.SymInt]], bool]hints	list[int]-tuple[T | None, bytes | None, dict[str, str]]c                 C  s   d}d}d}d}	d}
|  |||D ]1\}}}
t|dsJ |js(|}|}d} nt||j|}|r=|}|}d}|j}	 nd}|j}	qd|i}|	durO|	|d< |duoT|
}|duo[|
 }|du sb|oc|}|du ok|du}| j|||||d	 |||fS )
aY  
        Find the first cache entry in iterate_over_candidates that passes `evaluate_guards`.

        Args:
            key: The cache key to look up
            local: Whether to check the local cache
            remote_cache: The remote cache to check, if any
            evaluate_guards: Function that evaluates whether a guard passes the check,
                given a list of hint values and the guard expression.
            hints: List of symint hints paired with evaluate_guards

        Returns:
            A tuple of (graph, pickled_content) if found, or (None, None) if not found
        N	full_missFguards_exprr   
guard_misscache_status_detailedcache_status_guard_expr)r{  r|  r}  r~  )r  ra  r  r   r  )r  r   r  r  r  r  r  pickled_contentresult_statussample_guards_exprin_local	candidater  r   rm  r{  r}  r|  r~  r~   r~   r   find_guarded_entryF  sL   
zGuardedCache.find_guarded_entryr   r  list[torch.SymInt]c                 C  s   dd |D S )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        c                 S  s$   g | ]}t |tjrt|r|qS r~   )r  r{   r   rU   r   r  r~   r~   r   r    s   $ z7GuardedCache._filter_backed_symints.<locals>.<listcomp>r~   )r  r   r~   r~   r   _filter_backed_symints  s   z#GuardedCache._filter_backed_symintsShapeEnv | Nonec                 C  s"   t jj }|r|jsdS |jjS )zG
        Helper to get the shape env from the tracing context.
        N)r{   _guardsTracingContexttry_get	fake_moderV  )r  ctxr~   r~   r   rP    s   
zGuardedCache._get_shape_envN)r  ru  rv  rx   rw   rx   )r  ru  r   rx   r{  r   r|  r   r}  r   r~  r   rw   r   )
r  ru  r  r   r  r  r   rx   rw   r  )r  ru  r   rx   r  r   r  r  r  r  r  r  rw   r  )r  ru  r   r  rw   r  )r  ru  rw   r  )r   r   r   r3  classmethodry  r  r  r  r  rP  r~   r~   r~   r   rt    s    
#N	rt  c                   @  s,   e Zd Zed	ddZeed
ddZdS )InductorCacheArtifactrw   r   c                 C  s   t | j| j d S r   )rO  _write_to_local_cacher   r  r   r~   r~   r   populate_cache  s   z$InductorCacheArtifact.populate_cacherx   c                   C     dS )Ninductorr~   r~   r~   r~   r   r       zInductorCacheArtifact.typeNr   rw   rx   )r   r   r   r   r  r   r  r~   r~   r~   r   r    s    r  c                   @  s   e Zd ZdZedEddZedFd	d
ZedGddZedHddZ	e	dIdJd$d%Z
edKd(d)ZedLd,d-ZedMd0d1ZedMd2d3ZedNd:d;ZedOd<d=Ze	dIdPdAdBZedQdCdDZdS )RrO  a6  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metadata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    rw   rx   c                   C  s   t jt dS )zS
        Get the toplevel temporary directory for storing compiled graphs.
        fxgraph)r   r   r   rA   r~   r~   r~   r   _get_tmp_dir  s   zFxGraphCache._get_tmp_dirr  type[FxGraphCache]r   c                 C  s   t jt |dd |S )zA
        Return the disk location for a given cache key.
        rY   r  )r   r   r   rO  r  r  r   r~   r~   r   ry    s   z!FxGraphCache._get_tmp_dir_for_keyr{  r   r|  r}  r~  r   c                 C  sp   |r	t t jd |rt t jd t t jd| |r#t t jd |r6t t jd t t jd| dS dS )zG
        Called by GuardedCache to record hit/miss statistics.
        !inductor_fx_local_cache_hit_count"inductor_fx_remote_cache_hit_count!inductor_fx_remote_cache_hit_keys"inductor_fx_local_cache_miss_count#inductor_fx_remote_cache_miss_count"inductor_fx_remote_cache_miss_keysN)r   try_increment_topleveladd_to_set_toplevelr  r~   r~   r   r    s>   zFxGraphCache._record_resultr  r[   
cache_infor   	constantsrl   -tuple[CompiledFxGraph | None, dict[str, Any]]c                   s  j  }r4t|}| }dur4t||d< tjd|jd tjd|jd t|jdkr4ttj	d z
| dd	lm} |jdurK|j W n tyY   d|f Y S w t }jtj|d
 tjj td  j7  < td td  tddd fddd tdfddd td fddfddd tddd fddd tddd fddd t  rԈjrt  dj |fS )ah  
        Cache specific post compile steps that need to run if we find a graph in the cache
        This includes putting bundled triton artifacts in the right place,
        reloading the PyCodeCache artifact, etc.

        These don't always happen (i.e. on a cache miss, so they are in a separate function from
        CompiledFxGraph.post_compile)
        Ntriton_bundler_metainductor_compile)cached_kernel_nameszAOTAutogradCache.inductor_loadr   num_triton_bundlesrY   ri   r  r  zOutput code: 
%szOutput code written to: %sartifactc                   S  
   dddS )Nfx_graph_runnablestringr   r!  r~   r~   r~   r~   r   r  H     z5FxGraphCache.cache_hit_post_compile.<locals>.<lambda>c                         j S r   )runnable_graph_strr~   r  r~   r   r  L  r  )metadata_fn
payload_fninductor_post_grad_graphc                     r  r   )inductor_post_grad_graph_strr~   r  r~   r   r  P  r  r  inductor_output_codec                     s    t j dS )N)filename	file_path)r   r   abspathr~   )artifact_pathr~   r   r  T  s   
c                         S r   r~   r~   r  r~   r   r  X      c                   S  r  )N*inductor_provenance_tracking_node_mappingsr   r  r~   r~   r~   r~   r   r  \  r  c                     r  r   )inductor_provenance_mapping_strr~   r  r~   r   r  `  r  c                   S  r  )N0inductor_provenance_tracking_kernel_stack_tracesr   r  r~   r~   r~   r~   r   r  d  r  c                     r  r   )$inductor_provenance_stack_traces_strr~   r  r~   r   r  h  r  inductor_provenance)!_triton_bundler_   read_and_emitrx   r   try_add_pt2_compiler  r  r  r  after_deserializationr  rj   save_output_codesource_coder/  r]   inductor_meta_from_configr^   begin_compiler$   CachedMetricsHelperapply_deltasmetrics_deltasr   counter_deltasoutput_code_logr`  rJ   r!   in_progressr  
add_to_set)r  r  r  bundler  rd  rj   inductor_metar~   )r  r  r  r   cache_hit_post_compile  s~   









	z#FxGraphCache.cache_hit_post_compileNr  r  r  r  r  r  <Callable[[str, list[int] | list[torch.SymInt]], bool] | Nonec                 C  s   t  }|dus
J t |}dd |D }tjrdd }|du r$|j}t }	t | ||||\}
}}|	| |
du r@d|	fS |durMt	
t | | |
jrft||
j|}|du s^J td| |j t |
|	|S )a  
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        `constants` tracks a list of constants, or a way to obtain the list of constants
        associated with a given cache entry
        `evaluate_guards` allows AOTAutogradCache and other callers to customize
        what constitutes a guard success. Normally, a guard hit happens if
        `shape_env.evaluate_guards_expression` returns True.
        Nc                 S  s   g | ]}t |qS r~   )rW   r  r~   r~   r   r        z.FxGraphCache._lookup_graph.<locals>.<listcomp>c                 S  r  )NTr~   )rX  yr~   r~   r   r    r  z,FxGraphCache._lookup_graph.<locals>.<lambda>Tz*fx graph cache key %s post-load guards: %s)rO  rP  r  r"   &unsafe_skip_cache_dynamic_shape_guardsevaluate_guards_expressionr  r  rp  rQ   record_artifactr  r  r  r   r  r`  guardsr  )r   r  r  r  r  r  rV  symintsr  r  r  r  
guard_infocheckr~   r~   r   _lookup_graphs  s4   




zFxGraphCache._lookup_graphr  r   c                 C  sH   t | }tj|stj|dd tj|t|}t||dd d S )NTr   r   )	rO  ry  r   r   r   r   r   r  r   )r   r  r  r   r~   r~   r   r    s
   
z"FxGraphCache._write_to_local_cachecompiled_graphrm   c                 C  sV  ddl m} t||sJ dt| dt }|dusJ t|}||}|j||d|_	t
|}	|	  zt|	}
W n ty[   tjddd	 td
 d  d7  < Y dS w z3tt | |
 |rnt| |
 |rt|	jpudd }t|
d|d}|| | W dS W dS  ty   tjddd	 td
 d  d7  < Y dS w )z=
        Store a serialized CompiledFxGraph on disk.
        rY   rZ   zserialization for z NYIN)placeholdersr  z1fx graph cache unable to serialize compiled graphTr  r  fxgraph_cache_pickle_errorr   g    .Ar  )r   time_taken_msz!fx graph unable to write to cachefxgraph_cache_write_error)
compile_fxr[   r  r  rO  rP  r  get_pruned_guardsproduce_guards_expressionr  r   prepare_for_serializationr  r   r  r  r  r   rQ   r  r  r  rf  _time_taken_nsr  	b64encoder  put)r   r  r  r  r  r[   rV  r  r  disk_compiled_graphr  r  r  r~   r~   r   _save_graph  sN   	


zFxGraphCache._save_graphrg  rh  c                 C  s   |   D ]<}t|tjjsq|jjD ]-}t|jtjj	r+|j
 s+td|j  |jdkr?tt| |jtjjr?tdqqd S )Nz!Can't cache HigherOrderOperator: getattrzCan't cache torchbind objects)r   r  r{   rw  r!  r  nodesrB  _opsHigherOrderOperator	cacheabler  r   r   r  _CScriptObject)rg  r  rQ  r~   r~   r   _check_for_hop  s$   zFxGraphCache._check_for_hopc                 C  s  t jt jfD ]}|rt|tr| stdqt jt jfD ]}|r/t|tr+| s/tdqt j	dur?tt j	ts?tdt j
D ]}t|rQt|tsQtdqBt| r`tjds`tdt jjrhtdd	d
lm} |jrxtd tt du rtd tdt|  dS )z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        z!Unsupported post grad custom passzUnsupported joint custom passNz#Unsupported _pre_fusion_custom_passz(Unsupported _fuse_ddp_communication_passz,pytorch/inductor:allow_freezing_with_cachingz$Skipping graph with frozen constantszORuntime constant folding can introduce constants that aren't static across runsr   )CompilerBisectorz$dont cache graph when bisect enabledzfx graph cache no shape envzNo shape env)r"   r?  rC  r  r:   rZ  r  rD  rE  rG  rH  r[  r>   r{   rk  justknobs_checkaot_inductoruse_runtime_constant_folding!torch._inductor.compiler_bisectorr  bisection_enabledr  r`  rO  rP  r  )rg  pr  r~   r~   r   _check_can_cache  s>   



zFxGraphCache._check_can_cacher  rg   r  r  remote3tuple[tuple[str, list[str]] | None, dict[str, Any]]c           	   
   C  s   zt |  t| |||\}}W n8 tyH } z,td d  d7  < td| |r1tdt| dt|t	 d}d|fW  Y d}~S d}~ww ||fi fS )	a  
        Checks that the inductor input is cacheable, then computes
        and returns the cache key for the input.
        Returns (key_info, cache_info) where:
        - key_info is (hash_key, debug_lines), and
        - cache_info will contain debug info in the event of BypassFxGraphCache.

        NB: It is possible to have this function return a union instead. But
        I personally believe it is more annoying/difficult to read in that format.
        r  fxgraph_cache_bypassrY   z%Bypassing FX Graph Cache because '%s'bypass_fx_graphbypass)cache_statecache_bypass_reasoncache_event_timeN)
rO  r  rd  r  r   r  rm  rN   rx   r   )	rg  r  r  r  r  r   r  r1  r  r~   r~   r   prepare_key:  s$   
zFxGraphCache.prepare_keyc                  C  s   d} t | t ddS )zK
        Attempts to load the remote cache, returns None on error.
        zfx-graph-v1FbRemoteFxGraphCacheRemoteFxGraphCache)r\   r"   r  )cache_idr~   r~   r   get_remote_cache_  s   zFxGraphCache.get_remote_cacher  r  is_backwardc                 C  s   t | |||||\}}	i |	| |t d}	|durTtd|  td d  d7  < d|	d< |j }
durP|
|	d	< ttj	d
|
d  t
|
 }dkrP||	d< ||	fS td|  td d  d7  < d|	d< ||	fS )z
        Lookup the graph with the given key, and return results and metadata.
        Doesn't do any logging on its own, because AOTAutograd handles a cache miss
        differently from FXGraphCache.
        )r   
componentsr  Nzfx graph cache hit for key %sr  fxgraph_cache_hitrY   r   r  re   distributed_ephemeral_timeout_usi  r   ephemeral_timeout_increasezfx graph cache miss for key %sfxgraph_cache_missmiss)rO  r  r   r  rm  r   r  r   r  r  rs  )r   r  r  r  r  r$  r  r  r  r  re  ephemeral_increaser~   r~   r   load_with_keyl  s>   zFxGraphCache.load_with_keyc                   C  s*   z
t t  W dS  ty   Y dS w )z.
        Clear out the on-disk cache.
        N)rL  rmtreerO  r  FileNotFoundErrorr~   r~   r~   r   r    s
   zFxGraphCache.clearr  )r  r  r   rx   rw   rx   )r  r  r   rx   r{  r   r|  r   r}  r   r~  r   rw   r   )r  r[   r  r   r  rl   rw   r  r   )r   rx   r  r  r  r   r  r  r  rl   r  r  rw   r  )r   rx   r  r   rw   r   )r   rx   r  rm   r  r  r  r   r  r  rw   r   )rg  rh  rw   r   )rg  rh  r  r  r  rg   r  r  r  r   rw   r  )rw   r  )r   rx   r  r  r  r  r  r   r  r  r$  r   r  rl   r  r  rw   r  r   )r   r   r   r3  r   r  r  ry  r  r  r  r  r  r  r  r  r#  r,  r  r~   r~   r~   r   rO    s<    *e::2$
1rO  r   c                 C  sB   ddd}	 |  | rtj| S |  drtj| S | dfS )Nrw   rx   c                   S  s   t rdS dS )Nz.pyd.sor0  r~   r~   r~   r   get_module_ext_type  s   z;split_aot_inductor_output_path.<locals>.get_module_ext_typez.pt2r   r  )endswithr   r   split)r   r1  r~   r~   r   split_aot_inductor_output_path  s   

r4  c                   @  sT   e Zd ZU i Zded< eejZe		ddddZ	edddZ
edddZdS )CudaKernelParamCachezdict[str, dict[str, Any]]r   Nr   rx   paramsdict[str, str | None]r   bin_typeasmr4  asm_typerw   r   c                 C  s8  d }t jjrt jjsJ d|d sJ d|d }t|||tt jjd |d\}}	t|	\}}t jj	rSddt
dd	d
i}
||
v sEJ dtj|	\}}||
|  }	d}t jj	s]t jjrtjjd u sg|r|r|smJ d|ssJ d|dv ry|nd}t|||tt jjd |d\}}|	|t < ||d< || j|< d S )Nz:package_cpp_only requires triton kernel names to be uniquemangled_namezMissing kernel namer   )r  r  r   r   z.fatbinz.spvr   z.hsacoz8multi_arch_kernel_binary only supported in CUDA/XPU/ROCmr   zMissing kernel assembly codezMissing kernel assembly type>   r  r  r  r  r9  )r"   r  package_cpp_onlyr   unique_kernel_namesr8  r4  output_pathr5   emit_multi_arch_kernelrH   r   r   splitextr{   r|   r}   r   r   )r  r   r6  r   r8  r9  r:  r  r   bin_pathbin_type_to_ext	base_pathasm_path	hash_kindr~   r~   r   r    sh   







zCudaKernelParamCache.setr   c                 C  s   | j |d S r   )r   r   r  r~   r~   r   r        zCudaKernelParamCache.getKeysView[str]c                 C  s
   | j  S r   )r   r   r  r~   r~   r   get_keys     
zCudaKernelParamCache.get_keysNN)r   rx   r6  r7  r   rx   r8  rx   r9  r4  r:  r4  rw   r   )r   rx   rw   r   )rw   rG  )r   r   r   r   rW  r   r  cache_clearr  r  r   rI  r~   r~   r~   r   r5    s   
 
Kr5  c                   @  s   e Zd ZdZedddZdS )AotCodeCompilerz.
    Compile AOT Inductor generated code.
    r  rj   wrapper_coderx   kernel_codeserialized_extern_kernel_nodesr4  r   additional_filesr  rw   list[Union[str, Weights]] | strc          a        sb  |}t   t }tddt|jdd}	t|	 }
t o%dko%j	t	tj
j\}}tj
jr;d
f
dt
d|
|tj
jd	\}d
 d td|
|tj
jd	\}ddtj
jsttjtjtjtdddd7}tj
j}d| }| dd| dd| dd| dtd||d\}W d   n1 sw   Y  td}	 |
ddf |  tjj|jdd W d   n1 sw   Y  tj
jr|  tj
js|  tj
js|  t!"d t!"d t#d fd!d"
fd#d"d$ t#d fd%d"fd&d"d$ tj
jsBt!"d' t#d fd(d"fd)d"d$ t$}t$}|j%| & sZj'd*d+ t(t$d, }d	fd2d3}d4d5l)m*} t+ }|tj||d6 t,d7}|` |rt(|-d8}t|d9}|| W d   n	1 sw   Y  tj
jr| | tj
j.}|d:< |/t0 t(|1|j2 d;}tj
j.3 D ]\}}t4|t(rt4|t(sJ d<qt|d9}|t56tj
j. W d   n	1 sw   Y  t(|1|j2 d;}t78|| tj
jr0| | tj
js0| | |r7tj
jnt(|-d=} t9fd>d?j:D ddDdE tj
j;s[tj
j<dFkrkdG fdHd?j:D }!ndG}!tj
j<dIkrt=fdJdKj:D }"| |" t>|!}#t?|#\}$}%|$r|%rt@dLd}&|$r|j2 dM}'t(|1|'}&j|%|$	|dN}(tdd*tj
j dO|(})tddPd*i|(}*tj
jArtBstCjdQ}+tD|+|
fdRtj
j i|(|)_EtF },rtD|,|
fi |(|*_Ett(|j2t(|j%|)dS}-|- }.|-G }/tt(|j2t(|j%|*dS}0|0 }1|0G }2tHdT|. tHdU|1 tj
jr^t(|1|j2 dV}3|)I|3 | |3 |-J| |-K| | | n)z|-L  W n tMjNtOfy }4 zdWt(|4v r|t@dX|4|4d}4~4ww |0L  |%s|!}5d4}6|$rtPQdY|#}5|&dusJ t|&dZ}7|7|! W d   n	1 sw   Y  | |& ntRtStTUd4tTVtTjWjXd[Y }6tPQd\|#d] |6}5||5tZj[}8d4}9i }:t\j]3 D ]G\}9\};}<t4|<tTj^j_j`r|<ja}<t4|<tTjbjcsJ td |9 }=tHd^|;|= |=|:|;< tTjbe|<}>tj|j%|=}?tf|?|>d* | |? q|:r`tj|j%d_}@t|@d9}|t56|: W d   n	1 sVw   Y  | |@ tTjgjhrhti ntj }A|Ajk8 }B|Ajkl  |Brtj
jmrJ d`g }Cg }DtBsQtn	\}E}Ftotjpjqdai }Gtrjs3 D ]\}H}I|H|Gvrq|Idb  }Jr|D |J |Itt  }Ktj
jmr@dckr@tTjgjhdu rtuv }Ltuw  dd|J de|K df|L dg|L df|L dh|L di}Mztxjy|Mz d*d*d*dj W nM txj{y }4 zt||M dk|4j} dl|4j~ tZj~dm  d}4~4ww d4dnlm}N tj&|Js(t@do|J dp|N|J|Kddq}O|Os:t@dr|H dstH"dt|K tj
jrO|C t|K|H|E|F qt| \}P}Qt|j	du}R|/|2|8g|B|C}St|P|S|Q|RdS}T|T }U|TG } tHdv|U tdw}|d |dx|. d |dy|U d W d   n	1 sw   Y  tdw}|d |dx|1 d |dy|U d W d   n	1 sw   Y  tj
jrpt(|1|j2 dz}V|RI|V | |V | t |%r0t(|1|j2 d{}Wt|WdZ}7|7|! |7tPQdY|6 W d   n	1 s%w   Y  | |W n| |8 |TK||8 tj
jmrStTjgjhdu rS|T||D ||D ng |B|C}S||S |SD ]	}X|TK||X q`|T| nn|TL  |SD ]}Y|Y|Bv rqvt|Y qv|%rtj
jd|krt@d}ddd}Z|Z }[tXd|[}\t| d%}]|] }^|]d|\|^|\    |]|! |]tPQdY|6 W d   n	1 sw   Y  tj
jr| |  W d   n	1 sw   Y  tjjd4kr(tTjj }_tj|j%d}`t|`d9}|t5j6|_dd W d   n	1 sw   Y  | |` tj
jr/|S | S )z
        Returns the .so path, or returns a list of files that were generated if
        config.aot_inductor.package=True.
        oi)vec_isar   aot_moder   sourcesBuildOptioncpur_  r   zwrapper.cpp)r	  r  r   z.// Triton kernels are embedded as comments in z
kernel.cppcsrcr  aoti_runtimezmodel.hAOTInductorModelz<AOTInductorModel><>zAOTInductorModel((zAOTInductorModel :z :r  r  r   Nzw+cpp)r  zWrapper code written to: %szKernel code written to: %s
graph_dumpc                        dd dS )Ninductor_aot_wrapper_coderb  r   r  r  r~   r~   )wrapper_pathr~   r   r       z)AotCodeCompiler.compile.<locals>.<lambda>c                     r  r   r~   r~   )rN  r~   r   r    r  r  c                     rd  )Ninductor_aot_kernel_coderb  rf  r~   r~   )kernel_pathr~   r   r    rh  c                     r  r   r~   r~   )rO  r~   r   r    r  zHeader code written to: %sc                     rd  )Ninductor_aot_header_coderb  rf  r~   r~   )header_pathr~   r   r    rh  c                     r  r   r~   r~   )header_coder~   r   r    r  Tr   zCMakeLists.txtconstsr   r   rx   rw   c                   sP  t jj}|dkr$jtj @ rt| dkrtdd nd d}n|dkr-d d	}n|d
kr6d}d}nt	d| dkrCd}t| dk}t| dk}d) fdd}d*dd}d+ fdd}|rn|| t
||\}	}
n|rx|t
|\}	}
n|| t
|\}	}
t|	|
tt jjd \}}t|}tjd!d"}tt|jt|t|j|d#}| }|du r|rtt||t|j n|  |r!|r!t|d$H}|d |d}tjd%kr|d&n|d'}|d(ksJ || d}|t| k r|| |d  }||7 }|t| k sW d    n	1 sw   Y  t| |S ),Nlinuxi 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z.ldata, "aw"z.lrodata, "a"r   darwinz__DATA,__datar   rt   FzUnsupported platform: r   r  r   rn  r   align_bytesrf  symbol_prefixrx   is_large_constsr   rw   r5  c                   s   d  d}|d| d7 }|d| d7 }|| d7 }|s3| D ]
}|d| d7 }q!| s2|d7 }n|d	7 }|d
t | d  d7 }|d| d7 }|| d7 }|dfS )N
	.section	r_  		.balign 	.globl	_binary_constants_bin_start
_binary_constants_bin_start:
z	.byte z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    .globl	_binary_constants_bin_end
_binary_constants_bin_end:
z	weights.Sr  )rn  rq  rr  rs  
consts_asmr
  section_attrr~   r   format_consts_to_gnu_asm  s   zRAotCodeCompiler.compile.<locals>._compile_consts.<locals>.format_consts_to_gnu_asmc                 S  s   t | }d}|}|d| d7 }|d| d| d7 }d}| D ]}|| d7 }|d	 }|d
 dkr6|d7 }q|d7 }|d| d| d7 }|dfS )Nz#if defined(__clang__) || defined (__GNUC__)	
#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))	
#else	
#define ATTRIBUTE_NO_SANITIZE_ADDRESS	
#endif	
	
ATTRIBUTE_NO_SANITIZE_ADDRESS	
zalignas(z	) extern zunsigned char z_binary_constants_bin_start[z] = {	
r   , rY      z	
z};	
z) extern unsigned char * z_binary_constants_bin_end;	
zweights.cppr}  )rn  rq  rr  consts_size	asan_attr	const_cppcount_bytesr
  r~   r~   r   format_consts_to_cpp  s   zNAotCodeCompiler.compile.<locals>._compile_consts.<locals>.format_consts_to_cppc                   sx   t r
d}d}||fS d  d}|d|  d7 }|d| d7 }|| d7 }|d	| d
7 }|| d7 }d}||fS )a  
                This function handles zero-sized constants because the C++ standard prohibits zero-length arrays:
                https://stackoverflow.com/questions/9722632/what-happens-if-i-define-a-0-size-array-in-c-c

                On Windows (MSVC):
                    The compiler reports error C2466 for zero-sized arrays:
                    https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2466
                    Solution: Use assembly compilation to handle this case.

                Why not use Win32 assembly for all paths?
                    ml64 only supports alignment up to 16 bytes, which isn't optimal for performance.

                Cross-platform implementation:
                    Linux: Added '-pedantic' to disable zero-sized arrays in C++ compiler
                    Windows: MSVC naturally rejects zero-sized arrays by default
                z
option casemap:none
.data
?_binary_constants_bin_start@@3PAEA:
align 16
?_binary_constants_bin_end@@3PAEA:
align 16
public ?_binary_constants_bin_start@@3PAEA
public ?_binary_constants_bin_end@@3PAEA
end
r9  rt  r_  ru  rv  rw  rx  rz  r{  r|  Sr0  )rq  rr  asm_codeasm_extr  r~   r   get_zero_consts_asm_code  s   
zRAotCodeCompiler.compile.<locals>._compile_consts.<locals>.get_zero_consts_asm_codera  T)r   rV  compile_onlyuse_relative_pathr   rX  
output_dirrY  zr+blittles   ͫxV4s   4Vxr   )
rn  r   rq  rf  rr  rx   rs  r   rw   r5  )rn  r   rq  rf  rr  rx   rw   r5  )rq  rf  rr  rx   rw   r5  )r"   r  use_consts_asm_buildmutated_buffersrX   r  r   r  r  r   rC   r8  rx   model_name_for_generated_filesr   r2   rV  r0   stemrE  get_target_file_pathr7   buildr   r  r  r   	byteorderfindr   rN  )rn  r   use_asm_buildrr  rs  is_zero_size_constsr  r  r  consts_codecode_extr   consts_sobject_build_optionsobject_builderconsts_orQ  hdr	start_idxposrc)r   r  specified_sub_dirr  r  r   _compile_consts  s   
.








z0AotCodeCompiler.compile.<locals>._compile_constsr   FileLock.locktimeoutz.jsonr   AOTI_DEVICE_KEYz_metadata.jsonz"Metadata must only contain stringsr/  c                 3  s&    | ]}| j vr |jV  qd S r   )folded_constantsget_original_value_of_constantis_cudar   r   r  r~   r   r     s    

z*AotCodeCompiler.compile.<locals>.<genexpr>r]  torch.Tensorall_cudar   c           	      S  s   ddd}dd l }|  dkrdS | jr#tjj| }tjj| }n|  	 }| }|
 }||||j| }t|j}|rF|S ||S )	N	raw_bytesr   rw   c                 S  s$   |  t| t d t t d}|S )NrY       )ljustr  rC   )r  padded_bytesr~   r~   r   _pad_to_alignment  s
   zEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentr       )r  r   rw   r   )ctypesnumelr  r{   opsmkldnndata_ptr_nbytesuntyped_storagerZ  nbytesr   POINTERc_ubyter   contents)	r]  r  r  r  r  r  t_cpu	raw_arrayr  r~   r~   r   	_to_bytes  s    
	
z*AotCodeCompiler.compile.<locals>._to_bytesbinary_blobr  c                 3  s*    | ]}|j vr |V  qd S r   )r  r  r  )r  r  r  r~   r   r     s    
pickle_weightsc                   s6   i | ]}| j vr j|  |t j| fqS r~   )r  allocated_constant_namer  rR   r  r  r  r~   r   r    s    
z+AotCodeCompiler.compile.<locals>.<dictcomp>z?use_external_weights and  use_mmap_weights cannot both be True.z_weights.blob)rV  r   use_mmap_weightsuse_mmap_weights_externalr  rU  r  min_optimizer  )rV  r  r  z#aot wrapper compilation command: %sz"aot kernel compilation command: %sz_compile_flags.jsonz is too big to optimizezUPlease use torch._inductor.config.aot_inductor.compile_wrapper_opt_level = 'O0' flag.qr@  )rY   qqry  zsaving script object %s as %szcustom_objs_config.jsonz<TODO: add emit_multi_arch_kernel support for cutlass kernels_kernel_name_to_bodyr9  r   z	 -fatbin z -o z -gencode arch=compute_z,code=compute_z	,code=sm_ )capture_outputr9  r  z failed with:
stdout:
z	
stderr:
)file)%compile_multiarch_bundle_from_llvm_irz7Multi-arch ROCm compilation requires LLVM IR file, but zD not found. Ensure asm_type='ll' is captured in triton_heuristics.py)llvm_ir_pathoutput_bundle_pathtarget_archsz/Failed to compile multi-arch bundle for kernel z>. Check that ROCm toolchain is available and LLVM IR is valid.zCreated multi-arch bundle: %s)rU  r   rV  r  zaot linkage command: %saz// Compile cmd
// z// Link cmd
// z_linker_flags.jsonz_serialized_weights.binwindowszKwhen cross_target_platform is windows, use_mmap_weights should not be true.rf  c                    sx   t r2ddlm} m}m} ddlm mm G  fddd|}| }|j	
| | |j}|S dd l}| }|S )Nr   )byref	StructurewindllDWORDLPVOIDWORDc                      sR   e Zd Zdfdfd fdfdfd fd fd fd	 fd
fdfgZdS )zCAotCodeCompiler.compile.<locals>.get_page_size.<locals>.SYSTEM_INFOwProcessorArchitecture	wReserved
dwPageSizelpMinimumApplicationAddresslpMaximumApplicationAddressdwActiveProcessorMaskdwNumberOfProcessorsdwProcessorTypedwAllocationGranularitywProcessorLevelwProcessorRevisionN)r   r   r   _fields_r~   r  r~   r   SYSTEM_INFOR
  s    r  )r0  r  r  r  r  ctypes.wintypesr  r  r  kernel32GetSystemInfor  resourcegetpagesize)r  r  r  r  sisys_page_sizer  r~   r  r   get_page_sizeG
  s   z.AotCodeCompiler.compile.<locals>.get_page_sizei @  za+b    zkernel_information.jsonr   r   )rn  r   r   rx   rw   rx   )r]  r  r  r   rw   r   r~   )rw   rf  )r-   r8   r0   r2   rV  r  get_command_liner"   r  r4  r  r>  r<  r   r8  r  dynamic_linkager   r   r   r  r  r  rc  r  
writelinesflushr`   r`  ru   r   packager  r  rm  rJ   r   rE  r   rF  rx   torch.utils._filelockr  r   LOCK_TIMEOUTwith_suffixr   rp  r   	with_namer  r   r  r   r   rL  r   r   r  package_constants_in_so package_constants_on_disk_formatrS   r  rE   r   precompile_headersr0  _get_cpp_wrapper_header_precompile_headerprecompiled_header_get_cpp_prefix_headerr  r  save_flags_to_jsonsave_compile_cmd_to_cmakesave_src_to_cmaker  r#   CppCompileErrorr   structpackr   rf  r{   randintiinfoint64maxitemr   r   	enumeratetorchbind_constants_libraryfake_class_registryrI   r  r  r  rT   _pickle_saver   r|   r}   ROCmCodeCacheCUDACodeCacheaot_kernels_or  r?  r4   r  r  rN  r5  r   r   cuda_compile_utils_nvcc_arch_as_compile_option_cuda_compiler
subprocessrunr3  CalledProcessErrorprintstdoutstderr$torch._inductor.rocm_multiarch_utilsr  embed_kernel_binaryr/   r5   r,   save_kernel_asm_to_cmakeextendsave_link_cmd_to_cmakerN  cross_target_platformtelltraceprovenance_tracking_levelr   create_kernel_information_json)ar  r  rN  rO  rP  r   rQ  generated_filespicked_vec_isavec_isa_cmd_gencpp_commandspecified_output_pathspecified_artifact_namewrapper_keyr   rQ  model_class_name
class_namer]  wrapper_path_operatorkernel_path_operator
cmake_pathr  r  r   lockextern_kernel_nodes_jsonr   	meta_jsonr  r  kernel_meta_json	output_soserialized_weightsweights_dictr  use_external_weightsr  external_weights_pathexternal_weights_filenamecompile_commandwrapper_build_optionskernel_build_optionsheader_file
cpp_prefixwrapper_builderwrapper_compile_cmd	wrapper_okernel_builderkernel_compile_cmdkernel_ocompile_flagsr1  aot_constantsmagic_number	f_weightsr  custom_obj_idxqual_name_to_idr   constantcustom_obj_namecustom_obj_bytescustom_obj_pathconstants_config_jsongpu_codecachegpu_kernels_ocubins_o	asm_filesldobjcopykernelskernel_namer   asm_file
cubin_filecurrent_archcmdr  successoutput_namer  so_build_optionsobj_srcs
so_builderlink_cmdlinker_flagsweight_filer  o_filer  
page_size_	page_sizef_soso_sizekernel_infokernel_info_jsonr~   )r  r  r   r  rm  rl  rO  rj  r  r  rN  rg  r   compile  s  











	


	



 @








 



	













	














$



      t

zAotCodeCompiler.compileN)r  rj   rN  rx   rO  rx   rP  r4  r   rx   rQ  r  rw   rR  )r   r   r   r3  r  ru  r~   r~   r~   r   rM    s    rM  zCDLL | None_libgompr   argsr    list[c_void_p] | c_void_p | Nonec                   sP  d fdd  fdd|D }|  dsJ | d	 d }t| d
D ]\}}|dkr1t|}t||}q$t|sAJ | d t }t|j	j
|D ]\}}|jrW|||j< qK|rb|t| d = ||i |}	|	d u rod S t|	ttfrdd |	D }	|	D ]}
t|
tjsJ | d qtjj|	S t|	tjsJ | d tjj|	S )Nargr   rw   c                   sJ   t t| dkrtjj| S t| ttfr#t|  fdd| D S | S )Nz<class 'PyCapsule'>c                 3  s    | ]} |V  qd S r   r~   r   r  convert_argr~   r   r   
  r   z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>)	rx   r  r{   r  _aoti&alloc_tensor_by_stealing_from_void_ptrr  r  rI  )ry  r{  r~   r   r|  
  s
   z&custom_op_wrapper.<locals>.convert_argc                   s   g | ]} |qS r~   r~   r   ry  r{  r~   r   r  
  r  z%custom_op_wrapper.<locals>.<listcomp>z
torch.ops.z, can not be called through custom_op_wrapperr  r   z, can not be loaded through custom_op_wrapperc                 S  s"   g | ]}|d u rt g n|qS r   )r{   tensor)r   rr~   r~   r   r  
  s   " z returns a list of non-tensorsz returns a non-tensor)ry  r   rw   r   )r  r  r3  	importlibimport_moduler  r[  r  zip_schema	arguments
kwarg_onlyr   r  r  r  rI  r{   r   r  r}  #unsafe_alloc_void_ptrs_from_tensors!unsafe_alloc_void_ptr_from_tensor)r   rw  converted_argsr  rT  r  r'  func_argconv_argresultr  r~   r{  r   custom_op_wrapper
  s8   	

r  precompiled_headersr   headerhashable_cmd_linerD  c              	   K  s  t rJ dt >}t|d }|d|  d tt|d d t|tdi |ddid}|  ddd}||	 }W d    n1 sKw   Y  tdi |ddi}t
d|  dd|| t|  td\}	}
t|
|
|d}tjtdd ttjt|	 d|f |
S )Nz>CppBuilder does not currently support precompiling on Windows!z
header.hppz
#include <z>
preprocessingTrW  r  rx   rw   c                 S  s$   t jdd| fddd}|j d S )zReading the whole preprocessed header in for hashing is very expensive,
            but calling a fast hashing utility in a subprocess is cheap.opensslsha512T)r  r9  r   )r  r  r"  r3  )r  
cmd_outputr~   r~   r   _get_file_checksum
  s   z._precompile_header.<locals>._get_file_checksumprecompilingr  )r  r  r	  r  r   r  r~   )r  rx   rw   rx   )r0  r&  TemporaryDirectoryr   r<  r0   rx   r2   r  r  r8  r3   get_compiler_HEADER_DIRr   r   _HEADER_LOCK_DIR_worker_compile_cppr   r   )r  r  rD  preprocessing_dirpreprocessing_headerpreprocessorr  preprocessor_hashheader_build_optionheader_hashheader_full_pathcpp_builderr~   r~   r   r  
  sJ   


	



r  c                 C  s   |  drdS d S )NrZ  z torch/csrc/inductor/cpp_prefix.h)r  r   r~   r~   r   r    s   
r  rV  c                 C  sJ   | j dddd }tjjo|dk}d|rdnd d	|r d
 dS | dS )zGiven a device type (and optionally whether we're in AOT Inductor mode), returns
    the path to the cpp_wrapper header file to be precompiled.:rY   )maxsplitr   rZ  ztorch/csrc/inductor/aoti_includecpp_wrapper/	array_refz.h)r3  r"   r  allow_stack_allocation)r   rV  base_deviceis_array_refr~   r~   r   r    s   
r  c                   @  s   e Zd ZU dZi Zded< eejZi Z	ded< ed#ddZ
ed#ddZed$ddZe				d%d&ddZed'd!d"ZdS )(CppCodeCachezCompiles and caches C++ libraries.  Users of this class supply the source code to
    be compiled, while compilation flags are set by CppBuilder.*dict[str, Callable[[], CDLL | ModuleType]]r   r   cpp_compile_command_flagsr   rx   r   rw   CDLL | ModuleTypec                 C  s
   t | S r   )r   LoadLibrary)r   r   r~   r~   r   _load_library_inner   rJ  z CppCodeCache._load_library_innerc              
   C  s   z|  ||}||_|W S  ttfyU } z;dt|v r8tjdr8t	da
|  ||}||_|W  Y d }~S dt|v rPt| dt  dt  d| d }~ww )Ngompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorr/  rx   r   r   r   r   r  rv  r&  
gettempdir)r  r   r   r  r1  r~   r~   r   _load_library$  s*   
zCppCodeCache._load_libraryr   r4  c                 C  r  )z_
        Given a device type, returns the path to a CPP header file to be precompiled.
        Nr~   r  r   r~   r~   r   _get_uncompiled_header;  s   z#CppCodeCache._get_uncompiled_headerrZ  Nr~   	main_coder   	submit_fnr   extra_flagsSequence[str]optimized_codec                   sL  i j ||t t d}t  tdt||dud|}tdddi|}dd
d}	|	|}
|	|}t|d| d|
 d\}|rPt|d|d\}}ntj	}j
vr!ddlm} tjt d }ddtjrts| }rt||
fd|dui||_|rt| }rt||fi ||_t|\}}t||||d}|rt|\}}t||||d}t|| | gtdi ||d}tt||||ft|  ntt||ft|  d fdd}|dur||td tj s|W d   n	1 sw   Y  |j
< j
 S )z\Compile and load a C++ library.  Returns a callable that returns the loaded
        library.)r   r  r  rU  Nr  r  Tbuild_optionrh   rw   rx   c                 S  s   t dd| d S )zWriting the code to file will calculate a hash, which we need to vary if
            the command line flags change.  This implements a mostly-generic way of
            validating that.rS  rT  rW  )r0   r  )r  r~   r~   r   get_hashable_command_lineh  s
   z:CppCodeCache.load_async.<locals>.get_hashable_command_linezmain.cppr  r	  zoptimized.cppr   r  r  r  )r   rX  rY  r  r   c                    sF   d u r!d ur    } | d u sJ  d us!J S r   )r  r  )r  binary_pathr  futurer   r  	worker_fnr~   r   load_fn  s   z(CppCodeCache.load_async.<locals>.load_fnr  r~   )r  rh   rw   rx   rw   r   )r  r"   r  r8   r-   r2   r   r8  r   devnullr   r  r  r   r   r   cpp_cache_precompile_headersr0  r  r  r  r  r5   r0   r  r   r
   r  r6   r  r   )r  r  r   r  r  r  rD  main_build_optionoptimized_build_optionr  main_cmd_lineoptimized_cmd_line	main_pathr   optimized_pathr  	lock_pathr  	main_namer  main_builderoptimized_nameoptimized_builderlinkerr  r~   r  r   
load_asyncB  s   






zCppCodeCache.load_asyncrw  r'  c                 O     | j |i | S r   )r  r  rw  r'  r~   r~   r   r        zCppCodeCache.load)r   rx   r   rx   rw   r  r   rx   rw   r4  )rZ  Nr~   N)r  rx   r   rx   r  r   r  r  r  r4  rw   r   rw  r   r'  r   rw   r   )r   r   r   r3  r   rW  r   r  rL  r  r  r  r  r  r  r   r~   r~   r~   r   r    s(   
 
 r  r  cpp_buildersSequence[CppBuilder]c                 C  s`   ddl m} || td |D ]}tj| s|  qW d    d S 1 s)w   Y  d S )Nr   r  r  )r  r  r  r   r   r   r  r  )r  r  r  builderr~   r~   r   r    s   "r  c                   @  s   e Zd ZU i Zded< eejZdddZdZ	dZ
dZed	Zed)ddZed*ddZe					d+d,d#d$Zed-d'd(ZdS ).CppPythonBindingsCodeCacher  r   FTinclude_pytorchsharedrR  zkernel({}); Py_RETURN_NONE;r   a  
        // Python bindings to call {entry_func}():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>
        #include <cerrno>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {{
            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }}
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }}
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }}
        template <> inline float parse_arg<float>(PyObject* args, size_t n) {{
            auto result = PyFloat_AsDouble(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1.0 && PyErr_Occurred()))
                throw std::runtime_error("expected float arg");
            return static_cast<float>(result);
        }}

        {extra_parse_arg}

        static PyObject* {entry_func}_py(PyObject* self, PyObject* args) {{
            try {{
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != {arg_len}))
                    throw std::runtime_error("requires {arg_len} args");
                {call_entry_func}
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}

        static PyMethodDef py_methods[] = {{
            {{"{entry_func}", {entry_func}_py, METH_VARARGS, ""}},
            {{NULL, NULL, 0, NULL}}}};

        static struct PyModuleDef py_module =
            {{PyModuleDef_HEAD_INIT, "{entry_func}", NULL, -1, py_methods}};

        PyMODINIT_FUNC PyInit_{entry_func}(void) {{
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {{
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }}

            char* endptr = nullptr;
            errno = 0;
            uintptr_t addr = std::strtoull(str_addr, &endptr, 10);
            if(errno != 0 || endptr == str_addr || addr == 0) {{
                PyErr_SetString(PyExc_RuntimeError, "Failed to parse _TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
                return nullptr;
            }}
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            PyObject* module = PyModule_Create(&py_module);
            if (module == NULL) {{
                return NULL;
            }}
            #ifdef Py_GIL_DISABLED
                PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
            #endif
            return module;
        }}
        r   rx   r   rw   r   c                 C  s   t tjjjjtjd< | d| j }zt	j
| W S  ty"   Y nw tj||}|d us0J tj|}|t	j
|< |jd usBJ |j| |S )N'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRr  )rx   r{   r  _dynamor  '_torchinductor_pyobject_tensor_data_ptrr   environentry_functionr   r   KeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)r  r   r   module_namer  r  r~   r~   r   r  _  s    


z.CppPythonBindingsCodeCache._load_library_innerr   r4  c                 C     t |S r   )r  r  r~   r~   r   r  r  rz  z1CppPythonBindingsCodeCache._get_uncompiled_headerrZ  r   Nr~   argtypesr  r  r   num_outputsrf  r  r   r  rO  c                   sv   d dd t|D } jjt| j| j jj|dd}	 j||	 ||||ddd fd
d}
|
S )aV  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            main_code: C++ source code containing ENTRY_FUNCTION().  Will be built at
                -O3 if kernel_code is None (to maximize performance in any kernels that
                are present), or -O1 otherwise (to minimize compile time).
            kernel_code: If present, C++ source code that will be built at -O3 and
                linked to main_code.

        Returns:
            A python version of ENTRY_FUNCTION()
        r  c                 s  s.    | ]\}}d | dd d| dV  qdS )z
parse_arg<zconst r   z>(args, )N)rc  )r   nargtyper~   r~   r   r     s
    
zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>)	array_len)arg_lencall_entry_func
entry_funcextra_parse_arg)r  r  r  Nrw   r   c                     s(   d u r t tsJ t jS r   )r  r   r  r  r~   r  
get_resultr  r~   r   r    s   z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.futurer  )	r   r  suffix_templateformatr  call_entry_functionr  r  r  )r  r  r  r   r  r  r  rO  	parseargsr"  r  r~   r  r   load_pybinding_asyncv  s&   

z/CppPythonBindingsCodeCache.load_pybinding_asyncrw  r'  c                 O  r  r   )r  r  r~   r~   r   load_pybinding  r  z)CppPythonBindingsCodeCache.load_pybinding)r   rx   r   rx   rw   r   r  )rZ  r   Nr~   N)r  r  r  rx   r   rx   r  rf  r  r   r  r  rO  r4  rw   r   r  )r   r   r   r   rW  r   r  rL  r  r  r  r  textwrapdedentr  r  r  r  r  r  r~   r~   r~   r   r    s2   
 
e4r  c                   @  sN   e Zd ZU i Zded< eejZdddZdZ	dZ
edZedddZdS )CppWrapperCodeCacher  r   Tr  inductor_entry_cppzreturn inductor_entry_cpp({});a	  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {{
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }}
            return result;
        }}

        static inline PyObject* pack_tensor_handle_list(const std::array<AtenTensorHandle, {array_len}>& arr) {{
            PyObject* result = PyList_New({array_len});
            for (size_t i = 0; i < {array_len}; i++) {{
                PyObject *elem =
                    arr[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(arr[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }}
            return result;
        }}

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {{
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }}

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {{
            // For outputs, we only allocate an array to hold returned tensor handles,
            // not the actual output tensor storage.
            std::array<AtenTensorHandle, {array_len}> output_handles{{}};
            try {{
                inductor_entry_impl(input_handles.data(), output_handles.data());
                if (PyErr_Occurred()) {{
                    return nullptr;
                }}
                return pack_tensor_handle_list(output_handles);
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}
        r   rx   rw   r4  c                 C  r  r   )r  r  r~   r~   r   r    rz  z*CppWrapperCodeCache._get_uncompiled_headerNr  )r   r   r   r   rW  r   r  rL  r  r  r  r  r  r  r  r  r~   r~   r~   r   r    s   
 
6r  c                   @  s  e Zd ZU i Zded< eejZdZded< e	
dZee	
d Zee	
d Ze	
d	Zed3ddZed4ddZeejd5ddZed6ddZeejd7d d!Zeejd7d"d#Ze	d8d9d(d)Zed:d,d-Zed5d.d/Zed;d1d2ZdS )<HalideCodeCachez*dict[str, Callable[[], ModuleType | CDLL]]r   Nr4  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        r   rx   ry  rp   r   r   rw   r  c                 C  sn  |j d usJ |jd urt|j t|jksJ |jd usJ |jp$|j d|j }|r:d| d}d}d}d}nd}d}d| d}d	}g }	t|j |jD ]\}
}|	d
|
 d| d qOd| dt|	dkrwd| dd|	 dnd| d| d| d| d| d| d| d| d| d| d|	  d| dt|	 d| d| d| dg
S )Nz + zreinterpret_cast<uint64_t>(r  cuda_interfacenullptrhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(halide_buffer_flag_host_dirtyzhalide_dimension_t(0, r  zhalide_buffer_t ;r   zhalide_dimension_t z_dims[] = {z};zhalide_dimension_t * z_dims = nullptr;z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestrider  offsetalias_ofr   r  r  r   halide_type)r  r   ry  r   r  r   device_interfacehostflagsdimssizer  r~   r~   r   _codegen_bufferC  s:   "

zHalideCodeCache._codegen_bufferrd  rq   
headerfileobjectc           
      C  s   |  }|d|jv u sJ d|jv sJ g }g }t|jD ]+\}}| r;|d|  || d| || qd|jvsBJ ||j	 qd
dd |D  }|rZ| jn| j}|j| |red	nd
|d
dd |jD |d
|d}	|	S )Nuser_context
no_runtimez&hl_buf_hl_buf_*r_  c                 S  s   g | ]}d | qS )    r~   )r   liner~   r~   r   r  w  rY  z1HalideCodeCache._codegen_glue.<locals>.<listcomp>HalideRuntimeCuda.hzHalideRuntime.hr  c                 s  s.    | ]}|j d u r|  d|j V  qd S )Nr  )r  bindings_typer   rz  r~   r~   r   r     s    
z0HalideCodeCache._codegen_glue.<locals>.<genexpr>)halideruntime_hr  argdefsbuffersbuffer_names)r  rB  r  r  	is_bufferr  r'  r  ctyper   r   lstripglue_template_cudaglue_template_cppr  find_header)
r  rd  r  r  r&  r'  rT  ry  glue_template	glue_coder~   r~   r   _codegen_glueg  s2   

zHalideCodeCache._codegen_gluec                 C  s:   t ddt d}| }td| j| j| j|gdS )NOIrW  r_  r   )	r0   r1   r  r  r   r,  r+  standalone_runtime_cuda_initr   )r  command_gencommand_liner~   r~   r   config_hash  s    zHalideCodeCache.config_hashr"  errmsgc           	   
   C  s   t jjd}|d u s|jstdzX|jd }t|D ]H}|drezt	
dtj||g}W n
 t	jy<   Y qw td|d}|retjtj|d| }tj|retj|  W S qW t| ty{ } zt||d }~ww )	Nhalidez$halide python bindings not installedr   r/  lddz(/.*)/libHalide.sor   rY   )r  	machinery
PathFinderr  r  r   r   r  r2  r  check_outputr   r   SubprocessErrorr  searchr  r  groupr   r  )	r"  r7  r  r>  r  outmr   r1  r~   r~   r   _search_for_file  s4   


z HalideCodeCache._search_for_filec                 C  sV   d|    d}dtjv rtjtjd |}tj|r|S d| d}t||S )Nlibautoschedule_r/  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r  r   r  r   r   r   r  rB  )r   sofiler   r7  r~   r~   r   find_libautoschedule  s   

z$HalideCodeCache.find_libautoschedulec                 C  s   dt jv rt jt jd | }t j|r|S dt jv r5t jt jt jd d|  }t j|r5|S d|  d}td|  |S )NHALIDE_INCLUDErD  z../include/rE  z7, set env HALIDE_INCLUDE to the directory containing it)r   r  r   r   r   r  r  rB  )r   r   r7  r~   r~   r   r-    s   


zHalideCodeCache.find_headerr  r  r   Callable[[], Any]c              
     s  t tt|t|  |fddd }tj|dd d t|d }t|d }t|d }t|d	 }t|d
 }	tj	| }
g }|
rwt
|| tj|ddd| ddddg
}|jrf|d| |jg ||  |ttj| dd |jD }| r|d | j|| ||||  f|
r|jnd | rdndd |
r|tt| tt|	|}|r||jn|  d fdd}|S )Nr  r8     Tr   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdoner:  z-grR  -oz-fhalide_kernelz-ezstatic_library,h,schedulez-pc                 S  s   g | ]}|j d u r| qS r   )r  r#  r  r~   r~   r   r    s    z9HalideCodeCache.generate_halide_async.<locals>.<listcomp>	uintptr_tr   rZ  )r  r  r   rw   rI  c                     s   r    S r   r~   r~   bindings_futurewait_for_compiler~   r   r     s   z3HalideCodeCache.generate_halide_async.<locals>.load)rw   rI  )r   r  r  r  r6  r   r   rx   r   r   r   r   
executable	schedulerr'  rG  rw  r  r   r
   r  
check_callr  r  r  r0  build_standalone_runtimetouch_worker_task_halider  )r  rd  r  r  dirpathgenfilelibfiler  donefilelockfileneed_compilejobsre  binding_typestaskr   r~   rO  r   generate_halide_async  sr   	



z%HalideCodeCache.generate_halide_asyncrw  r'  c                 O  r  r   )ra  r  r~   r~   r   generate_halide  r  zHalideCodeCache.generate_halidec              	   C  s  | j rtj| j r| j S tj rdnd}d}|dkrdnd}| j r0tj| j r,J t }nt }t	|d| d| 
   }tj|dd	 t|d
 }t|d }t|d }t|d }	t|| }
tj|sdd l}ddlm} ||t] tj|st|d}|dkr|| j| d W d    n1 sw   Y  ||	|| t|
\}}t|||	g|t|dd}tt|  t | W d    n1 sw   Y  tj|
sJ |
| _ |
S )Nr   rZ  zlibStandaloneHalideRuntime.soz	host-cudar  zhalide-runtime--Tr   rK  r:  z	hooks.cppzstandalone_halide_runtime.ar   r  r   r"  )r   r  )!r  r   r   r   r{   r   r,  rB   rA   r   r6  r   rx   r8  r  r  r  r   r8  r3  r  r-  compile_standalone_runtimeTargetr5   r0   r2   r  rT  shlexr3  r  rV  )r  r   libnamerB  baserX  	done_file	lock_file	hook_filea_fileso_filehlr  rQ  r   r  halide_cmd_genr~   r~   r   rU    sf   	z(HalideCodeCache.build_standalone_runtimer   c                 C  r  )z5Header precompiling is currently disabled for halide.Nr~   r  r~   r~   r   r  X  r  z&HalideCodeCache._get_uncompiled_header)r   rx   ry  rp   r   r   rw   r  )rd  rq   r  r  rw   rx   r  )r"  rx   r7  rx   rw   rx   )r   rx   rw   rx   r   )rd  rq   r  rx   r  r   rw   rI  )rw  r   r'  r   rw   rI  r  )r   r   r   r   rW  r   r  rL  r  r  r  r  r,  r+  r3  r  r  r0  r   r6  rB  rG  r-  ra  rb  rU  r  r~   r~   r~   r   r    sP   
 
	!#!D:r  r\  r^  list[partial[Any]]c                 C  sl  ddl m} z"|| t |D ]}|  qW d    W d S 1 s!w   Y  W d S  tjy } ztjddkrt|dd^}}}tj	
|drt| }d}	||	d	ks\J G d
d d}
|d}t|tsoJ |
 ||d	 < ttddg|dd}||	|}tdd}||  W d    n1 sw   Y  td| | d }~ww )Nr   r  HALIDE_REPRO1re  )r   r   r   pythonz    hl.main()rY   c                   @     e Zd ZdddZdS )z _worker_task_halide.<locals>.Outrw   rx   c                 S  r  )Nr@  r~   r   r~   r~   r   __repr__o  r\  z)_worker_task_halide.<locals>.Out.__repr__Nr  )r   r   r   ru  r~   r~   r~   r   Outn      rv  rL  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        r   r   zwrote repro.py: )r  r  r  r  r=  r   r  r   r  r   r  r  r   	read_textcountindexr  rf  r  r   r  rc  r   r8  r*  r   )r\  r^  r  jobr1  rs  scriptre  r  mainrv  cireplfdr~   r~   r   rW  ^  sD   &
rW  r  c                 C  s0   t | d	 W d    d S 1 sw   Y  d S )Nr  )r   )r  r~   r~   r   rV    s   "rV  c                   @  s   e Zd ZU g Zded< i Zded< i Zded< ed&d'ddZed&d(ddZ	e		d)d*ddZ
ed+d,dd Zeejd-d$d%ZdS ).PyCodeCachezlist[ModuleType]r   zdict[str, ModuleType]modules_no_attrz dict[str, list[tuple[Any, ...]]]linemapsr   r  rx   r	  rw   r5  c                 C  s   t |d|dS Npyr  r;  )r  r  r	  r~   r~   r   r8    rF  zPyCodeCache.writer   c                 C  s   t |d|d\}}| ||S r  )r8  load_by_key_path)r  r  r	  r   r   r~   r~   r   r     s   zPyCodeCache.loadNr   r   linemaplist[tuple[int, str]] | Noneattrsr   c           	      C  s   |d u rg }|d u r|| j v r| j | S t }t|||d}|r)tt| | j|< |d ur<| D ]
\}}t||| q1|rM|d u rG|| j |< | j	| |S )N)set_sys_modules)
r  r+   r@   r  r  r  r   setattrr   r  )	r  r   r   r  r  in_toplevelmodr  r  r~   r~   r   r    s    

zPyCodeCache.load_by_key_pathFpurger   r   c              	   C  sT   |r| j D ]}z|jsJ t|j W q ty   Y qw | j   | j  dS )z
        Clear the in-memory module cache. If purge=True, also delete all the
        corresponding on-disk source files.
        N)r   r  r   rN  r.  r  r  )r  r  r  r~   r~   r   rL    s   


zPyCodeCache.cache_clearlinenorf  list[dict[str, Any]] | Nonec                 C  sn   || j vrd S t| j | dkrd S | j | \}}t||}|dkr$d S ||d  }|s.d S d	dd}||S )
Nr   rY   stack_tracerx   rw   list[dict[str, Any]]c                 S  s"   d}t || }dd t|D S )Nz"File "(.+)", line (\d+), in (.+)\nc                 S  s"   g | ]\}}}|t ||d qS ))r  r!  r   )rf  )r   rQ  lr  r~   r~   r   r    s    zPPyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace.<locals>.<listcomp>)r  findallreversed)r  regexmatchesr~   r~   r   parse_stack_trace  s
   z<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace)r  rx   rw   r  )r  r  r   )r  r   r  r  r  r  entryr  r~   r~   r   stack_frames_for_code  s   



z!PyCodeCache.stack_frames_for_coder   )r  rx   r	  rx   rw   r5  )r  rx   r	  rx   rw   r   rK  )
r   rx   r   rx   r  r  r  r   rw   r   r  )r  r   rw   r   )r   rx   r  rf  rw   r  )r   r   r   r   rW  r  r  r  r8  r   r  rL  r   r   r  r~   r~   r~   r   r    s"   
 !r  ra  r  rr   c                 C  s   t t|| S r   )r  r  r   )ra  r  r~   r~   r   _load_triton_kernel_from_source  s   r  c               	   C  s   t  r<tjdd)} t| }|  W  d   W  d   S 1 s(w   Y  W d   n1 s7w   Y  t	 }t
t jjgd| | S )zz
    Compute a key representing the state of the CUTLASS library.

    Note: OSS and fbcode will have different keys.
    cutlass_libraryzsrc_hash.txtNr   )r"   r  r  	resourcesr   r   r  r   r   r   r  cutlasscutlass_dirr  )resource_pathresource_filecombined_hashr~   r~   r   cutlass_key  s   
Lr  c                   @  sV   e Zd ZdZdddZddd	Zdd
dZdddZdddZdddZ	dddZ
dS )
DLLWrapperz A wrapper for a dynamic library.lib_pathrx   rw   r   c                 C  s"   || _ d| _t|| _d| _d S )NFT)r  is_openr   r  DLL)r   r  r~   r~   r   r     s   
zDLLWrapper.__init__c                 C  s   | j r|   d| _ d S d S )NF)r  _dlcloser   r~   r~   r   r-    s   
zDLLWrapper.closec                 C  s   d }t  rtd }t|dstd}t|dr|j}nt r-dd l}|jddd}|j}ntd|d ur`t  rDtg|_	|| j
j d S t r^dd l}ddlm} |jg|_	|| j
j d S d S td	 d S )
Ndlclosezlibc.sor   r  T)use_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)rF   r   ra  r  rG   r  FreeLibraryrx  r   r  r  _handler  HMODULEr  r  )r   	f_dlclosesymsr  r  r  r~   r~   r   r  !  s4   


zDLLWrapper._dlcloser   Callable[..., None]c                   s4   | j std| j t| j| d fdd}|S )	NzCannot use closed DLL library: rw  r   rw   r   c                    s     |  }|rt d j d S )NzError in function: )r   r   )rw  errmethodr~   r   _wrapped_funcJ  s   z-DLLWrapper.__getattr__.<locals>._wrapped_funcrw  r   rw   r   )r  r   r  r  r  )r   r   r  r~   r  r   __getattr__D  s
   zDLLWrapper.__getattr__r   c                 C  rY  r   r~   r   r~   r~   r   r)  Q  r\  zDLLWrapper.__enter__rw  r   c                 G     |    d S r   r-  )r   rw  r~   r~   r   r2  T  r  zDLLWrapper.__exit__c                 C  r  r   r  r   r~   r~   r   __del__W  r  zDLLWrapper.__del__N)r  rx   rw   r   r   )r   rx   rw   r  )rw   r   r  )r   r   r   r3  r   r-  r  r  r)  r2  r  r~   r~   r~   r   r    s    

	

#

r  r>  c                 C  s   | d S )z,
    standard format for the error path
    z.errorr~   )r>  r~   r~   r   binary_error_path[  r  r  c                   @  s   e Zd ZU dZejG dd dZi Zded< g Z	ded< dZ
d	ed
< dZd	ed< ed8ddZeeddd9ddZed:ddZe	d;d<d d!Zed=d"d#Zeedd>d&d'Ze	d;d?d*d+Zed@d-d.Ze	d;dAd6d7ZdS )BCUTLASSCodeCacheaB  
    A cache for managing the compilation and loading source code specifically for CUTLASS.
    This class handles writing source code to files, compiling them into shared objects, and caching
    the results to avoid redundant compilations. It also manages error handling and logging for the
    compilation process.
    c                   @  s*   e Zd ZU ded< ded< dZded< dS )zCUTLASSCodeCache.CacheEntryrx   
input_pathr>  Nr4  
error_json)r   r   r   rW  r  r~   r~   r~   r   
CacheEntryk  s   
 r  dict[str, CacheEntry]r   r  r  r   rx   _SOURCE_CODE_SUFFIX_BACKENDrw   r   c                 C  s   | j   | j  d S r   )r   r  r  rH  r~   r~   r   rL  w     
zCUTLASSCodeCache.cache_clearr   )maxsizecaching_enabledr   caching_availablerX  c                 C  sN   | s	t d dS |sdS z
ddlm} | W S  ty&   t d Y dS w )ad  
        Get or create the class instance of the CUTLASSKernelBinaryRemoteCache.

        Args:
            caching_enabled: Whether binary remote caching is enabled
            caching_available: Whether we're in fbcode environment

        Returns:
            CUTLASSKernelBinaryRemoteCache: The class instance of the kernel binary remote cache
        z6CUTLASSKernelBinaryRemoteCache not requested, skippingNr   )CUTLASSKernelBinaryRemoteCachezECUTLASSKernelBinaryRemoteCache not available, remote caching disabled)r  r`  -torch._inductor.fb.kernel_binary_remote_cacher  r  )r  r  r  r~   r~   r   get_kernel_binary_remote_cache|  s   
z/CUTLASSCodeCache.get_kernel_binary_remote_cachec                 C     t r   rw  rH  r~   r~   r   _use_re_build     zCUTLASSCodeCache._use_re_buildN	src_filesdst_filedst_file_ext
extra_argsOptional[list[str]]c                 C  r  r   rw  r  r  r  r  r  r~   r~   r   _compile_command  s   z!CUTLASSCodeCache._compile_commandc                 C  r  r   rw  rH  r~   r~   r   _source_code_extra  r  z#CUTLASSCodeCache._source_code_extrar  r5  c                 C  sF   t jjrt| dgd|}|}n|  }t|| j|d\}}||fS z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        dummy_inputdummy_outputr  )r"   r  cutlass_hash_with_compile_cmdr  r  r  r8  r  )r  r  r  rD  r	  r   r  r~   r~   r   r8    s   zCUTLASSCodeCache.writer  r  c                 C  s  |dkr|  |d|\}}}| ||\}}|gd}}	n| ||\}}|gd}}	|| }
|
| jvrddlm} t }|tj||d t	d}|u |d	t
| j  | }t|}| jtjjohtj t d
}|d	urx||| tj|rt|dd}| }W d	   n1 sw   Y  t|\}}|d	urtjjr||tjj | |||| j|
< t||tj|s| ||||}t|d}|d |d| j  d|	 d| d W d	   n1 sw   Y  t! }t"#d| j |	| |$d}z&| % r$ddl&m'} ||tj(|tj)| n
t*j+|t*j,tj-d W nN t*j.yS } z| /|j01d|
|||| t||j0|d	}~w t2y} } zdt3|v rw| /t3||
|||| t|t3|||d	}~ww t! }| j  d|	 d||  d| }t"4| n	t"#d| j |	| |d	urtjjr||tjj | ||d	| j|
< W d	   n	1 sw   Y  | j|
 }|j5d	urt|j5\}}t||6d| j|
 j7||fS )z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        If dst_file_ext is "so", first compiles to ".o" and then links to ".so".
        Returns a tuple of dst_file_path, hash_key, source_code_path
        sorS  LinkingCompilationr   r  r  r  N)r  r  r   rA  r  r_  z// r  z cmd
// z	%s %s: %s)run_build_command)r#  envzCOMPILE FAILED WITHz took z seconds. Command: z-%s %s skipped: %s since output already exists)8ru  r8  r   r  r  r   r   r   r   r  r  r  r  r  r"   r  use_binary_remote_cacheforce_disable_cachesr  r   r   r   r  r   r  upload_to_binary_remote_cacher  binary_remote_cache_force_writer  r#   CUDACompileErrorr  r  r   r  r`  r3  r  triton.fb.re_build_helperr  r  r  r  r<  STDOUTr  r   _record_compile_erroroutputr  r  rx   rm  r  r   r>  )r  r  r  r  obj_pathr   r   r  r  operation_namekey_with_extr  r   r:  r>  
error_pathbinary_remote_cachefhr  	cmd_partserror_outputre  rQ  
start_timer  errorend_timelog_duration_msgcache_entryr~   r~   r   ru    s   	


"





 

bzCUTLASSCodeCache.compiletuple[DLLWrapper, str, str]c                 C  <   |dkrt d| d| | ||\}}}t|||fS z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   ru  r  r  r  r  dst_file_pathr   source_code_pathr~   r~   r   r   D     
zCUTLASSCodeCache.load	error_strr  r  r  r>  r  r   c           
      C  s   t ||g}| |||| j|< t|}t|ddd}	|	| W d    n1 s,w   Y  |d urCtjj	rE|
|tjj d S d S d S )Nr   r   rA  )r   r   r  r   r  r   r8  r"   r  r  r  r  )
r  r  r  r  r  r>  r  r  r  r  r~   r~   r   r  U  s   z&CUTLASSCodeCache._record_compile_errorr   )r  r   r  r   rw   rX  rw   r   r   
r  r  r  rx   r  rx   r  r  rw   rx   r  r  rx   r  rx   rw   r5  r  rx   r  rx   r  r  rw   r  r  rx   r  rx   rw   r  )r  rx   r  rx   r  r  r  rx   r>  rx   r  r   rw   r   )r   r   r   r3  rb  	dataclassr  r   rW  r  r  r  r  rL  r   r	   r  r  r  r  r8  ru  r   r  r~   r~   r~   r   r  c  s@   
 	 
r  c                   @  sB   e Zd ZdZdZedddZe	ddddZedddZdS )r  cuCUDArw   r   c                 C  s   t  S r   )r  use_re_buildrH  r~   r~   r   r  v  rz  zCUDACodeCache._use_re_buildNr  r  r  rx   r  r  r  c                 C  s   t j||||dS )N)r  )r  cuda_compile_commandr  r~   r~   r   r  z  s   zCUDACodeCache._compile_commandc                 C  s"   t t t t t g}|S r   )r  r  r  _nvcc_compiler_options_nvcc_host_compiler_optionsr  )r  r	  r~   r~   r   r    s   z CUDACodeCache._source_code_extrar  r   r  r  )	r   r   r   r  r  r  r  r  r  r~   r~   r~   r   r  q  s    r  c                   @  s~   e Zd ZU ejG dd dZi Zded< g Zded< dZ	dZ
edddZedddZe	ddddZed ddZdS )!r  c                   @  s   e Zd ZU ded< ded< dS )zROCmCodeCache.CacheEntryrx   r  r>  N)r   r   r   rW  r~   r~   r~   r   r    s   
 r  r  r   r  r  rb  Frw   r   c                   C  s   t j  t j  d S r   )r  r   r  r  r~   r~   r~   r   rL    r  zROCmCodeCache.cache_clearr  rx   r  r5  c                 C  s.   t tdgd|}t|| j|d\}}||fS r  )r  r)   r8  r  )r  r  r  cuda_commandr   r  r~   r~   r   r8    s   
zROCmCodeCache.writeNr  r  r  c                 C  sx  | j sd| _ tttt  | ||\}}|| jvrddlm	} t
 }|tj||d td}|v |dt| j  | }	tj|	st|g|	||}
t }|
d}ztj|tjdtjd}td	| W n tjy } zt||j|d}~ww t }d
||  d|
 }t| ntd||	 t||	| j|< W d   n1 sw   Y  | j| j ||fS )z
        Compiles source_code into a file with dst_file_ext extension,
        using the compile command specific for the ROCm platform.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        Tr   r  r  r  Nr  )r#  r9  r  zCompilation output: %szCompilation took z seconds. Compile command: z+Skip compiling %s: output %s already exists)!_logged_compiler_versionr  r`  r3   rx   r*   r8  r   r  r  r   r   r   r   r  r  r  r   r)   r   r3  r  r<  r  r  r   r#   r  r  rm  r  r  r>  )r  r  r  r  r   r  r  r   r:  r>  re  r  r  r  r  r  r  r~   r~   r   ru    sN   	


zROCmCodeCache.compiler  c                 C  r  r  r  r  r~   r~   r   r     r  zROCmCodeCache.loadr   r   r   r  r  )r   r   r   rb  r  r  r   rW  r  r  r  r   rL  r  r8  ru  r   r~   r~   r~   r   r    s    
 1r  c                   @  rt  )CodeCacheFuturerw   Callable[..., Any]c                 C  r  r   rw  r   r~   r~   r   r    r\  zCodeCacheFuture.resultNrw   r  )r   r   r   r  r~   r~   r~   r   r    rw  r  c                   @  s$   e Zd Z	dddd	Zdd
dZdS )LambdaFutureN	result_fnr  r  Future[Any] | Nonerw   r   c                 C  s   || _ || _d S r   )r  r  )r   r  r  r~   r~   r   r     s   
zLambdaFuture.__init__c                 C  s   |   S r   )r  r   r~   r~   r   r    s   zLambdaFuture.resultr   )r  r  r  r  rw   r   r  )r   r   r   r   r  r~   r~   r~   r   r    s    r  c                   @  s$   e Zd ZdZdddZddd	Zd
S )StaticAutotunerFuturezM
    A statically launchable CachingAutotuner, loaded from TritonBundler
    static_autotunerrr   rw   r   c                 C  s   || _ d | _d S r   )r  reload_kernel_from_src)r   r  r~   r~   r   r     s   
zStaticAutotunerFuture.__init__c                 C  sd   | j d usJ td | jj| j d | jjd| j d d | jW  d    S 1 s+w   Y  d S )Nz%StaticAutotunerFuture.warm_precompile)r  F)warm_cache_onlyreload_kernelstatic_triton_bundle_key)r  r    r  recheck_autotune_cache
precompiler   r~   r~   r   r    s   
$zStaticAutotunerFuture.resultN)r  rr   rw   r   )rw   rr   )r   r   r   r3  r   r  r~   r~   r~   r   r    s    
	r  r  )r   rx   rw   rx   )r   rx   rw   r   )r   r   rw   rx   r  )r  r  r	  r  rw   rx   )r  rx   r  rx   r  rx   rw   r  )r   r  )r  r  r	  rx   r  rx   rw   rx   )r   r  r   N)r  r  r  rx   r	  rx   r  rx   r  rx   r   r4  rw   r5  )r9  rx   rw   rx   )FF)
r=  rx   r  r  r   r   r>  r   rw   r   )rX  ra   rw   ra   )r]  r   rw   rM   )r  r  r  rx   r  r  rw   r   )r  r  rw   r  r  )
rg  rh  r  r  r  rg   r  r  rw   r^  )re  rf  rw   rf  )r   rx   rw   r5  )r   rx   rw  r   rw   rx  )r  rx   r  rx   rD  r   rw   rx   r  r  )r   rx   rV  r   rw   rx   )r  rx   r  r  rw   r   )r\  rx   r^  rp  rw   r   )r  rx   rw   r   )ra  rx   r  rx   rw   rr   )r>  rx   rw   rx   (  
__future__r   r  rn  rb  r   r   r  importlib.resourcesrj  r"  r   loggingr   r  r  r   r  rf  rL  r  r  r   r&  r  rH  r  bisectr   r   r  r   r   r   datetimer   r	   r
   pathlibr   r   r   r   typesr   typingr   r   r   r   r   r   r   r   typing_extensionsr   r   r{   torch._library.opaque_objectr  r  torch.distributedri  rn  r   r   torch._dynamo.device_interfacer   torch._dynamo.excr   torch._dynamo.utilsr   r   r    r!   torch._inductorr"   r#   r$   torch._inductor.codegen.commonr%   r&   r'   torch._inductor.codegen.cudar(   r  ,torch._inductor.codegen.rocm.compile_commandr)   r*   $torch._inductor.compile_worker.utilsr+   torch._inductor.cpp_builderr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   torch._inductor.cpu_vec_isar8   !torch._inductor.custom_graph_passr9   r:   r;   r<   r=   torch._inductor.freezing_utilsr>   r?   %torch._inductor.runtime.compile_tasksr@   %torch._inductor.runtime.runtime_utilsrA   rB   torch._inductor.utilsrC   rD   rE   rF   rG   rH   "torch._library.fake_class_registryrI   torch._loggingrJ   torch._subclasses.fake_tensorrK   rL   rM   torch._utils_internalrN   torch.compilerr  torch.compiler._cacherO   rP   rQ   )torch.export.pt2_archive._package_weightsrR   rS   "torch.export.pt2_archive.constantsrT   %torch.fx.experimental.symbolic_shapesrU   rV   rW   torch.utils._ordered_setrX   ru   r[   r  r\   runtimer]   runtime.autotune_cacher^   triton_bundlerr_   virtualizedr`   ra   collections.abcrb   rc   rd   re   concurrent.futuresrf   r  rg   r  rh   r  rj   irrk   rl   rm   rn   ro   runtime.hintsrp   rq   runtime.triton_heuristicsrr   r1  rs   r0  file_lock_timeoutr  _logginggetArtifactLoggerr   r  autotuning_log	getLoggerr  r   r   r   r   r   r   r   r  r  r  r  r  r8  r<  r   r  rR  r[  re  Picklerrf  r  r  r   r  r  r  r  r  rd  rs  rt  registerr  rO  r   r4  r5  rM  rv  rW  r  r   r   r  r  r  r  r  r  r  r  r  r  rW  rV  r  r  r  r  r  r  r  r  r  r  r  r~   r~   r~   r   <module>   s|   (8 




	A
7
	(

 M
!	 M    vY      |4? J BD  j*gK  &c