o
    ki                     @   sp  U d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZmZ ddlZddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ er}ddl%m&Z& ddl'm(Z( e)e*Z+ddgZ,dZ-g a.e/e	 e0d< g a1e/e	 e0d< g a2e/e	 e0d< da3da4da5da6da7de8fddZ9de8fddZ:de8fddZ;de8fddZ<de8fdd Z=G d!d" d"Z>d[de8fd#d$Z?d[de8fd%d&Z@d\d'ejAd(eBdejAeCB fd)d*ZDd+d, ZEd\d'ejAd(eBdejAeFB fd-d.ZGde8fd/d0ZHd1e8de8dB fd2d3ZIde8dB fd4d5ZJde8fd6d7ZKdaLd8d9 ZMG d:d; d;ZNG d<d= d=eNZOG d>d? d?eNZPG d@dA dAeNZQG dBdC dCeNZRG dDdE dEeNZSdFdG ZTdHeNddfdIdJZUdHeNddfdKdLZVdHeNde8fdMdNZWejXjYdOdPdQdRe8ddfdSdTZZeZj[dRe8ddfdUdVZ\G dWdX dXej]j^Z_G dYd deZ`de`dB fdZdZadS )]a  
DebugMode is a debugging TorchDispatchMode that intercepts and logs runtime calls
to a hierarchical string dump. It logs real tensor, DTensor, and optionally FakeTensor
operations, with some additional handling for DTensor internals.

An example dump from an eager mode DTensor matmul:

    torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))  ->  dt$6: f32[8, 32]| S(0)
      aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
        redistribute_input(1, S(0) -> R)
          redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
            _c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0)  ->  t$3: f32[8, 32]
            _c10d_functional::wait_tensor(t$3: f32[8, 32])  ->  t$3: f32[8, 32]
        aten::mm(t$4: f32[1, 8], t$3: f32[8, 32])  ->  t$5: f32[1, 32]

This mode runs "under" compile, which means it hides itself during compilation, and is re-enabled
at runtime, and DebugMode-related operations won't show up in the compiled region.
DebugMode also provides some visibility into non-torch-dispatch calls (e.g. DTensor redistribute calls,
inductor-generated triton kernels), but requires special handling for these, since dispatch modes
can't intercept them by default.

The mode also provides some extensions for custom debugging (e.g. adding custom dispatch call hooks
via dispatch_hooks), or numerics debugging (e.g. tensor hashing for bitwise equivalence/closeness,
via log_tensor_hashes). These decorators allow annotating string dumps with additional per-call information,
for any region of runtime code.

Usage::

    with DebugMode() as debug_mode:
        result = some_pytorch_operation(tensor_input)
    print(debug_mode.debug_string())
    N)Callable)AnyTYPE_CHECKING)warning_once)
FakeTensorFakeTensorMode)_parse_stack_trace)dtype_abbrs)_get_current_dispatch_mode _get_current_dispatch_mode_stackTorchDispatchMode)keystrtree_alltree_maptree_map_onlytree_map_with_path)CapturedTraceback)	WeakIdRef)DeviceInterface
ModTracker	DebugModeget_active_debug_moderedistribute_input_DISPATCH_RECORD_HOOKS_DISPATCH_LOG_HOOKS_DISPATCH_PRE_LOG_HOOKSFreturnc                 C      dd dd | D  dS )N[, c                 S      g | ]}t |qS  str).0xr"   r"   a/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/utils/_debug_mode.py
<listcomp>`       z$_stringify_shape.<locals>.<listcomp>]join)shaper"   r"   r'   _stringify_shape_      r.   c                 C   s   dd dd | jD  dS )NzDM(r    c                 S   r!   r"   r#   )r%   sr"   r"   r'   r(   d   r)   z*_stringify_device_mesh.<locals>.<listcomp>))r,   r-   )meshr"   r"   r'   _stringify_device_meshc   s   r3   c                 C   r   )Nr   r    c                 S   r!   r"   r#   )r%   pr"   r"   r'   r(   h   r)   z(_stringify_placement.<locals>.<listcomp>r*   r+   )	placementr"   r"   r'   _stringify_placementg   r/   r6   c                 C   sV   i }|D ]}t | |rt| |||< qt|dkrdS dddd | D  dS )Nr    {r    c                 S   s   g | ]\}}| d | qS )=r"   r%   kvr"   r"   r'   r(   r   s    z)_stringify_attributes.<locals>.<listcomp>})hasattrgetattrlenr,   items)tensor
attributespairsattrr"   r"   r'   _stringify_attributesk   s   
 rF   c                 C   s   ddl m} || j| jS )Nr   DTensorSpec)&torch.distributed.tensor._dtensor_specrH   format_shard_order_str
placementsshard_order)specrH   r"   r"   r'   _stringify_dtensor_specu   s   rN   c                   @   s$   e Zd ZdddZdefddZdS )TensorIdTrackerr   Nc                 C   s   i | _ d| _d S Nr   )tensor_memonext_tensor_idselfr"   r"   r'   __init__|   s   
zTensorIdTracker.__init__c                    s   t j 1 t| d fdd}t||  jvr*jj <  jd7  _j  W  d    S 1 s9w   Y  d S )Nr   c                      s   j  d  d S N)rQ   popr"   orT   r"   r'   del_memo   s   z%TensorIdTracker._id.<locals>.del_memo   r   N)torch_C_DisablePythonDispatcherr   weakreffinalizerQ   rR   )rT   rB   rZ   r"   rX   r'   _id   s   
$zTensorIdTracker._idr\   )__name__
__module____qualname__rU   intrb   r"   r"   r"   r'   rO   {   s    
rO   c                 C   s   t | tjrPt| j  t| j t| | }|dur#d||  nd}t | tj	j
jr;d| d| dt| j S t | trHd| d| S d| d| S td	t|  )
z.Convert tensor to debug string representation.N$r7   dt: z| fttzUnsupported tensor type: )
isinstancer]   Tensorr	   dtyper.   r-   rF   rb   distributedrB   DTensorrN   _specr   RuntimeErrortype)rB   rC   rQ   tensor_debug_strid_strr"   r"   r'   _tensor_debug_string   s   "
rv   c                    s.   ddl m   fdd}t|| } t| S )Nr   rG   c                    s.   t | tjrt| S t |  rt| S | S rV   )rl   r]   rm   rv   rN   r&   rH   rC   rQ   r"   r'   to_str   s
   
z_arg_to_str.<locals>.to_str)rI   rH   r   r$   )argrC   rQ   ry   r"   rx   r'   _arg_to_str   s   
r{   rk   
use_scalarc                 C   s   t j @ |  s|  s|  } |  } |  r"| jt jd}n| jt j	d}|j
dd}|r<| W  d   S |W  d   S 1 sHw   Y  dS )a.  
    from Observer. Computes a hash for a tensor by converting it to float (if needed), making it contiguous,
    replacing NaN/inf values with fixed numbers, and then computing the L1 norm in float64 or complex128.
    This is used to generate a deterministic summary value for tensor comparison.
    rn   r[   )r4   N)r]   r^   r_   is_floating_point
is_complexfloat
contiguousto
complex128float64normitem)rk   r|   t_floatoutr"   r"   r'   norm_hash_fn   s   $r   c                 C   s(   t | | }tt | t |d}|| S )Ng|=)absmax)hash1hash2	numeratordenominatorr"   r"   r'   _compute_rel_diff   s   r   c                 C   s   t | tjjjr|  } |  r| jtjd}n| 	 r(| jtj
dtj}n| jtjd}|  dkr;t|}n
tjd|jtjd}|rK| S |S )z(
    wrapper over torch.hash_tensor
    r}   r   r"   )devicern   )rl   r]   ro   rB   rp   to_localr~   r   r   r   r   viewint64numelhash_tensorzerosr   uint64r   )rk   r|   t_cleanr   r"   r"   r'   hash_tensor_fn   s   r   c                     sP   ddl m  t  } | d d }  fdd| D } tj| } d| 	 S )Nr   uninteresting_filesc                    s   g | ]
}|j   vr|qS r"   )filename)r%   framer   r"   r'   r(      s    z$_get_stack_trace.<locals>.<listcomp>r7   )
%torch.fx.experimental.symbolic_shapesr   r   extractsummary	tracebackStackSummary	from_listr,   format)r   r"   r   r'   _get_stack_trace   s   
r   stack_trace_strc                    sV   t jtt  fdd}t| |d}|r)d|j d|j d|j	 d|j
 S d S )Nc                    s   |   tjj  S rV   )
startswithospathsep)filenamecode	torch_dirr"   r'   <lambda>   r)   z'_get_user_stack_trace.<locals>.<lambda>)	filter_fnzFile: :z in z, code: )r   r   dirnameinspectgetfiler]   r   r   linenor   r   )r   r   tracer"   r   r'   _get_user_stack_trace   s   $r   c                  C   s2   t j d urt j jd} | rd| S d S )N
traceback_r7   )r]   r^   _current_autograd_nodemetadatagetr,   )tbr"   r"   r'   _maybe_get_autograd_trace   s
   
r   c                 C   sN   t | tjjr| j}|S t| dr!t| dr!| j d| j }|S t| }|S )Nrd   rc   .)	rl   r]   _ops
OpOverloadre   r>   rd   rc   r$   )opop_namer"   r"   r'   _get_op_name   s   r   c                  C   sj   t s3tjtjt_ddlm}  | tj	j
jj ddlm} |tj	j
jdtddfdd}d	a dS dS )
zz
    Lazily apply dont_skip_tracing decorator to DebugMode._annotate, to avoid circular import/initialization issues.
    r   )_side_effectful_functions)register_loweringtagr   Nc                 S   s   t td d S )Nz7DebugMode._annotate() is a no-op for backend="inductor")r   logr   r"   r"   r'   _annotate_lowering  s   
z6_ensure_annotate_decorated.<locals>._annotate_loweringT)_annotate_decoratedr]   _dynamodont_skip_tracingr   	_annotatetorch.fx.noder   addopsdebug_mode_opsannotatedefaulttorch._inductor.loweringr   r$   )r   r   r   r"   r"   r'   _ensure_annotate_decorated  s   r   c                   @   s   e Zd ZdZ			ddedeeef dB deeef dB deddf
d	d
Z		dde
e dedB ddfddZ	ddede
e dedB ddfddZde
e defddZdefddZdS )
_DebugCallz3Base class for tracking operator calls in DebugModeNF
call_depthrecordr   stackr   c                 C   s0   || _ |rt | _t | _|| _|| _d | _d S rV   )r   r   stack_tracer   fwd_stack_tracer   r   
output_str)rT   r   r   r   r   r"   r"   r'   rU   (  s   
z_DebugCall.__init__rC   rQ   c                 C      t d)z
        To reduce memory consumption, this method stringifies args/kwargs, stores the result, and deletes original args/kwargs.
        z9Subclasses must implement stringify_args(), even if no-opNotImplementedErrorrT   rC   rQ   r"   r"   r'   stringify_args9  s   z_DebugCall.stringify_argsoutputc                    s:   t dd |r	dS t fdd|}dt| | _dS )z;Store stringified version of call output in self.output_strc                 S      | d u S rV   r"   rw   r"   r"   r'   r   J      z-_DebugCall.stringify_output.<locals>.<lambda>Nc                    s   t |  S rV   r{   rw   rC   rQ   r"   r'   r   L  s    z  ->  )r   r   r$   r   )rT   r   rC   rQ   r   r"   r   r'   stringify_outputC  s   z_DebugCall.stringify_outputc                 C   r   )Nz)Subclasses must implement string render()r   rT   rC   r"   r"   r'   renderO  s   z_DebugCall.renderc                 C   s
   |  g S rV   )r   rS   r"   r"   r'   __repr__R  s   
z_DebugCall.__repr__)NNFrV   )rc   rd   re   __doc__rf   dictr$   r   boolrU   listrO   r   r   r   r   r"   r"   r"   r'   r   %  sH    


r   c                       sz   e Zd ZdZ	ddededededdf
 fd	d
Z	dde	e
 dedB ddfddZde	e
 de
fddZdd Z  ZS )_OpCallzNormal operator callFargskwargsr   r   r   Nc                    s2   t  j||d || _|| _|| _d | _d | _d S Nr   )superrU   r   r   r   args_str
kwargs_str)rT   r   r   r   r   r   	__class__r"   r'   rU   Y  s   
z_OpCall.__init__rC   rQ   c                    s^   d  fdd| jD | _| jr&dd  fdd| j D  | _nd| _| `| `d S )Nr    c                 3   s    | ]	}t | V  qd S rV   r   r%   rz   r   r"   r'   	<genexpr>l  s    
z)_OpCall.stringify_args.<locals>.<genexpr>c                 3   *    | ]\}}| d t |  V  qdS r9   Nr   r:   r   r"   r'   r   p  
    
r7   )r,   r   r   r   rA   r   r   r"   r   r'   r   i  s   
z_OpCall.stringify_argsc                    s   | j d ur	| j }nd fdd| jD }| jd ur| j}n| jr4dd fdd| j D  }nd}t| jtj	j
rC| jj}nt| jdr[t| jdr[| jj d| jj }nt| j}| d	| | d
}| jrr|| j7 }| jr}|d| j 7 }|S )Nr    c                 3   s    | ]}t | V  qd S rV   r   r   rC   r"   r'   r   }  s    z!_OpCall.render.<locals>.<genexpr>c                 3   s(    | ]\}}| d t |  V  qdS r   r   r:   r  r"   r'   r     s    
r7   rd   rc   r   (r1   z  # )r   r,   r   r   r   rA   rl   r   r]   r   r   re   r>   rd   rc   r$   r   r   )rT   rC   r   r   r   base_strr"   r  r'   r   y  s*   




z_OpCall.renderc                 c   sH    | j d ur| j| j | j| jgE d H  d S | j| j| j| jgE d H  d S rV   )r   r   r   r   r   r   rS   r"   r"   r'   __iter__  s   
z_OpCall.__iter__FrV   )rc   rd   re   r   tupler   rf   r   rU   r   r$   rO   r   r   r  __classcell__r"   r"   r   r'   r   V  s0    
r   c                       sf   e Zd Z		d	d fddZ	ddee dedB ddfdd	Zdee defd
dZdd Z	  Z
S )_RedistributeCallFr   Nc                    sD   t  j||d || _|| _|| _|| _|| _t|t| _	d | _
d S r   )r   rU   rz   src_placementdst_placementtransform_info_stris_explicitrl   rf   is_outer_callarg_str)rT   rz   r	  r
  r  r   r   r  r   r"   r'   rU     s   

z_RedistributeCall.__init__rC   rQ   c                 C   s   t | j|| | _| `d S rV   )r{   rz   r  r   r"   r"   r'   r     s   z _RedistributeCall.stringify_argsc                 C   s   | j d ur	| j }nt| j| }| jd urd| j }nt| j|}t| j|}| d| }| jr5d}n| jr;d}nd}t | d| d| d}| j	rR|| j	7 }|S )	Nztrace: z -> z [implicit] z [explicit] r7   r  r    r1   )
r  r{   rz   r  r	  r
  r  r  REDISTRIBUTE_FUNCr   )rT   rC   r  placement_strsrc_placement_strdst_placement_str
annotationr  r"   r"   r'   r     s"   


z_RedistributeCall.renderc                 c   sV    | j d ur
| j }n| j}tV  | jr|| jgV  n|| j| jgV  i V  | jV  d S rV   )r  rz   r  r  r	  r
  r   )rT   rz   r"   r"   r'   r    s   
z_RedistributeCall.__iter__)FFr\   rV   )rc   rd   re   rU   r   r$   rO   r   r   r  r  r"   r"   r   r'   r    s     	
r  c                       sf   e Zd ZdZdededdf fddZ	ddee d	edB ddfd
dZ	dee defddZ
  ZS )_OutputPlacementCallz*Records output placement for a DTensor op.placements_strr   r   Nc                    s   t  | || _d S rV   )r   rU   r  )rT   r  r   r   r"   r'   rU     s   
z_OutputPlacementCall.__init__rC   rQ   c                 C      d S rV   r"   r   r"   r"   r'   r        z#_OutputPlacementCall.stringify_argsc                 C   s   d| j  S )Nz-> output: )r  r   r"   r"   r'   r        z_OutputPlacementCall.renderrV   )rc   rd   re   r   r$   rf   rU   r   rO   r   r   r  r"   r"   r   r'   r    s    
r  c                       s   e Zd ZdZdedeeef def fddZ	dde	e d	e
dB d
dfddZde	e d
efddZdddZdd Z  ZS )_TritonKernelCallz Triton kernel call from Inductorkernel_namer   r   c                    s.   t  | || _|| _d | _d | _d | _d S rV   )r   rU   r  r   r   
pre_hashespost_hashes)rT   r  r   r   r   r"   r'   rU     s   
z_TritonKernelCall.__init__NrC   rQ   r   c                    sX   t  rfdd| j D | _| jr'd fdd| j D | _d S d| _d S )Nc                    &   i | ]\}}t |tjr| |qS r"   rl   r]   rm   r:   hash_fnr"   r'   
<dictcomp>      
z4_TritonKernelCall.stringify_args.<locals>.<dictcomp>r    c                 3   r   r   r   r:   r   r"   r'   r     r   z3_TritonKernelCall.stringify_args.<locals>.<genexpr>r7   )_TRITON_INPUT_HASH_FNr   rA   r  r,   r   r   r"   )rC   r   rQ   r'   r     s   

z _TritonKernelCall.stringify_argsc                 C   s   d| j  d| j d}| jr)ddd | j D }dd| j  d	| d
 }nd}| jrIddd | j D }dd| j  d| d
 }nd}| | | dS )Nz	[triton] r  r1   r    c                 s   "    | ]\}}| d | V  qdS ri   Nr"   r:   r"   r"   r'   r     s     z+_TritonKernelCall.render.<locals>.<genexpr>z
    z# pre-kernel hashes: {r=   r7   c                 s   r$  r%  r"   r:   r"   r"   r'   r   (  s    
z# post-kernel hashes: {
)r  r   r  r,   rA   r   r  )rT   rC   r  pre_hashes_strpost_hashes_strr"   r"   r'   r     s,   


z_TritonKernelCall.renderdevice_interfacer   c                    sV   | |  trddd | j D i| _t  r' fdd| j D | _| `d S )Nr   c                 S   s*   i | ]\}}|t |tjr| n|qS r"   rl   r]   rm   cloner:   r"   r"   r'   r!  :  s    z._TritonKernelCall.finalize.<locals>.<dictcomp>c                    r  r"   r  r:   r  r"   r'   r!  @  r"  )synchronizecurrent_device_RECORD_TRITON_OUTPUTSr   rA   r   _TRITON_OUTPUT_HASH_FNr  )rT   r*  r"   r  r'   ra   4  s   
z_TritonKernelCall.finalizec                 c   s    | j d| j| jgE d H  d S )Nr"   )r  r   r   rS   r"   r"   r'   r  I  s   z_TritonKernelCall.__iter__rV   )r*  r   )rc   rd   re   r   r$   r   r   rf   rU   r   rO   r   r   ra   r  r  r"   r"   r   r'   r    s(    


r  c                       sX   e Zd ZdZ	ddededededdf
 fd	d
Zde	e defddZ
dd Z  ZS )_AnnotateCallzCustom annotation callFr   headerr   r   r   Nc                    s    t  j||d || _|| _d S r   )r   rU   r   r2  )rT   r   r2  r   r   r   r"   r'   rU   P  s   
z_AnnotateCall.__init__rC   c                 C   s   d| j  d| j S )Nr   ] )r2  r   r   r"   r"   r'   r   W  s   z_AnnotateCall.renderc                 c   s*    d| j  d| j di | jgE d H  d S )Nr   r3  r"   )r2  r   r   rS   r"   r"   r'   r  Z  s   z_AnnotateCall.__iter__r  )rc   rd   re   r   r   r$   rf   r   rU   r   r   r  r  r"   r"   r   r'   r1  M  s     r1  c                 G   s2   | | }|d urt |tstdt|j |S )Nz#hook must return None or dict, got )rl   r   AssertionErrorrs   rc   )hookr   r   r"   r"   r'   	_run_hookc  s   r6  callc                 C   sL   t r"t D ]}t|||||| }|d ur!| jd u ri | _| j| qd S d S rV   )r   r6  r   update)r7  functypesr   r   r5  hook_outr"   r"   r'   _run_dispatch_pre_log_hooksj  s   
r<  c           	      C   s   t r i }t D ]}t||||||}|d ur|| q|r || _trB| jd u r*i | _tD ]}t||||||}|d urA| j| q,d S d S rV   )r   r6  r8  r   r   r   )	r7  r9  r:  r   r   resultr   r5  r;  r"   r"   r'   _run_dispatch_hooksv  s&   

r>  c                 C   sX   t | tr
t| jS t | tr| jS t | tr!d| j d| j S t | t	r(t
S t| S )z>String identifying _DebugCall (e.g. func, kernel, module name)r   r3  )rl   r   r   r   r  r  r1  r2  r   r  r  r$   )r7  r"   r"   r'   _get_call_name  s   




r?  zdebug_mode_ops::annotater"   )mutates_argsr   c                 C   r  rV   r"   r   r"   r"   r'   r     r  r   c                 C   r  rV   r"   r   r"   r"   r'   _annotate_fake     rA  c                       sF   e Zd ZdZ fddZdejjdef fddZ	 fdd	Z
  ZS )
DebugInterpretera|  
    Interpreter class for running aot_eager compiled regions when DebugMode is active,
    instead of using the compiled code. This gives us access to fx.Node metadata to decorate
    and contextualize DebugMode logs (e.g. nn_module_stack, stack_trace, compiled region boundaries).

    Note: this is currently only enabled with DebugMode(run_compile_with_interpreter=True).
    c                    s`   t  | t | _| jd u rtdt| jj| _|| _| jj	
td| j d| jj d S )N No DebugMode is currently activeenter region (compile))r   rU   r   moderr   r   current_nn_module_stackbase_nn_module_stackbackend	operatorsappendr1  r   )rT   modulerJ  r   r"   r'   rU     s   
zDebugInterpreter.__init__nr   c                    s   | j d u r	td| j jr%|jdvr%| j | j|jdi |jdi  | j jrW|jdvrW|jdd  }d urW| j 	| t
 |W  d    S 1 sPw   Y  d S t
 |S )NrD  )placeholderr   nn_module_stackfwd_nn_module_stackr   )rG  rr   record_nn_moduler   _handle_fx_nn_module_stackrI  metar   record_stack_traceset_fx_stack_tracer   run_node)rT   rN  r   r   r"   r'   rW    s    


$zDebugInterpreter.run_nodec                    s   | j d u r	tdt j| }t| j jt| jk rttd t| j jt| jkr:| j 	  t| j jt| jks*| j j
td| j d| j j |S )NrD  z:unexpected handling of nn_module_stack in DebugInterpreterexitrF  )rG  rr   r   runr@   rH  rI  r   r   _exit_nn_module_callrK  rL  r1  rJ  r   )rT   r   r   r=  r   r"   r'   rY    s    

zDebugInterpreter.run)rc   rd   re   r   rU   r]   fxNoder   rW  rY  r  r"   r"   r   r'   rC    s
    rC  c                       sV  e Zd Zddddddddddddd	dS fddZdSdd	ZdSd
dZedefddZdSddZ	dSddZ
dTddZdd Zdd ZdTddZ fddZ fddZejdd  Zd!d" Zd#d$ ZdSd%d&Zd'ee d(eeeeef f dB d)eeeeef f dB ddfd*d+Zej		dUd,edB d-efd.d/ZdSd0d1Zd2ed3eeef defd4d5Z dVd6edB defd7d8Z!e"ej			dWd9e#dB d:e#dB d;e#dB fd<d=Z$e"ejd>d? Z%e"ej	dXdAe#eB ee B dBefdCdDZ&e"ejdEdF Z'e(dGdH Z)dIdJ Z*e"dKeddfdLdMZ+e"	dYdNedOedPedee fdQdRZ,  Z-S )Zr   FTN)record_torchfunctionrecord_faketensorrecord_realtensorrecord_tensor_attributesrR  store_original_argsrU  record_output
record_idsrecord_profiler_contextrecord_localtensorrun_compile_with_interpreterr   c                   s   t    dd l}t  d| _|| _|| _|| _|| _|pg | _	|| _
d | _| j
r-|   || _|| _|| _|	| _|
| _|| _|   d S )Nr   T)r   rU   torch.distributed.tensorr   supports_higher_order_operatorsr]  r^  r_  re  r`  rR  module_trackermodule_tracker_setupra  rU  rb  rc  rd  rf  reset)rT   r]  r^  r_  r`  rR  ra  rU  rb  rc  rd  re  rf  r]   r   r"   r'   rU     s(   

zDebugMode.__init__c                 C   s0   g | _ d| _t | _i | _d| _g | _d | _d S rP   )rK  r   rO   _tensor_memo_output_infoignored_record_functionsrH  fx_stack_tracerS   r"   r"   r'   rk  5  s   
zDebugMode.resetc                 C   s   || j |< dS )z5Assign IDs to output tensors and store in output_infoN)rm  )rT   op_indexr=  r"   r"   r'   _track_op_output>  s   zDebugMode._track_op_outputc                 C   s   dS )NTr"   )clsr"   r"   r'   ignore_compile_internalsE  rB  z"DebugMode.ignore_compile_internalsc                 C   s^   t rd S t|drd S | js|| j| jr| jnd  | jr'| j |_	|_
| j| d S )Nzprofiler::_record_function)_IN_INDUCTOR_BENCHMARKr$   r   ra  r   r`  rc  rl  ro  r   r   rK  rL  )rT   r7  r"   r"   r'   _record_callI  s   zDebugMode._record_callc                 C   s0   | j sd S ||| j| jr| j d S d  d S rV   )rb  r   r`  rc  rl  )rT   r7  r   r"   r"   r'   _record_call_outputZ  s   
zDebugMode._record_call_outputr"   c              	   C   sz   |d u ri }t |||| j| jd}| | z|  jd7  _||i |}| || |W |  jd8  _S |  jd8  _w )Nr   r[   )r   r   rU  ru  rv  )rT   r9  r:  r   r   r7  r=  r"   r"   r'   __torch_function__c  s   
 zDebugMode.__torch_function__c                    sZ   t  fdddD r|  jd7  _d S t d| j| jd}| j| |  jd7  _d S )Nc                 3   s    | ]}  |V  qd S rV   )r   )r%   prefixr   r"   r'   r   v  s
    
z3DebugMode._maybe_record_function.<locals>.<genexpr>)zCachingAutotuner.zInductorBenchmarker.zcompile_fx.<locals>.r[   zrecord functionr   )anyrn  r1  r   rU  rK  rL  rT   r   r7  r"   r   r'   _maybe_record_functiont  s   
z DebugMode._maybe_record_functionc                 C   sH   | j dk rtd| j  | j dkr|  j d8  _ d S |  jd8  _d S )Nr   z&ignored_record_functions is negative: r[   )rn  r4  r   rS   r"   r"   r'   _maybe_exit_record_function  s   


z%DebugMode._maybe_exit_record_functionc                 C   s  |d u ri }| j r4|tjjjjkr(t|dkr tdt| | |d  n|tjjj	j
kr4|   |tjjjju rTt|dkrKtdt| | |d  d S ddlm} d }tjjj|v rut|||| j| jd}| | tS t|v stt tr| jr|tjjjjkrt|||| jd | jd}| | n5||v r| j rt|||| jd | jd}| | nt|dkr| j!rt|||| jd | jd}| | |rt"||||| ||i |}|r| #|| t$|||||| |S )Nr[   zexpected 1 arg, got r   )LocalTensorr   )%rd  r]   r   profiler_record_function_enter_newr   r@   r4  r{  _record_function_exit_RecordFunctionr|  r   r   _handle_annotatetorch.distributed._local_tensorr}  ro   rB   rp   r   r   rU  ru  NotImplementedr   rl   r
   r   r^  primr   re  r_  r<  rv  r>  )rT   r9  r:  r   r   r}  r7  r=  r"   r"   r'   __torch_dispatch__  s   




zDebugMode.__torch_dispatch__c                    sZ   t d7 a | jrtj|  t   | jr| j  | j	r+tj
jddd| _| j  | S )Nr[   TF)	check_nan)_ACTIVE_DEBUG_MODE_COUNTr]  r]   r^   _push_on_torch_function_stackr   	__enter__rR  ri  rU  autogradset_detect_anomalyanomaly_for_tracesrS   r   r"   r'   r    s   


zDebugMode.__enter__c                    sN   t d8 a t j|  | jr| j  | jrtj  | j	r%| j
j|  d S d S Nr[   )r  r   __exit__rR  ri  r]  r]   r^   _pop_torch_function_stackrU  r  )rT   r   r   r"   r'   r    s   

zDebugMode.__exit__c                 c   s$    || _ z	d V  W d | _ d S d | _ w rV   )ro  )rT   r   r"   r"   r'   rV    s
   zDebugMode.set_fx_stack_tracec                 C   sB   t ||| jd | jd}| j| | j| |  jd7  _d S )Nr[   r   )r1  r   rU  rK  rL  rH  )rT   fqnr2  r7  r"   r"   r'   _enter_nn_module_call  s   zDebugMode._enter_nn_module_callc                 C   s   |  j d8  _ | j  d S r  )r   rH  rW   rS   r"   r"   r'   rZ    s   zDebugMode._exit_nn_module_callc                    sB   ddl m} |  _d fdd}d fdd} j|| d S )	Nr   r   r   c                    s    j | } |d d S )Nznn.Mod)ri  _get_mod_namer  )rM  inputr  rS   r"   r'   pre_fw_hook  s   z3DebugMode.module_tracker_setup.<locals>.pre_fw_hookc                    s       d S rV   )rZ  )rM  r  r   rS   r"   r'   post_fw_hook  r  z4DebugMode.module_tracker_setup.<locals>.post_fw_hookr\   )$torch.distributed._tools.mod_trackerr   ri  register_user_hooks)rT   r   r  r  r"   rS   r'   rj    s
   zDebugMode.module_tracker_setup
base_stackrP  rQ  c                 C   s   |pi }|pi }|r|rt d|}|r|n|}| j}|dd | D  }t|t| }t|t| }	|	D ]}
|   q8| jdk rHt dt|D ]}| ||rUdnd qL|| _dS )	a_  
        Called when DebugInterpreter observes nn_module_stack or fwd_nn_module_stack metadata
        from executing the compiled GraphModule.

        If the current module stack is mismatched with what's currently tracked in DebugMode
        (current_nn_module_stack), we adjust call depth and add new [nn.Module] log entries accordingly.
        zAExpecting at most one of nn_module_stack and fwd_nn_module_stack.c                 S   s   g | ]}|d  qS )r   r"   )r%   r<   r"   r"   r'   r(   <  r)   z8DebugMode._handle_fx_nn_module_stack.<locals>.<listcomp>r   z.Unexpectedly, DebugMode call_depth is negativeznn.Mod (compile)znn.Mod (compile bwd)N)r4  rH  valuessetrZ  r   sortedr  )rT   r  rP  rQ  is_fwdr   current_stack	new_stackenteredexited_r  r"   r"   r'   rS  "  s*   


z$DebugMode._handle_fx_nn_module_stackr  r  c                 c   s`    z&|  t||||| jd | j|d |  jd7  _d V  W |  jd8  _d S |  jd8  _w )Nr[   )r	  r
  r  r   r   r  )ru  r  r   rU  )rT   rz   r	  r
  r  r  r"   r"   r'   record_redistribute_callsO  s    	"z#DebugMode.record_redistribute_callsc                 C   sD   | j sdS ddlm} tt|t|}t|| jd }| | dS )z=Record output placements for a DTensor op as a separate line.Nr   rG   r[   )	rb  rI   rH   r$   r   rN   r  r   ru  )rT   output_specrH   r  r7  r"   r"   r'   record_output_placementsi  s   
z"DebugMode.record_output_placementsr  r   c                 C   s.   t ||| jd }|| j | j| |S r  )r  r   r   r`  rK  rL  )rT   r  r   r7  r"   r"   r'   record_triton_kernelu  s   zDebugMode.record_triton_kernelshow_stack_tracec           
         s2  |du r j n|}tj  |s'd fdd jD }|W  d   S g }d} jD ]T}d}t|dr>|jr>|j}nt|drI|jrI|j}d}|rQt	|}|rp||krp|r^|
d d|jd	  }|
|d
 |  |}dd|j  | j }	|
|	 q.d|W  d   S 1 sw   Y  dS )a@  
        show_stack_trace: option to display one-line stack trace summaries above groups
                        of operations (similar to gm.print_readable() style).
                        Requires record_stack_trace=True.
                        if None, uses self.record_stack_trace, otherwise overrides it.
        Nr'  c                 3   s*    | ]}d d |j   | j V  qdS )r&  N)r   r   r`  )r%   r   rS   r"   r'   r     s    

z)DebugMode.debug_string.<locals>.<genexpr>r   r   r7   r&  r[   z# )rU  r]   r^   DisableTorchFunctionr,   rK  r>   r   r   r   rL  r   r   r`  )
rT   r  r=  linesprev_stack_summaryr   r   stack_summaryindentliner"   rS   r'   debug_string}  sF   


$zDebugMode.debug_stringrecord_hooklog_hookpre_log_hookc                 c   s    | rt |  |rt| |rt| zdV  W | r!t   |r't  |r/t  dS dS | r7t   |r=t  |rDt  w w )ai  
        Allows installing post-hooks on arguments to intercepted __torch_dispatch__ calls;
        hook signatures are expected as (func, types, args, kwargs, result),
        i.e. __torch_dispatch__ args + return value.

        Logging hook outputs are stored in call.log and annotate calls in debug_string(),
        while recording hook outputs are just stored in call.record.
        For now hooks are expected to return dictionaries.

        pre_log_hook signature is (func, types, args, kwargs, call) and is executed before
        the operation. It allows capturing state before in-place mutations.
        N)r   rL  r   r   rW   )r  r  r  r"   r"   r'   dispatch_hooks  s.   



zDebugMode.dispatch_hooksc                  c   s`    dd } z't }da tj| d dV  W d   n1 sw   Y  W |a dS W |a dS |a w )zN
        Hook for storing cloned output tensors in .record["output"].
        c                 S   s   t dd |}d|iS )Nc                 S   s   t | tjr
|  S | S rV   r+  rw   r"   r"   r'   r         zADebugMode.record_outputs.<locals>.dispatch_hook.<locals>.<lambda>r   r   r9  r:  r   r   r=  r   r"   r"   r'   dispatch_hook  s   z/DebugMode.record_outputs.<locals>.dispatch_hookT)r  N)r/  r   r  )r  _old_record_tritonr"   r"   r'   record_outputs  s   zDebugMode.record_outputsr   r   hash_inputsc                 #   s    dd t | r| n(t| tr| nt| tr+fdd| D fddn	tdt|  fdd	  fd
d} fdd}z1rNt}at}atj	|rY|ndd dV  W d   n1 skw   Y  W ru|a|adS r}|a|aw )au  
        Installs hook for tensor hash logging.

        hash_fn: One of:
            - Custom-defined hash function
            - String: one of ("norm", "hash_tensor")
                - "norm": uses norm_hash_fn; basically tensor's L1 norm
                - "hash_tensor": uses torch.hash_tensor (XOR sum reduction)
            - List of strings: returns tuple of hashes from above options
        hash_inputs: if True, also hashes tensors in (args, kwargs), storing them in "input_hash".
        Input hashes are captured before the operation executes, so they reflect the state before
        any in-place mutations.
        c                 S   s@   t | tr	| dvrtd| tj| dkrtddS tddS )N)r   r   z/hash_type must be 'norm' or 'hash_tensor', got r   T)r|   )rl   r$   r4  	functoolspartialr   r   )	hash_typer"   r"   r'   hash_fn_option  s   z3DebugMode.log_tensor_hashes.<locals>.hash_fn_optionc                    s   g | ]} |qS r"   r"   r%   fn)r  r"   r'   r(     r)   z/DebugMode.log_tensor_hashes.<locals>.<listcomp>c                    s   t  fddD S )Nc                 3   s    | ]}| V  qd S rV   r"   r  rw   r"   r'   r     s    z@DebugMode.log_tensor_hashes.<locals>.<lambda>.<locals>.<genexpr>)r  rw   )fnsrw   r'   r     s    z-DebugMode.log_tensor_hashes.<locals>.<lambda>zRlog_tensor_hashes() expected hash_fn to be callable, str, or list[str], but found c                    s   t  fdd| S )Nc                    s   t | tjr
 | S d S rV   r  rw   r  r"   r'   r     r  zADebugMode.log_tensor_hashes.<locals>._tree_hash.<locals>.<lambda>r  )objr  r"   r'   
_tree_hash  s   z/DebugMode.log_tensor_hashes.<locals>._tree_hashc                    sF   dt | v sdt | v rdS r! ||f}tdd |s!d|iS dS )z:Pre-hook to capture input hashes before operation executesemptyr~  Nc                 S   r   rV   r"   rw   r"   r"   r'   r   '  r   zMDebugMode.log_tensor_hashes.<locals>._dispatch_pre_log_hook.<locals>.<lambda>
input_hash)r$   r   )r9  r:  r   r   r7  r  )r  r  r"   r'   _dispatch_pre_log_hook  s   z;DebugMode.log_tensor_hashes.<locals>._dispatch_pre_log_hookc                    sF   dt | v sdt | v rdS i } ||d< tdd | r!dS |S )z;Post-hook to capture output hashes after operation executesr  r~  Nhashc                 S   r   rV   r"   rw   r"   r"   r'   r   3  r   zJDebugMode.log_tensor_hashes.<locals>._dispatch_post_hook.<locals>.<lambda>)r$   r   r  r  )r  r"   r'   _dispatch_post_hook+  s   z8DebugMode.log_tensor_hashes.<locals>._dispatch_post_hookN)r  r  )
callablerl   r$   r   r   rs   r#  r0  r   r  )r   r  r  r  _old_input_hfn_old_output_hfnr"   )r  r  r  r  r  r'   log_tensor_hashes  sD   



zDebugMode.log_tensor_hashesc                   c   s    z
da dV  W da dS da w )z
        Context manager for disabling logging during inductor benchmarking,
        so logs don't contain all kernels launched from autotuning.
        TNF)rt  r"   r"   r"   r'   _benchmarking_inductorH  s
   z DebugMode._benchmarking_inductorc                 C   s
   t | jS rV   )r   rK  rS   r"   r"   r'   logsV  s   
zDebugMode.logsc                 C   s"   t |d| j| j}| j| dS )zHandles DebugMode._annotate()r   N)r1  r   rU  rK  rL  rz  r"   r"   r'   r  Z  s   zDebugMode._handle_annotater   c                 C   s   t jj|  dS )z
        If an active DebugMode exists, adds an "[annotate] <tag>" entry to the logs. Useful for contextualizing logs.
        Implemented with a custom op.
        N)r]   r   r   r   r   r"   r"   r'   r   _  s   zDebugMode._annotatelogs1logs2compare_inputsc                    sZ  t | t |krtdt |  dt | g  tt| |D ]\}\}}t|j}t|j}||krAtd| d| d| |}t|t|}	}
|	|
krdtd| d| d|	 d| d|
 d|	|j|jkrtd	| d d
| d|j d|j 
t|t	rt
|t
|krtd| d| d| qt|tr|j|jkrtd|j d| d|j d|j  fdd}|jdu|jdu}}||krtd|j d| d| d| |r||j|jdd |r|jdu|jdu}}||krtd|j d| d| d| |r||j|jdd qt|tr fdd}|jduo5d|jv }|jduo@d|jv }||krYtd| d d | d!| d"| 
|ri||jd |jd dd |r|jduovd#|jv }|jduod#|jv }||krtd$| d d | d%| d&| 
|r||jd# |jd# dd q S )'a  
        Compares tensor hashes between two DebugMode runs, for checking run-to-run numerical divergence.

        This first validates the two log sequences have identical structure (same operations, input shapes/dtypes, etc.),
        then compares tensor hash values, and returns a list of call outputs where mismatches were found.
        Expects input logs to have been run with log_tensor_hashes, and looks for hashes in .log["hash"] & .log["input_hash"]
        (or .post_hashes & .pre_hashes for triton kernels).

        note: skips checking log pairs where hashes aren't present, but will raise if present in one & not the other.

        Args:
            logs1: logs from the first DebugMode run (from debug_mode.logs)
            logs2: logs from the second DebugMode run
            compare_inputs: If True, also compare input tensor hashes (default: only output checking)

        Returns:
            List of dictionaries describing hash mismatches. Each dict contains:
                - call_type: "torch op" or "triton kernel"
                - call: Operator/kernel name
                - arg_name: For triton kernels, the argument name; None for torch ops
                - pytree_path: For torch ops, the pytree path to the differing tensor; None for kernels
                - hash1: Hash value from the first run
                - hash2: Hash value from the second run
                - rel_diff: Relative difference between hash values
                - is_input_hash: True if this is an input hash, False for output hash

        Raises:
            ValueError: If logs have different lengths, call types, operator names, or call depths

        Usage::

            # Run model first time
            with DebugMode() as debug_mode, DebugMode.log_tensor_hashes():
                model(x)
                logs1 = debug_mode.logs

            # Run again, in exactly the same way
            with DebugMode() as debug_mode, DebugMode.log_tensor_hashes():
                model(x)
                logs2 = debug_mode.logs

            mismatches = DebugMode.check_hash_mismatches(logs1, logs2)
            for m in mismatches:
                print(f"{m['call']}: hash diff {m['rel_diff']:.2e}")
        zLog lengths don't match:  vs z Call types don't match at index ri   zOperators don't match at index r   z] vs r*   zCall depths for z] don't match at index z(Redistribute calls don't match at index z(Triton kernel call args don't match for z
 at index z	:

log1: z

log2: c                    s   t |  t | krtdt |   dt |  | D ]#}| | || krC d|d | | || t| | || |d q d S )Nzhash key mismatch: r  ztriton kernel	call_typer7  arg_namepytree_pathr   r   rel_diffis_input_hash)r  keysr4  rL  r   )hashes1hashes2is_inputkeydifference_infor   r"   r'   compare_triton_hashes  s*   z>DebugMode.check_hash_mismatches.<locals>.compare_triton_hashesNz2Triton kernel post-hash presence inconsistent for z: log1 has post_hashes=z, log2 has post_hashes=Fr  z1Triton kernel pre-hash presence inconsistent for z: log1 has pre_hashes=z, log2 has pre_hashes=Tc                    s     fdd}t || | d S )Nc                    s6   ||kr  dd t| ||t||d d S d S )Nztorch opr  )rL  r   r   )keypathr   r   )r  r  r   r"   r'   _helper  s   zKDebugMode.check_hash_mismatches.<locals>.compare_op_hashes.<locals>._helper)r   )r  r  r  r  r  r  r'   compare_op_hashes  s   z:DebugMode.check_hash_mismatches.<locals>.compare_op_hashesr  z4Output hash presence inconsistent for triton kernel z] at index z: log1 has hash=z, log2 has hash=r  z3Input hash presence inconsistent for triton kernel z: log1 has input_hash=z, log2 has input_hash=)r@   
ValueError	enumerateziprs   rc   r?  r   rl   r  r  r  r   r  r  r  r   r   )r  r  r  ilog1log2
call1_type
call2_typer  op1_nameop2_namer  
has_post_1
has_post_2	has_pre_1	has_pre_2r  	has_hash1	has_hash2r"   r  r'   check_hash_mismatchesg  s   1

"$







zDebugMode.check_hash_mismatchesr\   )r"   N)NFrV   )NNN)r   Fr  ).rc   rd   re   rU   rk  rq  classmethodr   rs  ru  rv  rw  r{  r|  r  r  r  
contextlibcontextmanagerrV  r  rZ  rj  r   r$   r   r  r   rS  r  r  r  r  r  staticmethodr   r  r  r  r  propertyr  r  r   r  r  r"   r"   r   r'   r     s    
H
	


	

N


-


7#W
c                  C   s4   t dkrd S d } t D ]}t|tr|}  | S q| S rP   )r  r   rl   r   )
debug_moderG  r"   r"   r'   r   3  s   

rV   r  )br   r  r  r   loggingr   r   r`   collections.abcr   typingr   r   r]   torch._loggingr   torch._subclasses.fake_tensorr   r   torch.fx.graphr   torch.utils._dtype_abbrsr	   torch.utils._python_dispatchr
   r   r   torch.utils._pytreer   r   r   r   r   torch.utils._tracebackr   torch.utils.weakr   torch._dynamo.device_interfacer   r  r   	getLoggerrc   r   __all__r  r   r   __annotations__r   r   rt  r/  r0  r#  r  r$   r.   r3   r6   rF   rN   rO   rv   r{   rm   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   r  r  r  r1  r6  r<  r>  r?  library	custom_opr   register_fakerA  r[  InterpreterrC  r   r   r"   r"   r"   r'   <module>   s   !

  

1JGVH      M