o
    eip                     @  s~  U d dl mZ d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZ d dlZd dlmZ er=d dlmZ dd	lmZ g d
ZedZedZeejdsmedejjd< edejjd< edejjd< d dlmZmZmZ d3ddZd4ddZG dd deZ G dd dZ!e	dede"f f Z#de$d < e	!	"	d5d6d+d,Z%e	!	"	d5d7d/d,Z%	!	"	d5d8d2d,Z%dS )9    )annotationsN)Callable)overloadTYPE_CHECKING	TypeAliasUnion)	ParamSpecSelfTypeVar)Tensor)_POOL_HANDLE   )_dummy_type)is_current_stream_capturinggraph_pool_handle	CUDAGraphgraphmake_graphed_callables_R_P_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r   r   r   returnboolc                   C  s   t  S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r    r   r   [/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/cuda/graphs.pyr   -   s   r   r   c                   C  s   t jt S )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )torchcudar   r   r   r   r   r   r   6   s   r   c                      s   e Zd ZdZd'd( fddZ	
d)d* fddZd+ fddZd+ fddZd+ fddZd+ fddZ	d, fddZ
d+ fddZd- fd d!Zd. fd#d$Zd. fd%d&Z  ZS )/r   a-  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it will be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    F
keep_graphr   r   r	   c                   s   t  | |S N)super__new__)clsr    	__class__r   r   r#   ]   s   zCUDAGraph.__new__Nglobalpool_POOL_HANDLE | Nonecapture_error_modestrNonec                   s   t  j||d dS )a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )r(   r*   N)r"   capture_begin)selfr(   r*   r%   r   r   r-   `   s   zCUDAGraph.capture_beginc                      t    dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r"   capture_endr.   r%   r   r   r0   u   s   	zCUDAGraph.capture_endc                   r/   )a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r"   instantiater1   r%   r   r   r2      s   zCUDAGraph.instantiatec                   r/   )z,Replay the CUDA work captured by this graph.N)r"   replayr1   r%   r   r   r3         zCUDAGraph.replayc                   r/   )z1Delete the graph currently held by this instance.N)r"   resetr1   r%   r   r   r5      r4   zCUDAGraph.resetr   c                   
   t   S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r"   r(   r1   r%   r   r   r(      s   
zCUDAGraph.poolc                   r6   )z/Enable debugging mode for CUDAGraph.debug_dump.)r"   enable_debug_moder1   r%   r   r   r7      s   
zCUDAGraph.enable_debug_mode
debug_pathc                   s   t  |S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r"   
debug_dump)r.   r8   r%   r   r   r9      s   zCUDAGraph.debug_dumpintc                   r6   )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r"   raw_cuda_graphr1   r%   r   r   r;         
zCUDAGraph.raw_cuda_graphc                   r6   )a  Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.

        See the following for APIs for how to manipulate this object: `Graph Execution <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH__EXEC.html>`_ and `cuda-python Graph Execution bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-execution>`_
        )r"   raw_cuda_graph_execr1   r%   r   r   r=      r<   zCUDAGraph.raw_cuda_graph_exec)F)r    r   r   r	   )Nr'   )r(   r)   r*   r+   r   r,   r   r,   r   r   )r8   r+   r   r,   )r   r:   )__name__
__module____qualname____doc__r#   r-   r0   r2   r3   r5   r(   r7   r9   r;   r=   __classcell__r   r   r%   r   r   B   s    	
r   c                   @  sD   e Zd ZU dZdZded< 			ddddZdddZdddZdS )r   a  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Ntorch.cuda.Stream | Nonedefault_capture_streamr'   
cuda_graphr   r(   r)   streamr*   r+   c                 C  s~   |d u r| j jd u rtj | j _|d u rdn|f| _|d ur!|n| j j| _| jd u r/tdtj| j| _	|| _
|| _d S )Nr   zcapture_stream must not be None)r&   rF   r   r   Streamr(   capture_streamAssertionErrorrH   
stream_ctxrG   r*   )r.   rG   r(   rH   r*   r   r   r   __init__   s   


zgraph.__init__r   r,   c                 C  sT   t j  t jjjrt  t j  t j	
  | j  | jj| jd| ji d S )Nr*   )r   r   synchronizecompilerconfigforce_cudagraph_gcgccollectempty_cache_C_host_emptyCacherL   	__enter__rG   r-   r(   r*   r1   r   r   r   rW      s   





zgraph.__enter__argsobjectc                 G  s   | j   | jj|  d S r!   )rG   r0   rL   __exit__)r.   rX   r   r   r   rZ     s   
zgraph.__exit__)NNr'   )rG   r   r(   r)   rH   rE   r*   r+   r>   )rX   rY   r   r,   )	r@   rA   rB   rC   rF   __annotations__rM   rW   rZ   r   r   r   r   r      s   
 
r   torch.nn.Module.r   _ModuleOrCallable   F	callablessample_argstuple[Tensor, ...]num_warmup_itersr:   allow_unused_inputr(   r)   c                 C     d S r!   r   r_   r`   rb   rc   r(   r   r   r   r        r   tuple[_ModuleOrCallable, ...]tuple[tuple[Tensor, ...], ...]c                 C  rd   r!   r   re   r   r   r   r     rf   1_ModuleOrCallable | tuple[_ModuleOrCallable, ...]3tuple[Tensor, ...] | tuple[tuple[Tensor, ...], ...]c           )        s  t  rt  rtdd}t| ts$d}| f} tttdf |f}nttttdf df |}g  t	| |D ]N\}}t|t j
jrlt|jdkrYt|jdkrYt|jdks]tdtdd | D sltd	t jjj| }	 t|	 td
d |	D stdq9dd  D }
dd | D  fddtt| D }dd tt| D }dd tt| D }|du rt n|}t j  t jt j \ t	| ||D ]M\}}}d\}}}t|D ]4}t jj|| }tdd |D }t|dkrt jj|tdd |D tdd |D d|d}q|||fD ]}~q qW d   n	1 s1w   Y  t j  g }g }t	| ||D ]8\}}}t jj ||d || }W d   n	1 sbw   Y  t jj!|\}}|t| || qEg }g }t	t"|t"|t"|D ]\}}}tdd |D } tdd |D }d}t|dkrt jj ||d! t jj|tdd |D tdd | D d|d}W d   n	1 sw   Y  g }!d}"|D ]}#|#j#r|dur|!||"  |"d7 }"q|!d qt|!}!||  ||! q|$  |$  d:d/d0}$g }%t%| D ]F\}&}|$||& ||& |& |
|& ||& ||& ||& ||& ||& 	}'t|t j
jrhd;d8d9}(|(||j&|'|j'|_'|%| q(|%|' q(|rv|%d S t|%S )<a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FT.r   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s  s    | ]}|j d u V  qdS )FNrequires_grad.0br   r   r   	<genexpr>      z)make_graphed_callables.<locals>.<genexpr>zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s  s    | ]	}t |tjV  qd S r!   )
isinstancer   r   )rn   argr   r   r   rp         zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S  s   g | ]}t |qS r   )len)rn   rX   r   r   r   
<listcomp>  s    z*make_graphed_callables.<locals>.<listcomp>c                 S  s*   g | ]}t |tjjrt| nd qS )r   )rr   r   nnModuletuple
parameters)rn   cr   r   r   rv     s    c                   s   g | ]
} | |  qS r   r   rn   iflatten_sample_argsper_callable_module_paramsr   r   rv     s    c                 S     g | ]}t j qS r   r   r   r   rn   _r   r   r   rv         c                 S  r   r   r   r   r   r   r   rv     r   N)NNNc                 s      | ]}|j r|V  qd S r!   rk   rn   or   r   r   rp     rq   c                 s  r   r!   rk   r|   r   r   r   rp     s    
c                 s  s     | ]}|j rt|V  qd S r!   rl   r   
empty_liker   r   r   r   rp     s    
)outputsinputsgrad_outputsonly_inputsallow_unused)r(   c                 s  s$    | ]}|j rt|nd V  qd S r!   r   r   r   r   r   rp     s    
c                 s  r   r!   rk   r   r   r   r   rp     rq   c                 s  r   r!   rk   r|   r   r   r   rp     rq   c                 s  s    | ]	}|d ur|V  qd S r!   r   r   r   r   r   rp     rt      	fwd_graphr   	bwd_graphmodule_paramstuple[torch.nn.Parameter, ...]len_user_argsr:   output_unflatten_spectorch.utils._pytree.TreeSpecstatic_input_surfacera   static_outputsstatic_grad_outputstuple[Tensor | None, ...]static_grad_inputsr   Callable[..., object]c	           
        s:   G 	fdddt jj d fdd}	|	S )	Nc                      sD   e Zd ZedfddZeejjjd fd
dZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.GraphedctxrY   r   r   r   ra   c                   sn   t D ]}|  ||  kr| ||  q   tts.tdt tdd D S )Nz"static_outputs must be tuple, got c                 s  s    | ]}|  V  qd S r!   detachr   r   r   r   rp     s    zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)rangedata_ptrcopy_r3   rr   ry   rK   type)r   r   r}   )r   r   r   r   r   r   forward  s   
zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardgradsc                   s   t |t krtdt | dt  t|D ]\}}|d ur0| | kr0|| q   ttsCtdt tdd D S )Nzlen(grads)=z != len(static_grad_outputs)=z&static_grad_inputs must be tuple, got c                 s  s$    | ]}|d ur|  n|V  qd S r!   r   rm   r   r   r   rp   3  s
    
zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)	ru   rK   zipr   r   r3   rr   ry   r   )r   r   ggrad)r   r   r   r   r   backward  s"   

zXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)r   rY   r   r   r   ra   )r   rY   r   r   r   ra   )
r@   rA   rB   staticmethodr   r   autogradfunctiononce_differentiabler   r   )r   r   r   r   r   r   r   r   r   Graphed  s    r   	user_argsrY   r   c                    s0   t jjj|  } jt|  }t jj|S r!   )r   utils_pytreearg_tree_leavesapplyry   tree_unflatten)r   flatten_user_argsout)r   r   r   r   r   functionalized9  s   zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r   rY   r   rY   )r   r   Function)
r   r   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   r   make_graphed_autograd_function  s   $*z>make_graphed_callables.<locals>.make_graphed_autograd_functionfuncr\   graph_training_stater   graphedCallable[_P, _R]orig_fwdc                   s   d	 fdd}|S )
Nr   _P.argsuser_kwargs	_P.kwargsr   r   c                    s&    j kr| i |S | i |S r!   )training)r   r   r   r   r   r   r   r   new_fwdZ  s   
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwd)r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   make_graphed_forwardT  s   z4make_graphed_callables.<locals>.make_graphed_forward)r   r   r   r   r   r   r   r:   r   r   r   ra   r   ra   r   r   r   ra   r   r   )
r   r\   r   r   r   r   r   r   r   r   )(r   is_autocast_enabledis_autocast_cache_enabledRuntimeErrorrr   ry   typingcastr   r   rw   rx   ru   _backward_hooks_forward_hooks_forward_pre_hooksrK   allbuffersr   r   r   appendr   r   r   rN   rH   rI   tree_leavesr   r   r   tree_flattenreversedrl   reverse	enumerater   r   ))r_   r`   rb   rc   r(   just_one_callable_sample_argsr{   rX   flatten_argper_callable_len_user_args"per_callable_static_input_surfaces
fwd_graphs
bwd_graphsmempoolr   r   grad_inputsr   outputs_gradr   vper_callable_static_outputs"per_callable_output_unflatten_specr   func_outputsflatten_outputsspec per_callable_static_grad_outputsper_callable_static_grad_inputsr   r   r   r   grad_idxrs   r   retr}   r   r   r   r~   r   r   %  s  I







@
)r   r   r?   )r^   FN)r_   r]   r`   ra   rb   r:   rc   r   r(   r)   r   r]   )r_   rg   r`   rh   rb   r:   rc   r   r(   r)   r   rg   )r_   ri   r`   rj   rb   r:   rc   r   r(   r)   r   ri   )&
__future__r   rR   r   collections.abcr   r   r   r   r   typing_extensionsr   r	   r
   r   r   
torch.cudar   _utilsr   __all__r   r   hasattrrU   __dict__torch._Cr   r   r   r   r   r   r   rY   r]   r[   r   r   r   r   r   <module>   sP   	

	tX	