o
    ki                     @   s  d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZ g d
ZG dd deZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZdS )    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr
   c                       s   e Zd ZU dZdZg dZeed< eed< edB ed< e	ed< e	ed	< 	
					ddedededB de	d	e	ddf fddZ
dddZdddZdd Zdd Z		d fddZ  ZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   h㈵>皙?Treturnc           	   	      s(  ||d}t    || _|| _|| _|| _|| _| jr5ttj	|fi || _
ttj	|fi || _n| dd  | dd  | jr|| dtj|fi | | dtj|fi | |  |  | dtj	ddtjid	d
 | D  |  n| dd  | dd  | dd  |   d S )Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr   r   c                 S      i | ]\}}|d kr||qS r    .0kvr'   r'   d/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/nn/modules/batchnorm.py
<dictcomp>L       z&_NormBase.__init__.<locals>.<dictcomp>r   )super__init__r   r   r   r   r   r   torchemptyr    r!   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters	selfr   r   r   r   r   r   r   factory_kwargs	__class__r'   r,   r1   &   sH   


	z_NormBase.__init__c                 C   s.   | j r| j  | jd | j  d S d S )Nr	   )r   r"   zero_r#   fill_r$   r=   r'   r'   r,   reset_running_statsV   s
   
z_NormBase.reset_running_statsc                 C   s.   |    | jrt| j t| j d S d S N)rD   r   r   ones_r    zeros_r!   rC   r'   r'   r,   r;   ^   s
   z_NormBase.reset_parametersc                 C   s   t rE   )NotImplementedErrorr=   inputr'   r'   r,   _check_input_dimd   s   z_NormBase._check_input_dimc                 C   s   dj di | jS )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats}r'   )format__dict__rC   r'   r'   r,   
extra_reprg   s   z_NormBase.extra_reprc           
   	      s   | dd }|d u s|dk r4| jr4|d }	|	|vr4| jd ur*| jjtdkr*| jntjdtjd||	< t ||||||| d S )Nversionr   r$   metar   r&   )	getr   r$   r   r2   r8   r9   r0   _load_from_state_dict)
r=   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrO   num_batches_tracked_keyr?   r'   r,   rR   m   s$   

z_NormBase._load_from_state_dictr   r   TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatboolr1   rD   r;   rK   rN   rR   __classcell__r'   r'   r?   r,   r      sF   
 	
0
	r   c                       sZ   e Zd Z						ddedededB ded	ed
df fddZded
efddZ  Z	S )
_BatchNormr   r   TNr   r   r   r   r   r   c           	         s*   ||d}t  j|||||fi | d S Nr   )r0   r1   r<   r?   r'   r,   r1      s   



z_BatchNorm.__init__rJ   c              
   C   s   |  | | jd u rd}n| j}| jr1| jr1| jd ur1| jd | jd u r.dt| j }n| j}	 | jr8d}n
| jd u oA| jd u }	 t	
|| jrL| jrO| jnd | jrV| jrY| jnd | j| j||| jS )N        r	         ?T)rK   r   trainingr   r$   add_re   r"   r#   F
batch_normr    r!   r   )r=   rJ   exponential_average_factorbn_trainingr'   r'   r,   forward   s:   



z_BatchNorm.forwardr[   )
r]   r^   r_   rc   re   rf   r1   r   rr   rg   r'   r'   r?   r,   rh      s*    	rh   c                       sX   e Zd ZU eed< eed< 						d	d fdd	Zd fd
dZdddZ  ZS )_LazyNormBaser    r!   r   r   TNr   c                    s   ||d}t  jd||ddfi | || _|| _| jr,tdi || _tdi || _| jrUtdi || _tdi || _	t
j	ddt
jidd | D | _d S d S )	Nr   r   Fr   c                 S   r%   r&   r'   r(   r'   r'   r,   r-      r.   z*_LazyNormBase.__init__.<locals>.<dictcomp>r'   r/   )r0   r1   r   r   r   r    r!   r   r"   r#   r2   r8   r9   r:   r$   )r=   r   r   r   r   r   r   r>   r?   r'   r,   r1      s4   
	
z_LazyNormBase.__init__c                    s(   |   s| jdkrt   d S d S d S )Nr   )has_uninitialized_paramsr   r0   r;   rC   r?   r'   r,   r;      s   z_LazyNormBase.reset_parametersc                 C   s   |   rJ|jd | _| jr1t| jtstdt| jts!td| j	| jf | j	| jf | j
rD| j	| jf | j	| jf |   d S d S )Nr	   z-self.weight must be an UninitializedParameterz+self.bias must be an UninitializedParameter)rt   shaper   r   
isinstancer    r   AssertionErrorr!   materializer   r"   r#   r;   rI   r'   r'   r,   initialize_parameters  s(   z#_LazyNormBase.initialize_parametersr[   r\   )	r]   r^   r_   r   rd   r1   r;   ry   rg   r'   r'   r?   r,   rs      s   
 (rs   c                   @      e Zd ZdZdddZdS )r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the variance is calculated via the biased estimator,
    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
    moving average of the variance is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    r   Nc                 C   4   |  dkr|  dkrtd|   dd S d S Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrI   r'   r'   r,   rK   b     zBatchNorm1d._check_input_dimr\   r]   r^   r_   r`   rK   r'   r'   r'   r,   r     s    Fr   c                   @      e Zd ZdZeZdddZdS )r   aR  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                 C   r{   r|   r   rI   r'   r'   r,   rK     r   z LazyBatchNorm1d._check_input_dimr\   )r]   r^   r_   r`   r   cls_to_becomerK   r'   r'   r'   r,   r   g      r   c                   @   rz   )r   a  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    r   Nc                 C   $   |  dkrtd|   dd S N   zexpected 4D input (got r~   r   rI   r'   r'   r,   rK        zBatchNorm2d._check_input_dimr\   r   r'   r'   r'   r,   r         Gr   c                   @   r   )r   aU  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                 C   r   r   r   rI   r'   r'   r,   rK     r   z LazyBatchNorm2d._check_input_dimr\   )r]   r^   r_   r`   r   r   rK   r'   r'   r'   r,   r     r   r   c                   @   rz   )r   a  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    r   Nc                 C   r   N   zexpected 5D input (got r~   r   rI   r'   r'   r,   rK   @  r   zBatchNorm3d._check_input_dimr\   r   r'   r'   r'   r,   r     r   r   c                   @   r   )r   aU  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                 C   r   r   r   rI   r'   r'   r,   rK   b  r   z LazyBatchNorm3d._check_input_dimr\   )r]   r^   r_   r`   r   r   rK   r'   r'   r'   r,   r   E  r   r   c                       s   e Zd ZdZ							ddedededB d	ed
ededB ddf fddZdddZ	dddZ
dedefddZedddZ  ZS )r
   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, correction=0)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TNr   r   r   r   r   process_groupr   c	           
         s0   ||d}	t  j|||||fi |	 || _d S ri   )r0   r1   r   )
r=   r   r   r   r   r   r   r   r   r>   r?   r'   r,   r1     s   


zSyncBatchNorm.__init__c                 C   s$   |  dk rtd|   dd S )Nr   z expected at least 2D input (got r~   r   rI   r'   r'   r,   rK     r   zSyncBatchNorm._check_input_dimc                 C   s   | ddkrtdd S )Nr	   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rI   r'   r'   r,   _check_non_zero_input_channels  s
   z,SyncBatchNorm._check_non_zero_input_channelsrJ   c           	      C   s  |  | | | | jdu rd}n| j}| jr:| jr:| jdu r$td| jd | jdu r7d| j  }n| j}	 | jrAd}n
| j	du oJ| j
du }	 | jrR| jrU| j	nd}| jr]| jr`| j
nd}|op| joptj optj }|r|jjddd	tj fvrtd
tj  tjjj}| jr| j}tj|}|dk}|st|||| j| j||| jS |stdt|| j| j||| j|||	S )z(
        Runs the forward pass.
        Nrj   z$num_batches_tracked must not be Noner	   rk   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or zbn_training must be True)rK   r   r   rl   r   r$   rw   rm   itemr"   r#   r2   distributedis_availableis_initializedr   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizern   ro   r    r!   r   sync_batch_normapply)	r=   rJ   rp   rq   r"   r#   	need_syncr   
world_sizer'   r'   r,   rr     s   





zSyncBatchNorm.forwardc                 C   s   |}t |tjjjjrStj|j|j|j	|j
|j|}|j
r:t  |j|_|j|_W d   n1 s5w   Y  |j|_|j|_|j|_|j|_t|drS|j|_| D ]\}}||| || qW~|S )aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nqconfig)rv   r2   nnmodules	batchnormrh   r
   r   r   r   r   r   no_gradr    r!   r"   r#   r$   rl   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechildr'   r'   r,   r   L  s6   $


z$SyncBatchNorm.convert_sync_batchnorm)r   r   TTNNNr\   rE   )r]   r^   r_   r`   rc   re   rf   r   r1   rK   r   r   rr   classmethodr   rg   r'   r'   r?   r,   r
   g  s:    i


cr
   )typingr   r2   r   torch.nnr   rn   r   torch.nn.parameterr   r   r   
_functionsr
   r   lazyr   r   r   __all__r   rh   rs   r   r   r   r   r   r   r'   r'   r'   r,   <module>   s&   wCHL"M"M"