o
    ei                     @   s4  U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZm Z m!Z! d dl"m#Z# d d	l$m%Z%m&Z&m'Z(m)Z)m*Z*m+Z+m,Z,m-Z- d d
l.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z7 d dl8m9Z9 g dZ:dZ;dZ<dZ=dZ>e?e@ ZAee2eejBeCeDe@f ZEeeEeFeE eGeE eHe@df f ZIeHe@eIf ZJeFeJ ZKeHe@eJeKB f ZLe? ZMe?e eNd< e jOdd ZPe
G dd dZQe
G dd deQZR			dcdejSde@d e@d!eTd"eTd#eAfd$d%ZUG d&d' d'ZVddd(d)ZWddd*dejSd+eGejXjYd,f d-eTd.e?ejS dB d/eQdB d#eRfd0d1ZZd2eHe@eIf d3eLd4eRd#dfd5d6Z[d7ejSejXjYB d8e@d#efd9d:Z\d;eHe@ef d4eRd#eHe@ef fd<d=Z]e^ dejSd4eRd#eHe@eIf fd>d?Z_e^ dejSd;eHe@eIf d4eRd#e4fd@dAZ`dBejXjYd#dfdCdDZad;eLd#eHe@eIf fdEdFZbdBejXjYd;eHe@eIf d4eRd#eLfdGdHZce^ dejSdIeGejXjYd,f d4eRd#eLfdJdKZddejSdBejXjYd3eLd4eRd#eLf
dLdMZee^ dejSdIeGejXjYd,f d;eLd4eRd#df
dNdOZfddd*dejSd.e?ejS dB d/eQdB d#eHe@eIf fdPdQZgddd*dejSdIejXjYeejXjY B d.e?ejS dB d/eQdB d#eLf
dRdSZhddd*dejSdIejXjYeejXjY B d.e?ejS dB d/eQdB d#eGeHe@eIf eLf f
dTdUZidejSd;eHejSeHe@eIf f eHe@eIf B d#eHe@eIf fdVdWZjddXdejSd2eHe@eIf d/eQdB d#e4fdYdZZkddXdejSdIejXjYeejXjY B d3eLd/eQdB d#df
d[d\ZlddXdejSdIejXjYeejXjY B d2eHe@eIf d3eLd/eQdB d#e4fd]d^ZmeddXdejSd/eQdB d#dfd_d`ZneddXdejSdIeGejXjYd,f d/eQdB d#dfdadbZodS )e    N)Callable	GeneratorIterable)asdict	dataclassfield)chain)Anycastno_type_checkUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater%   _patched_state_dictc                  c   s@    t  } t   zd V  W | rt   d S d S | rt   w w N)gc	isenableddisableenable)
is_enabled r;   q/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextQ   s   
r=   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dZeed< dS )r)   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiersN)__name__
__module____qualname____doc__r>   bool__annotations__r?   r@   rA   rB   rC   rD   rF   strr;   r;   r;   r<   r)   \   s   
 $r)   c                   @   s   e Zd ZU eedZeeejB e	ejB f e
d< eedZeeejB e	ejB f e
d< eedZee e
d< dZee
d< dZee
d< ejZee
d< eedZeej e
d	< d
S )_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rG   rH   rI   r   dictrP   rM   torchTensorr#   rL   rQ   setrR   rS   rK   rT   
contextlibnullcontextrU   r   listrV   nnModuler;   r;   r;   r<   rN      s$   
 rN   rE   TmodelnamerF   skip_ddp_prefixskip_compiler_prefixreturnc                    s  | td}d|vr|hS |d}g }| }t|D ]\}}	t|tr:|	dkr/td|	 d|j}|s9||	 qt|t	r|t
|d k rl||d  tkrld| t|t}
 r`  d  fdd|
jD   S t|t}|	tkr||	 t||	}qt|tjjjr|	d	krtd
|	 d|j}|s||	 qt||rt|| |	 }rt||rt||}||	 |	tjjjkr|t
|d krtdqt||	}qd| tdhS )a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .modulezExpected 'module', got ''   c                    s   h | ]}  | qS r;   r;   .0fqnprefixr;   r<   	<setcomp>       z_get_fqns.<locals>.<setcomp>	_orig_modzExpected '_orig_mod', got 'z-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPAssertionErrorrg   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr   rX   _dynamo
eval_frameOptimizedModulerq   hasattrgetr^   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)r`   ra   rF   rb   rc   	obj_namesfqn_obj_namescurr_objicurr_obj_name
flat_paramremoved_fqnr;   rm   r<   	_get_fqns   s\   



 










r   c                   @   s   e Zd ZdS )_EXTRA_STATEN)rG   rH   rI   r;   r;   r;   r<   r      s    r   c                 #   s<    t  dtjdtdtf fdd| dE d H  d S )Nrg   curr_fqnrd   c                 3   s    |  |r| dnd}|  D ]-\}}|v rqt|  r2|t|    v r2|d d }n| | }||E d H  qt| jdd| jddD ]\}}|| jv rXqN| | }||fV  qNt| j	dt
jjt
jjkr| t
jjj }|t fV  d S d S )Nrf   re   F)recurseget_extra_state)addnamed_childrenr   r}   valuesr   named_buffersnamed_parameters_non_persistent_buffers_set	__class__r^   r_   r   r   rg   r   r   )rg   r   ra   	submodulenew_fqnobjrF   r   visited_modulesr;   r<   r      s2   

z+_iterate_valid_model_state.<locals>.recursere   )rZ   r^   r_   rM   r   )r`   rF   r;   r   r<   _iterate_valid_model_state   s    "r   )
submodulesoptionsoptims.
optim_onlyr   r   c                C   s(  |r
t jdtdd |r|std|pt }i }i }t| D ]?\}}t|tr)qt| |}	|	|}
|
durJt
tt || |	 || ||< n|	 ||< |	D ]}
t|ts]|||
< qRqt| D ]\}}|D ]
}
t
tj|||
< qkqet }|rt|}|  D ]"\}}||vrqt| |}	t|	dkrtd|dd	 |	D  q|jr|jstd
t| }|r|jrt|j|jd}t|j|jp|jd}tj}nt |jd}t!|jd}tj"}t#j$dd }t%j&|| |||d}nt#j'}t(di t)|||||t
tt*j+ || t|dkdS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.   
stacklevelz;Optimizers are not passed in but optim_only is set to True.Nri   z)Submodule FQN should only have 1 instancec                 s   s    | ]}| d V  qdS )rf   Nr;   rj   r;   r;   r<   	<genexpr>M      z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              	   s   s    t  5 t jddtd tj| |||d d V  W d    n1 s%w   Y  W d    d S W d    d S 1 s=w   Y  d S )NignorezFSDP.state_dict_type)messagecategoryrg   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningry   r   r   r;   r;   r<   $fsdp_state_dict_type_without_warningk  s    
"z=_verify_options.<locals>.fsdp_state_dict_type_without_warningr   r   )rP   rQ   rR   rU   rV   rS   rT   r;   ),r   warnr   r   r)   r   ru   r   r   r   r
   rZ   rM   updatecopyr]   itemsrX   rY   named_modulesrz   rw   rC   r>   
ValueErrorry   rV   r   r?   r   r   FULL_STATE_DICTr   r   SHARDED_STATE_DICTr[   contextmanager	functoolspartialr\   rN   r   r^   r_   )r`   r   r   r   r   rP   rQ   ra   paramfqnsrl   param_fqns_rR   rg   rV   r   r   r   r   rU   r;   r;   r<   _verify_options  s   










r   model_state_dictoptim_state_dictinfoc                 C   s   |j D ]}t|}|d u rtdq|jr3| s3|js3|js3|jr#|js3|jr3|j	s3t
dt d|jrH|sH|jr>|jsH|j	sHt
d| | D ]}t|v rZt
| dt dqJd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rf   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rV   r   rw   rS   rR   r@   r?   r>   rB   rC   r   distget_rankrT   r{   )r   r   r   rg   
fsdp_statekeyr;   r;   r<   _verify_state_dict  s^   
	r   r   apic                 C   s,   t | |}|tv rtjt | j|| d}|S )N)self)r}   r4   r   r   r   )r   r   callr;   r;   r<   _state_dict_fn  s   
r   
state_dictc                 C   s@   |j r|jrtj sdnd}t| |j|dS |jrt| S | S )Nr;   )r   )r?   
ranks_only)r>   r?   rX   distributedis_initializedr   r   )r   r   r   r;   r;   r<   _maybe_full_or_cpu_state_dict  s   r   c                 C   sz  |j si S |  t| d }W d    n1 sw   Y  t| D ]C}t| |}t|dkrAtd| dt| d| tt	|}||krhdt
fdd}|||satd	| d
| ||||< q%|jri }|D ]&}|jD ] }||s}qu|jr|| ||< qu|t|d  }	|| ||	< quqp|}|jr|  D ]\}}
|
jrqt| |}|D ]}|| qqt||S )Nr   ri   Expected 1 FQN for key '', got z: rd   c                 S   s   t |t | kr
dS |d}| d}d}t|D ]&\}}||| kr9|d7 }|t |kr8|t |d k  S q|dv r>q dS dS )NFrf   r   ri   )rg   rq   T)rz   rs   rt   )r   rl   	fqn_split	key_splitfqn_idxkey_idxkey_namer;   r;   r<   verify  s   

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rS   rU   r   r]   keysr   rz   rw   nextiterrK   r   poprR   
startswithrA   r@   r   requires_gradr   )r`   r   r   r   r   rl   r   new_state_dictrn   r   r   r;   r;   r<   _get_model_state_dict  sN   






r   c                 C   s  |j r|s|jsti i S i }t| |jD ]J\}}t| ||j}t| ||jddd}t||D ]/\}}	|jr<t dkrZ||	krZ|	|d }
|
d u rV|j
rUtd| dn|
||	< |||	< q/qd}|jsh|jrt }| D ]}t|r| dkr||j qotd|v r|td d}t|dkr|tj  n
t|dkrtd	|jrt|||	 |j
|jd
 n|jrt|||	 d || |  ttt| d||j
|dW  d    S 1 sw   Y  d S )NF)rb   rc   r   zMissing key: rf   metaTri   zMultiple devices found)devicerB   r?   r   load_state_dict)r   rB   assign) rS   rC   r    r   rF   r   zipr   r   r   rB   r   r>   rZ   r   rX   	is_tensordimr   r   removerz   distributed_c10d_get_pg_default_devicer   r   r?   r   r   rU   r
   r   )r`   r   r   local_state_dictr   valuer   fqns_with_prefixrl   fqn_with_prefix
load_valuer   devicesr;   r;   r<   _load_model_state_dict  st   



$r   optimc                 C   s   | j rdS | jD ]}|t D ]}|jdur  dS qq| jD ]}|t D ]}|jr/t||_q$qg }| jD ]}d|v rT||d  t|d tj	rPt
dnd|d< q6| jdd | jD ]}d|v rk|d|d< q^| jdd dS )zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)set_to_none)r3   r1   _PARAMSgradr   rX   
zeros_likerx   ru   rY   tensorstepr   	zero_grad)r   param_groupr   lrsr;   r;   r<   _init_optim_statec  s:   




r  c           	   
      s   dt ttf dtdt ttf f fdd dd i }tt| t  D ]\}}t d| }| tt ttf || q%tt	| t
 D ]&}|t}ttt |D ]}| D ]\}}||t
 d| d| < q[qUqG|S )	a3  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_groups": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_groups.layer1.weight.lr": 0.1,
        "param_groups.layer2.weight.lr": 0.1,
        "param_groups.layer1.weight.betas": (0.9, 0.95),
        "param_groups.layer2.weight.betas": (0.9, 0.95),
    }

    The "state" section supports arbitrary levels of nesting for optimizers like Shampoo.
    nested_dictrn   rd   c                    sd   i }|   D ])\}}t|}|r| d| n|}t|tr'| || q| |||< q|S )a  
        Recursively flatten a nested dictionary with dot-separated keys.

        Args:
            nested_dict: The dictionary to flatten
            prefix: The prefix to prepend to all keys

        Returns:
            Flattened dictionary with dot-separated keys
        rf   )r   rM   ru   rW   r   )r  rn   	flattenedr   r   str_keyfull_key_flatten_state_nested_dict_raise_if_type_not_supportedr;   r<   r    s   

z=_flatten_optim_state_dict.<locals>._flatten_state_nested_dictc                 S   s,   t | tjtttfstdt|  dd S )Nz[Flattening optimizer state_dict only supports tensor, int, float, dict states now. Type is rf   )ru   rX   rY   intfloatrW   NotImplementedErrortype)vr;   r;   r<   r    s   z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedrf   )rW   rM   r	   r%   r
   r&   _STATEr   r   r'   _PGr   r   r]   )	r   retrl   r3   state_prefixr  r   kr  r;   r  r<   _flatten_optim_state_dict  s,   )


r  c                 C   s   dt dtt tf dtt tf fdd}i }g }t|t|i}| jD ]}|tg i |t D ]}|j| D ]z}	|	|j	v rXd}
|D ]}|tkrFq?t d|	 d| }||v rVd}
 nd}
|
s]q4|d	 t }t
|tsqtd
t| ||	 |jszq4i ||	< | j| D ]*}t d|	 d| }||vr|||}|tt||	 |< q|| tt||	 |< qq4q-ttt  |d	 t d }|D ]=}|tkrq|t d| d|  }||d	 vr||d	 |< q|d	 | |krtd| d| d| d|d	 |  d	qq |S )z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    Supports arbitrary levels of nesting in the state section through recursive reconstruction.

    See the docstring of _flatten_optim_state_dict() for more detail.
    flattened_keyflattened_dictrd   c           
      S   s   |  d}i }|  D ]=\}}||sq|t|d }|d}|}|dd D ]}	|	|vr4i ||	< t||	 ts=J ||	 }q*|||d < q|S )z
        Reconstructs a potentially nested value from flattened keys.
        For non-nested values, returns the value directly.
        For nested values, reconstructs the nested structure with string keys.
        rf   Nr   )r   r   rz   rs   ru   rW   )
r  r  rn   r  r   r   remaining_keypartscurrentpartr;   r;   r<   _reconstruct_nested_dict  s   



z=_unflatten_optim_state_dict.<locals>._reconstruct_nested_dictFrf   Tr   Expected list, got r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )rM   rW   r%   r  r  r1   rx   r   rP   rQ   ru   r]   rw   r  r   r3   r
   r&   r   )r   r   r   r   r3   pg_state
return_osdr  r   rl   	in_paramsr  flatten_keyr2   
state_nameflattened_state_keyreconstructed_valuefirst_param_fqnr   r;   r;   r<   _unflatten_optim_state_dict  s   


1



/
r*  
optimizersc              	      s0  |j si S ti tg i}|D ]}t| t|d }|jrm|  t| ||}W d    n1 s2w   Y  |s:qt	|t 
 D ]}d|v rW|t ||t |dd< qB|t D ]}dd |t D }||t< q\nt	tdd |jD }tt|tt|}	i  |  D ]2\}
}t| |
}t|d	krtd
|
 dt| tt|}||	vrq|	| }| |< | |< qt	|t 
 D ]}
 |
 }|t |
|t |< q|t D ]} fdd|t D |t< q|sqtt|t |t  tt|t |t  q|jrtt t!|}t"||S )Nr   rq   
_orig_mod.re   c                 S   s   g | ]}| d dqS )r,  re   rr   rk   r  r;   r;   r<   
<listcomp>  rp   z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s    | ]}|t  V  qd S r5   )r   )rk   gr;   r;   r<   r     s    z(_get_optim_state_dict.<locals>.<genexpr>ri   r   r   c                    s   g | ]} | qS r;   r;   )rk   pidfqn_pid_mappingr;   r<   r/    s    )#rT   r  r  r  r   rV   rU   ry   r   r]   r   r   rr   r   r   from_iterabler1   rW   r   rangerz   r   r   rw   r   r   r
   r&   r   r'   extendrD   r(   r  r   )r`   r+  r   r   r   osdr  r0  r2   param_pid_mappingr   r   r   rl   r1  groupr;   r2  r<   _get_optim_state_dicts  sb   




r:  c              	   C   st  i }g }t |t|i}i }tdd tt|t  D r|S |jD ]}|tg i |t D ]}	|j|	 D ]}
|
|j	v rVd}tt
|t D ]}|
ttt |t v rTd} nqCnd}|s[q3|d t }t|tsotdt| ||
 |	jr|
tt|t  v rtt|t  |
 ||
< n|jrtd|
 dtt
|t D ]}|
ttt |t v rt|t d	 |t|< qq3q,t|t d
krg }tt
|t D ]}tttt |t d
kr|| qt|d	krtdt|t t|jkrtdt|t d	 |t|< qtt
|t D ])}|t|d}|dkrq| D ]\}}|tkr.q#||| |< q#q|S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s    | ]}t |tV  qd S r5   )ru   r  r.  r;   r;   r<   r     r   z*_split_optim_state_dict.<locals>.<genexpr>FTr   r!  z'Missing optimizer state for parameter 'z' in checkpoint. The parameter requires gradients but has no saved optimizer state. To load anyway, use StateDictOptions(strict=False).ri   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r  r  allr
   r&   r1   rx   r   rP   rQ   r'   r]   rM   ru   rw   r  r   rB   r   rz   idr   r   r   )r`   r   r   r   r3   r"  r#  
pg_mappingr  r   rl   r$  loaded_param_groupr2   r  pg_idxr   r   r;   r;   r<   _split_optim_state_dict  s   




!


r@  c              	      s  |j sd S |D ]E}t| |r*t|v rt| |||}nt|ttttf ||}ni }|j	r| 
 D ]n\}}t| |}t| |dd}	||	krHq3t|dkrZtd| dt| | |	 |t D ]}
ttttf |
}fdd|t D }||t< qftt|t }t| D ]}|v r||||< qq3|  t| ||}W d    n1 sw   Y  n|jrEd|_t| |f|}d|_d   fd	d
}ttj||} d u rtdt|\}}t|\}}|jrt || d nt!|| d |D ]!}||vr&||vrtd| d|| ||< || ||< qt"||}|t D ]}t|vrCg ttttf |t< q1t#|d|d qd S )NF)rc   ri   zExpected 1 FQN for 'r   c                    s   g | ]}|  qS r;   r-  )rk   r   )rl   fqn_with_compilerr;   r<   r/  I  s    z*_load_optim_state_dict.<locals>.<listcomp>Tc                    s4   |   dkr d u r| j | S  | jkrtd| S )Nr   zDevice mismatch)r   r   r   )tr   r;   r<   _device\  s   
z'_load_optim_state_dict.<locals>._devicezExpected device to be setr   zExpected key 'z' in osd_mappingr   r   )$rT   r  r  r@  r*  r
   rW   rM   r%   rV   r   r   rz   rw   r   r  r	   r   r&   r]   r   rr   rU   ry   optim_state_dict_to_loadr>   r:  r"   rX   rY   r   rC   r   r   r   r   )r`   r+  r   r   r   r   original_fqn_r   fqns_with_compilerr0  valr2   	osd_stater  r   rC  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keypgr;   )r   rl   rA  r<   _load_optim_state_dict  s   



	



rQ  c                C   sV   t   t| dd||d}t| |}t|i | |W  d   S 1 s$w   Y  dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    r;   Fr   r   r   N)r=   r   r   r   )r`   r   r   r   r   r;   r;   r<   r*     s   
$r*   c                C   st   t  - t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  d   S 1 s3w   Y  dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    TrR  N)	r=   ru   rX   r   	Optimizertupler   r:  r   )r`   r+  r   r   r   r   r;   r;   r<   r+     s    $r+   c                C   s   t  4 t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  d   S 1 s:w   Y  dS )a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
        ...     fsdp_model, fsdp_optim
        ... )

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    FrR  N)
r=   ru   rX   r   rS  rT  r   r   r:  r   )r`   r+  r   r   r   r   r   r;   r;   r<   r,     s"   H
$r,   c           	         s   |si S t tt| tjrgtjdtdd t	t
tjt
ttf f |}i }| D ]8\}}|  D ]/\}}||kr=q4t| |}t|dkrLtdtt| d | fdd| D  q4q,|S t	t
ttf |S )	NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.r   r   ri   z/FQNs for a submodule should only have 1 elementrf   c                    s   i | ]	\}} | |qS r;   r;   )rk   subfqnr   rm   r;   r<   
<dictcomp>R  s    z/_unflatten_model_state_dict.<locals>.<dictcomp>)ru   r   r   r   r^   r_   r   r   r   r
   rW   rM   r%   r   r   r   rz   rw   r   )	r`   r   cast_state_dictr   r   sub_state_dictra   mr   r;   rm   r<   _unflatten_model_state_dict4  s4   
rZ  )r   c                C   s\   t | |}t  t| dd|d}t|i | t| ||W  d   S 1 s'w   Y  dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    r;   Fr   r   N)rZ  r=   r   r   r   )r`   r   r   r   r;   r;   r<   r-   Y  s   
$r-   c                C   sr   t  , t|tjjr|fnt|}t| |d|d}ti || t| ||| W d   dS 1 s2w   Y  dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
        ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
        initialized correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr[  N)	r=   ru   rX   r   rS  rT  r   r   rQ  )r`   r+  r   r   r   r;   r;   r<   r.     s   "r.   c                C   s   t | |}t 2 t|tjjr|fnt|}t| || |d}t||| t	| ||| t
| ||W  d   S 1 s=w   Y  dS )a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
        is called on the optimizers. Otherwise, the optimizer states won't be initialized
        correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r[  N)rZ  r=   ru   rX   r   rS  rT  r   r   rQ  r   )r`   r+  r   r   r   r   r;   r;   r<   r/     s   .

$r/   c                   sj   t jt| |dfdd}|| _t jt| |d dtttf f fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r   c                           S r5   r;   r;   _state_dict_callr;   r<   state_dict_call     z0_patch_model_state_dict.<locals>.state_dict_callr   c                        | d d S )N)r   r;   rD  _load_state_dict_callr;   r<   load_state_dict_call     z5_patch_model_state_dict.<locals>.load_state_dict_callN)r   r   r*   r   r-   rW   rM   r	   r   r4   r   )r`   r   r_  rd  r;   rc  r^  r<   _patch_model_state_dict  s    
rg  c                   s   t jt| ||dfdd}t jt| ||d dtttf f fdd}t| t| t	|t
jjr9|fnt|}|D ]}||_||_q?dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r+  r   c                      r\  r5   r;   r;   r]  r;   r<   r_  J  r`  z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    ra  )N)r   r;   rD  rb  r;   r<   rd  T  re  z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r   r   r+   r.   rW   rM   r	   r4   r   ru   rX   r   rS  rT  r   r   )r`   r+  r   r_  rd  r   r;   rf  r<   _patch_optimizer_state_dict$  s0   

rh  )rE   TT)rE   )pr[   r   r6   r   collections.abcr   r   r   dataclassesr   r   r   	itertoolsr   typingr	   r
   r   r   rX   torch.distributedr   r   torch.nnr^   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   ry   r   r   r   r   r   $torch.distributed.fsdp._common_utilsr   r   torch.distributed.tensorr   torch.nn.modules.moduler    torch.nn.parallelr!   rv   torch.utils._pytreer"   __all__r{   r  r   r  rZ   rM   r#   rY   r  r  r$   r]   rT  rW   r%   r&   r'   r(   r4   rL   r   r=   r)   rN   r_   rK   r   r   r   r   rS  r   r   r   r   no_gradr   r   r  r  r*  r:  r@  rQ  r*   r+   r,   rZ  r-   r.   r/   rg  rh  r;   r;   r;   r<   <module>   s  
 (


/
K
-
 

 .



B
D*b

 G
dj

)
1
[ 

)

,
1

B6