o
    Din                     @   s   d dl mZmZmZmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZmZmZmZmZ d dlmZmZ d dlmZmZ ed	Zefd
edeg ef fddZdd ZdddZde_ G dd dZ!G dd dee Z"dS )    )SequenceListTypeVarTupleCallableN)TritonSemantic   )_core)
AutoLayoutDistributedLayoutDistributedLinearLayoutSliceLayoutSharedLayoutCoalescedLayout)GluonOpBuildercompute_tmem_reg_layout)flatten_values_to_irunflatten_ir_valuesTensorTycondmsg_fnc                 C   s   | s|| d S N )r   r   categoryr   r   v/var/www/addictedbytheproject.nl/epg/venv/lib/python3.10/site-packages/triton/experimental/gluon/language/_semantic.py_check   s   
r   c                 C   s   t | totdd | D S )Nc                 s       | ]}t |tV  qd S r   
isinstanceint.0ir   r   r   	<genexpr>       z_is_int_list.<locals>.<genexpr>)r   r   allvaluer   r   r   _is_int_list   s   r(   c                    s0  t ttdd  t dv fdd t ttfdd t dko-d @ dkd	d  tt td
d D fdd t}t |dkdd  |d u rYg }dk}|radn |ru|D ]}t t||kdd  qgt| | |t d u fdd |rd }	jst j	d d|	d gkfdd j
d|	d g ddgj	d< S jd d|	d gkr| jdtj }
t |
d kfdd j}dD ],}t|}t|D ] \}}|d|	d gkr|| |d |d< ||<     S qqJ d| S )Nc                   S      dS )Nzinstr_variant must be a stringr   r   r   r   r   <lambda>       z*_compute_tmem_reg_layout.<locals>.<lambda>)32x32b16x64b16x128b16x256b16x32bx232x32b_splitnc                      
   d  S )Nzunknown instr_variant: r   r   )instr_variantr   r   r*         
 c                      s   dt  S )Nz!num_warps must be an int but got typer   	num_warpsr   r   r*             r   r   c                   S   r)   )Nz)num_warps must be a power of two and >= 4r   r   r   r   r   r*      r+   c                 s   r   r   r   )r!   dimr   r   r   r#      r$   z+_compute_tmem_reg_layout.<locals>.<genexpr>c                      r2   )Nz#shape entries must be ints but got r   r   shaper   r   r*      r4      c                   S   r)   )Nzexpected a 2D tensorr   r   r   r   r   r*      r+   r1   r,   c                   S   r)   )Nzcga_layout basis rank mismatchr   r   r   r   r   r*   (   r+   c                      s   d  d d S )NzTMEM layout 'z' unsupported for shape z and num_warps r   r   )atom_variantr8   r=   r   r   r*   3       c                      r2   )NzJsplitn with 1 register requires the last lane basis to be [0, N / 2]. Got r   r   )
layout_objr   r   r*   ;   r4       c                      s   dd   d  dS )NzETo be able to `tmem.load` into `tl.split` you need to have more than rC    z-bit registers, as you need to use the instruction 32x32b.x1 twice. You can always load into instr_variant="32x32b" and then convert_layout to this layout otherwise.r   r   )bitwidthr   r   r*   B   s
    )
lane_bases
warp_basesFz6splitn requires at least one basis of [0, N / 2]. Got )r   r   strr   listr%   lenr   	reg_basesrF   appendprimitive_bitwidthgetattr	enumerate)
element_tyr=   layoutr8   r3   
cga_layoutranksplitnbasisNnum_regrK   	bases_strbasesr"   r   )r?   rE   r3   rB   r8   r=   r   _compute_tmem_reg_layout   sl   
" 

rZ   Tc                   @   s*   e Zd ZdefddZdd Zdd ZdS )	GluonCallerContextr8   c                 C   
   || _ d S r   r7   )selfr8   r   r   r   __init__W      
zGluonCallerContext.__init__c                 C   s   d| j  S )N_NWr7   r]   r   r   r   mangleZ   s   zGluonCallerContext.manglec                 C   s   | d|| j d S )Nzttg.num-warps)set_attrget_int32_attrr8   )r]   fnbuilderr   r   r   initialize_callee]   s   z$GluonCallerContext.initialize_calleeN)__name__
__module____qualname__r   r^   rb   rg   r   r   r   r   r[   U   s    r[   c                
       s  e Zd ZU ejZeZeed< defddZdd Z	dd Z
dee d	ee fd
dZdededefddZdededef fddZdedeeef f fddZdedee def fddZdedee defddZdededef fdd Z fd!d"Zded#ee d$ef fd%d&Zd'd( Zd)d* Zded,d-Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Z d8d9 Z!d:d; Z"d<d= Z#d>d? Z$d@dA Z%dBdC Z&dDdE Z'dFdG Z(e)dHdI Z*dJe+e dedKedeedLf fdMdNZ,dJe+e dedeedLf fdOdPZ-dedQedRedefdSdTZ.deded$edefdUdVZ/dWedXededefdYdZZ0dWedefd[d\Z1d]e+e d^e+e fd_d`Z2dadb Z3dcdd Z4  Z5S )fGluonSemanticrf   c                 C   r\   r   )rf   )r]   rf   r   r   r   r^   g   r_   zGluonSemantic.__init__c                 C   s0   |g kr|}nt ||| j|}| ||S r   )ttgldistributed_typerf   get_gluon_layout_from_tensortensor)r]   handle	scalar_tyr=   tyr   r   r   _wrap_handle_infer_layoutj   s   z'GluonSemantic._wrap_handle_infer_layoutc                 C   s   |  |j|jj|jS r   )rs   rp   r6   scalarr=   )r]   ro   r   r   r   _wrap_tensor_infer_layoutq   s   z'GluonSemantic._wrap_tensor_infer_layout	lhs_shape	rhs_shapec                 C   s   t |t |krtd| d| g }t|D ]3\}}|| }|dkr*|| q|dks2||kr8|| qtdt| d t| d t| |S )N!Cannot broadcast, rank mismatch: , r   z?Cannot make_shape_compatible: incompatible dimensions at index : z and )rJ   
ValueErrorrO   rL   rH   )r]   rv   rw   	ret_shaper"   leftrightr   r   r   _broadcast_shapest   s*   zGluonSemantic._broadcast_shapesinputaxisreturnc                    s   dd j D }| d  dk r tj 7  ttjtjfdd jjttt	t
tffdd ttt
tfpFj k fdd | jj }| |jj|S )	Nc                 S   s   g | ]}t |qS r   )rl   _unwrap_if_constexprr!   xr   r   r   
<listcomp>   r@   z-GluonSemantic.expand_dims.<locals>.<listcomp>r   r   c                         d j S Nz=expected expand_dims input to be a distributed_type but got: r5   r   r   r   r   r*          z+GluonSemantic.expand_dims.<locals>.<lambda>c                      r2   )Nz;expected expand_dims input to have a SliceLayout, but got: r   r   rQ   r   r   r*      r4   c                      s   d  dj  S )Nz7expected expand_dims input layout to be sliced in axis z	 but got r;   r   )r   rQ   r   r   r*          )r=   insertrJ   r   r   r6   rl   rm   rQ   r   r
   r   r;   rf   create_expand_dimsrp   rs   rt   )r]   r   r   	dst_shaperp   r   )r   r   rQ   r   expand_dims   s"   

zGluonSemantic.expand_dimsabc                    s<   |  ||\}}t|jg kdd  t ||}| |S )Nc                   S   r)   )NzCannot join scalars in gluonr   r   r   r   r   r*      r+   z$GluonSemantic.join.<locals>.<lambda>)broadcast_impl_valuer   r=   superjoinru   )r]   r   r   r'   	__class__r   r   r      s   
zGluonSemantic.joinc                    s$   t  |\}}| || |fS r   )r   splitru   )r]   r   lhsrhsr   r   r   r      s   zGluonSemantic.splitdimsc                    s   t  ||}| |S r   )r   permuteru   )r]   r   r   r'   r   r   r   r      s   
zGluonSemantic.permuter=   c                    s   t t jtj fdd  j t ttkfdd kr) S tD ]#\}}| |krP|dkrPtd|  d| d| d d	 
q-t jj	 jj
}| j j|| j}| ||S )
Nc                      r   r   r5   r   r   r   r   r*      r   z4GluonSemantic.broadcast_impl_shape.<locals>.<lambda>c                         d d  S )Nrx   ry   r   r   )r=   	src_shaper   r   r*          r   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension rz   ry   )r   r   r6   rl   rm   get_block_shapesrJ   rO   r{   rt   rQ   rf   create_broadcastrp   to_irro   )r]   r   r=   r"   itemret_tyrp   r   )r   r=   r   r   broadcast_impl_shape   s,   

 z"GluonSemantic.broadcast_impl_shaper   r   c                    s  |j  |j   r st ||S tt tj fdd tttjfdd   } }| 	||}t j
t}tj
t}|rU|sU| |j
}n|ra|sa| | j
}n j
j
krstd j
 dj
 | ||}| ||}||fS )Nc                      
   d S )Nz@expected broadcast left input to be a distributed_type but got: r   r   )lhs_tyr   r   r*      r4   z4GluonSemantic.broadcast_impl_value.<locals>.<lambda>c                      r   )NzAexpected broadcast right input to be a distributed_type but got: r   r   )rhs_tyr   r   r*      r4   zLayout mismatch in broadcast: z vs )r6   is_blockr   r   r   r   rl   rm   r   r   rQ   r
   set_auto_layoutr{   r   )r]   r   r   rv   rw   r|   is_lhs_autois_rhs_autor   )r   r   r   r      s0   

z"GluonSemantic.broadcast_impl_valuec                    s:   || g}|d u rt  }ttj||}t j|||dS )N)r   )r
   rl   rm   int32r   arange)r]   startendrQ   r=   r   r   r   r   r      s
   
zGluonSemantic.aranger   can_reorderc                    s*   t | dd  t |||}| |S )Nc                   S   r)   )Nz%can_reorder is not supported in gluonr   r   r   r   r   r*      r+   z'GluonSemantic.reshape.<locals>.<lambda>)r   r   reshaperu   )r]   r   r   r   r'   r   r   r   r      s   
zGluonSemantic.reshapec                 C   sD   t |dkr|S t|j||}| j|| j|j}t||S )Nr   )	rJ   rl   rm   dtyperf   create_splatr   rp   ro   )r]   r'   r=   rQ   r   rp   r   r   r   splat   s
   zGluonSemantic.splatc                 C   s(   |  ||}|d u rt }| |||S r   )make_scalarr
   r   )r]   r=   r'   r   rQ   rt   r   r   r   full   s   zGluonSemantic.fullFc                    s   |j tttjfdd tt tj fdd tjj }|| j	}|rR| j	
||jsRtdj d  d| jj d|  j | j	||j}t||S )Nc                      r   )Nz@expected convert_layout input to be a distributed_type but got: r   r   )rr   r   r   r*      r4   z.GluonSemantic.convert_layout.<locals>.<lambda>c                      r2   Nz4expected 'layout' to be a DistributedLayout but got r   r   r   r   r   r*      r4   zlayout conversion from z to z) is not trivial.
The linear layouts are:

)r6   r   r   rl   rm   r   rP   r=   r   rf   is_convert_layout_trivialrp   	TypeErrorrQ   to_linear_layoutcreate_convert_layoutro   )r]   r'   rQ   assert_trivialr   	ret_ty_irrp   r   )rQ   rr   r   convert_layout   s"   

zGluonSemantic.convert_layoutc                    s   t t tj fdd t tfdd t ttjfdd t }|d ur>| j|	| j|j
}n
| j|	| j}t| S )Nc                      r2   )Nz,expected 'element_ty' to be a dtype but got r   r   )rP   r   r   r*      r4   z/GluonSemantic.allocate_shared.<locals>.<lambda>c                      r2   Nz1all elements of 'shape' must be integers but got r   r   r<   r   r   r*      r4   c                      r2   Nz/expected 'layout' to be a SharedLayout but got r   r   r   r   r   r*      r4   )r   r   rl   r   r(   r   shared_memory_descriptor_typerf   create_local_allocr   rp   shared_memory_descriptor)r]   rP   r=   rQ   r'   rr   rp   r   )rP   rQ   r=   r   allocate_shared   s   
zGluonSemantic.allocate_sharedc                    sP   t t tj fdd t|j|j }| j|	| j|j
}t||S )Nc                      r2   r   r   r   r   r   r   r*     r4   z+GluonSemantic.shared_load.<locals>.<lambda>)r   r   rl   r   rm   r   r=   rf   create_local_loadr   rp   ro   )r]   mem_descrQ   r   rp   r   r   r   shared_load  s   
zGluonSemantic.shared_loadc                    sh   t ttjfdd t j jk fdd t j jk fdd | j jj d S )Nc                         dt   S )Nz+expected 'value' to be a tensor, but got a r5   r   r&   r   r   r*     r9   z,GluonSemantic.shared_store.<locals>.<lambda>c                         dj  d j  dS )Nzsource shape z and destination shape  must matchr<   r   r   r'   r   r   r*     r@   c                      r   )Nzsource dtype z and destination dtype r   r   r   r   r   r   r*     r@   )	r   r   rl   ro   r=   r   rf   create_local_storerp   )r]   r   r'   r   r   r   shared_store  s   zGluonSemantic.shared_storec                 C   s   t |tjstdt| t |tjstdt| |j|jkr1td|j d|j d|j|jkrDtd|j d|j d|j|j	t
|j d  kr^td	|j d
|j	 |j| j}|j| j}| j||t|j|jjS )NzIbank_conflicts expects the register layout to be a distributed_type, got zTbank_conflicts expects the shared layout to be a shared_memory_descriptor_type, got zregister shape z and shared shape r   z$mismatched dtypes between register (z) and shared (z	) layoutsz,bank_conflicts NYI for subslices. Got shape z and alloc_shape )r   rl   rm   r   r6   r   r=   r{   rP   alloc_shaperJ   rQ   _to_irrf   get_shared_bank_conflictsrI   rM   )r]   distr_ty	shared_tyreg_attrshared_attrr   r   r   bank_conflicts  s.   zGluonSemantic.bank_conflictsc                    sl   t t ttf fdd t|tst|}t  t ttfr(t	 S t	| j
 | j
|S )Nc                      r   )Nz2Expected a DistributedLayout or SharedLayout, got r5   r   r   r   r   r*   1  r9   z0GluonSemantic.to_linear_layout.<locals>.<lambda>)r   r   r   r   rI   rl   r   r
   r   	constexprrf   r   r   )r]   rQ   r=   r   r   r   r   /  s   



zGluonSemantic.to_linear_layoutc                 C   s   | j |j d S r   )rf   create_local_deallocrp   )r]   r   r   r   r   shared_dealloc=  s   zGluonSemantic.shared_deallocc                    sn   j }tt t fdd tt|jtfdd | j | jj	}t
|j|j }| ||S )Nc                      r2   )Nz9set_auto_layout must set to a distributed layout but got r   r   r   r   r   r*   C  r4   z/GluonSemantic.set_auto_layout.<locals>.<lambda>c                      s   d j j S )Nz4set_auto_layout input must have auto layout but got r6   rQ   r   r&   r   r   r*   E  r9   )r6   r   r   r   rQ   r
   rf   create_set_auto_layoutr   rp   rl   rm   rP   r=   ro   )r]   r'   rQ   src_tyrp   res_tyr   )rQ   r'   r   r   @  s   


zGluonSemantic.set_auto_layoutc                    s   t ttfdd t ttfdd t t t fdd dg|j }| < t|j}| < |j}t|j	|||j
j}| j}	|	||	|j|}
tj|
fi |jS )Nc                      r2   )Nz&expected 'start' to be an int but got r   r   )r   r   r   r*   K  r4   z-GluonSemantic.memdesc_slice.<locals>.<lambda>c                      r2   )Nz'expected 'length' to be an int but got r   r   )lengthr   r   r*   L  r4   c                      r2   )Nz$expected 'dim' to be an int but got r   r   r   r   r   r*   M  r4   r   )r   r   r   rS   rI   r=   rQ   rl   r   r   r6   r   rf   create_memdesc_subslicer   rp   r   __dict__)r]   r   r   r   r;   offsetsr=   rQ   rr   rf   rp   r   )r;   r   r   r   memdesc_sliceJ  s   
zGluonSemantic.memdesc_slicec                    s   |    t jtjk fdd |jdd  }|   j |j}t|j	|||}| j
}||||j }tj|fi |jS )Nc                      s   d j  S )Nz%expected 'index' to be int32 but got r5   r   indexr   r   r*   Z  r   z-GluonSemantic.memdesc_index.<locals>.<lambda>r   )	to_tensorr   r6   rl   r   r=   rp   rQ   r   r   rf   create_memdesc_indexr   r   r   )r]   r   r   r=   rQ   rr   rf   rp   r   r   r   memdesc_indexX  s   
zGluonSemantic.memdesc_indexc                    s   t tfdd t ttjkfdd fddD }jj  d t j  }| fddD 7 }| jj	}| j
|}tj|j|||dS )Nc                      r2   )Nz1all elements of 'order' must be integers but got r   r   )orderr   r   r*   d  r4   z-GluonSemantic.memdesc_trans.<locals>.<lambda>c                      s   d j  dt dS )Nzsource rank (z) and order length (z) must match)rS   rJ   r   )r   r   r   r   r*   g      c                    s   g | ]} j | qS r   r<   r    )r   r   r   r   i  r@   z/GluonSemantic.memdesc_trans.<locals>.<listcomp>c                    s&   g | ]} t  j d  | qS r   )rJ   rS   r    )r   r   r   r   r   l  s   & rP   r=   r   rQ   )r   r(   rJ   r=   r6   r   rS   rf   create_memdesc_transrp   get_gluon_layout_from_memdescrl   r   r   )r]   r   r   r=   new_alloc_shaperp   rQ   r   )r   r   r   r   memdesc_transc  s   zGluonSemantic.memdesc_transc                    s   t tfdd t tt jk fdd | j j}| j|} j	j
}t| j }|d | t }tj| j||dS )Nc                      r2   r   r   r   r<   r   r   r*   t  r4   z/GluonSemantic.memdesc_reshape.<locals>.<lambda>c                      s   d j  d S )Nz)memdesc_reshape total elements mismatch: z -> r<   r   r   r=   r   r   r*   w  s
    r   )r   r(   mathprodr=   rf   create_memdesc_reshaperp   r   r6   r   rJ   rS   rI   rl   r   r   )r]   r   r=   rp   rQ   r   
prefix_lenr   r   r   r   memdesc_reshapes  s"   zGluonSemantic.memdesc_reshapec                    s   t t tj fdd t tfdd t ttjfdd t }| j|	| j|j
}tj|fi |jS )Nc                      r2   )Nz'expected 'dtype' to be a dtype but got r   r   r   r   r   r*     r4   z3GluonSemantic.memdesc_reinterpret.<locals>.<lambda>c                      r2   r   r   r   r<   r   r   r*     r4   c                      r2   r   r   r   r   r   r   r*     r4   )r   r   rl   r   r(   r   r   rf   create_memdesc_reinterpretr   rp   r   r   )r]   r   r   r=   rQ   rr   rp   r   )r   rQ   r=   r   memdesc_reinterpret  s   
z!GluonSemantic.memdesc_reinterpretc                 C   s$   |r
t |||}n|}| ||S r   )rl   rm   ro   )r]   r   rq   r|   rQ   r   r   r   r   wrap_tensor  s   zGluonSemantic.wrap_tensorc                    sl   | D ]t tjtjfdd qdd | D d  t t fdddd  D fd	d d S )
Nc                      r   Nz#expected distributed_type but got: r5   r   )r   r   r   r*     r   z2GluonSemantic._check_same_layout.<locals>.<lambda>c                 S   s   g | ]}|j jqS r   r   r   r   r   r   r         z4GluonSemantic._check_same_layout.<locals>.<listcomp>r   c                 3   s    | ]}| kV  qd S r   r   )r!   l)l0r   r   r#     s    z3GluonSemantic._check_same_layout.<locals>.<genexpr>r   c                      r2   )Nz3Expected inputs to have matching layouts, but got: r   r   )layoutsr   r   r*     r4   )r   r   r6   rl   rm   r%   )xsr   )r   r   r   r   _check_same_layout  s   
z GluonSemantic._check_same_layoutinputsreverse.c                    s    d j jt}| |  kr|k s!n J d| d| d|dk r)||7 } D ]}|j jks7J dq+jdd  D |||  sOJ t fdd	tt D S )
Nr   z
scan axis z must be < inputs rank ()z(all scan inputs must have the same shapec                 S      g | ]}|j qS r   rp   r!   tr   r   r   r     r   z2GluonSemantic.associative_scan.<locals>.<listcomp>c                 3   s,    | ]} | | jjV  qd S r   rs   
get_resultr6   rt   r    r  scan_opr]   r=   r   r   r#     
    
z1GluonSemantic.associative_scan.<locals>.<genexpr>)r6   r=   rJ   rf   create_scanverifytuplerange)r]   r  r   region_builder_fnr  rS   r  r   r  r   associative_scan  s   .
zGluonSemantic.associative_scanc                    s    d u rt fddD d d jjttd   ko%k n   fdd   fddtD tfddD sNJ d	j	d
d D  | 
 sdJ t fddttD S )Nc                 3   s&    | ]} j ||jjgd dV  qdS )F)r   N)r   numelr'   r  ra   r   r   r#     s   $ z*GluonSemantic.reduction.<locals>.<genexpr>r   c                      r   )Nz/expected reduction axis to be in the range [0, z
) but got r   r   r   rS   r   r   r*     r   z)GluonSemantic.reduction.<locals>.<lambda>c                    s   g | ]
\}}| kr|qS r   r   )r!   r"   sr   r   r   r     s    z+GluonSemantic.reduction.<locals>.<listcomp>c                 3   s    | ]	}|j j kV  qd S r   )r6   r=   r  r<   r   r   r#     s    z-all reduction inputs must have the same shapec                 S   r  r   r  r  r   r   r   r     r   c                 3   s,    | ]} | | jjV  qd S r   r	  r    )r  	reduce_opr|   r]   r   r   r#     r  )r  r6   r=   rJ   r   r  rO   r%   rf   create_reducer  r  )r]   r  r   r  r   )r   r  rS   r  r|   r]   r=   r   	reduction  s   (

zGluonSemantic.reductionnum_binsmaskc                 C   s   t t|jdkdd  t |j dd  t |d udd  |d ur9| ||\}}t |jj dd  |j	}|
| j}| j|j	|||}| |tj|g|S )Nr   c                   S   r)   )Nz histogram only supports 1D inputr   r   r   r   r   r*     r+   z)GluonSemantic.histogram.<locals>.<lambda>c                   S   r)   )Nz%histogram only supports integer inputr   r   r   r   r   r*     r+   c                   S   r)   )Nz'histogram requires a destination layoutr   r   r   r   r   r*     r+   c                   S   r)   )Nz"Mask must have boolean scalar typer   r   r   r   r   r*     r+   )r   rJ   r=   r   is_intr   r6   rt   is_boolrp   r   rf   create_histogramr   rl   r   )r]   r   r  r  rQ   layout_attrrp   r   r   r   	histogram  s   zGluonSemantic.histogramc              	   C   s   t |d udd  t |dd  t t|jdkdd  t|jj|jd |jd  g|}| | j	|j
|j
|| j|S )Nc                   S   r)   )Nz!cat requires a destination layoutr   r   r   r   r   r*     r+   z#GluonSemantic.cat.<locals>.<lambda>c                   S   r)   )Nz;current implementation of `cat` always may reorder elementsr   r   r   r   r   r*     r+   r   c                   S   r)   )Nzcat requires a rank-1 inputr   r   r   r   r   r*     r+   r   )r   rJ   r=   rl   rm   r6   rt   ro   rf   
create_catrp   r   )r]   r   r   r   rQ   ret_typer   r   r   cat  s
   $$zGluonSemantic.catsrcr   c                    s   t tjtjfdd t tjtjfdd t jj fdd tjjt tjjkdd  t     koGk n   fdd  dk rY 7  t	D ]}| krdq]t jj| jj| k fdd q]| j
jj }| |jjjjjjS )	Nc                      r   r   r5   r   )r%  r   r   r*     r   z&GluonSemantic.gather.<locals>.<lambda>c                      r   r   r5   r   r   r   r   r*     r   c                      s   d j jS )Nz&expected integer scalar type but got: )r6   rt   r   r   r   r   r*     r9   c                   S   r)   )Nz0source and index tensors must have the same rankr   r   r   r   r   r*     r+   c                      s   d  d dS )Nzgather axis z must be < source rank (r  r   r   r  r   r   r*     r   r   c                      s   d  dS )Nz
index dim z( must match the corresponding source dimr   r   r  r   r   r*     r   )r   r   r6   rl   rm   rt   r  rJ   r=   r  rf   create_gatherrp   r   rQ   )r]   r%  r   r   dgatherr   )r   r   rS   r%  r   r(    s&   
*
zGluonSemantic.gatherc                 C   sD   | j |j|| j |}t|jj}||  d9  < | |||S )Nr>   )rf   create_fp4_to_fprp   r   rI   r6   r=   rs   )r]   r%  	elem_typer   resultr=   r   r   r   	fp4_to_fp  s   zGluonSemantic.fp4_to_fpworker_num_warpsworker_num_regsc                    sb  |D ]\}t tttjffdd qt|dks J d|d \}}t|d }|dd  }	|t|ksEJ d| dt| d|t|ksXJ d| dt| d	| j}
|
 }|
 }|
| |j	||i d
}g }|d urzt
|}|
| dd |D }dd |	D }t|g }|
| |
||| | | |
 g  |
|}dd |D }d t|	D ]E\}\}t|| d}|
||||| } fddtt|D }t|dd D }|j	||i |d |
   t|7  q|
  fddtt|D }|d u r%d S tt|dd |D S )Nc                      r   )Nz9function arguments must be a tuple of arguments, but got r5   r   )argsr   r   r*     r9   z/GluonSemantic.warp_specialize.<locals>.<lambda>r   z8expected at least one function for the default partitionr   zwarp specialize got z partitions but z warp countsz register counts)kwargsc                 S      g | ]}|  qS r   get_typer!   rr   r   r   r     r   z1GluonSemantic.warp_specialize.<locals>.<listcomp>c                 S   s   g | ]\}}t |qS r   )r   )r!   _r/  r   r   r   r     r   c                 S   r1  r   r2  r!   argr   r   r   r   $  r   r7   c                    s   g | ]	}  | qS r   )get_argument)r!   j)arg_itblockr   r   r   *  s    c                 S   r  r   r5   r7  r   r   r   r   +  r   )r0  caller_contextc                    s   g | ]}  |qS r   )r
  r    )ws_opr   r   r   1  r@   c                 S   r  r   r5   r4  r   r   r   r   4  r   )r   r   r  rl   rJ   rf   get_insertion_point	new_blockset_insertion_point_to_startcall_JitFunctionr   create_warp_yieldsumrestore_insertion_pointcreate_warp_specializeget_default_region	push_backset_requested_registerscreate_block_with_parentget_partition_op_holder!create_warp_specialize_partitionsrO   r[   
get_regionr  r   create_warp_returnset_insertion_point_afterget_operation)r]   functions_and_argsr-  r.  	generatorr6  default_partitiondefault_argsnum_partitionsworkersrf   	insert_ptdefault_blockdefault_resultsmlir_resultsresult_typesworker_args	mlir_argspartitions_op	arg_typesr"   funcr=  
block_argsr   )r;  r/  r<  r>  r   warp_specialize  sh   







zGluonSemantic.warp_specializec                 C   s   t | jjjS r   )rl   r   rf   optionsnum_ctasra   r   r   r   rd  6  s   zGluonSemantic.num_ctasc                 C   s8   |j d urt|j tsJ t|j jS t| jjjS r   )r=  r   r[   rl   r   r8   rf   rc  )r]   rR  r   r   r   r8   9  s   
zGluonSemantic.num_warps)F)6rh   ri   rj   rl   ro   langr   __annotations__r^   rs   ru   r   r   r   r   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr  r   r  r  r!  r$  r(  r,  rb  rd  r8   __classcell__r   r   r   r   rk   a   s^   
 

	


"<rk   r   )#typingr   r   r   r   r   r   triton.language.semanticr    r	   rl   _layoutsr
   r   r   r   r   r   triton._C.libtriton.gluon_irr   r   triton.compiler.code_generatorr   r   r   r{   rg  rH   r   r(   rZ   __triton_builtin__r[   rk   r   r   r   r   <module>   s     
=