-<tr class="memitem:acd423b6f992354e7c00137b20d687281"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#acd423b6f992354e7c00137b20d687281">gemm_interleave4x4_f32</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
-<tr class="memdesc:acd423b6f992354e7c00137b20d687281"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#acd423b6f992354e7c00137b20d687281">More...</a><br /></td></tr>
-<tr class="separator:acd423b6f992354e7c00137b20d687281"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:ae333c12d780666b2591f7c8e9faaf5a9"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#ae333c12d780666b2591f7c8e9faaf5a9">gemm_interleave4x4_f16</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
-<tr class="memdesc:ae333c12d780666b2591f7c8e9faaf5a9"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#ae333c12d780666b2591f7c8e9faaf5a9">More...</a><br /></td></tr>
-<tr class="separator:ae333c12d780666b2591f7c8e9faaf5a9"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:a830ba1cc0ad3c8cffa4a14424b2d0411"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#a830ba1cc0ad3c8cffa4a14424b2d0411">gemm_interleave4x4_u8</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
-<tr class="memdesc:a830ba1cc0ad3c8cffa4a14424b2d0411"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#a830ba1cc0ad3c8cffa4a14424b2d0411">More...</a><br /></td></tr>
-<tr class="separator:a830ba1cc0ad3c8cffa4a14424b2d0411"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a560b2d50aa886edc0f5daf4fe729717f"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#a560b2d50aa886edc0f5daf4fe729717f">gemm_interleave4x4_32bit</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
+<tr class="memdesc:a560b2d50aa886edc0f5daf4fe729717f"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#a560b2d50aa886edc0f5daf4fe729717f">More...</a><br /></td></tr>
+<tr class="separator:a560b2d50aa886edc0f5daf4fe729717f"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a2f32d740e780059f68da8aa589ed0a5b"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#a2f32d740e780059f68da8aa589ed0a5b">gemm_interleave4x4_16bit</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
+<tr class="memdesc:a2f32d740e780059f68da8aa589ed0a5b"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#a2f32d740e780059f68da8aa589ed0a5b">More...</a><br /></td></tr>
+<tr class="separator:a2f32d740e780059f68da8aa589ed0a5b"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:aa66a7b6b0420e54ec173743e6c5bfc45"><td class="memItemLeft" align="right" valign="top">__kernel void </td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_8cl.xhtml#aa66a7b6b0420e54ec173743e6c5bfc45">gemm_interleave4x4_8bit</a> (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)</td></tr>
+<tr class="memdesc:aa66a7b6b0420e54ec173743e6c5bfc45"><td class="mdescLeft"> </td><td class="mdescRight">This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values. <a href="#aa66a7b6b0420e54ec173743e6c5bfc45">More...</a><br /></td></tr>
+<tr class="separator:aa66a7b6b0420e54ec173743e6c5bfc45"><td class="memSeparator" colspan="2"> </td></tr>