25 #ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ 26 #define __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ 36 const float32x4x3_t r =
40 vld1q_dup_f32(1 + ptr),
41 vld1q_dup_f32(2 + ptr)
47 template <
unsigned int str
idex>
48 float32x4x2_t
convolve_3x3(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2,
int fixed_point_position);
51 inline float32x4x2_t
convolve_3x3<1>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2,
int fixed_point_position)
55 const float32x4x3_t vtop =
59 vld1q_f32(in_top + 4),
63 const float32x4x3_t vmid =
67 vld1q_f32(in_mid + 4),
71 const float32x4x3_t vlow =
75 vld1q_f32(in_low + 4),
82 vmulq_f32(vtop.val[0], m0.val[0]),
83 vmulq_f32(vtop.val[1], m0.val[0])
86 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
87 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
89 out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
90 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
91 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
93 out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
94 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
95 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
97 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
98 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
100 out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
101 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
102 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
104 out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
105 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
106 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
111 inline float32x4x2_t
convolve_3x3<2>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2,
int fixed_point_position)
113 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
114 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
115 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
116 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
121 inline float32x4x2_t
convolve_3x3<3>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2,
int fixed_point_position)
123 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
124 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
128 template <
unsigned int str
idex>
129 void store_results(
float *buffer,
const float32x4x2_t &values);
134 vst1q_f32(buffer, values.val[0]);
135 vst1q_f32(buffer + 4, values.val[1]);
141 vst1q_f32(buffer, values.val[0]);
147 vst1_f32(buffer, vget_low_f32(values.val[0]));
150 template <
unsigned int str
idex>
156 return num_elems_written_per_iteration;
162 return num_elems_written_per_iteration << 1;
168 return num_elems_written_per_iteration * 3;
float32x4x2_t convolve_3x3< 2 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
int get_input_num_elems_processed< 3 >(unsigned int num_elems_written_per_iteration)
This file contains all available output stages for GEMMLowp on OpenCL.
int get_input_num_elems_processed< 2 >(unsigned int num_elems_written_per_iteration)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
void store_results(float *buffer, const float32x4x2_t &values)
Stores a float32x4x2_t array into a memory location.
float32x4x2_t convolve_3x3< 3 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
void store_results< 3 >(float *buffer, const float32x4x2_t &values)
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
void store_results< 1 >(float *buffer, const float32x4x2_t &values)
float32x4x3_t load_matrix_row(const float *ptr)
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
Get the number of elements processed on 3x3 convolution.
int get_input_num_elems_processed< 1 >(unsigned int num_elems_written_per_iteration)
void store_results< 2 >(float *buffer, const float32x4x2_t &values)