#include <nnstreamer_log.h>
#if defined(__aarch64__)
+#include <arm_neon.h>
+
#define NEON64_ENABLED
+#define GRAYSCALE_HEX (0x00010101)
+#define ALPHA_HEX (0xFF000000)
#endif
#define DEFAULT_LABELS (20)
guint height; /**< Input video height */
GRand *rand; /**< random value generator */
+ guint rgb_modifier; /**< rgb modifier according to # labels */
} image_segments;
/** @brief tensordec-plugin's GstTensorDecoderDef callback */
idata->max_labels = DEFAULT_LABELS;
idata->segment_map = NULL;
idata->color_map = NULL;
+ idata->rgb_modifier = 0;
return TRUE;
}
guint i;
idata->color_map[0] = 0; /* background */
+
+#if defined (NEON64_ENABLED)
+ idata->rgb_modifier = 0xFFFFFF / (idata->max_labels + 1);
+ for (i = 1; i <= idata->max_labels; i++) {
+ /* colors should be the same with neon calculations */
+ idata->color_map[i] = idata->rgb_modifier * i;
+ ((guint8 *)&idata->color_map[i])[3] = '\xff'; /* alpha */
+ }
+#else
for (i = 1; i <= idata->max_labels; i++) {
/* any color value would be acceptable */
idata->color_map[i] = g_rand_int_range (idata->rand, 0x101010, 0xFFFFFF);
((guint8 *)&idata->color_map[i])[3] = '\xff'; /* alpha */
}
+#endif
}
/** @brief tensordec-plugin's GstTensorDecoderDef callback */
guint num_pixels = idata->height * idata->width;
guint label_idx, idx = 0;
+#if defined (NEON64_ENABLED)
+ float32x4_t v_src_float;
+
+ uint32x4_t v_src_uint;
+ uint32x4_t v_magic;
+ uint32x4_t v_mask;
+ uint32x4_t v_alpha;
+ uint32x4_t v_zero;
+
+ guint num_lanes = 4;
+
+ v_magic = vdupq_n_u32 (idata->rgb_modifier);
+ v_alpha = vdupq_n_u32 (ALPHA_HEX);
+ v_zero = vdupq_n_u32 (0);
+
+ for (idx = 0; idx < num_pixels; idx += num_lanes) {
+ /* load float32 vector */
+ v_src_float = vld1q_f32 (input);
+ input += num_lanes;
+
+ /* convert float32 vector to uint32 vector */
+ v_src_uint = vcvtq_u32_f32 (v_src_float);
+
+ /* multiply by magic number to fill RGB values */
+ v_src_uint = vmulq_u32 (v_src_uint, v_magic);
+
+ /* check whether the label is zero (i.e., background) */
+ v_mask = vceqq_u32 (v_src_uint, v_zero);
+ v_mask = vbslq_u32 (v_mask, v_zero, v_alpha);
+
+ /* set the alpha value unless it's background */
+ v_src_uint = vorrq_u32 (v_src_uint, v_mask);
+
+ /* store uint32 vector */
+ vst1q_u32 (output, v_src_uint);
+ output += num_lanes;
+ }
+
+ if (num_pixels == idx)
+ return;
+
+ /* handle remaining data */
+ input = (float *) idata->segment_map;
+ output = (uint32_t *) out_info->data;
+ idx -= num_lanes;
+#endif
for (; idx < num_pixels; idx++) {
label_idx = (guint) input[idx];
guint num_pixels = idata->height * idata->width;
guint idx = 0;
+#if defined (NEON64_ENABLED)
+ float32x4_t v_src, v_max;
+ guint num_lanes = 4;
+
+ v_max = vdupq_n_f32 (0);
+
+ /* find the maximum value per lane */
+ for (idx = 0; idx < num_pixels; idx += num_lanes) {
+ v_src = vld1q_f32 (input);
+ input += num_lanes;
+
+ v_max = vmaxq_f32 (v_src, v_max);
+ }
+
+ /* find the maximum value among all lanes */
+ gray_max = MAX (gray_max, vgetq_lane_f32 (v_max, 0));
+ gray_max = MAX (gray_max, vgetq_lane_f32 (v_max, 1));
+ gray_max = MAX (gray_max, vgetq_lane_f32 (v_max, 2));
+ gray_max = MAX (gray_max, vgetq_lane_f32 (v_max, 3));
+
+ if (num_pixels == idx)
+ return gray_max;
+
+ /* handle remaining data */
+ input = idata->segment_map;
+ idx -= num_lanes;
+#endif
for (; idx < num_pixels; idx++)
gray_max = MAX (gray_max, input [idx]);
if (G_UNLIKELY (max_grayscale == 0.0))
return;
+#if defined (NEON64_ENABLED)
+ {
+ float32x4_t v_src_float;
+ float32x4_t v_max_gray;
+ float32x4_t v_max_rgb;
+
+ uint32x4_t v_src_uint;
+ uint32x4_t v_magic;
+ uint32x4_t v_alpha;
+
+ guint num_lanes = 4;
+
+ v_max_gray = vdupq_n_f32 (max_grayscale);
+ v_max_rgb = vdupq_n_f32 (MAX_RGB);
+ v_magic = vdupq_n_u32 (GRAYSCALE_HEX);
+ v_alpha = vdupq_n_u32 (ALPHA_HEX);
+
+ for (idx = 0; idx < num_pixels; idx += num_lanes) {
+ /* load float32 vector */
+ v_src_float = vld1q_f32 (input);
+ input += num_lanes;
+
+ /* normalized_gray = (gray / max_gray) x max_rgb */
+ v_src_float = vdivq_f32 (v_src_float, v_max_gray);
+ v_src_float = vmulq_f32 (v_src_float, v_max_rgb);
+
+ /* convert float32 vector to uint32 vector */
+ v_src_uint = vcvtq_u32_f32 (v_src_float);
+
+ /* multiply by magic number to fill the same RGB values */
+ v_src_uint = vmulq_u32 (v_src_uint, v_magic);
+ v_src_uint = vaddq_u32 (v_src_uint, v_alpha);
+
+ /* store uint32 vector */
+ vst1q_u32 (output, v_src_uint);
+ output += num_lanes;
+ }
+
+ if (num_pixels == idx)
+ return;
+
+ /* handle remaining data */
+ input = idata->segment_map;
+ output = (uint32_t *) out_info->data;
+ idx -= num_lanes;
+ }
+#endif
for (; idx < num_pixels; idx++) {
/* normalize grayscale values to RGB_MAX */
grayscale = (guint) ((input[idx] / max_grayscale) * MAX_RGB);