From 998261202191ff08039f573da9480af762aafe7e Mon Sep 17 00:00:00 2001
From: Ramin Zaghi <ramin@arm.com>
Date: Tue, 8 May 2012 11:10:30 +0000
Subject: [PATCH] added the DST==SRC condition test case to all the applicable
 units..

---
 headers/unit_test_abs_operation_x.h        | 31 +++++++++
 headers/unit_test_common.h                 |  4 +-
 headers/unit_test_invmat_operation_x.h     | 30 ++++++++
 headers/unit_test_mla_operation_x.h        | 54 +++++++++++++++
 headers/unit_test_mlac_operation_x.h       | 55 +++++++++++++++
 headers/unit_test_mulcmatvec_operation_x.h | 29 ++++++++
 headers/unit_test_normalize_operation_x.h  | 31 +++++++++
 headers/unit_test_x_operation_x.h          | 35 ++++++++++
 headers/unit_test_x_operation_x_tolerant.h | 81 ++++++++++++++++++----
 headers/unit_test_xc_operation_x.h         | 35 ++++++++++
 headers/unit_test_xmat_operation_x.h       | 33 +++++++++
 11 files changed, 403 insertions(+), 15 deletions(-)

diff --git a/headers/unit_test_abs_operation_x.h b/headers/unit_test_abs_operation_x.h
index 94d4ce7..cb2d832 100644
--- a/headers/unit_test_abs_operation_x.h
+++ b/headers/unit_test_abs_operation_x.h
@@ -52,6 +52,11 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -74,6 +79,32 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
diff --git a/headers/unit_test_common.h b/headers/unit_test_common.h
index 1999075..cd62a11 100644
--- a/headers/unit_test_common.h
+++ b/headers/unit_test_common.h
@@ -157,8 +157,8 @@ inline int EQUALS_FLOAT( float fa, float fb , unsigned int err )
   {  // then we have an unacceptable error
 
      // report an unaaceptable error
-     memcpy( &ui1,  &fa, sizeof(float) );
-     memcpy( &ui2,  &fb, sizeof(float) );
+     memcpy( &ui1,  &fa, sizeof(arm_float_t) );
+     memcpy( &ui2,  &fb, sizeof(arm_float_t) );
 
      fprintf( stderr, "HINT: %e (0x%04X) != %e (0x%04X) ", fa, ui1, fb, ui2 );
 
diff --git a/headers/unit_test_invmat_operation_x.h b/headers/unit_test_invmat_operation_x.h
index 850abef..a1b9ac3 100644
--- a/headers/unit_test_invmat_operation_x.h
+++ b/headers/unit_test_invmat_operation_x.h
@@ -52,6 +52,10 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -74,6 +78,32 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
+
+  for ( i = 0;  i < tmp_len * (opcode+1) * (opcode+1); i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
diff --git a/headers/unit_test_mla_operation_x.h b/headers/unit_test_mla_operation_x.h
index abe2b70..8d56e28 100644
--- a/headers/unit_test_mla_operation_x.h
+++ b/headers/unit_test_mla_operation_x.h
@@ -56,6 +56,10 @@ arm_float_t * thesrc2 = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[9];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -88,6 +92,56 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // input 3
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[5] = (arm_float_t*) malloc( inbytes ); // copy of 3nd input
+  esp_buf[6] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[2], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[3], esp_buf[0], inbytes );
+  memcpy( esp_buf[4], esp_buf[1], inbytes );
+  memcpy( esp_buf[5], esp_buf[2], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1],  esp_buf[2], tmp_len ); // DST == ACC
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4],  esp_buf[5], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[6][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: MLA Operation number %d implementation [%d] has failed the DST==ACC test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
+          exit( NE10_ERR );
+     }
+  }
+
+  memcpy( esp_buf[3], esp_buf[0], inbytes );
+  memcpy( esp_buf[4], esp_buf[1], inbytes );
+  memcpy( esp_buf[5], esp_buf[2], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , esp_buf[0], esp_buf[1],  esp_buf[2], tmp_len ); // DST == SRC
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4],  esp_buf[5], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[1][i] != esp_buf[6][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: MLA Operation number %d implementation [%d] has failed the DST==SRC test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]); free(esp_buf[5]); free(esp_buf[6]); 
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc1, thesrc2, ARRLEN );
diff --git a/headers/unit_test_mlac_operation_x.h b/headers/unit_test_mlac_operation_x.h
index 5b7dc6e..beb0d8a 100644
--- a/headers/unit_test_mlac_operation_x.h
+++ b/headers/unit_test_mlac_operation_x.h
@@ -56,6 +56,10 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[9];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -91,6 +95,57 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // input 3
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[5] = (arm_float_t*) malloc( inbytes ); // copy of 3nd input
+  esp_buf[6] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[2], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[3], esp_buf[0], inbytes );
+  memcpy( esp_buf[4], esp_buf[1], inbytes );
+  memcpy( esp_buf[5], esp_buf[2], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1],  esp_buf[2], tmp_len ); // DST == ACC
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4],  esp_buf[5], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[6][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: MLAC Operation number %d implementation [%d] has failed the DST==ACC test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
+          exit( NE10_ERR );
+     }
+  }
+
+  memcpy( esp_buf[3], esp_buf[0], inbytes );
+  memcpy( esp_buf[4], esp_buf[1], inbytes );
+  memcpy( esp_buf[5], esp_buf[2], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , esp_buf[0], esp_buf[1],  esp_buf[2], tmp_len ); // DST == SRC
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4],  esp_buf[5], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[1][i] != esp_buf[6][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: MLAC Operation number %d implementation [%d] has failed the DST==SRC test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]); free(esp_buf[5]); free(esp_buf[6]);
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc, thecst, ARRLEN );
diff --git a/headers/unit_test_mulcmatvec_operation_x.h b/headers/unit_test_mulcmatvec_operation_x.h
index 6bbc02e..8d2cdca 100644
--- a/headers/unit_test_mulcmatvec_operation_x.h
+++ b/headers/unit_test_mulcmatvec_operation_x.h
@@ -55,6 +55,10 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -82,6 +86,31 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[3], esp_buf[1], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , thecst, esp_buf[1], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , thecst, esp_buf[3], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[1]); free(esp_buf[3]); free(esp_buf[4]);
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, thesrc, ARRLEN );
diff --git a/headers/unit_test_normalize_operation_x.h b/headers/unit_test_normalize_operation_x.h
index 75307f7..d413a44 100644
--- a/headers/unit_test_normalize_operation_x.h
+++ b/headers/unit_test_normalize_operation_x.h
@@ -52,6 +52,10 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -74,6 +78,33 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
+
+
   // sample run
   MEASURE( elapsed,
       // call the function
diff --git a/headers/unit_test_x_operation_x.h b/headers/unit_test_x_operation_x.h
index 0a969f3..6acffee 100644
--- a/headers/unit_test_x_operation_x.h
+++ b/headers/unit_test_x_operation_x.h
@@ -54,6 +54,10 @@ arm_float_t * thesrc2 = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -81,6 +85,37 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t); 
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+  memcpy( esp_buf[3], esp_buf[1], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
+
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
diff --git a/headers/unit_test_x_operation_x_tolerant.h b/headers/unit_test_x_operation_x_tolerant.h
index 5aa19b1..dfb2489 100644
--- a/headers/unit_test_x_operation_x_tolerant.h
+++ b/headers/unit_test_x_operation_x_tolerant.h
@@ -45,42 +45,97 @@ struct timezone zone;
 // there is a max of "4" components in a vec
 #define MAX_VEC_COMPONENTS 4
 
+arm_float_t * guarded_src1 = NULL;
+arm_float_t * guarded_src2 = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
 arm_float_t * thesrc1 = NULL;
 arm_float_t * thesrc2 = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
   // initialize if not done so
   if ( 0 == done_init )
   {
-    thesrc1 = (arm_float_t*) malloc( ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
     FILL_FLOAT_ARRAY_LIMIT( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
 
-    thesrc2 = (arm_float_t*) malloc( ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
     FILL_FLOAT_ARRAY_LIMIT( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
 
     for ( i = 0; i<IMPL_COUNT; i++ )
     {
-      thedst[i] = (arm_float_t*) malloc( ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
     }
 
     done_init = 1;
   }
 
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+  memcpy( esp_buf[3], esp_buf[1], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
+
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
   );
 
-          MEASURE( elapsed,
-            for ( i = 0; i < max; i++  )
-            {
-               // call the function
-               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
-            }
-           );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+
+  MEASURE( elapsed,
+    for ( i = 0; i < max; i++  )
+    {
+      // call the function
+      ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
+    }
+   );
 
   if ( !mute )
        printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
@@ -200,11 +255,11 @@ arm_result_t run_test( int argc, char **argv )
 
 
   // free any allocated memory...
-  free( thesrc1 );
-  free( thesrc2 );
+  free( guarded_src1 );
+  free( guarded_src2 );
   for ( i = 0; i<IMPL_COUNT; i++ )
   {
-    free( thedst[i] );
+    free( guarded_dst[i] );
   }
 
   return NE10_OK;
diff --git a/headers/unit_test_xc_operation_x.h b/headers/unit_test_xc_operation_x.h
index 2bc81f6..4885b7c 100644
--- a/headers/unit_test_xc_operation_x.h
+++ b/headers/unit_test_xc_operation_x.h
@@ -54,6 +54,10 @@ arm_float_t * thesrc = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -81,6 +85,37 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+  memcpy( esp_buf[3], esp_buf[1], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
+
+  for ( i = 0;  i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
+
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, thecst, ARRLEN );
diff --git a/headers/unit_test_xmat_operation_x.h b/headers/unit_test_xmat_operation_x.h
index 9ae0468..c1c6bc5 100644
--- a/headers/unit_test_xmat_operation_x.h
+++ b/headers/unit_test_xmat_operation_x.h
@@ -55,6 +55,10 @@ arm_float_t * thesrc2 = NULL;
 arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
 int done_init = 0;
 
+// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
+// They may vary in size from one case to another and from one function to another.
+arm_float_t*  esp_buf[8];
+
 arm_result_t test_operation()
 {
   const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
@@ -82,6 +86,35 @@ arm_result_t test_operation()
     done_init = 1;
   }
 
+  // test the especial case where dst == src
+  unsigned int tmp_len = 13; // Just an odd number bigger than 8
+  unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
+  esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
+  esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
+  esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
+  esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
+  esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
+
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
+  memcpy( esp_buf[2], esp_buf[0], inbytes );
+  memcpy( esp_buf[3], esp_buf[1], inbytes );
+
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
+  ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
+
+  for ( i = 0;  i < tmp_len * (opcode+1) * (opcode+1); i++ ) // at this point the two outputs must be identical
+  {
+      if ( esp_buf[0][i] != esp_buf[4][i] )
+      {
+          fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
+          fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
+          exit( NE10_ERR );
+     }
+  }
+
+  free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
+
   // sample run
   MEASURE( dt_test_sample,
     ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
-- 
2.34.1