2 * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * http://www.apache.org/licenses/LICENSE-2.0
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
16 * @date 04 December 2019
17 * @brief This is Tensor class for calculation
18 * @see https://github.com/nnstreamer/nntrainer
19 * @author Jijoong Moon <jijoong.moon@samsung.com>
20 * @bug No known bugs except for NYI items
24 #include "include/tensor.h"
31 #include <helper_cuda.h>
32 #include <helper_functions.h>
37 void TensorDim::setTensorDim(std::string input_shape) {
38 std::regex words_regex("[^\\s.,:;!?]+");
39 auto words_begin = std::sregex_iterator(input_shape.begin(), input_shape.end(), words_regex);
40 auto words_end = std::sregex_iterator();
41 int cur_dim = std::distance(words_begin, words_end);
43 std::cout << "Tensor Dimension should be less than 4" << std::endl;
47 for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
48 Dim[MAXDIM - cur_dim + cn] = std::stoi((*i).str());
53 Tensor::Tensor(int height, int width) {
54 this->height = height;
58 this->len = height * width * batch;
59 this->data = std::vector<float>(len);
63 Tensor::Tensor(int batch, int height, int width) {
64 this->height = height;
68 this->len = height * width * batch;
69 this->data = std::vector<float>(len);
73 float Tensor::getValue(int batch, int h, int w) { return this->data[batch * height * width + h * width + w]; }
75 void Tensor::setValue(int batch, int h, int w, float value) {
76 this->data[batch * height * width + h * width + w] = value;
79 Tensor::Tensor(std::vector<std::vector<float>> const &d) {
80 assert(d.size() != 0);
81 this->height = d.size();
82 this->width = d[0].size();
85 this->len = height * width * batch;
86 this->data = std::vector<float>(len);
88 for (int j = 0; j < height; ++j)
89 for (int k = 0; k < width; ++k)
90 this->setValue(0, j, k, d[j][k]);
93 Tensor::Tensor(std::vector<std::vector<std::vector<float>>> const &d) {
94 assert(d.size() != 0 && d[0].size() != 0);
95 this->batch = d.size();
96 this->height = d[0].size();
97 this->width = d[0][0].size();
99 this->len = this->batch * this->height * this->width;
100 this->data = std::vector<float>(len);
102 for (int i = 0; i < this->batch; ++i)
103 for (int j = 0; j < this->height; ++j)
104 for (int k = 0; k < this->width; ++k)
105 this->setValue(i, j, k, d[i][j][k]);
108 Tensor Tensor::multiply(float const &value) {
109 Tensor result(batch, height, width);
111 memset(result.data.data(), 0, sizeof(float) * result.len);
112 cblas_saxpy(this->len, value, this->data.data(), 1, result.data.data(), 1);
114 for (int k = 0; k < len; ++k) {
115 result.data[k] = data[k] * value;
121 Tensor Tensor::divide(float const &value) {
122 Tensor result(batch, height, width);
124 memset(result.data.data(), 0, sizeof(float) * result.len);
125 cblas_saxpy(this->len, 1.0 / value, this->data.data(), 1, result.data.data(), 1);
127 for (int k = 0; k < len; ++k) {
128 result.data[k] = data[k] / value;
134 Tensor Tensor::add(float const &value) {
135 Tensor result(batch, height, width);
137 cblas_scopy(this->len, this->data.data(), 1, result.data.data(), 1);
138 Tensor tmp(batch, height, width);
139 for (int i = 0; i < tmp.len; ++i)
141 cblas_saxpy(this->len, value, tmp.data.data(), 1, result.data.data(), 1);
143 for (int k = 0; k < len; ++k) {
144 result.data[k] = data[k] + value;
151 Tensor Tensor::add(Tensor const &m) const {
152 assert(height == m.height && width == m.width);
154 Tensor result(batch, height, width);
156 cblas_scopy(this->len, this->data.data(), 1, result.data.data(), 1);
157 int size = this->width * this->height;
159 for (int k = 0; k < batch; ++k) {
160 cblas_saxpy(size, 1.0, m.data.data(), 1, &(result.data.data()[k * size]), 1);
163 cblas_saxpy(this->len, 1.0, m.data.data(), 1, result.data.data(), 1);
168 for (k = 0; k < batch; ++k) {
169 for (i = 0; i < m.len; ++i) {
171 result.data[j + i] = data[j + i] + m.data[i];
175 for (k = 0; k < len; ++k) {
176 result.data[k] = data[k] + m.data[k];
184 Tensor Tensor::subtract(Tensor const &m) const {
185 assert(height == m.height && width == m.width);
186 Tensor result(batch, height, width);
189 cblas_scopy(this->len, this->data.data(), 1, result.data.data(), 1);
190 int size = this->width * this->height;
194 for (int k = 0; k < batch; ++k) {
195 cblas_saxpy(size, alpha, m.data.data(), 1, &(result.data.data()[k * size]), 1);
198 assert(batch == m.batch);
199 cblas_saxpy(this->len, alpha, m.data.data(), 1, result.data.data(), 1);
204 for (k = 0; k < batch; ++k) {
205 for (i = 0; i < m.len; ++i) {
207 result.data[j + i] = data[j + i] - m.data[i];
211 for (k = 0; k < len; ++k) {
212 result.data[k] = data[k] - m.data[k];
219 Tensor Tensor::subtract(float const &value) {
220 Tensor result(batch, height, width);
222 cblas_scopy(this->len, this->data.data(), 1, result.data.data(), 1);
223 Tensor tmp(batch, height, width);
224 for (int i = 0; i < tmp.len; ++i)
226 cblas_saxpy(this->len, value, tmp.data.data(), 1, result.data.data(), 1);
228 for (int k = 0; k < len; ++k) {
229 result.data[k] = data[k] - value;
236 Tensor Tensor::multiply(Tensor const &m) const {
237 assert(height == m.height && width == m.width);
238 Tensor result(batch, height, width);
240 int end = this->len / 4;
241 int e = width * height / 4;
244 for (int k = 0; k < batch; ++k) {
245 int b = k * width * height;
246 for (i = 0; i < e * 4; i += 4) {
247 result.data[b + i + 0] = this->data[b + i + 0] * m.data[i + 0];
248 result.data[b + i + 1] = this->data[b + i + 1] * m.data[i + 1];
249 result.data[b + i + 2] = this->data[b + i + 2] * m.data[i + 2];
250 result.data[b + i + 3] = this->data[b + i + 3] * m.data[i + 3];
252 for (int j = i; j < width * height; j++)
253 result.data[b + j] = this->data[b + j] * m.data[j];
256 for (i = 0; i < end * 4; i += 4) {
257 result.data[i + 0] = this->data[i + 0] * m.data[i + 0];
258 result.data[i + 1] = this->data[i + 1] * m.data[i + 1];
259 result.data[i + 2] = this->data[i + 2] * m.data[i + 2];
260 result.data[i + 3] = this->data[i + 3] * m.data[i + 3];
262 for (int j = i; j < len; ++j)
263 result.data[j] = this->data[j] * m.data[j];
269 Tensor Tensor::divide(Tensor const &m) const {
270 assert(height == m.height && width == m.width);
271 Tensor result(batch, height, width);
273 int end = this->len / 4;
274 int e = width * height / 4;
278 for (int k = 0; k < batch; ++k) {
279 int b = k * width * height;
280 for (i = 0; i < e * 4; i += 4) {
281 result.data[b + i + 0] = this->data[b + i + 0] / m.data[i + 0];
282 result.data[b + i + 1] = this->data[b + i + 1] / m.data[i + 1];
283 result.data[b + i + 2] = this->data[b + i + 2] / m.data[i + 2];
284 result.data[b + i + 3] = this->data[b + i + 3] / m.data[i + 3];
286 for (int j = i; j < width * height; ++j)
287 result.data[b + j] = this->data[b + j] / m.data[j];
290 for (i = 0; i < end * 4; i += 4) {
291 result.data[i + 0] = this->data[i + 0] / m.data[i + 0];
292 result.data[i + 1] = this->data[i + 1] / m.data[i + 1];
293 result.data[i + 2] = this->data[i + 2] / m.data[i + 2];
294 result.data[i + 3] = this->data[i + 3] / m.data[i + 3];
296 for (int j = i; j < len; ++j)
297 result.data[j] = this->data[j] / m.data[j];
304 * This is to sum the Tensor data according to the batch.
305 * Therefore the result has M(batch, 1, 1) dimension.
307 Tensor Tensor::sum() const {
309 Tensor ret(batch, 1, 1);
311 for (k = 0; k < batch; ++k)
312 ret.data[k] = cblas_sasum(width * height, &(data.data()[k * width * height]), 1);
315 for (k = 0; k < batch; ++k) {
316 int id = k * width * height;
318 for (i = 0; i < height * width; ++i) {
319 ret.data[id] += data[id + i];
327 Tensor Tensor::sum(int axis) const {
332 ret = Tensor(1, height, width);
333 for (int i = 0; i < height; ++i) {
335 for (int j = 0; j < width; ++j) {
336 for (int k = 0; k < batch; ++k) {
337 int K = k * width * height;
338 ret.data[I + j] += data[K + I + j];
344 ret = Tensor(batch, 1, width);
345 for (int k = 0; k < batch; ++k) {
347 for (int j = 0; j < width; ++j) {
348 for (int i = 0; i < height; ++i) {
349 int I = i * width * batch;
350 ret.data[K + j] += data[K + I + j];
356 ret = Tensor(batch, height, width);
357 for (int k = 0; k < batch; ++k) {
359 for (int i = 0; i < height; ++i) {
360 for (int j = 0; j < width; ++j) {
361 int J = j * height * batch;
362 ret.data[K + i] += data[K + J + i];
368 std::runtime_error("Error: Cannot excide 2");
375 * If the batch sizeo of m is one, the it is reused for
376 * every calculation along with batch
378 Tensor Tensor::dot(Tensor const &m) const {
379 assert(width == m.height);
380 int mwidth = m.width;
381 Tensor result(batch, height, mwidth);
384 float alpha_dgemm = 1.0;
385 float beta_dgemm = 1.0;
387 for (int k = 0; k < batch; k++) {
388 int i = k * width * height;
389 int ii = k * height * mwidth;
390 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, height, mwidth, width, alpha_dgemm, &(data.data()[i]),
391 width, m.data.data(), mwidth, beta_dgemm, &(result.data.data()[ii]), mwidth);
394 for (int k = 0; k < batch; k++) {
395 int i = k * width * height;
396 int j = k * m.width * m.height;
397 int ii = k * height * mwidth;
399 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, height, mwidth, width, alpha_dgemm, &(data.data()[i]),
400 width, &(m.data.data()[j]), mwidth, beta_dgemm, &(result.data.data()[ii]), mwidth);
405 cudaDeviceProp deviceProp;
406 cudaGetDeviceProperties(&deviceProp, devID);
407 float *d_A, *d_B, *d_C;
409 unsigned int size_A = this->width * height * sizeof(float);
410 unsigned int size_B = m.width * m.height * sizeof(float);
411 unsigned int size_C = result.width * result.height * sizeof(float);
414 for (int k = 0; k < batch; k++) {
415 int i = k * width * height;
416 int ii = k * height * mwidth;
418 cudaMalloc((void **)&d_A, size_A);
419 cudaMalloc((void **)&d_B, size_B);
420 cudaMemcpy(d_A, &data.data()[i], size_A, cudaMemcpyHostToDevice);
421 cudaMemcpy(d_B, m.data.data(), size_B, cudaMemcpyHostToDevice);
422 cudaMalloc((void **)&d_C, size_C);
425 const float alpha = 1.0f;
426 const float beta = 0.0f;
427 cublasHandle_t handle;
429 (cublasCreate(&handle));
431 (cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m.width, height, width, &alpha, d_B, m.width, d_A, width, &beta,
434 (cudaMemcpy(&result.data.data()[ii], d_C, size_C, cudaMemcpyDeviceToHost));
435 (cublasDestroy(handle));
439 for (int k = 0; k < batch; k++) {
440 int i = k * width * height;
441 int j = k * m.width * m.height;
442 int ii = k * height * mwidth;
444 (cudaMalloc((void **)&d_A, size_A));
445 (cudaMalloc((void **)&d_B, size_B));
446 (cudaMemcpy(d_A, &data.data()[i], size_A, cudaMemcpyHostToDevice));
447 (cudaMemcpy(d_B, &m.data.data()[j], size_B, cudaMemcpyHostToDevice));
448 (cudaMalloc((void **)&d_C, size_C));
451 const float alpha = 1.0f;
452 const float beta = 0.0f;
453 cublasHandle_t handle;
455 (cublasCreate(&handle));
457 (cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m.width, height, width, &alpha, d_B, m.width, d_A, width, &beta,
460 (cudaMemcpy(&result.data.data()[ii], d_C, size_C, cudaMemcpyDeviceToHost));
461 (cublasDestroy(handle));
469 for (k = 0; k < batch; ++k) {
470 for (i = 0; i < height; ++i) {
471 for (j = 0; j < mwidth; ++j) {
472 for (h = 0; h < width; ++h) {
473 w += data[k * height * width + i * width + h] * m.data[h * mwidth + j];
475 result.data[k * height * mwidth + i * mwidth + j] = w;
481 for (k = 0; k < batch; k++) {
482 for (i = 0; i < height; i++) {
483 for (j = 0; j < mwidth; j++) {
484 for (h = 0; h < width; h++) {
485 w += data[k * height * width + i * width + h] * m.data[k * width * mwidth + h * mwidth + j];
487 result.data[k * height * mwidth + i * mwidth + j] = w;
498 Tensor Tensor::transpose() const {
499 Tensor result(batch, width, height);
501 for (k = 0; k < batch; ++k) {
502 int b = k * width * height;
503 for (i = 0; i < width; ++i) {
504 for (j = 0; j < height; ++j) {
505 result.data[b + i * height + j] = data[b + j * width + i];
512 Tensor Tensor::apply(float (*function)(float)) const {
513 Tensor result(batch, height, width);
516 for (i = 0; i < this->len; ++i)
517 result.data[i] = (*function)(data[i]);
522 Tensor Tensor::apply(Tensor (*function)(Tensor)) const { return (*function)(*this); }
524 void Tensor::print(std::ostream &out) const {
526 std::stringstream ss;
527 for (k = 0; k < batch; k++) {
528 for (i = 0; i < height; i++) {
529 for (j = 0; j < width; j++) {
530 out << data[k * width * height + i * width + j] << " ";
538 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
543 Tensor &Tensor::copy(const Tensor &from) {
544 if (this != &from && from.len != 0) {
545 height = from.height;
549 cblas_scopy(this->len, from.data.data(), 1, this->data.data(), 1);
551 for (int i = 0; i < len; ++i)
552 data[i] = from.data[i];
560 * This generate one dimension vector has the every element in Tensor
562 std::vector<float> Tensor::Mat2Vec() {
563 std::vector<float> ret;
565 for (int i = 0; i < this->len; i++)
566 ret.push_back(data[i]);
571 void Tensor::save(std::ofstream &file) {
572 for (int i = 0; i < this->len; i++)
573 file.write((char *)&data[i], sizeof(float));
576 void Tensor::read(std::ifstream &file) {
577 for (int i = 0; i < this->len; i++)
578 file.read((char *)&data[i], sizeof(float));
582 * This calculates average value according to the batch direction.
583 * That is the why it has (1, height, width) dimension.
585 Tensor Tensor::average() const {
589 Tensor result(1, height, width);
590 for (int i = 0; i < height; i++) {
591 for (int j = 0; j < width; j++) {
592 result.data[i * width + j] = 0.0;
593 for (int k = 0; k < batch; k++) {
594 result.data[i * width + j] += data[k * width * height + i * width + j];
596 result.data[i * width + j] = result.data[i * width + j] / (float)batch;
602 void Tensor::setZero() { memset(this->data.data(), 0, sizeof(float) * this->len); }
604 Tensor Tensor::softmax() const {
605 Tensor result(batch, height, width);
606 Tensor divisor(batch, height, 1);
610 for (int k = 0; k < batch; k++) {
611 int index = k * height;
612 for (int i = 0; i < height; i++) {
613 for (int j = 0; j < width; j++) {
614 divisor.data[index + i] += exp(this->data[k * height * width + i * width + j]);
619 for (int k = 0; k < batch; ++k) {
620 int index = k * height;
621 for (int i = 1; i < height; ++i) {
622 divisor.data[index] += divisor.data[index + i];
626 for (int k = 0; k < batch; k++) {
627 int index = k * height;
628 for (int i = 0; i < height; i++) {
629 for (int j = 0; j < width; j++) {
630 int id = k * height * width + i * width + j;
631 result.data[id] = exp(this->data[id]) / divisor.data[index];
639 int Tensor::argmax() {
642 for (int i = 0; i < len; i++) {
643 if (this->data[i] > maximum) {
644 maximum = this->data[i];
651 float Tensor::l2norm() const {
653 for (int i = 0; i < len; i++) {
654 sum += this->data[i] * this->data[i];
660 Tensor Tensor::normalization() const {
661 Tensor results(batch, height, width);
662 float Min = 1000000.0;
665 for (int k = 0; k < batch; ++k) {
666 for (int i = 0; i < height; ++i) {
667 for (int j = 0; j < width; ++j) {
668 int id = k * height * width + i * width + j;
669 if (this->data[id] < Min)
670 Min = this->data[id];
671 if (this->data[id] > Max)
672 Max = this->data[id];
676 float dif = Max - Min;
678 for (int k = 0; k < batch; ++k) {
679 for (int i = 0; i < height; ++i) {
680 for (int j = 0; j < width; ++j) {
681 int id = k * height * width + i * width + j;
682 results.data[id] = (this->data[id] - Min) / dif;
690 Tensor Tensor::standardization() const {
691 Tensor result(batch, height, width);
693 for (int k = 0; k < batch; ++k) {
694 int K = k * height * width;
696 float mean_tmp = 0.0;
700 for (int i = 0; i < height; ++i) {
701 int I = K + i * width;
702 for (int j = 0; j < width; ++j) {
704 mean_tmp += this->data[J];
708 mean = mean_tmp / (this->width * this->height);
710 for (int i = 0; i < height; ++i) {
711 int I = K + i * width;
712 for (int j = 0; j < width; ++j) {
714 std_tmp += (this->data[J] - mean) * (this->data[J] - mean);
718 std_dev = sqrt(std_tmp) / (this->height * this->width);
720 for (int i = 0; i < height; ++i) {
721 int I = K + i * width;
722 for (int j = 0; j < width; ++j) {
724 result.data[J] = (this->data[J] - mean) / std_dev;